In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score,r2_score,mean_absolute_error,mean_squared_error


Read data


In [13]:
movie_data=pd.read_csv("IMDb_Movies_India.csv",encoding='latin1')

Preprocess

In [14]:
movie_data.head()
movie_data.isnull().sum()



Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

remove null rows

In [15]:
movie_data.dropna(subset=['Name', 'Year', 'Duration', 'Rating', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'], inplace=True)

remove duplictes

In [16]:
movie_data['Name'] = movie_data['Name'].str.extract('([A-Za-z\s\'\-]+)')
movie_data['Year'] = movie_data['Year'].astype(str).str.replace(r'[()]', '',regex=True).astype(int)
movie_data['Duration']=pd.to_numeric(movie_data['Duration'].astype(str).str.replace(r' min','',regex=True),errors='coerce')
movie_data['Votes']=pd.to_numeric(movie_data['Votes'].astype(str).str.replace(',',''),errors='coerce')
movie_data['Genre']=movie_data['Genre'].str.split(',')
movie_data=movie_data.explode('Genre')
movie_data['Genre'].fillna(movie_data['Genre'].mode()[0],inplace=True)



duplicate = movie_data.groupby(['Name', 'Year']).filter(lambda x: len(x) > 1)
duplicate.head(5)


  movie_data['Name'] = movie_data['Name'].str.extract('([A-Za-z\s\'\-]+)')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movie_data['Genre'].fillna(movie_data['Genre'].mode()[0],inplace=True)


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
3,Yaaram,2019,110,Comedy,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
3,Yaaram,2019,110,Romance,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,Aur Pyaar Ho Gaya,1997,147,Comedy,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
5,Aur Pyaar Ho Gaya,1997,147,Drama,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
5,Aur Pyaar Ho Gaya,1997,147,Musical,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor


In [17]:
movie_data.drop_duplicates(subset=['Name'],keep=False)
movie_data.drop('Name',axis=1,inplace=True)
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12008 entries, 1 to 15508
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Year      12008 non-null  int32  
 1   Duration  12008 non-null  int64  
 2   Genre     12008 non-null  object 
 3   Rating    12008 non-null  float64
 4   Votes     12008 non-null  int64  
 5   Director  12008 non-null  object 
 6   Actor 1   12008 non-null  object 
 7   Actor 2   12008 non-null  object 
 8   Actor 3   12008 non-null  object 
dtypes: float64(1), int32(1), int64(2), object(5)
memory usage: 891.2+ KB


In [18]:
genre_mean = movie_data.groupby('Genre')['Rating'].transform('mean')
movie_data['Genre_group'] = genre_mean
director_mean=movie_data.groupby('Director')['Rating'].transform('mean')
movie_data['Director_group']=director_mean
actor_1_mean=movie_data.groupby('Actor 1')['Rating'].transform('mean')
movie_data['Actor_1_group']=actor_1_mean
actor_2_mean=movie_data.groupby('Actor 2')['Rating'].transform('mean')
movie_data['Actor_2_group']=actor_2_mean
actor_3_mean=movie_data.groupby('Actor 3')['Rating'].transform('mean')
movie_data['Actor_3_group']=actor_3_mean
movie_data

Unnamed: 0,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,Genre_group,Director_group,Actor_1_group,Actor_2_group,Actor_3_group
1,2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,6.248697,7.000000,6.850000,7.000000,7.000000
3,2019,110,Comedy,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,5.838423,4.400000,5.250000,4.400000,4.460000
3,2019,110,Romance,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,5.838739,4.400000,5.250000,4.400000,4.460000
5,1997,147,Comedy,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,5.838423,5.335135,4.793617,5.730000,5.930000
5,1997,147,Drama,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,5.882451,5.335135,4.793617,5.730000,5.930000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15503,1989,125,Drama,5.8,44,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth,5.882451,6.290476,6.377419,5.800000,6.018750
15505,1999,129,Action,4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani,5.511985,5.175000,5.440845,4.603704,5.754545
15505,1999,129,Drama,4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani,5.882451,5.175000,5.440845,4.603704,5.754545
15508,1998,130,Action,6.2,20,K.C. Bokadia,Dharmendra,Jaya Prada,Arjun Sarja,5.511985,4.090625,6.045128,5.632558,5.780000


Model

In [19]:
model1=RandomForestRegressor()
model2=LinearRegression()
x=movie_data[['Year','Votes','Duration','Genre_group','Director_group','Actor_1_group','Actor_2_group','Actor_3_group']]
y=movie_data[['Rating']]
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=20)

In [20]:
model1.fit(x_train,y_train)
model2.fit(x_train,y_train)
lin_pred=model2.predict(x_test)
rand_pred=model1.predict(x_test)

  return fit_method(estimator, *args, **kwargs)


In [21]:
print('Random Forest Regressor: ')
print('mean_squared_error: ',mean_squared_error(y_test,rand_pred))
print('mean_absolute_error: ',mean_absolute_error(y_test,rand_pred))
print('r2_score: ',r2_score(y_test,rand_pred))
print("/n")
print('Linear Regression: ')
print('mean_squared_error: ',mean_squared_error(y_test,lin_pred))
print('mean_absolute_error: ',mean_absolute_error(y_test,lin_pred))
print('r2_score: ',r2_score(y_test,lin_pred))

Random Forest Regressor: 
mean_squared_error:  0.1016911136552873
mean_absolute_error:  0.17933014154870958
r2_score:  0.9465091792923737
/n
Linear Regression: 
mean_squared_error:  0.43576577729566723
mean_absolute_error:  0.4892541008332029
r2_score:  0.7707816521426202


Since Random Forest has higher R2_score, it will be used

In [22]:
model=model1

Build interface 

In [23]:
import gradio as gr
def predict(name,year,votes,duration,genre_mean_rating,director_mean_rating,actor_1_mean_rating,actor_2_mean_rating,actor_3_mean_rating):
    year=int(year)
    votes=int(votes)
    duration=int(duration)
    genre_mean_rating=float(genre_mean_rating)
    director_mean_rating=float(director_mean_rating)
    actor_1_mean_rating=float(director_mean_rating)
    actor_2_mean_rating=float(actor_2_mean_rating)
    actor_3_mean_rating=float(actor_3_mean_rating)
    # print(year,votes,duration,genre_mean_rating,actor_1_mean_rating,actor_2_mean_rating,actor_3_mean_rating)
    features=[[year,votes,duration,genre_mean_rating,director_mean_rating,actor_1_mean_rating,actor_2_mean_rating,actor_3_mean_rating]]
    prediction=model.predict(features)[0]
    return prediction

demo = gr.Interface(
    fn=predict,
    inputs=[
        "text",
        "number",
        "number",
        "number",
        "number",
        "number",
        "number",
        "number",
        "number",        
    ],
    outputs=["text"],
    
)
demo.launch(share=True)

Running on local URL:  http://127.0.0.1:7875
Running on public URL: https://fee174b74c79c68f27.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




