In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [45]:
df = pd.read_csv("IMDb Movies India.csv", encoding="latin1")

In [46]:
df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...,...,...,...,...
15504,Zulm Ko Jala Doonga,(1988),,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,(2005),,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,(1988),,Action,,,,,,


In [47]:
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [49]:
df.isnull()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,False,True,True,False,True,True,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,True,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
15504,False,False,True,False,False,False,False,False,False,False
15505,False,False,False,False,False,False,False,False,False,False
15506,False,False,True,False,True,True,False,False,True,True
15507,False,False,True,False,True,True,True,True,True,True


In [50]:
df.describe()

Unnamed: 0,Rating
count,7919.0
mean,5.841621
std,1.381777
min,1.1
25%,4.9
50%,6.0
75%,6.8
max,10.0


In [51]:
df.dtypes

Name         object
Year         object
Duration     object
Genre        object
Rating      float64
Votes        object
Director     object
Actor 1      object
Actor 2      object
Actor 3      object
dtype: object

In [52]:
df.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [53]:
df['Year'].mode()

0    (2019)
Name: Year, dtype: object

In [54]:
df['Duration'].mode()

0    120 min
Name: Duration, dtype: object

In [55]:
df['Genre'].mode()

0    Drama
Name: Genre, dtype: object

In [56]:
df['Votes'].mode()

0    8
Name: Votes, dtype: object

In [57]:
df['Rating'].mode()

0    6.2
Name: Rating, dtype: float64

In [58]:
df['Director'].mode()

0    Jayant Desai
Name: Director, dtype: object

In [59]:
df['Director'].fillna('Jayant Desai',inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Director'].fillna('Jayant Desai',inplace = True)


In [60]:
df['Actor 1'].mode()

0    Ashok Kumar
Name: Actor 1, dtype: object

In [61]:
df['Actor 2'].mode()

0    Rekha
Name: Actor 2, dtype: object

In [62]:
df['Actor 3'].mode()

0    Pran
Name: Actor 3, dtype: object

In [63]:
df_encoded = pd.get_dummies(df,drop_first = True)
df_encoded.head()

Unnamed: 0,Rating,Name_#Gadhvi (He thought he was Gandhi),Name_#Homecoming,Name_#Yaaram,Name_'83,Name_'Ferry',Name_'Sambandh',Name_...And Once Again,Name_...Aur Pyaar Ho Gaya,Name_...Yahaan,...,Actor 3_Zeenat Aman,Actor 3_Zeeshan Khan,Actor 3_Zeishan Quadri,Actor 3_Zenobia Shroff,Actor 3_Zohra,Actor 3_Zoya Hussain,Actor 3_Zubeida,Actor 3_Zuber K. Khan,Actor 3_Zulfi Sayed,Actor 3_Zunaid Memon
0,,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,7.0,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,4.4,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [64]:
df.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director       0
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [65]:
cat_cols = df.select_dtypes(include= ['object']).columns
cat_cols

Index(['Name', 'Year', 'Duration', 'Genre', 'Votes', 'Director', 'Actor 1',
       'Actor 2', 'Actor 3'],
      dtype='object')

In [67]:
X = df_encoded.drop('Rating',axis = 1)
y = df_encoded['Rating']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2 , random_state = 42)

In [None]:
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("RMSE:", rmse)
print("R2 Score:", r2)