# Importing Libraries

In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Loading the IMDb India Movies Dataset

In [4]:
# Load the dataset
file_path = "/content/IMDb Movies India.csv"
df = pd.read_csv(file_path, encoding='latin-1')

# Split the data into features (X) and target variable (y)
X = df[['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']]
y = df['Rating']

In [12]:
df


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...,...,...,...,...
15504,Zulm Ko Jala Doonga,(1988),,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,(2005),,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,(1988),,Action,,,,,,


# Data Exploration and Pre processing

In [13]:
df.head

<bound method NDFrame.head of                                      Name    Year Duration            Genre  \
0                                             NaN      NaN            Drama   
1      #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                             #Homecoming  (2021)   90 min   Drama, Musical   
3                                 #Yaaram  (2019)  110 min  Comedy, Romance   
4                       ...And Once Again  (2010)  105 min            Drama   
...                                   ...     ...      ...              ...   
15504                 Zulm Ko Jala Doonga  (1988)      NaN           Action   
15505                               Zulmi  (1999)  129 min    Action, Drama   
15506                           Zulmi Raj  (2005)      NaN           Action   
15507                       Zulmi Shikari  (1988)      NaN           Action   
15508                        Zulm-O-Sitam  (1998)  130 min    Action, Drama   

       Rating Votes  

In [14]:
df.shape

(15509, 10)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [16]:
df.describe()

Unnamed: 0,Rating
count,7919.0
mean,5.841621
std,1.381777
min,1.1
25%,4.9
50%,6.0
75%,6.8
max,10.0


In [17]:
df.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [18]:
df.isnull().mean()*100

Name         0.000000
Year         3.404475
Duration    53.317429
Genre       12.102650
Rating      48.939326
Votes       48.932878
Director     3.385131
Actor 1     10.426204
Actor 2     15.371720
Actor 3     20.272100
dtype: float64

In [20]:
df.dropna(inplace= True)

In [21]:
df.isnull().sum()

Name        0
Year        0
Duration    0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64

In [22]:
df['Name'] = df['Name'].str.extract('([A-Za-z\s\'\-]+)')

In [23]:
df['Name']

1                  Gadhvi 
3                   Yaaram
5        Aur Pyaar Ho Gaya
6                   Yahaan
8          A Question Mark
               ...        
15493               Zubaan
15494             Zubeidaa
15503      Zulm Ki Zanjeer
15505                Zulmi
15508         Zulm-O-Sitam
Name: Name, Length: 5659, dtype: object

In [24]:
df['Year'] = df['Year'].str.replace(r'[()]', '', regex=True).astype(int)

In [25]:
df['Year']

1        2019
3        2019
5        1997
6        2005
8        2012
         ... 
15493    2015
15494    2001
15503    1989
15505    1999
15508    1998
Name: Year, Length: 5659, dtype: int64

In [26]:
df['Duration'] = pd.to_numeric(df['Duration'].str.replace(r' min', '', regex=True), errors='coerce')

In [27]:
df['Duration']

1        109
3        110
5        147
6        142
8         82
        ... 
15493    115
15494    153
15503    125
15505    129
15508    130
Name: Duration, Length: 5659, dtype: int64

In [30]:
df['Votes'] = df['Votes'].str.replace(',', '', regex=True).astype(int)

In [31]:
df['Votes']

1           8
3          35
5         827
6        1086
8         326
         ... 
15493     408
15494    1496
15503      44
15505     655
15508      20
Name: Votes, Length: 5659, dtype: int64

In [32]:
df.shape

(5659, 10)

In [33]:
df.drop_duplicates(inplace = True)

In [34]:
df.shape

(5659, 10)

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5659 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      5652 non-null   object 
 1   Year      5659 non-null   int64  
 2   Duration  5659 non-null   int64  
 3   Genre     5659 non-null   object 
 4   Rating    5659 non-null   float64
 5   Votes     5659 non-null   int64  
 6   Director  5659 non-null   object 
 7   Actor 1   5659 non-null   object 
 8   Actor 2   5659 non-null   object 
 9   Actor 3   5659 non-null   object 
dtypes: float64(1), int64(3), object(6)
memory usage: 486.3+ KB


In [36]:
df.describe()

Unnamed: 0,Year,Duration,Rating,Votes
count,5659.0,5659.0,5659.0,5659.0
mean,1996.24757,133.439124,5.898533,2697.649585
std,19.741839,25.319939,1.381165,13651.503584
min,1931.0,21.0,1.1,5.0
25%,1983.0,119.0,5.0,30.0
50%,2002.0,135.0,6.1,131.0
75%,2013.0,150.0,6.9,922.5
max,2021.0,321.0,10.0,591417.0


# Data Encoding

In [39]:
### Converting text data to numeric form
categorical_variables = ['Genre', 'Director', 'Actor 1','Actor 2','Actor 3']
for feature in categorical_variables:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])

# Features and and Target variables

In [41]:
### Defining Features and and Target variables
X = df[['Genre', 'Director', 'Actor 1','Actor 2','Actor 3']]
y = df['Rating']

In [43]:
X

Unnamed: 0,Genre,Director,Actor 1,Actor 2,Actor 3
1,229,629,1352,2272,319
3,184,1335,1198,719,2148
5,157,1530,378,75,2045
6,289,2044,692,1112,2524
8,320,135,1934,1175,1013
...,...,...,...,...,...
15493,229,1223,1861,1801,1615
15494,133,2059,763,1619,1184
15503,28,1793,406,754,1685
15505,38,1025,112,2164,314


In [44]:
y

1        7.0
3        4.4
5        4.7
6        7.4
8        5.6
        ... 
15493    6.1
15494    6.2
15503    5.8
15505    4.5
15508    6.2
Name: Rating, Length: 5659, dtype: float64

In [46]:
scaler = StandardScaler()

In [47]:
features_scaled = scaler.fit_transform(X)

In [49]:
X= features_scaled
y = df['Rating']

In [50]:
X

array([[ 0.52931791, -0.84886812,  0.6816723 ,  1.63359215, -1.32934107],
       [ 0.11341838,  0.17933035,  0.40902916, -0.69552904,  1.21099562],
       [-0.13612133,  0.46332284, -1.04270705, -1.66137196,  1.06793675],
       ...,
       [-1.32836665,  0.84634861, -0.99313557, -0.64303758,  0.56792515],
       [-1.23594453, -0.2721449 , -1.51363611,  1.47161849, -1.33628568],
       [-1.23594453, -0.46147323, -0.88337015, -0.64453734, -1.35156381]])

In [51]:
y

1        7.0
3        4.4
5        4.7
6        7.4
8        5.6
        ... 
15493    6.1
15494    6.2
15503    5.8
15505    4.5
15508    6.2
Name: Rating, Length: 5659, dtype: float64

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [53]:
print(X.shape, X_train.shape, X_test.shape)

(5659, 5) (3961, 5) (1698, 5)


# Model Selection and Training

In [54]:
## Train a Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions and Model Evaluation

In [55]:
### We will use test set to make predictions
y_pred = model.predict(X_test)

In [56]:
## Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [57]:
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

Mean Squared Error: 1.81
R-squared: 0.01
