# Movie Rating Prediction 

# 1.Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

movie_data = pd.read_csv('IMDb Movies India.csv',encoding='ISO-8859-1')
movie_data.head()


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [2]:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [3]:
# Remove the first row
movie_data = movie_data.drop(0, axis=0)
movie_data.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
5,...Aur Pyaar Ho Gaya,(1997),147 min,"Comedy, Drama, Musical",4.7,827.0,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor


In [4]:
# Extract year and convert to float
movie_data['Year'] = movie_data['Year'].str.extract('(\d{4})').astype(float)

# fill NaN with median year
median_year = movie_data['Year'].median()
movie_data['Year'].fillna(median_year, inplace=True)

# convert to integer
movie_data['Year'] = movie_data['Year'].astype(int)

print(movie_data['Year'].head())

1    2019
2    2021
3    2019
4    2010
5    1997
Name: Year, dtype: int32


In [5]:
# Extract the numeric part of the 'Duration' column and convert it to float
movie_data['Duration'] = movie_data['Duration'].str.extract('(\d+)').astype(float)

# Calculate the median of the 'Duration' column
median_duration = movie_data['Duration'].median()

# Fill missing values in 'Duration' with the median
movie_data['Duration'].fillna(median_duration, inplace=True)

# convert 'Duration' to integer if you don't need decimal precision
movie_data['Duration'] = movie_data['Duration'].astype(int)

print(movie_data['Duration'].head())

1    109
2     90
3    110
4    105
5    147
Name: Duration, dtype: int32


In [6]:
# Fill missing 'Genre' values with the most common genre
most_common_genre = movie_data['Genre'].mode()[0]
movie_data['Genre'].fillna(most_common_genre, inplace=True)

In [7]:
# Fill missing values for 'Director' and 'Actors' with 'Unknown'
movie_data['Director'].fillna('Unknown', inplace=True)
movie_data['Actor 1'].fillna('Unknown', inplace=True)
movie_data['Actor 2'].fillna('Unknown', inplace=True)
movie_data['Actor 3'].fillna('Unknown', inplace=True)

In [8]:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15508 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15508 non-null  object 
 1   Year      15508 non-null  int32  
 2   Duration  15508 non-null  int32  
 3   Genre     15508 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  15508 non-null  object 
 7   Actor 1   15508 non-null  object 
 8   Actor 2   15508 non-null  object 
 9   Actor 3   15508 non-null  object 
dtypes: float64(1), int32(2), object(7)
memory usage: 1.1+ MB


In [9]:
# Convert 'Votes' to numeric, setting errors to 'coerce' which converts non-convertible values to NaN
movie_data['Votes'] = pd.to_numeric(movie_data['Votes'], errors='coerce')

In [10]:
# Fill NaNs in 'Votes' with the median
votes_median = movie_data['Votes'].median()
movie_data['Votes'].fillna(votes_median, inplace=True)

In [11]:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15508 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15508 non-null  object 
 1   Year      15508 non-null  int32  
 2   Duration  15508 non-null  int32  
 3   Genre     15508 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     15508 non-null  float64
 6   Director  15508 non-null  object 
 7   Actor 1   15508 non-null  object 
 8   Actor 2   15508 non-null  object 
 9   Actor 3   15508 non-null  object 
dtypes: float64(2), int32(2), object(6)
memory usage: 1.1+ MB


In [12]:
# Create a model to fill in missing 'Rating' based on other numerical columns
model = LinearRegression()

# Train the model on available data
train_data = movie_data.dropna(subset=['Rating'])
model.fit(train_data[['Year', 'Duration', 'Votes']], train_data['Rating'])

# Predict and fill missing ratings
missing_data = movie_data[movie_data['Rating'].isnull()]
predicted_ratings = model.predict(missing_data[['Year', 'Duration', 'Votes']])
movie_data.loc[movie_data['Rating'].isnull(), 'Rating'] = predicted_ratings

In [13]:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15508 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15508 non-null  object 
 1   Year      15508 non-null  int32  
 2   Duration  15508 non-null  int32  
 3   Genre     15508 non-null  object 
 4   Rating    15508 non-null  float64
 5   Votes     15508 non-null  float64
 6   Director  15508 non-null  object 
 7   Actor 1   15508 non-null  object 
 8   Actor 2   15508 non-null  object 
 9   Actor 3   15508 non-null  object 
dtypes: float64(2), int32(2), object(6)
memory usage: 1.1+ MB


# 2.Feature Engineering

In [14]:
# A. Textual Features

# 1. Number of Genres
movie_data['Genre_Count'] = movie_data['Genre'].apply(lambda x: len(x.split(', ')))


In [15]:
# 2. Genre Presence (One-Hot Encoding already planned to be handled in ColumnTransformer)

# B. Temporal Features

# 3. Movie Age
current_year = 2024
movie_data['Movie_Age'] = current_year - movie_data['Year']

In [16]:
# C. Interaction Features

# Example: interaction between director and genres might be complex to capture directly without creating many sparse columns,
# a simpler interaction might be between movie age and votes
# However, here's a placeholder for potential interaction features to might consider
# This requires careful consideration and domain knowledge to implement effectively

# Set up the ColumnTransformer with OneHotEncoder for genres
categorical_features = ['Genre']
numeric_features = ['Year', 'Duration', 'Votes', 'Genre_Count', 'Movie_Age']

one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', one_hot_encoder, categorical_features),
        ('num', 'passthrough', numeric_features)
    ],
    remainder='drop')  # Adjust as necessary


In [17]:
# 3. Modeling Setup (using RandomForest)
X = movie_data.drop(['Rating', 'Name', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'], axis=1)  # Adjust as necessary
y = movie_data['Rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
model_pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = model_pipeline.predict(X_test)

# Calculate and print the Root Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 0.8669538135327022
