In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

# Step 1: Data Investigation

In [2]:
df = pd.read_csv("D:/projects/CodSoft_Projects/Task2/IMDb Movies India.csv", encoding='latin1')
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [3]:
df.shape

(15509, 10)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


## Dropping null values

In [5]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5659 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      5659 non-null   object 
 1   Year      5659 non-null   object 
 2   Duration  5659 non-null   object 
 3   Genre     5659 non-null   object 
 4   Rating    5659 non-null   float64
 5   Votes     5659 non-null   object 
 6   Director  5659 non-null   object 
 7   Actor 1   5659 non-null   object 
 8   Actor 2   5659 non-null   object 
 9   Actor 3   5659 non-null   object 
dtypes: float64(1), object(9)
memory usage: 486.3+ KB


In [6]:
df.describe()

Unnamed: 0,Rating
count,5659.0
mean,5.898533
std,1.381165
min,1.1
25%,5.0
50%,6.1
75%,6.9
max,10.0


# Step 2: Data Exploration, Analysis, and Preprocessing

## Top rated movie

In [7]:
df[df['Rating'] == 10]

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
8339,Love Qubool Hai,(2020),94 min,"Drama, Romance",10.0,5,Saif Ali Sayeed,Ahaan Jha,Mahesh Narayan,Rajasree Rajakumari


## Extracting numeric features and converting the columns

In [8]:
df['Year'] = df['Year'].str.extract('(\d+)').astype(int)
df['Duration'] = df['Duration'].str.extract('(\d+)').astype(int)
df['Votes'] = df['Votes'].str.replace(',', '').astype(int)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5659 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      5659 non-null   object 
 1   Year      5659 non-null   int32  
 2   Duration  5659 non-null   int32  
 3   Genre     5659 non-null   object 
 4   Rating    5659 non-null   float64
 5   Votes     5659 non-null   int32  
 6   Director  5659 non-null   object 
 7   Actor 1   5659 non-null   object 
 8   Actor 2   5659 non-null   object 
 9   Actor 3   5659 non-null   object 
dtypes: float64(1), int32(3), object(6)
memory usage: 420.0+ KB


## Top voted movie

In [9]:
df.Votes.describe()

count      5659.000000
mean       2697.649585
std       13651.503584
min           5.000000
25%          30.000000
50%         131.000000
75%         922.500000
max      591417.000000
Name: Votes, dtype: float64

In [10]:
df[df['Votes'] == df.Votes.max()]

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
8219,Life of Pi,2012,127,"Adventure, Drama, Fantasy",7.9,591417,Ang Lee,Suraj Sharma,Irrfan Khan,Adil Hussain


## Most Frequent Genres

In [11]:
df.Genre.value_counts()

Drama                              844
Drama, Romance                     332
Action, Crime, Drama               329
Action, Drama                      206
Comedy, Drama                      205
                                  ... 
Comedy, Crime, Musical               1
History, Romance                     1
Drama, History, Sport                1
Animation, Comedy, Drama             1
Documentary, Biography, Musical      1
Name: Genre, Length: 376, dtype: int64

## Top rated years 

In [12]:
years = df.groupby(df['Year'])['Rating'].mean()
years = years.sort_values(ascending=False)
years.head()

Year
1952    7.212500
1957    7.080645
1944    7.075000
1940    7.050000
1953    6.955000
Name: Rating, dtype: float64

## Top voted years

In [13]:
years = df.groupby(df['Year'])['Votes'].mean()
years = years.sort_values(ascending=False)
years.head()

Year
2012    9900.971429
2007    7212.017094
2016    6780.601156
2009    5938.805085
2020    5819.257143
Name: Votes, dtype: float64

## Most frequent directors

In [14]:
df.Director.value_counts()

David Dhawan            41
Mahesh Bhatt            39
Ram Gopal Varma         33
Hrishikesh Mukherjee    33
Shakti Samanta          33
                        ..
Sriram Raja              1
Randeep Jha              1
Vicky Bhardwaj           1
Salar Shaikh             1
Mozez Singh              1
Name: Director, Length: 2431, dtype: int64

## Most frequent actors

In [15]:
actors = pd.concat([df['Actor 1'], df['Actor 2'], df['Actor 3']])
actors.value_counts()

Mithun Chakraborty    160
Amitabh Bachchan      148
Dharmendra            146
Ashok Kumar           124
Akshay Kumar          120
                     ... 
Rajeev Dassani          1
Rehaan Engineer         1
Glenn                   1
Pernia Qureshi          1
Shatakshi Gupta         1
Length: 5041, dtype: int64

# Step 3: Feature Engineering

## Combining features

In [16]:
df['Combined Actors'] = df['Actor 1'].str.cat([df['Actor 2'], df['Actor 3']], sep=', ')
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,Combined Actors
1,#Gadhvi (He thought he was Gandhi),2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,"Rasika Dugal, Vivek Ghamande, Arvind Jangid"
3,#Yaaram,2019,110,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,"Prateik, Ishita Raj, Siddhant Kapoor"
5,...Aur Pyaar Ho Gaya,1997,147,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,"Bobby Deol, Aishwarya Rai Bachchan, Shammi Kapoor"
6,...Yahaan,2005,142,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma,"Jimmy Sheirgill, Minissha Lamba, Yashpal Sharma"
8,?: A Question Mark,2012,82,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia,"Yash Dave, Muntazir Ahmad, Kiran Bhatia"


In [17]:
df['Combined Actors'].value_counts()

Dharmendra, Sunny Deol, Bobby Deol                  3
Anil Kapoor, Sridevi, Anupam Kher                   3
Dibakar Banerjee, Karan Johar, Anurag Kashyap       3
Sunny Deol, Anil Kapoor, Sridevi                    2
Dharmendra, Saira Banu, Feroz Khan                  2
                                                   ..
Sunny Deol, Meenakshi Sheshadri, Danny Denzongpa    1
Mani Kaul, Saeed Akhtar Mirza, Kamal Swaroop        1
Anil Kapoor, Raveena Tandon, Rambha                 1
Amol Palekar, Zarina Wahab, Sadhu Meher             1
Dharmendra, Jaya Prada, Arjun Sarja                 1
Name: Combined Actors, Length: 5604, dtype: int64

## Dropping unused columns

In [18]:
df.drop(['Name', 'Actor 1', 'Actor 2', 'Actor 3'], axis=1, inplace=True)

df.head()

Unnamed: 0,Year,Duration,Genre,Rating,Votes,Director,Combined Actors
1,2019,109,Drama,7.0,8,Gaurav Bakshi,"Rasika Dugal, Vivek Ghamande, Arvind Jangid"
3,2019,110,"Comedy, Romance",4.4,35,Ovais Khan,"Prateik, Ishita Raj, Siddhant Kapoor"
5,1997,147,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,"Bobby Deol, Aishwarya Rai Bachchan, Shammi Kapoor"
6,2005,142,"Drama, Romance, War",7.4,1086,Shoojit Sircar,"Jimmy Sheirgill, Minissha Lamba, Yashpal Sharma"
8,2012,82,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,"Yash Dave, Muntazir Ahmad, Kiran Bhatia"


## Frequency Encoding 

In [19]:
genre_counts = df['Genre'].value_counts().to_dict()
director_counts = df['Director'].value_counts().to_dict()
actors_counts = df['Combined Actors'].value_counts().to_dict()


df['genre_encoded'] = df['Genre'].map(genre_counts)
df['director_encoded'] = df['Director'].map(director_counts)
df['actors_encoded'] = df['Combined Actors'].map(actors_counts)


df.drop(['Genre', 'Director', 'Combined Actors'], axis=1, inplace=True)

df.head()

Unnamed: 0,Year,Duration,Rating,Votes,genre_encoded,director_encoded,actors_encoded
1,2019,109,7.0,8,844,1,1
3,2019,110,4.4,35,107,1,1
5,1997,147,4.7,827,45,15,1
6,2005,142,7.4,1086,5,6,1
8,2012,82,5.6,326,9,1,1


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5659 entries, 1 to 15508
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Year              5659 non-null   int32  
 1   Duration          5659 non-null   int32  
 2   Rating            5659 non-null   float64
 3   Votes             5659 non-null   int32  
 4   genre_encoded     5659 non-null   int64  
 5   director_encoded  5659 non-null   int64  
 6   actors_encoded    5659 non-null   int64  
dtypes: float64(1), int32(3), int64(3)
memory usage: 287.4 KB


# Step 4: Data splitting and Model training/evaluation

In [21]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

## Data Splitting

In [22]:
X = df.drop(columns=['Rating'], axis=1)
y = df['Rating']
X.head()

Unnamed: 0,Year,Duration,Votes,genre_encoded,director_encoded,actors_encoded
1,2019,109,8,844,1,1
3,2019,110,35,107,1,1
5,1997,147,827,45,15,1
6,2005,142,1086,5,6,1
8,2012,82,326,9,1,1


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
len(X_train), len(X_test)

(4527, 1132)

## Model Training

In [24]:
# Making a pipeline to train the data on different models
pipelines = {
    'lr':make_pipeline(StandardScaler() , LinearRegression()),
    'knn':make_pipeline(StandardScaler() , KNeighborsRegressor(n_neighbors=5)),
    'svm':make_pipeline(StandardScaler() , SVR()),
    'dt':make_pipeline(StandardScaler() , DecisionTreeRegressor(random_state=1)),
    'rf':make_pipeline(StandardScaler() , RandomForestRegressor(n_estimators=100, random_state=1)),
    'gb':make_pipeline(StandardScaler() , GradientBoostingRegressor(n_estimators=100, random_state=60)),
    'lgb':make_pipeline(StandardScaler() , LGBMRegressor(n_estimators=100, random_state=60)),
    'cat':make_pipeline(StandardScaler() , CatBoostRegressor(n_estimators=100, random_state=1, verbose=False)),
    'xgb':make_pipeline(StandardScaler() , XGBRegressor(n_estimators=100, random_state=1))
}

In [25]:
fit_models = {}
for algo , pipeline in pipelines.items():
    model = pipeline.fit(X_train , y_train)
    fit_models[algo] = model

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000141 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 573
[LightGBM] [Info] Number of data points in the train set: 4527, number of used features: 6
[LightGBM] [Info] Start training from score 5.892931


## Model Evaluation

In [26]:
for algo , model in fit_models.items():
    pred = model.predict(X_test)
    print("The r2 score of {} model is: {}".format(algo , r2_score(y_test , pred) * 100))
    print("The mean absolute error of {} model is: {}".format(algo , mean_absolute_error(y_test , pred)))
    print("The mean squared error of {} model is: {}".format(algo , mean_squared_error(y_test , pred)))  
    print()

The r2 score of lr model is: 8.594424975564896
The mean absolute error of lr model is: 1.0527545782670058
The mean squared error of lr model is: 1.6925969277246453

The r2 score of knn model is: 7.638422293689873
The mean absolute error of knn model is: 1.0159717314487633
The mean squared error of knn model is: 1.7102996466431097

The r2 score of svm model is: 24.92156813758252
The mean absolute error of svm model is: 0.9091456223854014
The mean squared error of svm model is: 1.3902600916272425

The r2 score of dt model is: -28.25129244927682
The mean absolute error of dt model is: 1.158922261484099
The mean squared error of dt model is: 2.3748851590106006

The r2 score of rf model is: 29.044459541725608
The mean absolute error of rf model is: 0.857489399293286
The mean squared error of rf model is: 1.3139147120141343

The r2 score of gb model is: 30.839144379469396
The mean absolute error of gb model is: 0.8609344768128369
The mean squared error of gb model is: 1.280681749563134

The 

### The best model is the light gradient boost model with score of 31.32

# Note: Since the score is too low we may need to return back to feature engineering phase and redo it to test if the score would get any higher

### Saving the best model

In [29]:
import pickle

with open('D:/projects/CodSoft_Projects/Task2/Movies.pkl' , 'wb') as f:
    pickle.dump(fit_models['lgb'], f)