In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
data = pd.read_csv("IMDbMoviesIndia.csv", encoding='latin1')

print(data.shape)
data.head()

(15509, 10)


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [5]:
print(data.columns)

Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')


In [6]:
data['Duration'] = data['Duration'].str.extract('(\d+)').astype(float)

  data['Duration'] = data['Duration'].str.extract('(\d+)').astype(float)


In [7]:
data['Votes'] = data['Votes'].str.replace(',', '')
data['Votes'] = pd.to_numeric(data['Votes'], errors='coerce')

In [8]:
data['Year'] = data['Year'].str.extract('(\d+)').astype(float)

  data['Year'] = data['Year'].str.extract('(\d+)').astype(float)


In [9]:
data = data.dropna()
print(data.shape)

(5659, 10)


In [10]:
data['Director_freq'] = data['Director'].map(data['Director'].value_counts())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Director_freq'] = data['Director'].map(data['Director'].value_counts())


In [12]:
data['Actor_freq'] = data['Actor 1'].map(data['Actor 1'].value_counts())

In [13]:
data['Genre'] = data['Genre'].apply(lambda x: x.split(',')[0])

In [14]:
data = pd.get_dummies(data, columns=['Genre'], drop_first=True)

In [15]:
features = data[['Duration', 'Votes', 'Year',
                 'Director_freq', 'Actor_freq'] +
                [col for col in data.columns if 'Genre_' in col]]

target = data['Rating']

In [16]:
print(features.dtypes)

Duration             float64
Votes                float64
Year                 float64
Director_freq          int64
Actor_freq             int64
Genre_Adventure         bool
Genre_Animation         bool
Genre_Biography         bool
Genre_Comedy            bool
Genre_Crime             bool
Genre_Documentary       bool
Genre_Drama             bool
Genre_Family            bool
Genre_Fantasy           bool
Genre_History           bool
Genre_Horror            bool
Genre_Music             bool
Genre_Musical           bool
Genre_Mystery           bool
Genre_Romance           bool
Genre_Sci-Fi            bool
Genre_Sport             bool
Genre_Thriller          bool
Genre_War               bool
dtype: object


In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42
)

In [18]:
model = RandomForestRegressor(
    n_estimators=300,
    max_depth=15,
    min_samples_split=5,
    random_state=42
)

model.fit(X_train, y_train)

In [23]:
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    random_state=42
)

model.fit(X_train, y_train)

In [24]:
predictions = model.predict(X_test)

print("MSE:", mean_squared_error(y_test, predictions))
print("R2 Score:", r2_score(y_test, predictions))

MSE: 1.209133980743494
R2 Score: 0.3470294966208094


In [25]:
comparison = pd.DataFrame({
    'Actual Rating': y_test,
    'Predicted Rating': predictions
})

print(comparison.head(10))

       Actual Rating  Predicted Rating
10971            6.0          5.229711
14052            2.4          6.477908
10002            3.8          4.929699
3970             3.8          4.657787
8840             7.2          6.115279
5071             6.3          5.369855
2709             8.6          5.437840
11842            3.9          5.931378
10910            6.6          5.603631
4485             6.6          6.817325


In [26]:
comparison['Error'] = abs(comparison['Actual Rating'] - comparison['Predicted Rating'])
print(comparison.head(10))

       Actual Rating  Predicted Rating     Error
10971            6.0          5.229711  0.770289
14052            2.4          6.477908  4.077908
10002            3.8          4.929699  1.129699
3970             3.8          4.657787  0.857787
8840             7.2          6.115279  1.084721
5071             6.3          5.369855  0.930145
2709             8.6          5.437840  3.162160
11842            3.9          5.931378  2.031378
10910            6.6          5.603631  0.996369
4485             6.6          6.817325  0.217325


In [27]:
print("Average Error:", comparison['Error'].mean())

Average Error: 0.8274250557264132
