<a href="https://colab.research.google.com/github/nirmal20092003/data-science/blob/main/MovieRatingPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

import section


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


Load the Dataset

In [None]:
df = pd.read_csv("/content/sample_data/IMDbMoviesIndia.csv", encoding='ISO-8859-1')
print(df.head())
print(df.info())
df.head

                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    

 Data Preprocessing


Clean Missing and Irrelevant Data

Keep only rows with ratings

In [None]:
df = df[df['Rating'].notna()]

Drop rows with missing important features

In [None]:
df = df.dropna(subset=['Genre', 'Director', 'Actor 1'])

Drop irrelevant columns

In [None]:
df = df[['Genre', 'Director', 'Actor 1', 'Rating']]


 Feature Engineering


 Convert categorical data using OneHotEncoding

Define features and target

In [None]:
X = df[['Genre', 'Director', 'Actor 1']]
y = df['Rating']

Define categorical columns

In [None]:
categorical_cols = ['Genre', 'Director', 'Actor 1']

Preprocessing for categorical data

In [None]:
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)]
)


Build and Train the Model

Create a pipeline with preprocessing + model

In [None]:

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])








Split data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Train

In [None]:
model.fit(X_train, y_train)

Evaluate the Model

Predict

In [None]:
y_pred = model.predict(X_test)


# Evaluation

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared Score: {r2:.2f}")

Mean Squared Error: 1.64
R-squared Score: 0.15


Predictions

In [None]:
sample = pd.DataFrame([{
    'Genre': 'Drama',
    'Director': 'Sridhar Reddy',
    'Actor 1': 'Tarun Arora'
}])

predicted_rating = model.predict(sample)
print(f"Predicted Rating: {predicted_rating[0]:.2f}")


Predicted Rating: 5.31
