In [3]:
import pandas as pd

# Load the new dataset to understand its structure
dataset_path = "IMDb Movies India.csv"
df = pd.read_csv(dataset_path)

# Display basic info and first few rows
df.info(), df.head()


ModuleNotFoundError: No module named 'pandas'

In [7]:
import pandas as pd

# Load the dataset
df = pd.read_csv("IMDb Movies India.csv", encoding="ISO-8859-1")

# Display basic info
print(df.info())

# Show first few rows
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB
None


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [9]:
def preprocess_movie_data(df):
    df = df.copy()  # Avoid modifying original DataFrame

    # Clean 'Year' column (extract numeric values)
    df["Year"] = df["Year"].astype(str).str.extract(r"(\d{4})").astype(float)

    # Convert 'Duration' to numeric (extract minutes)
    df["Duration"] = df["Duration"].astype(str).str.extract(r"(\d+)").astype(float)

    # Handle 'Votes' column (remove '$' and 'M' if present, convert to float)
    def clean_votes(votes):
        if isinstance(votes, str):
            votes = votes.replace(",", "").replace("$", "").replace("M", "")
            try:
                return float(votes)
            except ValueError:
                return None
        return votes

    df["Votes"] = df["Votes"].apply(clean_votes)

    # Fill missing values (median for numeric, 'Unknown' for categorical)
    df["Year"].fillna(df["Year"].median(), inplace=True)
    df["Duration"].fillna(df["Duration"].median(), inplace=True)
    df["Votes"].fillna(df["Votes"].median(), inplace=True)
    df["Rating"].fillna(df["Rating"].median(), inplace=True)
    df["Genre"].fillna("Unknown", inplace=True)
    df["Director"].fillna("Unknown", inplace=True)

    return df

# Apply preprocessing
df_cleaned = preprocess_movie_data(df)

# Display processed data
print(df_cleaned.info())
df_cleaned.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      15509 non-null  float64
 2   Duration  15509 non-null  float64
 3   Genre     15509 non-null  object 
 4   Rating    15509 non-null  float64
 5   Votes     15509 non-null  float64
 6   Director  15509 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(4), object(6)
memory usage: 1.2+ MB
None


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,1991.0,131.0,Drama,6.0,55.0,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),2019.0,109.0,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,2021.0,90.0,"Drama, Musical",6.0,55.0,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,2019.0,110.0,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,2010.0,105.0,Drama,6.0,55.0,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [11]:
# Compute Director Success Rate (average rating of past movies)
df_cleaned["Director Success Rate"] = df_cleaned.groupby("Director")["Rating"].transform(lambda x: x.expanding().mean())

# Compute Average Rating of Similar Movies (by genre)
genre_avg_ratings = df_cleaned.groupby("Genre")["Rating"].transform("mean")
df_cleaned["Avg Genre Rating"] = genre_avg_ratings

# Fill NaN values in new features (some directors may have only one movie)
df_cleaned["Director Success Rate"].fillna(df_cleaned["Rating"].median(), inplace=True)
df_cleaned["Avg Genre Rating"].fillna(df_cleaned["Rating"].median(), inplace=True)

df_cleaned[["Director", "Rating", "Director Success Rate", "Avg Genre Rating"]].head(10)


Unnamed: 0,Director,Rating,Director Success Rate,Avg Genre Rating
0,J.S. Randhawa,6.0,6.0,6.149065
1,Gaurav Bakshi,7.0,7.0,6.149065
2,Soumyajit Majumdar,6.0,6.0,6.366667
3,Ovais Khan,4.4,4.4,5.762143
4,Amol Palekar,6.0,6.0,6.149065
5,Rahul Rawail,4.7,4.7,6.215686
6,Shoojit Sircar,7.4,7.4,6.82
7,Anirban Datta,6.0,6.0,6.49364
8,Allyson Patel,5.6,5.6,5.606667
9,Biju Bhaskar Nair,4.0,4.0,5.406061


In [13]:
# One-hot encode 'Genre' and 'Director' (keeping top 10 directors only)
top_10_directors = df_cleaned["Director"].value_counts().nlargest(10).index
df_cleaned["Director"] = df_cleaned["Director"].apply(lambda x: x if x in top_10_directors else "Other")

df_encoded = pd.get_dummies(df_cleaned, columns=["Genre", "Director"], drop_first=True)

df_encoded.head()


Unnamed: 0,Name,Year,Duration,Rating,Votes,Actor 1,Actor 2,Actor 3,Director Success Rate,Avg Genre Rating,...,Director_Babubhai Mistry,Director_Dhirubhai Desai,Director_Jayant Desai,Director_Kanti Shah,Director_Mahesh Bhatt,Director_Master Bhagwan,Director_Mohammed Hussain,Director_Nanabhai Bhatt,Director_Other,Director_Unknown
0,,1991.0,131.0,6.0,55.0,Manmauji,Birbal,Rajendra Bhatia,6.0,6.149065,...,False,False,False,False,False,False,False,False,True,False
1,#Gadhvi (He thought he was Gandhi),2019.0,109.0,7.0,8.0,Rasika Dugal,Vivek Ghamande,Arvind Jangid,7.0,6.149065,...,False,False,False,False,False,False,False,False,True,False
2,#Homecoming,2021.0,90.0,6.0,55.0,Sayani Gupta,Plabita Borthakur,Roy Angana,6.0,6.366667,...,False,False,False,False,False,False,False,False,True,False
3,#Yaaram,2019.0,110.0,4.4,35.0,Prateik,Ishita Raj,Siddhant Kapoor,4.4,5.762143,...,False,False,False,False,False,False,False,False,True,False
4,...And Once Again,2010.0,105.0,6.0,55.0,Rajat Kapoor,Rituparna Sengupta,Antara Mali,6.0,6.149065,...,False,False,False,False,False,False,False,False,True,False


In [15]:
from sklearn.model_selection import train_test_split

# Select relevant features
features = ["Year", "Duration", "Votes", "Director Success Rate", "Avg Genre Rating"]
features += [col for col in df_encoded.columns if col.startswith("Genre_") or col.startswith("Director_")]

X = df_encoded[features]
y = df_encoded["Rating"]  # Target variable

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)


Training data shape: (12407, 500)
Testing data shape: (3102, 500)


In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Initialize model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train model
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate performance
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")


Mean Absolute Error (MAE): 0.26
Root Mean Squared Error (RMSE): 0.52


In [19]:
import joblib

# Save the trained model
joblib.dump(model, "movie_rating_predictor.pkl")

print("Model saved successfully!")


Model saved successfully!


In [23]:
# Load the trained model
model = joblib.load("movie_rating_predictor.pkl")

# Example: Predict rating for a new movie
new_movie = pd.DataFrame({
    "Year": [2025],
    "Duration": [120],
    "Votes": [5000],
    "Director Success Rate": [7.5],
    "Avg Genre Rating": [6.8],
}, index=[0])

# Add one-hot encoded genre and director columns (fill missing ones with 0)
for col in X.columns:
    if col not in new_movie.columns:
        new_movie[col] = 0

# Predict rating
predicted_rating = model.predict(new_movie)
print(f"Predicted IMDb Rating: {predicted_rating[0]:.1f}")


  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[col] = 0
  new_movie[c

Predicted IMDb Rating: 7.5
