In [1]:
# ---------------------------------------
# 1. Import Required Libraries
# ---------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score

from xgboost import XGBRegressor
import joblib

In [3]:
# ---------------------------------------
# 2. Load the Cleaned + Feature Engineered Dataset
# ---------------------------------------
df = pd.read_csv("../dataset/movie_data_featured.csv")
df.head()

Unnamed: 0,name,year_int,duration,genre,rating,votes,director,actor 1,actor 2,actor 3,genre_split,genre_count,director_avg_rating,actor_popularity
0,#Gadhvi (He thought he was Gandhi),(2019),109.0,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,['Drama'],1,7.0,6.855556
1,#Yaaram,(2019),110.0,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,"['Comedy', ' Romance']",2,4.4,4.838889
2,...Aur Pyaar Ho Gaya,(1997),147.0,"Comedy, Drama, Musical",4.7,827.0,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,"['Comedy', ' Drama', ' Musical']",3,5.358824,5.752446
3,...Yahaan,(2005),142.0,"Drama, Romance, War",7.4,35.0,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma,"['Drama', ' Romance', ' War']",3,7.5,5.883036
4,?: A Question Mark,(2012),82.0,"Horror, Mystery, Thriller",5.6,326.0,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia,"['Horror', ' Mystery', ' Thriller']",3,5.6,5.662121


In [5]:
# ---------------------------------------
# 3. Define Target + Features
# ---------------------------------------
target = "rating"
drop_cols = [
    "rating",
    "votes",
]  # drop unnecessary columns

X = df.drop(columns=drop_cols)
y = df[target]

X.shape, y.shape

((7919, 12), (7919,))

In [6]:
# ---------------------------------------
# 4. Column Type Identification
# ---------------------------------------
categorical_cols = X.select_dtypes(include="object").columns.tolist()
numerical_cols = X.select_dtypes(exclude="object").columns.tolist()

print("Categorical Features:", categorical_cols)
print("Numerical Features:", numerical_cols)

Categorical Features: ['name', 'year_int', 'genre', 'director', 'actor 1', 'actor 2', 'actor 3', 'genre_split']
Numerical Features: ['duration', 'genre_count', 'director_avg_rating', 'actor_popularity']


In [7]:
# ---------------------------------------
# 5. Preprocessing Pipeline for Categorical Columns
# ---------------------------------------
preprocessor = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)],
    remainder="passthrough",  # Keep numerical columns as-is
)

In [8]:
# ---------------------------------------
# 6. Define ML Pipeline with XGBoost
# ---------------------------------------
xgb_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        (
            "regressor",
            XGBRegressor(
                n_estimators=100,
                learning_rate=0.1,
                max_depth=5,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
            ),
        ),
    ]
)

In [9]:
# ---------------------------------------
# 6. Define ML Pipeline with XGBoost
# ---------------------------------------
xgb_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        (
            "regressor",
            XGBRegressor(
                n_estimators=100,
                learning_rate=0.1,
                max_depth=5,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
            ),
        ),
    ]
)

In [10]:
# ---------------------------------------
# 7. Train/Test Split
# ---------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [11]:
# ---------------------------------------
# 8. Model Training
# ---------------------------------------
xgb_model.fit(X_train, y_train)
print("✅ Model training complete")

✅ Model training complete


In [12]:
# ---------------------------------------
# 9. Model Evaluation
# ---------------------------------------
y_pred = xgb_model.predict(X_test)

# Evaluation Metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

RMSE: 0.72
R² Score: 0.73


In [17]:
# ---------------------------------------
# 10. Save Model to File
# ---------------------------------------
joblib.dump(xgb_model, "../models/xgb_movie_rating_model.pkl")
joblib.dump(X.columns.tolist(), "../models/xgb_feature_columns.pkl")
print("✅ Model and feature columns saved successfully")

✅ Model and feature columns saved successfully


In [1]:
# ---------------------------------------
# 1. Import Required Libraries
# ---------------------------------------
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score

from xgboost import XGBRegressor
import joblib

# ---------------------------------------
# 2. Load the Cleaned + Feature Engineered Dataset
# ---------------------------------------
df = pd.read_csv("../dataset/movie_data_featured.csv")
df.head()

# ---------------------------------------
# 3. Define Target + Features
# ---------------------------------------
target = "rating"
drop_cols = ["rating", "votes"]  # drop unnecessary columns

X = df.drop(columns=drop_cols)
y = df[target]

print("✅ Dataset loaded and split into features and target")

# ---------------------------------------
# 4. Column Type Identification
# ---------------------------------------
categorical_cols = X.select_dtypes(include="object").columns.tolist()
numerical_cols = X.select_dtypes(exclude="object").columns.tolist()

print("Categorical Features:", categorical_cols)
print("Numerical Features:", numerical_cols)

# ---------------------------------------
# 5. Preprocessing Pipeline for Categorical Columns
# ---------------------------------------
preprocessor = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)],
    remainder="passthrough",  # Keep numerical columns as-is
)

# ---------------------------------------
# 6. Define ML Pipeline with XGBoost
# ---------------------------------------
xgb_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        (
            "regressor",
            XGBRegressor(
                n_estimators=100,
                learning_rate=0.1,
                max_depth=5,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
            ),
        ),
    ]
)

# ---------------------------------------
# 7. Train/Test Split
# ---------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("✅ Dataset split into training and test sets")

# ---------------------------------------
# 8. Model Training
# ---------------------------------------
xgb_model.fit(X_train, y_train)
print("✅ Model training complete")

# ---------------------------------------
# 9. Model Evaluation
# ---------------------------------------
y_pred = xgb_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

# ---------------------------------------
# 10. Save Model to File
# ---------------------------------------
joblib.dump(xgb_model, "../models/xgb_movie_rating_model.pkl")
joblib.dump(X.columns.tolist(), "../models/xgb_feature_columns.pkl")
print("✅ Model and feature columns saved successfully")

✅ Dataset loaded and split into features and target
Categorical Features: ['name', 'year_int', 'genre', 'director', 'actor 1', 'actor 2', 'actor 3', 'genre_split']
Numerical Features: ['duration', 'genre_count', 'director_avg_rating', 'actor_popularity']
✅ Dataset split into training and test sets
✅ Model training complete
RMSE: 0.72
R² Score: 0.73
✅ Model and feature columns saved successfully
