## 🎯 Model Training & Evaluation

### ✅ Steps Covered:
- Load cleaned data
- Build train/test splits
- Setup pipeline for numeric and categorical features
- Train 3 regression models
- Evaluate with MAE, RMSE, and R²
- Compare results
- Visualize feature importances

📁 Output: Trained models ready for predictions.


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load processed data
df = pd.read_csv("../dataset/movie_data_featured.csv")
df.head()

Unnamed: 0,name,year_int,duration,rating,votes,genre_split,genre_count,director_avg_rating,actor_popularity
0,#Gadhvi (He thought he was Gandhi),(2019),109 min,7.0,8.0,['Drama'],1,7.0,
1,#Yaaram,(2019),110 min,4.4,35.0,"['Comedy', ' Romance']",2,4.4,
2,...Aur Pyaar Ho Gaya,(1997),147 min,4.7,827.0,"['Comedy', ' Drama', ' Musical']",3,5.358824,
3,...Yahaan,(2005),142 min,7.4,1086.0,"['Drama', ' Romance', ' War']",3,7.5,
4,?: A Question Mark,(2012),82 min,5.6,326.0,"['Horror', ' Mystery', ' Thriller']",3,5.6,


In [2]:
X = df.drop(columns=["rating"])
y = df["rating"]

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [4]:
numeric_cols = X.select_dtypes(include=["float64", "int64"]).columns.tolist()
categorical_cols = X.select_dtypes(include="object").columns.tolist()

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

Numeric columns: ['votes', 'genre_count', 'director_avg_rating', 'actor_popularity']
Categorical columns: ['name', 'year_int', 'duration', 'genre_split']


In [5]:
# Numeric pipeline
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)
])

In [6]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

results = {}

In [7]:
for name, model in models.items():
    pipe = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", model)
    ])

    print(f"\n🔹 Training {name}...")
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    results[name] = {"MAE": mae, "RMSE": rmse, "R2": r2}

    print(f"{name} ➤ MAE: {mae:.3f} | RMSE: {rmse:.3f} | R²: {r2:.3f}")


🔹 Training Linear Regression...
Linear Regression ➤ MAE: 1.717 | RMSE: 2.277 | R²: -1.789

🔹 Training Random Forest...
Random Forest ➤ MAE: 0.551 | RMSE: 0.818 | R²: 0.640

🔹 Training XGBoost...
XGBoost ➤ MAE: 0.552 | RMSE: 0.809 | R²: 0.648


In [8]:
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values("RMSE")
print("\n📊 Model Performance Summary:")
display(results_df.style.background_gradient(cmap="YlGnBu").format("{:.3f}"))


📊 Model Performance Summary:


Unnamed: 0,MAE,RMSE,R2
XGBoost,0.552,0.809,0.648
Random Forest,0.551,0.818,0.64
Linear Regression,1.717,2.277,-1.789
