## üì§ Final Predictions

### Steps:
- Load cleaned dataset (without target)
- Rebuild and retrain pipeline on full dataset
- Predict movie ratings using RandomForest
- Save predictions for analysis/deployment

üìÅ Output: predictions.csv ready for demo, email, or production.

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
import joblib

In [2]:
X = pd.read_csv("../dataset/movie_data_featured.csv")

X_predict = X.copy()

if "rating" in X_predict.columns:
    X_predict = X_predict.drop(columns=["rating"])


In [3]:
numeric_cols = X_predict.select_dtypes(include=["float64", "int64"]).columns.tolist()
categorical_cols = X_predict.select_dtypes(include="object").columns.tolist()

# Pipelines
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)
])

# Final Model (use best one from training, e.g., RandomForest)
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

In [4]:
# Full training (if test split was used earlier)
df_full = pd.read_csv("../dataset/movie_data_featured.csv")
X_full = df_full.drop(columns=["rating"])
y_full = df_full["rating"]

model.fit(X_full, y_full)

# Save the model for deployment
joblib.dump(model, "../models/final_model.pkl")

['../models/final_model.pkl']

In [5]:
# Predict on feature-only dataset
predictions = model.predict(X_predict)

# Store predictions
X_predict["predicted_rating"] = predictions
X_predict.head()

Unnamed: 0,name,year_int,duration,votes,genre_split,genre_count,director_avg_rating,actor_popularity,predicted_rating
0,#Gadhvi (He thought he was Gandhi),(2019),109 min,8.0,['Drama'],1,7.0,,7.007
1,#Yaaram,(2019),110 min,35.0,"['Comedy', ' Romance']",2,4.4,,4.421
2,...Aur Pyaar Ho Gaya,(1997),147 min,827.0,"['Comedy', ' Drama', ' Musical']",3,5.358824,,4.827
3,...Yahaan,(2005),142 min,1086.0,"['Drama', ' Romance', ' War']",3,7.5,,7.453
4,?: A Question Mark,(2012),82 min,326.0,"['Horror', ' Mystery', ' Thriller']",3,5.6,,5.61


In [6]:
X_predict[["predicted_rating"]].to_csv("../output/predictions.csv", index=False)
print("‚úÖ Predictions saved to '../output/predictions.csv'")

‚úÖ Predictions saved to '../output/predictions.csv'
