# Random Forest Model for API integration

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
import joblib

In [2]:
#Check where this is file is stored
import os
os.getcwd()

'/Users/morganpfunder/Desktop/Data Analytics/Homework/Project_4_Grp_1/Random Forest Models'

# Make sure you upload final_data.csv to the **content folder** in google colab every time you run this file

**Column Definitions:**
- **Rating:** Avgerage IMDb user rating on a scale of 1 to 10
- **Votes:** Total number of user votes that contributed to the IMDB rating
- **Meta Score:** Metacritic score based on critic reviews, ranging from 0-100
- **PR Rating:** Aged based rating PG, PG-13, R
- **Duration:** Total runtime of the movie
- **Number Rating:** The numerical equivalent of the parental rating (PR Rating)
- **Netflix top 10:** Indicates if the movie made it into Netflix's top 10 list (1=Yes, 0=No)

In [3]:
# Import in csv file
df = pd.read_csv("final_data.csv")


FileNotFoundError: [Errno 2] No such file or directory: 'final_data.csv'

In [None]:
# Preprocessing
df = df.dropna(subset=['rating', 'votes']).copy()
df[['cast2', 'cast3', 'cast4']] = df[['cast2', 'cast3', 'cast4']].fillna('Unknown')
df['weekly_hours_viewed'] = df['weekly_hours_viewed'].fillna(0)
df['all_genres'] = df['genre0'].str.split(', ')
df['movie_age'] = 2025 - df['year']
df['log_votes'] = np.log(df['votes'])

In [None]:
# Drop unused columns
df = df.drop(columns=[
    'row_id', 'movie_name', 'genre0', 'genre1', 'genre2', 'genre3',
    'pr_rating', 'cast0', 'weekly_views', 'year', 'votes'  # already log-transformed
])

In [None]:
# Define features and target
y = df['netflix_top_10']
X = df.drop(columns=['netflix_top_10'])

In [None]:
# Encode categorical + multi-label features
mlb = MultiLabelBinarizer()
genre_dummies = pd.DataFrame(
    mlb.fit_transform(X['all_genres']),
    columns=[f"genre_{g}" for g in mlb.classes_],
    index=X.index)


In [None]:
# Encode categorical fields with one-hot
cat_df = pd.get_dummies(X[['cast1', 'cast2', 'cast3', 'cast4', 'director']])
expected_cat_cols = cat_df.columns.tolist()  # Save this structure
cat_df = cat_df.reindex(columns=expected_cat_cols, fill_value=0)

In [None]:
# Scale numeric fields
numeric_columns = ['rating', 'meta_score', 'number_rating', 'weekly_hours_viewed', 'log_votes', 'movie_age']
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X[numeric_columns]), columns=numeric_columns, index=X.index)


In [None]:
# Final feature set
X_final = pd.concat([X_scaled, genre_dummies, cat_df], axis=1)

In [None]:
# Save feature names
feature_columns = X_final.columns.tolist()

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

In [None]:
# Train model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# Predict
rf_predictions = rf_model.predict(X_test)

In [None]:
# Save components for API
joblib.dump(rf_model, 'rf_model.pkl')
joblib.dump(mlb, 'mlb.pkl')
joblib.dump(X.columns.tolist(), 'feature_columns.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']