In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [5]:

# Load dataset
df = pd.read_csv("./raw-data/movies_metadata.csv", low_memory=False)
ratings_df = pd.read_csv("./ratings_average.csv")

# 1. Cleaning Genre Data
def parse_genres(genre_str):
    try:
        genres = ast.literal_eval(genre_str)
        if isinstance(genres, list) and len(genres) > 0:
            return [genre['name'] for genre in genres if 'name' in genre]
    except (ValueError, SyntaxError):
        pass
    return []

df['parsed_genres'] = df['genres'].apply(parse_genres)

# 2. Cleaning Budget and Revenue
def clean_numeric(value):
    try:
        val = int(value)
        if val == 0:
            return np.nan  # Treat 0 as missing value
        return val
    except (ValueError, TypeError):
        return np.nan

df['budget_cleaned'] = df['budget'].apply(clean_numeric)
df['revenue_cleaned'] = df['revenue'].apply(clean_numeric)

In [9]:
# 3. Cleaning Release Date
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

# Drop rows where budget or revenue is NaN or release date is missing
df_cleaned = df.dropna(subset=['budget_cleaned', 'revenue_cleaned', 'release_date'])

# Extract month and day from release date
df_cleaned['release_month'] = df_cleaned['release_date'].dt.month
df_cleaned['release_day'] = df_cleaned['release_date'].dt.day

# Step 5: Target variable (revenue/budget ratio)
df_cleaned['revenue_budget_ratio'] = df_cleaned['revenue_cleaned'] / df_cleaned['budget_cleaned']

# Step 6: Add ratings data

# Merge the two DataFrames on movieId and id
df_cleaned['id'] = pd.to_numeric(df_cleaned['id'], errors='coerce')  # Ensure 'id' is numeric to match 'movieId'
merged_df = pd.merge(df_cleaned, ratings_df, left_on='id', right_on='movieId', how='left')

# Function to calculate the weighted vote average
def update_vote_average(row):
    if pd.isna(row['vote_average']) or row['vote_average'] == 0:
        return row['average_rating']  # Replace with average_rating if vote_average is NaN or 0
    if pd.notna(row['average_rating']):  # If both are present, calculate weighted average
        vote_weight = row['vote_count'] if pd.notna(row['vote_count']) else 0
        rating_weight = row['rating_count']
        total_weight = vote_weight + rating_weight
        return ((row['vote_average'] * vote_weight) + (row['average_rating'] * 2 * rating_weight)) / total_weight
    return row['vote_average']  # If no update is needed, return the original vote_average

# Apply the update_vote_average function to each row
merged_df['vote_average_updated'] = merged_df.apply(update_vote_average, axis=1)
merged_df['vote_count'] = merged_df['vote_count'] + merged_df['rating_count']

# Drop extra columns like movieId from the merged DataFrame
merged_df.drop(columns=['movieId', 'average_rating', 'rating_count'], inplace=True)

merged_df.dropna(subset = ["vote_average_updated", "vote_count", "revenue_budget_ratio"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['release_month'] = df_cleaned['release_date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['release_day'] = df_cleaned['release_date'].dt.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['revenue_budget_ratio'] = df_cleaned['revenue_cleaned'] / df_clean

In [10]:
# Step 6: Prepare Features and Labels
X = merged_df[['release_month', 'release_day']]  # Use month, day, vote count as initial features

# Prepare the multi-output labels (targets)
y = merged_df[['revenue_budget_ratio', 'vote_average_updated', 'vote_count']]

In [11]:
print(X.shape, y.shape)

(1575, 2) (1575, 3)


In [15]:
# Step 7: Encode genres as categorical variables
encoder = OneHotEncoder()
genre_list = merged_df['parsed_genres'].apply(lambda x: ','.join(x) if isinstance(x, list) else '')
genres_encoded = encoder.fit_transform(genre_list.values.reshape(-1, 1)).toarray()
genre_feature_names = encoder.get_feature_names_out(['parsed_genres'])

genres_encoded_df = pd.DataFrame(genres_encoded, columns=genre_feature_names)

# Concatenate genre-encoded features to X
X = pd.concat([X, genres_encoded_df], axis=1)


In [16]:
# Step 7.1: Drop rows with NaN values
# First, check for any NaN values in X or y
print("Missing values in X before drop:", X.isna().sum().sum())
print("Missing values in y before drop:", y.isna().sum().sum())

# Drop rows with NaN values from both X and y
X_cleaned = X.dropna()
y_cleaned = y.dropna()

# Ensure that X and y have the same number of rows after dropping NaNs
X_cleaned, y_cleaned = X_cleaned.align(y_cleaned, join='inner', axis=0)

print("Missing values in X after drop:", X_cleaned.isna().sum().sum())
print("Missing values in y after drop:", y_cleaned.isna().sum().sum())

Missing values in X before drop: 994728
Missing values in y before drop: 0
Missing values in X after drop: 0
Missing values in y after drop: 0


In [17]:
print(X_cleaned.shape, y_cleaned.shape)

(707, 1146) (707, 3)


In [19]:
X_cleaned.columns

Index(['release_month', 'release_day', 'parsed_genres_',
       'parsed_genres_Action', 'parsed_genres_Action,Adventure',
       'parsed_genres_Action,Adventure,Comedy',
       'parsed_genres_Action,Adventure,Comedy,Crime,Thriller',
       'parsed_genres_Action,Adventure,Comedy,Drama,Mystery',
       'parsed_genres_Action,Adventure,Comedy,Drama,Western',
       'parsed_genres_Action,Adventure,Comedy,Family,Fantasy',
       ...
       'parsed_genres_Thriller,Science Fiction,Mystery,Romance',
       'parsed_genres_War',
       'parsed_genres_War,Crime,Drama,Mystery,Romance,Thriller',
       'parsed_genres_War,Drama', 'parsed_genres_War,Drama,History',
       'parsed_genres_War,Drama,History,Adventure,Romance,Thriller',
       'parsed_genres_War,History,Action,Adventure,Drama,Romance',
       'parsed_genres_Western', 'parsed_genres_Western,Adventure',
       'parsed_genres_Western,Thriller'],
      dtype='object', length=1146)

In [20]:
print(y['vote_count'])

0        5789.0
1        2536.0
3        4231.0
5        2513.0
8       14025.0
         ...   
5351       30.0
5362       93.0
5369       58.0
5370       86.0
5372       95.0
Name: vote_count, Length: 1575, dtype: float64


In [21]:
# Step 8: Train-Test Split with cleaned data
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)

In [22]:

# Step 9: Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
# Step 10: Train a RandomForest model for Multi-Output Regression
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Step 11: Evaluate the model
y_pred = model.predict(X_test_scaled)

# Calculate Mean Squared Error for each target
mse_revenue = mean_squared_error(y_test['revenue_budget_ratio'], y_pred[:, 0])
mse_vote_average = mean_squared_error(y_test['vote_average_updated'], y_pred[:, 1])
mse_vote_count = mean_squared_error(y_test['vote_count'], y_pred[:, 2])

print(f"Mean Squared Error for Revenue/Budget Ratio: {mse_revenue}")
print(f"Mean Squared Error for Vote Average: {mse_vote_average}")
print(f"Mean Squared Error for Vote Count: {mse_vote_count}")

Mean Squared Error for Revenue/Budget Ratio: 231.22914300458982
Mean Squared Error for Vote Average: 0.7132189925012951
Mean Squared Error for Vote Count: 107418073.22799663


In [24]:
y['revenue_budget_ratio']

0       12.451801
1        4.043035
3        3.123947
5        6.072311
8        0.102218
          ...    
5351     3.237833
5362     4.184134
5369     5.833044
5370     4.000000
5372     1.097910
Name: revenue_budget_ratio, Length: 1575, dtype: float64

In [25]:
y['vote_count']

0        5789.0
1        2536.0
3        4231.0
5        2513.0
8       14025.0
         ...   
5351       30.0
5362       93.0
5369       58.0
5370       86.0
5372       95.0
Name: vote_count, Length: 1575, dtype: float64

In [7]:
# Fine-tuning model
from catboost import CatBoostRegressor

# Step 10: Use CatBoost for Multi-Output Regression
model_catboost = CatBoostRegressor(iterations=500, depth=10, learning_rate=0.1, loss_function='MultiRMSE', random_state=42, verbose=0)
model_catboost.fit(X_train_scaled, y_train)

# Step 11: Evaluate CatBoost model
y_pred_catboost = model_catboost.predict(X_test_scaled)

# Calculate Mean Squared Error for each target
mse_revenue_catboost = mean_squared_error(y_test['revenue_budget_ratio'], y_pred_catboost[:, 0])
mse_vote_average_catboost = mean_squared_error(y_test['vote_average_updated'], y_pred_catboost[:, 1])
mse_vote_count_catboost = mean_squared_error(y_test['vote_count'], y_pred_catboost[:, 2])

print(f"CatBoost - MSE for Revenue/Budget Ratio: {mse_revenue_catboost}")
print(f"CatBoost - MSE for Vote Average: {mse_vote_average_catboost}")
print(f"CatBoost - MSE for Vote Count: {mse_vote_count_catboost}")

ModuleNotFoundError: No module named 'catboost'

In [None]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor

# Step 10: Use XGBoost for Multi-Output Regression
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
model_xgb = MultiOutputRegressor(xgb_model)
model_xgb.fit(X_train_scaled, y_train)

# Step 11: Evaluate XGBoost model
y_pred_xgb = model_xgb.predict(X_test_scaled)

# Calculate Mean Squared Error for each target
mse_revenue_xgb = mean_squared_error(y_test['revenue_budget_ratio'], y_pred_xgb[:, 0])
mse_vote_average_xgb = mean_squared_error(y_test['vote_average_updated'], y_pred_xgb[:, 1])
mse_vote_count_xgb = mean_squared_error(y_test['vote_count'], y_pred_xgb[:, 2])

print(f"XGBoost - MSE for Revenue/Budget Ratio: {mse_revenue_xgb}")
print(f"XGBoost - MSE for Vote Average: {mse_vote_average_xgb}")
print(f"XGBoost - MSE for Vote Count: {mse_vote_count_xgb}")

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Define parameter grid for CatBoost
param_grid = {
    'depth': [6, 8, 10],
    'iterations': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5]
}

# Random search of parameters, using 3-fold cross-validation
random_search_catboost = RandomizedSearchCV(CatBoostRegressor(loss_function='MultiRMSE', random_state=42), param_grid, n_iter=10, cv=3, verbose=0, random_state=42)

# Fit the random search model
random_search_catboost.fit(X_train_scaled, y_train)

# Best parameters and best score
print("Best Params (CatBoost):", random_search_catboost.best_params_)
print("Best Score (CatBoost):", random_search_catboost.best_score_)

In [None]:
from sklearn.model_selection import cross_val_score

# Evaluate model with cross-validation (e.g., CatBoost)
catboost_model = CatBoostRegressor(iterations=500, depth=10, learning_rate=0.1, loss_function='MultiRMSE', random_state=42, verbose=0)
cv_scores = cross_val_score(catboost_model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')

print("Cross-Validation Scores (CatBoost):", -cv_scores)
print("Mean CV Score:", -np.mean(cv_scores))

In [None]:
import optuna
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Step 1: Define the Optuna objective function
def objective(trial):
    # Suggest values for hyperparameters
    depth = trial.suggest_int('depth', 4, 10)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    iterations = trial.suggest_int('iterations', 100, 1000)
    l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 1, 10)

    # Create the model with suggested hyperparameters
    model = CatBoostRegressor(
        depth=depth,
        learning_rate=learning_rate,
        iterations=iterations,
        l2_leaf_reg=l2_leaf_reg,
        loss_function='MultiRMSE',
        random_state=42,
        verbose=0
    )

    # Train the model
    model.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = model.predict(X_test_scaled)

    # Calculate the Mean Squared Error for the first target (revenue_budget_ratio)
    mse = mean_squared_error(y_test['revenue_budget_ratio'], y_pred[:, 0])
    
    return mse

# Step 2: Run the Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Step 3: Print the best parameters
print("Best hyperparameters: ", study.best_params)

# Step 4: Train the final model with the best parameters
best_params = study.best_params
model_catboost = CatBoostRegressor(
    depth=best_params['depth'],
    learning_rate=best_params['learning_rate'],
    iterations=best_params['iterations'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    loss_function='MultiRMSE',
    random_state=42,
    verbose=0
)

model_catboost.fit(X_train_scaled, y_train)
y_pred_best = model_catboost.predict(X_test_scaled)

# Evaluate the final model
mse_final = mean_squared_error(y_test['revenue_budget_ratio'], y_pred_best[:, 0])
print(f"Final MSE: {mse_final}")

In [None]:
import optuna
import xgboost as xgb

def objective(trial):
    # Hyperparameter search space for XGBoost
    param = {
        'objective': 'reg:squarederror',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5)
    }

    # Initialize and train the model
    model_xgb = xgb.XGBRegressor(**param)
    model_xgb.fit(X_train_scaled, y_train)

    # Predict and calculate MSE
    y_pred = model_xgb.predict(X_test_scaled)
    mse = mean_squared_error(y_test['revenue_budget_ratio'], y_pred[:, 0])

    return mse

# Run the study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Print the best parameters
print("Best hyperparameters for XGBoost: ", study.best_params)

# Train final model with best parameters
best_params = study.best_params
model_xgb_best = xgb.XGBRegressor(**best_params)
model_xgb_best.fit(X_train_scaled, y_train)

# Evaluate the final model
y_pred_best = model_xgb_best.predict(X_test_scaled)
mse_final_xgb = mean_squared_error(y_test['revenue_budget_ratio'], y_pred_best[:, 0])
print(f"Final MSE for XGBoost: {mse_final_xgb}")

In [None]:
def objective_multi(trial):
    depth = trial.suggest_int('depth', 4, 10)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    iterations = trial.suggest_int('iterations', 100, 1000)
    l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 1, 10)

    model = CatBoostRegressor(
        depth=depth,
        learning_rate=learning_rate,
        iterations=iterations,
        l2_leaf_reg=l2_leaf_reg,
        loss_function='MultiRMSE',
        random_state=42,
        verbose=0
    )

    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    mse_revenue = mean_squared_error(y_test['revenue_budget_ratio'], y_pred[:, 0])
    mse_vote_average = mean_squared_error(y_test['vote_average_updated'], y_pred[:, 1])
    mse_vote_count = mean_squared_error(y_test['vote_count'], y_pred[:, 2])

    # Aggregate the MSE values for multi-objective optimization
    return mse_revenue + mse_vote_average + mse_vote_count

study = optuna.create_study(direction='minimize')
study.optimize(objective_multi, n_trials=50)

print("Best hyperparameters for multi-objective optimization: ", study.best_params)