In [None]:
# Seed value for reusability
seed_value= 1

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

from sklearn.model_selection import cross_val_score, KFold,  train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler

from numpy import mean, absolute, std
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, BaggingRegressor, HistGradientBoostingRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_error
from math import sqrt



In [None]:
df = pd.read_csv("........")
df.info()

In [None]:
from sklearn.utils import shuffle
df = shuffle(df)

In [None]:
df['Scores'] = np.log1p(df['Scores'])
df['ProductionBudget'] = np.log1p(df['ProductionBudget'])
df['OpeningTheater'] = np.log1p(df['OpeningTheater'])
df['Difference'] = np.log1p(df['Difference'])
df['Duration'] = np.log1p(df['Duration'])
df['FaceNo'] = np.log1p(df['FaceNo'])
df['Female'] = np.log1p(df['Female'])
df['Male'] = np.log1p(df['Male'])
df['AverageAge'] = np.log1p(df['AverageAge'])
df['AvgFaceSize'] =np.log1p(df['AvgFaceSize'])

In [None]:
# Control only features
train_features = ['OpeningTheater','ProductionBudget','Scores', 'Follows','Action','Adventure','Biography','Comedy',
'Crime','Documentary','Drama','Horror','Thriller',  'topStudio', 'PG-13','R','Not Rated','PG','NC-17','G', 'topMeters','topStars',
'AwardStars']

In [None]:
#Control + trailer content
train_features = ['OpeningTheater','ProductionBudget','Scores', 'Follows','Action','Adventure','Biography','Comedy',
'Crime','Documentary','Drama','Horror','Thriller',  'topStudio', 'PG-13','R','Not Rated','PG','NC-17','G', 'topMeters','topStars',
'AwardStars',
'Duration','Difference','RatioFaceNo','RatioMale','RatioFemale', 'FaceNo','Female','Male', 'RatioSad','RatioHappy', 'RatioFear',  'RatioAngry',
'RatioSurprise',  'RatioDisgust',  'RatioNeutral','RatioAsian', 'RatioIndian', 'RatioBlack', 'RatioWhite', 'RatioMiddle','RatioHispanic',
                    'AvgFaceSize', 'RatioFaceCoverage', 'AverageAge']

In [None]:
X = df[train_features]
y = df['OpeningGross'].values
y = np.log1p(y)
print(X.shape)
print(y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=52)


In [None]:
pip install optuna

In [None]:
import optuna

In [None]:
RANDOM_SEED = seed_value

# 5-fold CV
kfolds = KFold(n_splits=5)
# Define the helper function so that it can be reused
def tune(objective):
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=500)

    params = study.best_params
    best_score = study.best_value
    print(f"Best score: {best_score}\n")
    print(f"Optimized parameters: {params}\n")
    return params


In [None]:
##################
# Extra Tree Regressor
##################
def ExTree_objective(trial):
    _n_estimators = trial.suggest_int("n_estimators", 1,1000)
    _max_depth = trial.suggest_int("max_depth", 2, 100)
    _min_samp_split = trial.suggest_int("min_samples_split", 2, 100)
    _min_samples_leaf = trial.suggest_int("min_samples_leaf", 2, 100)
    _max_features = trial.suggest_int("max_features", 2,23)

    ExTree = ExtraTreesRegressor(
        max_depth=_max_depth,
        min_samples_split=_min_samp_split,
        min_samples_leaf=_min_samples_leaf,
        max_features=_max_features,
        n_estimators=_n_estimators,
        n_jobs=-1,
        random_state=RANDOM_SEED,
    )

    score = cross_val_score(
        ExTree, X_train, y_train, cv=kfolds, scoring="r2"
    ).mean()

    return score

ExTree_params = tune(ExTree_objective)


In [None]:
ExtraTree = ExtraTreesRegressor(**ExTree_params, random_state=RANDOM_SEED)
ExtraTree.fit(X_train, y_train)

y_pred = ExtraTree.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))


In [None]:

##################
# Light Boosting
##################
def lgb_objective(trial):
    _num_leaves = trial.suggest_int("num_leaves", 2,100)
    _learning_rate = trial.suggest_float("learning_rate", 0.0001, 0.1)
    _n_estimators = trial.suggest_int("n_estimators", 1, 100)
    _min_child_weight = trial.suggest_float("min_child_weight",1,10)
    _reg_alpha = trial.suggest_float('reg_alpha', 0.01, 10)
    _reg_lambda = trial.suggest_float('reg_lambda', 0.01, 10)
    _subsample = trial.suggest_float('subsample', 0.01, 1)

    lgbr = LGBMRegressor(objective='regression',
                             num_leaves=_num_leaves,
                             learning_rate=_learning_rate,
                             n_estimators=_n_estimators,
                             min_child_weight=_min_child_weight,
                             subsample=_subsample,
                             reg_alpha=_reg_alpha,
                             reg_lambda=_reg_lambda,
                             random_state=RANDOM_SEED,
    )
    
    score = cross_val_score(
        lgbr, X_train, y_train, cv=kfolds, scoring="r2"
    ).mean()
    return score

lgb_params = tune(lgb_objective)

In [None]:
lgbr = LGBMRegressor(objective='regression', random_state=RANDOM_SEED, **lgb_params)
lgbr.fit(X_train, y_train)

y_pred = lgbr.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

In [None]:
##################
# Random Forest
##################
def randomforest_objective(trial):
    _n_estimators = trial.suggest_int("n_estimators", 1, 1000)
    _max_depth = trial.suggest_int("max_depth", 1, 100)
    _min_samp_split = trial.suggest_float("min_samples_split", 0.0, 1.0)
    _min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    _max_features = trial.suggest_int("max_features", 2, 23)

    rf = RandomForestRegressor(
        max_depth=10,
        min_samples_split=_min_samp_split,
        min_samples_leaf=_min_samples_leaf,
        max_features=_max_features,
        n_estimators=_n_estimators,
        n_jobs=-1,
        random_state=RANDOM_SEED,
    )

    score = cross_val_score(
        rf, X_train, y_train, cv=kfolds, scoring="r2"
    ).mean()

    return score

randomforest_params = tune(randomforest_objective)

In [None]:
rf = RandomForestRegressor(n_jobs=-1, random_state=RANDOM_SEED, **randomforest_params)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

In [None]:
##################
# Bagging Regressor
##################
def Bagging_objective(trial):
    _n_estimators = trial.suggest_int("n_estimators", 1, 1000)
    _max_samples= trial.suggest_int("max_samples", 1, 300)
    _max_features = trial.suggest_int("max_features", 1, 23)

    Bagging = BaggingRegressor(
        max_samples=_max_samples,
        max_features=_max_features,
        n_estimators=_n_estimators,
        n_jobs=-1,
        random_state=RANDOM_SEED,
    )

    score = cross_val_score(
        Bagging, X_train, y_train, cv=kfolds, scoring="r2"
    ).mean()

    return score

Bagging_params = tune(Bagging_objective)

In [None]:
Bagging = BaggingRegressor(n_jobs=-1, random_state=RANDOM_SEED, **Bagging_params)
Bagging.fit(X_train, y_train)

y_pred = Bagging.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

In [None]:
##################
# Hist Gradient Boosting
##################
def gbr_objective(trial):
    _learning_rate = trial.suggest_float("learning_rate", 0.0001, 0.1)
    _max_depth = trial.suggest_int("max_depth", 1,100)
    _min_samples_leaf = trial.suggest_int("min_samples_leaf", 2, 100)

    _max_leaf_nodes = trial.suggest_int("max_leaf_nodes", 2,100)
    _max_bins = trial.suggest_int("max_bins",10,255)

    gbr = HistGradientBoostingRegressor(
        learning_rate=_learning_rate,
        max_depth=_max_depth, 
        min_samples_leaf=_min_samples_leaf,
        max_leaf_nodes = _max_leaf_nodes,
        random_state=RANDOM_SEED,
    )

    score = cross_val_score(
        gbr, X_train, y_train, cv=kfolds, scoring="r2"
    ).mean()

    return score

gbr_params = tune(gbr_objective)

In [None]:
HistGBR = HistGradientBoostingRegressor(random_state=RANDOM_SEED, **gbr_params)
HistGBR.fit(X_train, y_train)

y_pred = HistGBR.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

In [None]:
##################
# XGB Boosting
##################
def xgb_objective(trial):
    _n_estimators = trial.suggest_int("n_estimators", 1,1000)
    _max_depth = trial.suggest_int("max_depth", 2, 100)
    _learning_rate = trial.suggest_float("learning_rate", 0.0001, 0.1)
    _gamma = trial.suggest_float("gamma", 0.0001, 100)
    _min_child_weight = trial.suggest_float("min_child_weight", 0.1, 100)
    _subsample = trial.suggest_float('subsample', 0.0001, 1)
    _reg_alpha = trial.suggest_float('reg_alpha',0.0001, 100)
    _reg_lambda = trial.suggest_float('reg_lambda', 0.0001, 100)

    
    xgbr = xgb.XGBRegressor(
        n_estimators=_n_estimators,
        max_depth=_max_depth, 
        learning_rate=_learning_rate,
        gamma=_gamma,
        min_child_weight=_min_child_weight,
        subsample=_subsample,
        reg_alpha=_reg_alpha,
        reg_lambda=_reg_lambda,
        random_state=RANDOM_SEED,
    )
    
    score = cross_val_score(
        xgbr, X_train, y_train, cv=kfolds, scoring="r2"
    ).mean()
    return score

xgb_params = tune(xgb_objective)


In [None]:
xgbr = xgb.XGBRegressor(random_state=RANDOM_SEED, **xgb_params)
xgbr.fit(X_train, y_train)

y_pred = xgbr.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))