# Data Preparation

In [2]:
import pandas as pd

In [3]:
movies_data_path = '../dataset/movies.csv'
finantial_data_path = '../dataset/finantials.csv'
opening_data_path = '../dataset/opening_gross.csv'

In [4]:
fin_data = pd.read_csv(finantial_data_path)
movie_data = pd.read_csv(movies_data_path)
opening_data = pd.read_csv(opening_data_path)

In [26]:
numeric_columns_mask = (movie_data.dtypes == float) | (movie_data.dtypes == int)
numeric_columns = [column for column in numeric_columns_mask.index if numeric_columns_mask[column]]
movie_data = movie_data[numeric_columns+['movie_title']]


In [38]:
fin_data = fin_data[['movie_title','production_budget','worldwide_gross']]
fin_movie_data = pd.merge(fin_data, movie_data, on= 'movie_title', how='left')
full_movie_data = pd.merge( opening_data,fin_movie_data, on = 'movie_title', how='left')
full_movie_data[(full_movie_data.worldwide_gross != 0) & (full_movie_data.worldwide_gross.notnull())].shape
full_movie_data = full_movie_data.drop(['movie_title','gross'],axis=1)
print(full_movie_data.columns)
full_movie_data

Index(['opening_gross', 'screens', 'production_budget', 'worldwide_gross',
       'title_year', 'aspect_ratio', 'duration', 'budget', 'imdb_score'],
      dtype='object')


Unnamed: 0,opening_gross,screens,production_budget,worldwide_gross,title_year,aspect_ratio,duration,budget,imdb_score
0,2451.0,10.0,12000000,14616,2015.0,1.85,111.0,12000000.0,7.5
1,8330681.0,2271.0,13000000,60414025,1999.0,1.85,97.0,16000000.0,7.2
2,19883351.0,2704.0,85000000,66941559,2000.0,1.85,100.0,85000000.0,4.8
3,5329240.0,2331.0,20000000,17306648,2009.0,2.35,108.0,22000000.0,5.6
4,923715.0,19.0,20000000,181025343,2013.0,2.35,134.0,20000000.0,8.1
...,...,...,...,...,...,...,...,...,...
2299,24733155.0,3036.0,23600000,102236596,2009.0,2.35,88.0,23600000.0,7.7
2300,20065617.0,3482.0,80000000,170805525,2011.0,2.35,102.0,80000000.0,5.2
2301,15650000.0,3394.0,50000000,55348693,2016.0,2.35,102.0,50000000.0,4.8
2302,4510408.0,,35000000,12506188,2006.0,1.85,83.0,35000000.0,4.2


## Modeling

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_validate
import numpy as np

In [41]:
X = full_movie_data.drop(['worldwide_gross'], axis = 1)
y = full_movie_data['worldwide_gross']

In [46]:
pipeline = Pipeline([
     ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean'))
    ,('core_model', GradientBoostingRegressor())
])

In [48]:
results = cross_validate(pipeline, X, y, return_train_score = True, cv = 10)
results

{'fit_time': array([0.47544646, 0.42728972, 0.39944434, 0.3864615 , 0.4178648 ,
        0.41137409, 0.40863633, 0.40267062, 0.39251494, 0.39365816]),
 'score_time': array([0.00200725, 0.0030148 , 0.00200391, 0.00200987, 0.00200629,
        0.00200725, 0.0030098 , 0.00200772, 0.00200725, 0.00200701]),
 'test_score': array([0.66051235, 0.84674979, 0.64125353, 0.78041115, 0.77396554,
        0.86574001, 0.76252594, 0.85538634, 0.67449001, 0.65151735]),
 'train_score': array([0.91105346, 0.9140962 , 0.9183163 , 0.91613755, 0.91832576,
        0.91492969, 0.91866643, 0.91378583, 0.9201035 , 0.91384172])}

In [51]:
train_score = np.mean(results['train_score'])
test_score = np.mean(results['test_score'])
assert train_score > 0.7
assert test_score > 0.65
print(f'Train Score: {train_score}')
print(f'Train Score: {test_score}')

Train Score: 0.9159256441060745
Train Score: 0.7512552007341219


## Hyperparameter tunning

In [53]:
from sklearn.model_selection import GridSearchCV

In [54]:
param_tunning = {'core_model__n_estimators': range(20, 501, 20)}

In [57]:
estimator = Pipeline([
     ('imputer', SimpleImputer(missing_values = np.nan, strategy = 'mean'))
    ,('core_model', GradientBoostingRegressor())
])

In [58]:
grid_search = GridSearchCV(
      estimator
    , param_grid = param_tunning
    , scoring = 'r2'
    , cv = 5
)

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35, random_state = 42)

In [60]:
grid_search.fit(X_train, y_train)

In [62]:
grid_search.best_estimator_

In [61]:
final_result = cross_validate(
      grid_search.best_estimator_
    , X_train
    , y_train
    , return_train_score = True
    , cv = 7
    )

In [64]:
train_score = np.mean(final_result['train_score'])
test_score = np.mean(final_result['test_score'])
assert train_score > 0.7
assert test_score > 0.65
print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

Train Score: 0.95895885449701
Test Score: 0.7668490511558799


In [65]:
grid_search.best_estimator_.get_params()

{'memory': None,
 'steps': [('imputer', SimpleImputer()),
  ('core_model', GradientBoostingRegressor(n_estimators=180))],
 'verbose': False,
 'imputer': SimpleImputer(),
 'core_model': GradientBoostingRegressor(n_estimators=180),
 'imputer__add_indicator': False,
 'imputer__copy': True,
 'imputer__fill_value': None,
 'imputer__keep_empty_features': False,
 'imputer__missing_values': nan,
 'imputer__strategy': 'mean',
 'imputer__verbose': 'deprecated',
 'core_model__alpha': 0.9,
 'core_model__ccp_alpha': 0.0,
 'core_model__criterion': 'friedman_mse',
 'core_model__init': None,
 'core_model__learning_rate': 0.1,
 'core_model__loss': 'squared_error',
 'core_model__max_depth': 3,
 'core_model__max_features': None,
 'core_model__max_leaf_nodes': None,
 'core_model__min_impurity_decrease': 0.0,
 'core_model__min_samples_leaf': 1,
 'core_model__min_samples_split': 2,
 'core_model__min_weight_fraction_leaf': 0.0,
 'core_model__n_estimators': 180,
 'core_model__n_iter_no_change': None,
 'core_m

In [66]:
estimator = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', GradientBoostingRegressor(n_estimators=180,
                                             alpha=0.9,
                                             ccp_alpha=0.0,
                                             criterion='friedman_mse',
                                             init=None,
                                             learning_rate=0.1,
                                             loss='squared_error',
                                             max_depth=3,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_iter_no_change=None,
                                             random_state=None,
                                             subsample=1.0,
                                             tol=0.0001,
                                             validation_fraction=0.1,
                                             verbose=0,
                                             warm_start=False))
])

In [67]:
estimator.fit(X_train, y_train)

In [68]:
estimator.score(X_test, y_test)

0.7284224769966512

## Saving model

In [70]:
from joblib import dump

In [71]:
dump(estimator, '../model/model2.pkl')

['../model/model2.pkl']

In [73]:
X_train.columns

Index(['opening_gross', 'screens', 'production_budget', 'title_year',
       'aspect_ratio', 'duration', 'budget', 'imdb_score'],
      dtype='object')