In [43]:
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor

import logging
import sys
import numpy as np
import pandas as pd


In [44]:
data = pd.read_csv('../dataset/full_data.csv')
data

Unnamed: 0,opening_gross,screens,production_budget,worldwide_gross,title_year,aspect_ratio,duration,cast_total_facebook_likes,budget,imdb_score
0,2451.0,10.0,12000000,14616,2015.0,1.85,111.0,2059,12000000.0,7.5
1,8330681.0,2271.0,13000000,60414025,1999.0,1.85,97.0,37907,16000000.0,7.2
2,19883351.0,2704.0,85000000,66941559,2000.0,1.85,100.0,4182,85000000.0,4.8
3,5329240.0,2331.0,20000000,17306648,2009.0,2.35,108.0,2799,22000000.0,5.6
4,923715.0,19.0,20000000,181025343,2013.0,2.35,134.0,4251,20000000.0,8.1
...,...,...,...,...,...,...,...,...,...,...
2299,24733155.0,3036.0,23600000,102236596,2009.0,2.35,88.0,28011,23600000.0,7.7
2300,20065617.0,3482.0,80000000,170805525,2011.0,2.35,102.0,5392,80000000.0,5.2
2301,15650000.0,3394.0,50000000,55348693,2016.0,2.35,102.0,24107,50000000.0,4.8
2302,4510408.0,,35000000,12506188,2006.0,1.85,83.0,5022,35000000.0,4.2


In [45]:
model = Pipeline([
      ('imputer', SimpleImputer(strategy='mean', missing_values=np.nan))
    , ('core_model', GradientBoostingRegressor())
])

In [46]:
X = data.drop(['worldwide_gross'], axis=1)
y = data['worldwide_gross']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, train_size=42)


In [47]:
param_tuning = {'core_model__n_estimators': range(20, 301, 20)}

grid_search = GridSearchCV(model, param_grid=param_tuning, scoring='r2', cv=6)

grid_search.fit(X_train, y_train)

In [48]:
final_result = cross_validate(grid_search.best_estimator_, X_train, y_train, return_train_score=True, cv=5)

train_score = np.mean(final_result['train_score']) 
test_score = np.mean(final_result['test_score']) 

In [49]:
final_result

{'fit_time': array([0.08061862, 0.05631661, 0.05619812, 0.0556252 , 0.06527758]),
 'score_time': array([0.00800443, 0.00824475, 0.00185251, 0.        , 0.        ]),
 'test_score': array([ 0.62209573, -0.95175095,  0.46159658,  0.89733395,  0.76337575]),
 'train_score': array([0.99981753, 0.99987498, 0.99991825, 0.99970399, 0.99985746])}

In [50]:
print(train_score)
test_score

0.9998344434148899


0.3585302124168096

In [8]:
assert train_score > 0.7
assert test_score > 0.65

AssertionError: 