In [133]:
data = 'https://raw.githubusercontent.com/mariokart345/DS-Unit-2-Applied-Modeling/master/data/Video_Games_Sales_as_at_22_Dec_2016.csv'
import pandas as pd
import numpy as np
df = pd.read_csv(data)

In [134]:
#Commentated out for the purpose of not running extraneous exploratory code
#from pandas_profiling import ProfileReport
#ProfileReport(df)

In [135]:
#Wrangling function
def wrangle(df):
    #Removing all Na
    df = df.dropna().copy()
    #Engineering features
    df['Above_Average_Critic_Score'] = df['Critic_Score']>70
    df['User_Score'] = df['User_Score'].replace('tbd',np.NaN,regex=True)
    df['User_Score'] = df['User_Score'].astype(float)
    df['Above_Average_User_Score'] = df['User_Score']>7
    #Dropping really high variance
    df = df.drop(labels=['Name','Developer'],axis=1)
    #Dropping high NaN columns
    df = df.drop(labels=['Rating','User_Score','Critic_Score','Critic_Count','User_Count'],axis=1)
    #Using log function to created a less skewed distribution
    df['Log_Global_Sales'] = np.log(df['Global_Sales'])
    #Dropping all other Sales columns to prevent leakage
    df = df.drop(labels=['NA_Sales','EU_Sales','JP_Sales','Other_Sales','Global_Sales'],axis=1)
    #Converting 'Year_of_Release' to pandas datetime and sorting by oldest first
    df = df.dropna(subset=['Year_of_Release'])
    df['Year_of_Release'] = pd.to_datetime(df['Year_of_Release'],format='%Y')
    df['Year_of_Release'] = df['Year_of_Release'].dt.year
    df = df.sort_values(by='Year_of_Release') 
    #Removing upper and lower .5 percentile
    df = df[(df['Log_Global_Sales'] >= np.percentile(df['Log_Global_Sales'], 0.5)) & (df['Log_Global_Sales'] <= np.percentile(df['Log_Global_Sales'], 99.5))]
    return df

In [136]:
#Applying wrangle function to dataset
game_sales = wrangle(df)

Linear Regression

In [137]:
#Training, Validation and test datasets
train = game_sales[game_sales['Year_of_Release']<2009]
#Using .query() to prevent user warning
val = game_sales.query('Year_of_Release >= 2009 & Year_of_Release < 2013')
test = game_sales[game_sales['Year_of_Release']>=2013]
#Dropping 'Global_Sales' to prevent leakage and splitting into features and target
y_train = train['Log_Global_Sales']
X_train = train.drop('Log_Global_Sales',axis=1)
y_val = val['Log_Global_Sales']
X_val = val.drop('Log_Global_Sales',axis=1)
y_test = test['Log_Global_Sales']
X_test = test.drop('Log_Global_Sales',axis=1)

In [65]:
#Baseline MAE
from sklearn.metrics import r2_score,mean_absolute_error
base_mean = y_train.mean()
y_train_pred = [base_mean]*len(y_train)
print(f'Baseline MAE:{mean_absolute_error(y_train,y_train_pred)}')

Baseline MAE:1.1099002700982674


In [66]:
#Fitting basic model for R^2 and MAE
from sklearn.pipeline import make_pipeline
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
regress = make_pipeline(ce.OrdinalEncoder(),SimpleImputer(strategy='mean'),LinearRegression())
regress.fit(X_train,y_train)
print(f'Model R^2 Score: {regress.score(X_val,y_val)}\nMAE Score: {mean_absolute_error(y_val,regress.predict(X_val))}')

Model R^2 Score: 0.04153430182747764
MAE Score: 0.9900778714722571


Random Forest

In [67]:
#Basic bagging model
from sklearn.ensemble import RandomForestRegressor
pipeline = make_pipeline(ce.OrdinalEncoder(),SimpleImputer(strategy='mean'),RandomForestRegressor(criterion='mae',n_estimators=100, random_state=63, n_jobs=-1))
pipeline.fit(X_train,y_train)
pipeline.score(X_val,y_val)

0.02173441132756082

In [77]:
#Doing hyperparameter tuning for bagging model
#n_inter=160, cv=5 <-- changed for running in colab
from sklearn.model_selection import RandomizedSearchCV
params = {'randomforestregressor__n_estimators':np.arange(100,500,20),'randomforestregressor__max_depth':np.arange(1,10,1)}
basepipe = make_pipeline(ce.OrdinalEncoder(),SimpleImputer(),RandomForestRegressor(criterion='mae'))
rsearch = RandomizedSearchCV(basepipe,param_distributions=params,n_iter=50, cv=3, scoring='neg_mean_absolute_error', n_jobs=10, verbose=10, random_state=856)
rsearch.fit(X_train,y_train)
print(rsearch.best_params_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
{'randomforestregressor__n_estimators': 380, 'randomforestregressor__max_depth': 9}


In [128]:
#Applying tuned hyperparameters for random forest model
tunedpipe = make_pipeline(ce.OrdinalEncoder(),SimpleImputer(strategy='median'),RandomForestRegressor(criterion='mae',n_estimators=380,max_depth=9,n_jobs=10,random_state=437))
tunedpipe.fit(X_train,y_train)
tunedpipe.score(X_val,y_val)

0.0407930477631675

Decision tree

In [130]:
#Doing basic boost model
from sklearn.tree import DecisionTreeRegressor
pipeline = make_pipeline(ce.OrdinalEncoder(),DecisionTreeRegressor(criterion='mae'))
pipeline.fit(X_train,y_train)
pipeline.score(X_val,y_val)

-0.4297635643028339

In [139]:
#Hyperparameter tuning the decision tree model
decisionbasepipe = make_pipeline(ce.OrdinalEncoder(),SimpleImputer(),DecisionTreeRegressor(criterion='mae'))
decisionparams = {'decisiontreeregressor__max_depth':np.arange(1,10,1)}
decisionsearch = RandomizedSearchCV(decisionbasepipe,param_distributions=decisionparams,n_iter=9, scoring='neg_mean_absolute_error', cv=5,n_jobs=10, verbose=10, random_state=856)
decisionsearch.fit(X_train,y_train)
print(decisionsearch.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
{'decisiontreeregressor__max_depth': 6}


In [140]:
#Applying tuned hyperparameters to decision tree model
decisiontunedpipe = make_pipeline(ce.OrdinalEncoder(),SimpleImputer(strategy='mean'),DecisionTreeRegressor(max_depth=6,criterion='mae'))
decisiontunedpipe.fit(X_train,y_train)
decisiontunedpipe.score(X_val,y_val)

-0.13613960060212826

In [132]:
#The better model seems to be Random Forest tree model
print(f'Random Forest Score: {tunedpipe.score(X_val,y_val)}\nDecision Tree Score: {decisiontunedpipe.score(X_val,y_val)}')

Random Forest Score: 0.0407930477631675
Decision Tree Score: -0.11960613899691452


In [149]:
np.exp(0.0407930477631675)

1.0416365142311101

In [150]:
#Using test dataset with my best model
print(f'Best Model Score with test set:{decisiontunedpipe.score(X_test,y_test)}')

Best Model Score with test set:-1.5205166025118872


In [147]:
print(f'Undid log:{np.exp(-1.5205166025118872)}')

Undid log:0.21859892902172312


Permutation Importance of best model(decision tree)

In [89]:
#Transforming data
transformers = make_pipeline(ce.OrdinalEncoder(), SimpleImputer(strategy='mean'))
X_train_transformed = transformers.fit_transform(X_train)
X_val_transformed = transformers.transform(X_val)
importance = DecisionTreeRegressor(max_depth=6,criterion='mae')
importance.fit(X_train_transformed, y_train)
#Permutation Importance
import eli5
from eli5.sklearn import PermutationImportance
permuter = PermutationImportance(importance,scoring='neg_mean_absolute_error',n_iter=10,random_state=248)
permuter.fit(X_val_transformed,y_val)
feature_names = X_val.columns.to_list()
eli5.show_weights(permuter, top=None, feature_names=feature_names)

Weight,Feature
0.2993  ± 0.0253,Platform
0.1771  ± 0.0363,Above_Average_Critic_Score
0.0835  ± 0.0171,Publisher
0.0013  ± 0.0048,Above_Average_User_Score
0.0004  ± 0.0151,Genre
0  ± 0.0000,Year_of_Release
