# BaseLine Model

## Load Data

In [None]:
import numpy as np
import pandas as pd

hockey = pd.read_csv("/Users/seanmac/Documents/Mod_2/ML_Lab/predict-nhl-player-salaries/train.csv"
                     , sep = ',',encoding = "ISO-8859-1",engine='python')
hockey['Age'] = 117 - pd.to_numeric(hockey['Born'].str[0:2])

## Fit scikit-learn model

In [None]:
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import compose
from sklearn.experimental import enable_iterative_imputer
from sklearn import impute
from sklearn import preprocessing

y_train = hockey['Salary']
X_train = hockey.drop('Salary', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train)

def make_pipeline(regressor=None):
    
    numeric_features = ['Ht', 'Wt', 'DftYr', 'DftRd', 'Ovrl', 
                        'GP', 'G', 'A', 'A1', 'A2', 'PTS', '+/-', 'E+/-', 
                        'PIM', 'Shifts', 'TOI', 'TOIX', 'TOI/GP', 'TOI/GP.1', 
                        'TOI%', 'IPP%', 'SH%', 'SV%', 'PDO', 'F/60', 'A/60', 
                        'Pct%', 'Diff', 'Diff/60', 'iCF', 'iCF.1', 'iFF', 'iSF', 
                        'iSF.1', 'iSF.2', 'ixG', 'iSCF', 'iRB', 'iRS', 'iDS', 
                        'sDist', 'sDist.1', 'Pass', 'iHF', 'iHF.1', 'iHA', 'iHDf', 
                        'iMiss', 'iGVA', 'iTKA', 'iBLK', 'iGVA.1', 'iTKA.1', 
                        'iBLK.1', 'BLK%', 'iFOW', 'iFOL', 'iFOW.1', 'iFOL.1', 
                        'FO%', '%FOT', 'dzFOW', 'dzFOL', 'nzFOW', 'nzFOL', 
                        'ozFOW', 'ozFOL', 'FOW.Up', 'FOL.Up', 'FOW.Down', 
                        'FOL.Down', 'FOW.Close', 'FOL.Close', 'OTG', '1G', 'GWG', 
                        'ENG', 'PSG', 'PSA', 'G.Bkhd', 'G.Dflct', 'G.Slap', 'G.Snap', 
                        'G.Tip', 'G.Wrap', 'G.Wrst', 'CBar ', 'Post', 'Over', 'Wide', 
                        'S.Bkhd', 'S.Dflct', 'S.Slap', 'S.Snap', 'S.Tip', 'S.Wrap', 'S.Wrst', 
                        'iPenT', 'iPenD', 'iPENT', 'iPEND', 'iPenDf', 'NPD', 'Min', 
                        'Maj', 'Match', 'Misc', 'Game', 'CF', 'CA', 'FF', 'FA', 'SF', 
                        'SA', 'xGF', 'xGA', 'SCF', 'SCA', 'GF', 'GA', 'RBF', 'RBA', 
                        'RSF', 'RSA', 'DSF', 'DSA', 'FOW', 'FOL', 'HF', 'HA', 'GVA', 
                        'TKA', 'PENT', 'PEND', 'OPS', 'DPS', 'PS', 'OTOI', 'Grit', 'DAP', 
                        'Pace', 'GS', 'GS/G', 'Age']
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', impute.SimpleImputer(strategy='mean')),
        ('scaler', preprocessing.StandardScaler())])

    categorical_features = ['Pr/St', 'Nat', 'Hand', 'Position', 'Team']
    categorical_transformer = Pipeline(steps=[
        ('imputer', impute.SimpleImputer(strategy='constant', fill_value='unknown')),
        ('onehot', preprocessing.OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = compose.ColumnTransformer(transformers=[
        ('numerical', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)])

    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', regressor)])
    
    return pipeline

regressor = linear_model.Ridge(alpha=100, tol=0.001)
pipeline = make_pipeline(regressor)
pipeline.fit(X_train, y_train)

## Evaluation Metric

In [None]:
y_pred = pipeline.predict(X_train)
medae_value_train = metrics.median_absolute_error(y_train, y_pred)
print(f"${medae_value_train:.4f} medae on train dataset")

y_pred = pipeline.predict(X_test)
medae_value_test = metrics.median_absolute_error(y_test, y_pred)
print(f"${medae_value_test:.4f} medae on test dataset")

# Final Model Work

In [2]:
import numpy as np
import pandas as pd

In [5]:
train = pd.read_csv("/Users/seanmac/Documents/Mod_2/ML_Lab/predict-nhl-player-salaries/train.csv"
                     , sep = ',',encoding = "ISO-8859-1",engine='python')

test = pd.read_csv("/Users/seanmac/Documents/Mod_2/ML_Lab/predict-nhl-player-salaries/test.csv"
                     , sep = ',',encoding = "ISO-8859-1",engine='python')

test_y = pd.read_csv('/Users/seanmac/Documents/Mod_2/ML_Lab/predict-nhl-player-salaries/test_salaries.csv',
                     sep = ',',encoding = "ISO-8859-1",engine='python')
                     
def combine_train_and_test(train_df, test_df, test_response):
    test_df = pd.concat([test_df, test_response], axis = 1)
    return pd.concat([train_df, test_df],ignore_index = True, sort = False)

hockey = combine_train_and_test(train, test, test_y)

In [6]:
hockey['Age'] = 117 - pd.to_numeric(hockey['Born'].str[0:2])

In [34]:
def nationality_group(df, nationalityCol):
    # A function to feature engineering the 'Nationality column'
    # Changes it from 16 unique values to 5 to prevent overfitting
    scandanavianNations = ['SWE','NOR','FIN']
    otherNations = ['CHE','CZE','FRA','DEU','SVK','AUT','DNK','LVA','HRV','GBR']
    df.loc[(df[nationalityCol].isin(scandanavianNations)), nationalityCol] = 'Scandanavian'
    df.loc[(df[nationalityCol].isin(otherNations)), nationalityCol] = 'Other'
    return df
hockey = nationality_group(hockey, 'Nat')

In [35]:
# Code used to group and remove provinces and states that are only seen a few times
# Useful to prevent overfitting
prs = hockey.groupby('Pr/St').agg({'Pr/St':['count']}).reset_index()
prs.columns = ['pr/st','count']
extreneousStates = list(prs.loc[(prs['count'] < 10)]['pr/st'])
hockey.loc[(hockey['Pr/St'].isin(extreneousStates)),'Pr/St'] = 'Other'

In [36]:
hockey.head(5)

Unnamed: 0,Salary,Born,City,Pr/St,Cntry,Nat,Ht,Wt,DftYr,DftRd,...,OPS,DPS,PS,OTOI,Grit,DAP,Pace,GS,GS/G,Age
0,925000.0,97-01-30,Sainte-Marie,QC,CAN,CAN,74,190,2015.0,1.0,...,0.0,-0.2,-0.2,40.03,1,0.0,175.7,-0.4,-0.38,20
1,2250000.0,93-12-21,Ottawa,ON,CAN,CAN,74,207,2012.0,1.0,...,-0.2,3.4,3.2,2850.59,290,13.3,112.5,14.1,0.18,24
2,8000000.0,88-04-16,St. Paul,MN,USA,USA,72,218,2006.0,1.0,...,3.7,1.3,5.0,2486.75,102,6.6,114.8,36.8,0.57,29
3,3500000.0,92-01-07,Ottawa,ON,CAN,CAN,77,220,2010.0,1.0,...,0.0,0.4,0.5,1074.41,130,17.5,105.1,5.9,0.2,25
4,1750000.0,94-03-29,Toronto,ON,CAN,CAN,76,217,2012.0,1.0,...,-0.1,1.4,1.3,3459.09,425,8.3,99.5,21.8,0.27,23


In [49]:
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import compose
from sklearn.experimental import enable_iterative_imputer
from sklearn import impute
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor

y_train = hockey['Salary']
X_train = hockey.drop('Salary', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train)

def make_pipeline(regressor=None):
    
    numeric_features = ['Ht', 'Wt', 'DftYr', 'DftRd', 'Ovrl', 
                        'GP', 'G', 'A', 'A1', 'A2', 'PTS', '+/-', 'E+/-', 
                        'PIM', 'Shifts', 'TOI', 'TOIX', 'TOI/GP', 'TOI/GP.1', 
                        'TOI%', 'IPP%', 'SH%', 'SV%', 'PDO', 'F/60', 'A/60', 
                        'Pct%', 'Diff', 'Diff/60', 'iCF', 'iCF.1', 'iFF', 'iSF', 
                        'iSF.1', 'iSF.2', 'ixG', 'iSCF', 'iRB', 'iRS', 'iDS', 
                        'sDist', 'sDist.1', 'Pass', 'iHF', 'iHF.1', 'iHA', 'iHDf', 
                        'iMiss', 'iGVA', 'iTKA', 'iBLK', 'iGVA.1', 'iTKA.1', 
                        'iBLK.1', 'BLK%', 'iFOW', 'iFOL', 'iFOW.1', 'iFOL.1', 
                        'FO%', '%FOT', 'dzFOW', 'dzFOL', 'nzFOW', 'nzFOL', 
                        'ozFOW', 'ozFOL', 'FOW.Up', 'FOL.Up', 'FOW.Down', 
                        'FOL.Down', 'FOW.Close', 'FOL.Close', 'OTG', '1G', 'GWG', 
                        'ENG', 'PSG', 'PSA', 'G.Bkhd', 'G.Dflct', 'G.Slap', 'G.Snap', 
                        'G.Tip', 'G.Wrap', 'G.Wrst', 'CBar ', 'Post', 'Over', 'Wide', 
                        'S.Bkhd', 'S.Dflct', 'S.Slap', 'S.Snap', 'S.Tip', 'S.Wrap', 'S.Wrst', 
                        'iPenT', 'iPenD', 'iPENT', 'iPEND', 'iPenDf', 'NPD', 'Min', 
                        'Maj', 'Match', 'Misc', 'Game', 'CF', 'CA', 'FF', 'FA', 'SF', 
                        'SA', 'xGF', 'xGA', 'SCF', 'SCA', 'GF', 'GA', 'RBF', 'RBA', 
                        'RSF', 'RSA', 'DSF', 'DSA', 'FOW', 'FOL', 'HF', 'HA', 'GVA', 
                        'TKA', 'PENT', 'PEND', 'OPS', 'DPS', 'PS', 'OTOI', 'Grit', 'DAP', 
                        'Pace', 'GS', 'GS/G', 'Age']
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', impute.SimpleImputer(strategy='mean')),
        ('scaler', preprocessing.StandardScaler())])

    categorical_features = ['Pr/St', 'Nat', 'Hand', 'Position', 'Team']
    categorical_transformer = Pipeline(steps=[
        ('imputer', impute.SimpleImputer(strategy='constant', fill_value='unknown')),
        ('onehot', preprocessing.OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = compose.ColumnTransformer(transformers=[
        ('numerical', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)])

    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', regressor)])
    
    return pipeline

regressor = RandomForestRegressor(n_estimators=200, min_samples_leaf=3)#, random_state=42)
pipeline = make_pipeline(regressor)
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numerical',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                         

In [50]:
y_pred = pipeline.predict(X_train)
medae_value_train = metrics.median_absolute_error(y_train, y_pred)
print(f"${medae_value_train:.4f} medae on train dataset")

y_pred = pipeline.predict(X_test)
medae_value_test = metrics.median_absolute_error(y_test, y_pred)
print(f"${medae_value_test:.4f} medae on test dataset")

$213191.4934 medae on train dataset
$557905.2204 medae on test dataset


In [51]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

In [52]:
median_absolute_error_scorer = make_scorer(metrics.median_absolute_error)
cross_val_score(pipeline, 
                X_train, 
                y_train, 
                scoring=median_absolute_error_scorer,
                cv=10)  

array([671957.9168788 , 655746.98636436, 263071.76693706, 446086.61410985,
       473756.65428842, 440055.18849206, 449005.8546746 , 528915.73645105,
       371525.39586425, 630722.68831169])