# BaseLine Model

## Load Data

In [1]:
import numpy as np
import pandas as pd

hockey = pd.read_csv("data/clean/train.csv"
                     , sep = ',',encoding = "ISO-8859-1",engine='python')
hockey['Age'] = 117 - pd.to_numeric(hockey['Born'].str[0:2])

## Fit scikit-learn model

In [2]:
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import compose
from sklearn.experimental import enable_iterative_imputer
from sklearn import impute
from sklearn import preprocessing

y_train = hockey['Salary']
X_train = hockey.drop('Salary', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=42)

def make_pipeline(regressor=None):
    
    numeric_features = ['Ht', 'Wt', 'DftYr', 'DftRd', 'Ovrl', 
                        'GP', 'G', 'A', 'A1', 'A2', 'PTS', '+/-', 'E+/-', 
                        'PIM', 'Shifts', 'TOI', 'TOIX', 'TOI/GP', 'TOI/GP.1', 
                        'TOI%', 'IPP%', 'SH%', 'SV%', 'PDO', 'F/60', 'A/60', 
                        'Pct%', 'Diff', 'Diff/60', 'iCF', 'iCF.1', 'iFF', 'iSF', 
                        'iSF.1', 'iSF.2', 'ixG', 'iSCF', 'iRB', 'iRS', 'iDS', 
                        'sDist', 'sDist.1', 'Pass', 'iHF', 'iHF.1', 'iHA', 'iHDf', 
                        'iMiss', 'iGVA', 'iTKA', 'iBLK', 'iGVA.1', 'iTKA.1', 
                        'iBLK.1', 'BLK%', 'iFOW', 'iFOL', 'iFOW.1', 'iFOL.1', 
                        'FO%', '%FOT', 'dzFOW', 'dzFOL', 'nzFOW', 'nzFOL', 
                        'ozFOW', 'ozFOL', 'FOW.Up', 'FOL.Up', 'FOW.Down', 
                        'FOL.Down', 'FOW.Close', 'FOL.Close', 'OTG', '1G', 'GWG', 
                        'ENG', 'PSG', 'PSA', 'G.Bkhd', 'G.Dflct', 'G.Slap', 'G.Snap', 
                        'G.Tip', 'G.Wrap', 'G.Wrst', 'CBar ', 'Post', 'Over', 'Wide', 
                        'S.Bkhd', 'S.Dflct', 'S.Slap', 'S.Snap', 'S.Tip', 'S.Wrap', 'S.Wrst', 
                        'iPenT', 'iPenD', 'iPENT', 'iPEND', 'iPenDf', 'NPD', 'Min', 
                        'Maj', 'Match', 'Misc', 'Game', 'CF', 'CA', 'FF', 'FA', 'SF', 
                        'SA', 'xGF', 'xGA', 'SCF', 'SCA', 'GF', 'GA', 'RBF', 'RBA', 
                        'RSF', 'RSA', 'DSF', 'DSA', 'FOW', 'FOL', 'HF', 'HA', 'GVA', 
                        'TKA', 'PENT', 'PEND', 'OPS', 'DPS', 'PS', 'OTOI', 'Grit', 'DAP', 
                        'Pace', 'GS', 'GS/G', 'Age']
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', impute.SimpleImputer(strategy='mean')),
        ('scaler', preprocessing.StandardScaler())])

    categorical_features = ['Pr/St', 'Nat', 'Hand', 'Position', 'Team']
    categorical_transformer = Pipeline(steps=[
        ('imputer', impute.SimpleImputer(strategy='constant', fill_value='unknown')),
        ('onehot', preprocessing.OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = compose.ColumnTransformer(transformers=[
        ('numerical', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)])

    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', regressor)])
    
    return pipeline

regressor = linear_model.Ridge(alpha=100, tol=0.001)
pipeline = make_pipeline(regressor)
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numerical',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                         

## Evaluation Metric

In [3]:
def add_commas(number): 
    return ("{:,}".format(number))

In [4]:
y_pred = pipeline.predict(X_train)
medae_value_train = add_commas(round(metrics.median_absolute_error(y_train, y_pred), 2))
print(f"${medae_value_train} medae on train dataset")

y_pred = pipeline.predict(X_test)
medae_value_test = add_commas(round(metrics.median_absolute_error(y_test, y_pred), 2))
print(f"${medae_value_test} medae on test dataset")

$594,350.7 medae on train dataset
$717,794.25 medae on test dataset


In [5]:
def mape_metric(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    n = len(y_test)
    running_sum = 0
    for i in range(n):
        running_sum += abs((y_test[i] - y_pred[i])/y_test[i])
    return running_sum/n

In [6]:
mape_metric(y_test, y_pred)

0.7340679041195847