# KNN Model

### Group Name: The Big One

### Group Members: Nicholas Parker, Matthew King, and Sean Sturtevant

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import compose
from sklearn.experimental import enable_iterative_imputer
from sklearn import impute
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

#### Read CSV's into pandas dataframes

In [2]:
hockey = pd.read_csv("data/processed/hockey.csv")

#### Split into train and test sets.

In [3]:
y_train = hockey['Salary']
X_train = hockey.drop('Salary', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train)

#### Select nonpredictive_feature and separate numerical and categorical variables.

In [4]:
# nonpredictive_features = []
# negImportanceDropList = ['Position', 'Team', 'PEND', 'TOIX', 'GA', 'xGA', 'iSCF', 'iPEND', 'sDist.1', 'iHF.1', 'PDO', 'Hand', 'SCA', 'iTKA.1', 'SA', 'IPP%', 'ixG', 'FA', 'Pace', 'iGVA.1', 'SV%', 'RBF', 'PENT', 'F/60', 'GVA', 'TKA', 'FOW', 'Diff/60']
nonpredictive_features = ['ENG', 'Wide', 'Over', 'PSG', 'PSA', 'S.Dflct', 'G.Bkhd', 'Post', 'G.Dflct', 'CBar ', 'G.Slap', 'G.Snap', 'G.Wrst', 'G.Wrap', 'G.Tip', 'S.Bkhd', 'Min', 'S.Slap', 'Misc', 'Noise', 'DAP', 'Grit', 'PS', 'DPS', 'OPS', 'DSA', 'DSF', 'Game', 'Match', 'S.Snap', 'Maj', '1G', 'NPD', 'iPenDf', 'iPenD', 'iPenT', 'S.Wrst', 'S.Wrap', 'S.Tip', 'GWG', 'FOL.Down', 'OTG', 'PIM', 'iSF.1', 'iCF.1', 'Diff', 'Pct%', 'FOL.Close', 'TOI/GP.1', 'TOI/GP', 'TOI', 'Shifts', 'E+/-', 'sDist', '+/-', 'PTS', 'A2', 'A1', 'A', 'G', 'GP', 'Wt', 'Ht', 'iSF.2', 'Age', 'iFOW', 'iBLK', 'iFOL', 'dzFOL', 'nzFOW', 'nzFOL', 'ozFOW', 'ozFOL', 'dzFOW', 'FOL.Up', 'FOW.Up', 'iTKA', 'iGVA', 'iMiss', 'FOW.Down', 'iHF', 'FOW.Close', 'FO%', 'Position', 'Team', 'PEND', 'TOIX', 'GA', 'xGA', 'iSCF', 'iPEND', 'sDist.1', 'iHF.1', 'PDO', 'Hand', 'SCA', 'iTKA.1', 'SA', 'IPP%', 'ixG', 'FA', 'Pace', 'iGVA.1', 'SV%', 'RBF', 'PENT', 'F/60', 'GVA', 'TKA', 'FOW', 'Diff/60']
# RedundantColDropList = ['TOI/GP', 'iCF', 'iSF', 'iSF.1', 'sDist', 'iHF', 'iGVA', 'iTKA', 'iBLK', 'iFOW', 'iFOL']

numeric = [feature for feature in X_train.columns if np.issubdtype(X_train[feature], np.number) 
                      and feature not in nonpredictive_features]
categorical = [feature for feature in X_train.columns if feature not in numeric
                       and feature not in nonpredictive_features]

In [5]:
def make_pipeline(knn=None):
    """
    Creates pipeline that performs separate transformations on the categorical and numerical features.
    """
    
    numeric_features = numeric
    numeric_transformer = Pipeline(steps=[
        ('imputer', impute.SimpleImputer(strategy='median')),
        ('normalizer', preprocessing.Normalizer())])

    preprocessor = compose.ColumnTransformer(transformers=[
        ('numerical', numeric_transformer, numeric_features)])

    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('knn', knn)])
    
    return pipeline

knn = KNeighborsRegressor()
pipeline = make_pipeline(knn)

In [6]:
def make_random_cv():
    """
    Define hyperparameter search space
    Instantiate RandomizedSearchCV with the pipeline.
    """
    
    algo = ['ball_tree', 'kd_tree', 'auto']
    weights = ['distance', 'uniform']
    neighbors = [8, 9, 10, 15]
    hyperparameters = dict(knn__algorithm=algo,
                          knn__n_neighbors=neighbors,
                          knn__weights=weights)
    
    reg_random_cv = RandomizedSearchCV(pipeline, 
                                       hyperparameters, 
                                       cv=5, 
                                       n_iter=15, 
                                       verbose=1,
                                       random_state=42)
    
    return reg_random_cv

model = make_random_cv()
model.fit(X_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    0.9s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('preprocessor',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='drop',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('numerical',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('imputer',
                                                                                                SimpleImputer(add_indicator=False,
                                                               

#### Report and and observe evaluation metrics

In [7]:
def place_commas(number): 
    return ("{:,}".format(number)) 

In [8]:
def mape_metric(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    n = len(y_test)
    running_sum = 0
    for i in range(n):
        running_sum += abs((y_test[i] - y_pred[i])/y_test[i])
    return running_sum/n

#### Median Absolute Error

In [9]:
median_absolute_error_scorer = make_scorer(metrics.median_absolute_error)
cross_val_score(model.best_estimator_, 
                X_train, 
                y_train, 
                scoring=median_absolute_error_scorer,
                cv=10)

array([ 706111.11111111,  585277.77777778,  447083.33333333,
        979722.22222222,  515277.77777778,  694444.44444444,
        415744.44444444,  511388.88888889, 1165277.77777778,
        613055.55555556])

In [10]:
y_pred = model.best_estimator_.predict(X_train)
medae_value_train = place_commas(round(metrics.median_absolute_error(y_train, y_pred), 2))
print(f"${medae_value_train} medae on train dataset")

y_pred = model.best_estimator_.predict(X_test)
medae_value_test = place_commas(round(metrics.median_absolute_error(y_test, y_pred), 2))
print(f"${medae_value_test:} medae on test dataset")

$529,722.22 medae on train dataset
$588,888.89 medae on test dataset


#### Root Mean Squared Error

In [11]:
y_pred = model.best_estimator_.predict(X_train)
rmse_value_train = place_commas(round(np.sqrt(metrics.mean_squared_error(y_train, y_pred)), 2))
print(f"${rmse_value_train} rmse on train dataset")

y_pred = model.best_estimator_.predict(X_test)
rmse_value_test = place_commas(round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)), 2))
print(f"${rmse_value_test} rmse on test dataset")

$1,513,596.24 rmse on train dataset
$1,511,360.12 rmse on test dataset


#### Mean Absolute Percentage Error

In [12]:
y_pred = model.best_estimator_.predict(X_train)
mape_value_train = round(mape_metric(y_train, y_pred)*100, 2)
print(f"{mape_value_train}% mape on train dataset")

y_pred = model.best_estimator_.predict(X_test)
mape_value_test = round(mape_metric(y_test, y_pred)*100, 2)
print(f"{mape_value_test}% mape on test dataset")

60.17% mape on train dataset
59.68% mape on test dataset
