In [8]:
import pandas as pd
import numpy as np
import re
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows", 25)
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_selection import RFECV
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn import feature_selection
from sklearn import metrics
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')


import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from featureModelling import get_features_with_cleaned_results

In [9]:
feature_df = get_features_with_cleaned_results()
feature_columns = [col for col in feature_df if col.startswith('feature_')]

# Create our test and train sets
test_x_2019 = feature_df.loc[feature_df.season == 2019, ['game'] + feature_columns]
test_y_2019 = feature_df.loc[feature_df.season == 2019, 'result']
X = feature_df.loc[feature_df.season < 2018, ['game'] + feature_columns]
y = feature_df.loc[feature_df.season < 2018, 'result']


test_x_2018 = feature_df.loc[feature_df.season == 2018, ['game'] + feature_columns]
test_y_2018 = feature_df.loc[feature_df.season == 2018, 'result']

# Scale features
scaler = StandardScaler()
X[feature_columns] = scaler.fit_transform(X[feature_columns])
test_x_2018[feature_columns] = scaler.transform(test_x_2018[feature_columns])
test_x_2019[feature_columns] = scaler.transform(test_x_2019[feature_columns])



In [10]:
%%time
chosenAlgorithm = LogisticRegression()
chosenAlgorithm.fit(X, y)

final_predictions = chosenAlgorithm.predict(test_x_2018)
final_predictions2 = chosenAlgorithm.predict(test_x_2019)

accuracy2018 = (final_predictions == test_y_2018).mean() * 100
accuracy2019 = (final_predictions2 == test_y_2019).mean() * 100
print("Accuracy in predicting the 2019 season is: {:.2f}%".format(accuracy2019))
print("Accuracy in predicting the 2018 season is: {:.2f}%".format(accuracy2018))

# print(confusion_matrix(test_y_2018,final_predictions))
# print(classification_report(test_y_2018,final_predictions))


Our accuracy in predicting the 2019 season is: 58.94%
Our accuracy in predicting the 2018 season is: 56.04%
CPU times: user 98.1 ms, sys: 49.8 ms, total: 148 ms
Wall time: 73.3 ms


Next we want to tune our parameters.

Interestingly, this is actually an unconstrained nonlinear optimization problem and will be solved with either newton or quasi-newton (L-BFGS) methods. As expected Newton's method is better (but slower) as the quasi-newton simply tries to approximate the jacobian.

In [25]:
%%timeit
def hyperparameterTuning(X, y, algorithm, nfolds):
    kfold = StratifiedKFold(n_splits=nfolds)
    Cs = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5,  0.2, 1, 10]
    solvers = ["newton-cg", "lbfgs", "liblinear", "bfgs", "sag", "saga"]
    param_grid = {'C': Cs, 'solver': solvers}
    #param_grid = {'C' : Cs, 'solver' : solvers}
    grid_search = GridSearchCV(algorithm, param_grid, cv=kfold)
    grid_search.fit(X,y)
    grid_search.best_params_
    return grid_search

optimal_parameters = hyperparameterTuning(X,y,chosenAlgorithm,5).best_params_

1min 26s ± 10.6 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [27]:
print("Default parameters are: ")
print(chosenAlgorithm.get_params())
print("\noptimal parameters are:")
print(optimal_parameters)

Default parameters are: 
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}

optimal parameters are:
{'C': 0.05, 'solver': 'newton-cg'}


In [19]:
TunedAlgorithm = LogisticRegression(**optimal_parameters)
TunedAlgorithm.fit(X,y)
tunedPredictions = TunedAlgorithm.predict(test_x_2018)

tunedAccuracy = (tunedPredictions == test_y_2018).mean() * 100

print("Tuned Accuracy in predicting the 2018 season is: {:.2f}%".format(tunedAccuracy))

Our accuracy in predicting the 2018 season is: 63.77%
