In [3]:
import pandas as pd
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from typing import TextIO


In [4]:
raw_dataset = pd.read_csv('resources/datasets/data.csv', sep='\t')

raw_dataset.head()

Unnamed: 0,GAME_ID,MATCHUP,OUTCOME,TEAM_ID_home,TEAM_ABBREVIATION_home,FG_PCT_home,FG3_PCT_home,FT_PCT_home,REB_home,OREB_home,...,DREB_away,AST_away,STL_away,BLK_away,OFF_RATING_away,DEF_RATING_away,TS_PCT_away,TOV_away,ELO_away,SENTIMENT_away
0,22300848,LAC vs. LAL,0,1610612746,LAC,0.535503,0.354497,0.625187,0.22449,0.36,...,0.371429,0.592593,0.3125,0.333333,0.502571,0.532857,0.698895,0.6,0.510972,0.0
1,22300995,MIN vs. DEN,0,1610612750,MIN,0.449704,0.483245,0.544228,0.244898,0.32,...,0.571429,0.518519,0.25,0.066667,0.520566,0.532857,0.676796,0.7,0.750784,0.0
2,22300928,ORL vs. IND,0,1610612753,ORL,0.147929,0.319224,0.638681,0.428571,0.52,...,0.657143,0.296296,0.25,0.666667,0.437018,0.3,0.582873,0.45,0.529781,0.432975
3,22301112,NYK vs. SAC,1,1610612752,NYK,0.66568,0.634921,0.535232,0.285714,0.36,...,0.228571,0.518519,0.3125,0.2,0.470437,0.724286,0.574586,0.65,0.589342,0.569431
4,22300786,DEN vs. SAC,0,1610612743,DEN,0.201183,0.488536,0.796102,0.408163,0.48,...,0.571429,0.333333,0.375,0.333333,0.281491,0.285714,0.372928,0.55,0.547022,0.436185


In [5]:
dataset = raw_dataset.drop(columns=['GAME_ID', 'MATCHUP', 'TEAM_ID_home', 'TEAM_ABBREVIATION_home','TEAM_ID_away', 'TEAM_ABBREVIATION_away'])

In [11]:
train_dataset = dataset.sample(frac=0.9, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

In [12]:
train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('OUTCOME')
test_labels = test_features.pop('OUTCOME')

# Models

In [13]:
models = {
    "mlp": MLPClassifier(max_iter=5000),
    "svm": svm.SVC(),
    "lgr": LogisticRegression(),
    "rf": RandomForestClassifier(),
    "gnb": GaussianNB(),
}

## MLP parameters

In [14]:
def hiddenLayer_generator():
    result = []
    for i in range(1,5):
        for no_layers in range(1,21):
            layers = []
            for _ in range(no_layers):
                layers.append((len(train_dataset.columns) + 1)/2 *  2 * i )
            result.append(layers)
    return result

In [15]:
def hypertune_mlp(x, y, model, grid, file: TextIO):
    cv = KFold(n_splits=10)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0, verbose=1)
    grid_result = grid_search.fit(x, y)
    means = grid_result.cv_results_['mean_test_score']
    params = grid_result.cv_results_['params']
    for mean, param in zip(means, params):
        # hidden_layer_sizes = f"{param['hidden_layer_sizes'][0]} x {len(param['hidden_layer_sizes'])}"
        hidden_layer_sizes = param['hidden_layer_sizes']
        activation = param["activation"]
        solver = param["solver"]
        alpha = param["alpha"]
        learning_rate = param["learning_rate"]
        learning_rate_init = param["learning_rate_init"]

        file.write(f"{hidden_layer_sizes}\t{activation}\t{solver}\t{alpha}\t{learning_rate}\t{learning_rate_init}\t{round(mean,5)}\n")

In [16]:
activation = ['identity', 'logistic', 'tanh', 'relu']
solver = ['lbfgs', 'sgd', 'adam']

alpha = [10**i for i in range(-5,6)]
learning_rate_init = [1e-1, 1e-2, 1e-3, 1e-4]
learning_rate = ['constant', 'adaptive', 'invscaling']

hidden_layer_sizes = hiddenLayer_generator()
mlp_grid = dict(
    hidden_layer_sizes=hidden_layer_sizes,
    activation=activation,
    solver=solver,
    alpha=alpha,
    learning_rate=learning_rate,
    learning_rate_init = learning_rate_init
)

## SVM parameters

In [22]:
def hypertune_svm(x, y, model, grid, file: TextIO):
    cv = KFold(n_splits=10)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)
    grid_result = grid_search.fit(x, y)
    means = grid_result.cv_results_['mean_test_score']
    params = grid_result.cv_results_['params']
    for mean, param in zip(means, params):
        c,degree, kernel = param["C"], param["degree"], param["kernel"]
        if kernel != "poly":
            degree = "-"
        file.write(f"{c}\t{kernel}\t{degree}\t{round(mean,5)}\n")

In [18]:
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
C = [1000, 100, 50, 10, 1.0, 0.1, 0.01]
degree = range(1,20)
svm_grid = dict(kernel=kernel,C=C,degree=degree)

In [24]:
with open('resources/models/hyper-tuning/svm.tsv', 'x') as file:
    file.write(f'C\tkernel\tdegree\taccuracy\n')
    hypertune_svm(train_features, train_labels, models["svm"], svm_grid, file)

## Logistic regression parameters

In [31]:
def hypertune_lgr(x, y, model, grid, file: TextIO):
    cv = KFold(n_splits=10)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)
    grid_result = grid_search.fit(x, y)
    means = grid_result.cv_results_['mean_test_score']
    params = grid_result.cv_results_['params']
    for mean, param in zip(means, params):
        c, penalty ,solver = param["C"], param["penalty"], param["solver"]
        file.write(f"{c}\t{penalty}\t{solver}\t{round(mean,5)}\n")

In [26]:
penalty1 = ['l1']
solvers1 = ['saga', 'liblinear']
c_values1 = [100, 10, 1.0, 0.1, 0.01]
lgr_grid1 = dict(solver=solvers1, penalty=penalty1, C=c_values1)

penalty2 = ['l2']
solvers2 = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
c_values2 = [100, 10, 1.0, 0.1, 0.01]
lgr_grid2 = dict(solver=solvers2, penalty=penalty2, C=c_values2)

In [32]:
with open('resources/models/hyper-tuning/lgr.tsv', 'x') as file:
    file.write(f'C\tregularization\tsolver\taccuracy\n')
    hypertune_lgr(train_features, train_labels, models["lgr"], lgr_grid1, file)
    hypertune_lgr(train_features, train_labels, models["lgr"], lgr_grid2, file)

## Random forest parameters

In [33]:
def hypertune_rf(x, y, model, grid, file: TextIO):
    cv = KFold(n_splits=10)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)
    grid_result = grid_search.fit(x, y)
    means = grid_result.cv_results_['mean_test_score']
    params = grid_result.cv_results_['params']
    for mean, param in zip(means, params):
        n_estimators, max_features = param["n_estimators"], param["max_features"]
        file.write(f"{n_estimators}\t{max_features}\t{round(mean,5)}\n")

In [34]:
n_estimators = range(1,101)
max_features = range(1, 31)
                     
rf_grid = dict(n_estimators=n_estimators, max_features=max_features)

In [None]:
with open('resources/models/hyper-tuning/rf.tsv', 'x') as file:
    file.write(f'n_estimators\tmax_features\taccuracy\n')
    hypertune_lgr(train_features, train_labels, models["rf"], rf_grid, file)

## Naive Bayes parameters

In [None]:
def hypertune_gnb(x, y, model, grid, file: TextIO):
    cv = KFold(n_splits=10)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)
    grid_result = grid_search.fit(x, y)
    means = grid_result.cv_results_['mean_test_score']
    params = grid_result.cv_results_['params']
    for mean, param in zip(means, params):
        var_smoothing = param["var_smoothing"]
        file.write(f"{var_smoothing}\t{round(mean,5)}\n")

In [None]:
var_smoothing = [10**i for i in range(-11, -7)]

gnb_grid = dict(var_smoothing=var_smoothing)

In [None]:
with open('resources/models/hyper-tuning/gnb.tsv', 'x') as file:
    file.write(f'var_smoothing\taccuracy\n')
    hypertune_lgr(train_features, train_labels, models["gnb"], gnb_grid, file)

===================================================

In [74]:
train_features.describe()

Unnamed: 0,FG_PCT_home,FG3_PCT_home,FT_PCT_home,REB_home,OREB_home,DREB_home,AST_home,STL_home,BLK_home,OFF_RATING_home,DEF_RATING_home,TS_PCT_home,TOV_home,ELO_home,SENTIMENT_home,FG_PCT_away,FG3_PCT_away,FT_PCT_away,REB_away,OREB_away,DREB_away,AST_away,STL_away,BLK_away,OFF_RATING_away,DEF_RATING_away,TS_PCT_away,TOV_away,ELO_away,SENTIMENT_away
count,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0
mean,0.448232,0.518559,0.68622,0.38621,0.419266,0.432702,0.452784,0.416714,0.322821,0.532267,0.445452,0.479173,0.390307,0.488478,0.41121,0.519696,0.460158,0.772956,0.439093,0.387833,0.418938,0.527778,0.414994,0.334251,0.445452,0.532267,0.513362,0.490023,0.491764,0.440125
std,0.16446,0.143584,0.157707,0.136977,0.147673,0.145422,0.172909,0.172112,0.171258,0.161469,0.15815,0.157384,0.165117,0.231713,0.221155,0.187437,0.167892,0.11525,0.177679,0.178699,0.151008,0.189833,0.17901,0.173608,0.15815,0.161469,0.174807,0.175408,0.231488,0.227778
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.343195,0.425044,0.585457,0.285714,0.32,0.342105,0.344828,0.3125,0.1875,0.424286,0.338689,0.372796,0.26087,0.348653,0.341192,0.395134,0.345309,0.714,0.333333,0.238095,0.314286,0.407407,0.3125,0.2,0.338689,0.424286,0.392265,0.35,0.357759,0.395137
50%,0.446746,0.521164,0.70015,0.387755,0.4,0.421053,0.448276,0.375,0.3125,0.53,0.460154,0.478589,0.391304,0.53645,0.463029,0.533557,0.45509,0.789,0.416667,0.380952,0.4,0.518519,0.4375,0.333333,0.460154,0.53,0.516575,0.5,0.547806,0.493502
75%,0.565089,0.607143,0.8006,0.47449,0.52,0.526316,0.551724,0.5,0.4375,0.637143,0.551735,0.59194,0.478261,0.648177,0.560592,0.657718,0.562874,0.846,0.555556,0.47619,0.514286,0.62963,0.5,0.466667,0.551735,0.637143,0.629834,0.6,0.660266,0.592908
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.913043,1.0,0.987073,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9375,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [75]:
hypertune_mlp(train_features, train_labels, models["mlp"], mlp_grid, None)

Fitting 10 folds for each of 120384 candidates, totalling 1203840 fits


KeyboardInterrupt: 