In [5]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [6]:
raw_dataset = pd.read_csv('resources/datasets/data.csv', sep='\t')

raw_dataset.head()

Unnamed: 0,GAME_ID,MATCHUP,OUTCOME,TEAM_ID_home,TEAM_ABBREVIATION_home,FG_PCT_home,FG3_PCT_home,FT_PCT_home,REB_home,OREB_home,...,DREB_away,AST_away,STL_away,BLK_away,OFF_RATING_away,DEF_RATING_away,TS_PCT_away,TOV_away,ELO_away,SENTIMENT_away
0,22300848,LAC vs. LAL,0,1610612746,LAC,0.535503,0.354497,0.625187,0.22449,0.36,...,0.371429,0.592593,0.3125,0.333333,0.502571,0.532857,0.698895,0.6,0.510972,0.0
1,22300995,MIN vs. DEN,0,1610612750,MIN,0.449704,0.483245,0.544228,0.244898,0.32,...,0.571429,0.518519,0.25,0.066667,0.520566,0.532857,0.676796,0.7,0.750784,0.0
2,22300928,ORL vs. IND,0,1610612753,ORL,0.147929,0.319224,0.638681,0.428571,0.52,...,0.657143,0.296296,0.25,0.666667,0.437018,0.3,0.582873,0.45,0.529781,0.432975
3,22301112,NYK vs. SAC,1,1610612752,NYK,0.66568,0.634921,0.535232,0.285714,0.36,...,0.228571,0.518519,0.3125,0.2,0.470437,0.724286,0.574586,0.65,0.589342,0.569431
4,22300786,DEN vs. SAC,0,1610612743,DEN,0.201183,0.488536,0.796102,0.408163,0.48,...,0.571429,0.333333,0.375,0.333333,0.281491,0.285714,0.372928,0.55,0.547022,0.436185


In [7]:
dataset = raw_dataset.drop(columns=['GAME_ID', 'MATCHUP', 'TEAM_ID_home', 'TEAM_ABBREVIATION_home','TEAM_ID_away', 'TEAM_ABBREVIATION_away'])

In [8]:
train_dataset = dataset.sample(frac=0.9, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

In [9]:
train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('OUTCOME')
test_labels = test_features.pop('OUTCOME')

# Models

In [10]:
models = {
    "mlp": MLPClassifier(max_iter=5000),
    "svm": svm.SVC(),
    "lgr": LogisticRegression(),
    "rf": RandomForestClassifier(),
    "gnb": GaussianNB(),
}

# Hyper tuning 

In [11]:
def hypertune(x, y, model, grid):
    cv = KFold(n_splits=10)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0, verbose=2)
    grid_result = grid_search.fit(x, y)
    means = grid_result.cv_results_['mean_test_score']
    params = grid_result.cv_results_['params']
    return zip(means, params)

## MLP parameters

In [12]:
def hiddenLayer_generator():
    result = []
    for i in range(1,5):
        for no_layers in range(1,11):
            layers = []
            for _ in range(no_layers):
                layers.append((len(train_dataset.columns) + 1)/2 *  2 * i )
            result.append(layers)
    return result

In [13]:
activation = ['logistic', 'tanh', 'relu']
solver = ['sgd', 'adam']

alpha = [10**i for i in range(-5,6)]
learning_rate_init = [1e-1, 1e-2, 1e-3, 1e-4]
learning_rate = ['adaptive', 'invscaling']

hidden_layer_sizes = hiddenLayer_generator()
mlp_grid = dict(
    hidden_layer_sizes=hidden_layer_sizes,
    activation=activation,
    solver=solver,
    alpha=alpha,
    learning_rate=learning_rate,
    learning_rate_init = learning_rate_init
)

In [16]:
with open('resources/models/hyper-tuning/deep-neural-network.tsv', 'x') as file:
    file.write(f'hidden_layer_sizes\tactivation\tsolver\talpha\tlearning_rate\tlearning_rate_init\taccuracy\n')
    combinations = hypertune(train_features, train_labels, models["mlp"], mlp_grid)
    for mean, param in combinations:
        hidden_layer_sizes = param['hidden_layer_sizes']
        activation = param["activation"]
        solver = param["solver"]
        alpha = param["alpha"]
        learning_rate = param["learning_rate"]
        learning_rate_init = param["learning_rate_init"]

        file.write(f"{hidden_layer_sizes}\t{activation}\t{solver}\t{alpha}\t{learning_rate}\t{learning_rate_init}\t{round(mean,5)}\n")

Fitting 10 folds for each of 126720 candidates, totalling 1267200 fits



KeyboardInterrupt



## SVM parameters

In [12]:
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
C = [1000, 100, 50, 10, 1.0, 0.1, 0.01]
degree = range(1,20)
svm_grid = dict(kernel=kernel,C=C,degree=degree)

In [24]:
with open('resources/models/hyper-tuning/svm.tsv', 'x') as file:
    file.write(f'C\tkernel\tdegree\taccuracy\n')
    combinations = hypertune(train_features, train_labels, models["svm"], svm_grid)
    for mean, param in combinations:
        c,degree, kernel = param["C"], param["degree"], param["kernel"]
        if kernel != "poly":
            degree = "-"
        file.write(f"{c}\t{kernel}\t{degree}\t{round(mean,5)}\n")

## Logistic regression parameters

In [26]:
penalty1 = ['l1']
solvers1 = ['saga', 'liblinear']
c_values1 = [100, 10, 1.0, 0.1, 0.01]
lgr_grid1 = dict(solver=solvers1, penalty=penalty1, C=c_values1)

penalty2 = ['l2']
solvers2 = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
c_values2 = [100, 10, 1.0, 0.1, 0.01]
lgr_grid2 = dict(solver=solvers2, penalty=penalty2, C=c_values2)

In [32]:
with open('resources/models/hyper-tuning/logistic-regression.tsv', 'x') as file:
    file.write(f'C\tregularization\tsolver\taccuracy\n')
    combinations1 = hypertune(train_features, train_labels, models["lgr"], lgr_grid1)
    for mean, param in combinations1:
        c, penalty ,solver = param["C"], param["penalty"], param["solver"]
        file.write(f"{c}\t{penalty}\t{solver}\t{round(mean,5)}\n")
        
    combinations2 = hypertune(train_features, train_labels, models["lgr"], lgr_grid2)
    for mean, param in combinations2:
        c, penalty ,solver = param["C"], param["penalty"], param["solver"]
        file.write(f"{c}\t{penalty}\t{solver}\t{round(mean,5)}\n")

## Random forest parameters

In [3]:
n_estimators = range(1,101)
max_features = range(1, 31)

max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] + [None]
min_samples_split = range(1,11)
min_samples_leaf = range(1,5)
bootstrap = [True, False]


rf_grid = dict(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, bootstrap=bootstrap)

NameError: name 'np' is not defined

In [None]:
with open('resources/models/hyper-tuning/random-forest.tsv', 'x') as file:
    file.write(f'n_estimators\tmax_features\tmax_depth\tmin_samples_split\tmin_samples_leaf\tbagging\taccuracy\n')
    combinations = hypertune(train_features, train_labels, models["rf"], rf_grid)
    for mean, param in combinations:
        n_estimators = param["n_estimators"]
        max_features = param["max_features"]
        max_depth = param["max_depth"]
        min_samples_split = param["min_samples_split"]
        min_samples_leaf = param["min_samples_leaf"]
        bootstrap = param["bootstrap"]
        
        file.write(f"{n_estimators}\t{max_features}\t{max_depth}\t{min_samples_split}\t{min_samples_leaf}\t{bootstrap}\t{round(mean,5)}\n")

## Naive Bayes parameters

In [14]:
var_smoothing = [10**i for i in range(-11, -7)]
gnb_grid = dict(var_smoothing=var_smoothing)

In [15]:
with open('resources/models/hyper-tuning/gaussian-naive-bayes.tsv', 'x') as file:
    file.write(f'var_smoothing\taccuracy\n')
    combinations = hypertune(train_features, train_labels, models["gnb"], gnb_grid)
    for mean, param in combinations:
        var_smoothing = param["var_smoothing"]
        file.write(f"{var_smoothing}\t{round(mean,5)}\n")

Fitting 10 folds for each of 4 candidates, totalling 40 fits
