# Experiments

## Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
#pd.options.display.max_columns = None
#pd.set_option("display.max_colwidth", None)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
#pd.set_option("display.max_rows", None)
import model.train as train
import model.config as model_config
import utils
import model.dataset.game_matchup as gm

## Load Dataset

In [2]:
gm_df = gm.load_game_matchup_dataset()

## Experiment using TimeSeriesSplit

### Specific imports

In [9]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import precision_score, recall_score, balanced_accuracy_score

### Dataset

Usando max_split en TimeSeriesSplit permite dividir el dataset por cantidad de filas por lo que vamos a escoger 
solo las temporadas que tiene la misma cantiadad de partidos.


In [10]:
gm_df.groupby(by="SEASON").count()["GAME_DATE_EST"]

SEASON
2004    1230
2005    1230
2006    1230
2007    1230
2008    1230
2009    1230
2010    1230
2011     990
2012    1229
2013    1230
2014    1230
2015    1230
2016    1230
2017    1230
2018    1230
Name: GAME_DATE_EST, dtype: int64

Podemos observar que no todas las temporadas tiene la misma cantidad de partidos esto es debido a la siguientes razones:

- 2011: Los jugadores hicieron una huelga debido a no estar de acuerdo con los salarios de los mismos y el limite salarial de las franquicias.
- 2012: Un partido entre el equipo de Boston e Indiana fue suspedindo el cual despues no fue reprogramado, y al final de la temporada se decidio ya no reprogramarlo debido a qu la clasificacion a playoff ya estaba decidida y no afectaba el resultado.

Por tanto se seleccionaran solo las temporadas a partir del 2013(inclusive).

In [11]:
df = gm_df[gm_df.SEASON >= 2013]

Prueba de como seran dividido el dataset usando TimeSeriesSplit

In [15]:
tscv = TimeSeriesSplit(n_splits=len(df.SEASON.unique())-1, max_train_size=1230)
X, y = train.X_y_values(df, model_config.X_columns, model_config.y_columns[-1:])
print(len(X))
for train_index, test_index in tscv.split(X=X):
    print("TRAIN:", train_index, "TEST:", test_index)

7380
TRAIN: [   0    1    2 ... 1227 1228 1229] TEST: [1230 1231 1232 ... 2457 2458 2459]
TRAIN: [1230 1231 1232 ... 2457 2458 2459] TEST: [2460 2461 2462 ... 3687 3688 3689]
TRAIN: [2460 2461 2462 ... 3687 3688 3689] TEST: [3690 3691 3692 ... 4917 4918 4919]
TRAIN: [3690 3691 3692 ... 4917 4918 4919] TEST: [4920 4921 4922 ... 6147 6148 6149]
TRAIN: [4920 4921 4922 ... 6147 6148 6149] TEST: [6150 6151 6152 ... 7377 7378 7379]


### Algorithms 

In [16]:
models = []
models.append(('KNN', KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2))) 
models.append(('SVM', SVC(kernel = 'linear', random_state=0))) 
models.append(('KSVM', SVC(kernel = 'rbf', random_state=0))) 
models.append(('NB', GaussianNB())) 
models.append(('DT', DecisionTreeClassifier(criterion = 'entropy', random_state=0))) 
models.append(("RF", RandomForestClassifier(n_estimators=500, 
                                            max_features="sqrt", 
                                            max_depth=15, 
                                            n_jobs=-1, 
                                            random_state = 0)))
models.append(("GB", GradientBoostingClassifier(n_estimators=500, 
                                                max_depth=15, 
                                                max_features="sqrt", 
                                                random_state = 0)))

### Run experiments

In [18]:
# Evaluate each model in turn
results = []
names = []
train_splits = len(df.SEASON.unique())-1
print("Start experiments using TimeSeriesSplit")
for name, model in models:
    # TimeSeries Cross validation
    tscv = TimeSeriesSplit(n_splits=train_splits, max_train_size=1230)
    X, y = train.X_y_values(df, model_config.X_columns, model_config.y_columns[-1:])

    cv_results = cross_validate(model,
                                 X,
                                 y.ravel(),
                                 cv=tscv,
                                 scoring=['balanced_accuracy', 'precision', "recall"])
    
    cv_results["model"] = [name]*train_splits
    cv_results["season_train"] = df.SEASON.unique()[:-1]
    results.append(cv_results)
    
    names.append(name)
print("Done")
    
    #print('%s %s: %f (%f)' % ('balanced_accuracy', name, cv_results["test_balanced_accuracy"].mean()
    #                          , cv_results["test_balanced_accuracy"].std()))
    #print('%s %s: %f (%f)' % ("precision", name, cv_results["test_precision"].mean()
    #                           , cv_results["test_precision"].std()))
    #print('%s %s: %f (%f)' % ("recall", name, cv_results["test_recall"].mean()
    #                           , cv_results["test_recall"].std()))
    #print("\n")

Start experiment using TimeSeriesSplit
Done


In [19]:
results_df = pd.DataFrame(results[0])
for idx, result in enumerate(results[1:]):
    result_df = pd.DataFrame(result)
    results_df = pd.concat([results_df, result_df], ignore_index=True)
results_df.to_pickle("./experiments/tscv_exp.pkl")