# NBA Point Spread Regressor Experiments

## Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
#pd.options.display.max_columns = None
#pd.set_option("display.max_colwidth", None)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
#pd.set_option("display.max_rows", None)
import model.train as train
import model.config as model_config
import utils
import model.dataset.game_matchup as gm
import experiments as exp
import qgrid
from yellowbrick import classifier, features, regressor
import yellowbrick.model_selection as ms
from pandas_profiling import ProfileReport

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

sns.set(style="whitegrid", palette="husl")

### Utils functions

In [None]:
def show_qgrid(df_):
    qgrid_widget = qgrid.show_grid(df_, show_toolbar=True , grid_options={'forceFitColumns': False, 'defaultColumnWidth': 200})
    qgrid_widget

In [None]:
def plot_results(experiment_name, results, figsize=(20,10)):
    plt.figure(figsize=figsize)
    results_df = exp.map_results_to_df(results)
    a = sns.pointplot(data=results_df,
              kind="point", x="season_test", y="balanced_accuracy", hue="model"
              )
    a.set_title(
        f"{experiment_name}-balanced_accuracy")
    a.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
def get_results_df(algorithm_type='clf'):
    if algorithm_type == 'reg':
        return pd.DataFrame(exp.reg_exp_results)
    else:
        return pd.DataFrame(exp.exp_results)

In [None]:
metrics = ['precision', 'balanced_accuracy', 'recall', 'f1', 'roc_auc']

def print_prettier_exp_results(exp_name, metric='roc_auc'):
    ba_df = get_results_df()[['exp_name', 'model',
       f'{metric}_mean', f'{metric}_std']]
    return ba_df[ba_df.exp_name == exp_name].sort_values(by=[f"{metric}_mean"], ascending=False)


reg_metrics = ['mae', 'mse', 'rmse']

def print_prettier_reg_exp_results(exp_name, metric='mse'):
    ba_df = get_results_df('reg')
    return ba_df[ba_df.exp_name == exp_name].sort_values(by=[f"{metric}_mean"], ascending=False)

## Load Dataset

In [None]:
gm_df = gm.load_game_matchup_dataset()

## EDA

In [None]:
eda_df = gm_df

In [None]:
eda_df.info()

In [None]:
eda_df.describe()

In [None]:
eda_df["WIN"] = ["HOME" if x == 1 else "VISITOR" for x in eda_df['HOME_TEAM_WINS']]

In [None]:
eda_df["WIN"].value_counts()

### Pandas Profiling

In [None]:
profile = ProfileReport(eda_df, title='Pandas Profiling Report', pool_size=4,
                        minimal=True,
                        explorative=True,
                           correlations={
             "pearson": {"calculate": True},
             "spearman": {"calculate": True},
             "kendall": {"calculate": True},
             "phi_k": {"calculate": False},
             "cramers": {"calculate": False},
         })
profile

### By Team

In [None]:
last_season = eda_df[eda_df.SEASON == 2018]
last_season.head()

In [None]:
home_wins_df = last_season[["HOME_TEAM_NAME", "HOME_TEAM_WINS"]].groupby(by="HOME_TEAM_NAME") \
.agg({'HOME_TEAM_NAME':'count', 'HOME_TEAM_WINS': 'sum'}) \
.sort_values(by=["HOME_TEAM_WINS"], ascending=False)
home_wins_df["TEAM_LOSS"] = home_wins_df["HOME_TEAM_NAME"] - home_wins_df["HOME_TEAM_WINS"]
home_wins_df.drop(labels=["HOME_TEAM_NAME"], axis=1, inplace=True)
home_wins_df.rename(columns={"HOME_TEAM_WINS": "TEAM_WINS"}, inplace=True)
home_wins_df.index.rename("TEAM_NAME", inplace=True)
home_wins_df

In [None]:
visitor_wins_df = last_season[["VISITOR_TEAM_NAME", "HOME_TEAM_WINS"]].groupby(by="VISITOR_TEAM_NAME") \
.agg({'VISITOR_TEAM_NAME':'count', 'HOME_TEAM_WINS': 'sum'}) \
.sort_values(by=["HOME_TEAM_WINS"], ascending=False)
visitor_wins_df["TEAM_WINS"] = visitor_wins_df["VISITOR_TEAM_NAME"] - visitor_wins_df["HOME_TEAM_WINS"]
visitor_wins_df.rename(columns={"HOME_TEAM_WINS": "TEAM_LOSS"}, inplace=True)
visitor_wins_df.drop(labels=["VISITOR_TEAM_NAME"], axis=1, inplace=True)
visitor_wins_df.index.rename("TEAM_NAME", inplace=True)
#visitor_wins_df = visitor_wins_df[["TEAM_WINS"]]
visitor_wins_df = visitor_wins_df[["TEAM_WINS", "TEAM_LOSS"]]
visitor_wins_df

In [None]:
home_wins_df.combine(visitor_wins_df, lambda s1, s2: s1 + s2).sort_values(by="TEAM_WINS", ascending=False)

In [None]:
mil_df = last_season[(last_season.VISITOR_TEAM_NAME == 'MIL') | (last_season.HOME_TEAM_NAME == 'MIL')]
mil_df['GAME_N'] = range(1, 83,1)
mil_df

In [None]:
#sns.regplot(x=mil_df["GAME_N"], y=mil_df["GAME_N"])
#sns.catplot(data=mil_df[["HOME_TEAM_NAME", "GAME_N", "WIN"]],  x="GAME_N", y="HOME_TEAM_NAME", kind="bar", hue="WIN", palette="Set2")
#sns.displot(data=mil_df, x="GAME_N", hue="WIN", multiple="stack")
#plt.show()


### Scatter plot

## Dataset


In [None]:
gm_df.groupby(by="SEASON").count()["GAME_DATE_EST"]

Podemos observar que no todas las temporadas tiene la misma cantidad de partidos. Esto es debido a la siguientes razones:

- 2011: Los jugadores hicieron una huelga debido a no estar de acuerdo con los salarios de los mismos y el limite salarial de las franquicias.
- 2012: Un partido entre el equipo de Boston e Indiana fue suspedindo el cual despues no fue reprogramado, y al final de la temporada se decidio ya no reprogramarlo debido a que la clasificacion a playoff ya estaba decidida y no afectaba el resultado.

Por tanto se seleccionaran solo las temporadas a partir del 2013(inclusive).

In [None]:
df = gm_df[gm_df.SEASON >= 2013]
seasons_size = len(df.SEASON.unique())
seasons = list(df.SEASON.unique())

## Experiments

### Setup

In [None]:
exp_prefix = "reg"
exp_group_name = "reg_exp"
reg_results_total = []
exp.exp_results = []
TARGET = "HOME_TEAM_WINS"
exp_X_columns = model_config.X_COLUMNS
exp_y_columns = model_config.Y_COLUMNS[:-1]

reg_models = exp.get_reg_models()

sscv = utils.SeasonSeriesSplit(df)
df_sscv = sscv.get_df()
X = df_sscv[exp_X_columns]
y = df_sscv[exp_y_columns]

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
    ('numerical', num_pipeline, model_config.X_NUM_COLS)
], remainder='passthrough')

### Models evaluation

In [None]:
experiment_name = f"{exp_prefix}1_season"

folds, train_seasons, test_seasons = sscv.split(train_size=1, test_size=1)
params = (experiment_name, reg_models, folds, train_seasons, test_seasons, X, y, preprocessor
         ,exp.calculate_reg_metrics, 'reg'
         )
names, results = exp.run_experiment(*params)
reg_results_total.append((experiment_name, results))

print_prettier_reg_exp_results(experiment_name)

In [None]:
print_prettier_reg_exp_results(experiment_name)