# Unparameterized K-Fold Cross Validation

### Load Packages

In [97]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

### Read in Data

Data is from 2009-2010 season. 998 total observations consisting of 39 variables. Among the variables, 1 is a binary win/loss variable stating whether the home team won or lost the given game. The remaining 38 variables are broken down into team averages and team performance metrics. There are 34 variables referring to team averages (17 for home, 17 for away). Team averages are calculated from the teams last 10 games. 

So for the 11th game of the season, team average variables will be the averaged box score stats from games 1-10. For the 12th, team average variables will be calculated from 2-11. And so on. In doing so, this leaves the first 10 games unpredictable, as one can not take average of last 10 games.

The team metric variables consist of ELO, which is a continous metric that defines overall team strength. A team that is conistently good will show high levels of ELO. The idea is that a team gets ELO points for good performances. A playoff win will equate to more ELO points than a win in game 4 of the season. Blowouts, upsets, and road victories receive more points as well. Along with ELO, teams winning percentage in the last 10 games is also considered and recorded as a variable. This well help give a feel of overall recent performance.

The remaining variables consist of 2 dummy variables specifying whether each team is playing in a back to back. For instance, if Cleveland is playing Miami on December 25th, and Cleveland played last night on December 24th, they will receive a 1 for the back to back variable. 

In [75]:
os.chdir('/Users/johnoliver/Downloads/grad-nba-wins/data/mod_data')
# data from 2010
df10 = pd.read_csv("mod10.csv")
# get rid of first variable (unique identifier)
df10 = df10.iloc[:,1:]

### Format Data

Data is split into training (70%) and testing (30%) data. Home win is predicted by the 38 average and team metric variables

In [76]:
# define x and y variables
feature_cols = ["h_avg_points","a_avg_points","h_avg_fg","a_avg_fg",
                "h_avg_fga","a_avg_fga","h_avg_3p","a_avg_3p",
                "h_avg_3pa","a_avg_3pa","h_avg_ft","a_avg_ft",
                "h_avg_orb","a_avg_orb","h_avg_drb" ,"a_avg_drb",
                "h_avg_ast","a_avg_ast","h_avg_stl", "a_avg_stl", 
                "h_avg_blk","a_avg_blk","h_avg_tov","a_avg_tov",
                "h_avg_pf","a_avg_pf" ,"h_avg_tsp",  "a_avg_tsp",
                "h_avg_ortg","a_avg_ortg","h_avg_drtg","a_avg_drtg",
                "h_win_perc","a_win_perc","h_back","a_back",
                "home_elo", "away_elo"]

X = df10[feature_cols]
Y = df10.win_status

scaler = StandardScaler()
scale_X = scaler.fit_transform(X)

# split data into training and testing
X_train,X_test,y_train,y_test=train_test_split(X,Y,
                                               test_size=0.3,
                                               random_state=0)
# scaled data
scale_X_train,scale_X_test,y_train,y_test=train_test_split(scale_X,Y,
                                               test_size=0.3,
                                               random_state=0)


# Logistic Regression

In [None]:
%%time
# set up k fold cross validation 
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats = 3, random_state=1)
lr_model = LogisticRegression()
lr_model.fit(X_train,y_train)
lr_scores = cross_val_score(lr_model, X_train, y_train,
                            scoring = 'accuracy', cv = cv,
                            n_jobs = 1)

lr_ypred = lr_model.predict(X_test)

In [105]:
print("Cross Validation Score: %s" % lr_scores.mean())
print("Nonparameterized Run Time: 5.11 s s")

Cross Validation Score: 0.6893838383838384
Nonparameterized Run Time: 5.11 s s


# Random Forest Classifier

In [None]:
%%time
rf_model = RandomForestClassifier()
rf_model.fit(X_train,y_train)
rf_scores = cross_val_score(rf_model, X_train, y_train,
                            scoring = 'accuracy', cv = cv,
                            n_jobs = 1)

rf_ypred = rf_model.predict(X_test)

In [99]:
print("Cross Validation Score: %s" % rf_scores.mean())
print("Nonparameterized Run Time: 32.6 s s")

Cross Validation Score: 0.6623326432022085
Nonparameterized Run Time: 32.6 s s


# Support Vector Machine

In [None]:
%%time
svc_model = SVC()
svc_model.fit(scale_X_train,y_train)
svc_scores = cross_val_score(svc_model, scale_X_train, y_train,
                            scoring = 'accuracy', cv = cv,
                            n_jobs = 1)

svc_ypred = svc_model.predict(scale_X_test)

In [100]:
print("Cross Validation Score: %s" % svc_scores.mean())
print("Nonparameterized Run Time: 3.58 s")

Cross Validation Score: 0.6590200138026225
Nonparameterized Run Time: 3.58 s


# Naive Bayes

In [None]:
%%time
nb_model = GaussianNB()
nb_model.fit(scale_X_train,y_train)
nb_scores = cross_val_score(nb_model, scale_X_train, y_train,
                            scoring = 'accuracy', cv = cv,
                            n_jobs = 1)

nb_ypred = nb_model.predict(scale_X_test)

In [101]:
print("Cross Validation Score: %s" % nb_scores.mean())
print("Nonparameterized Run Time: 374 ms")

Cross Validation Score: 0.6580676328502417
Nonparameterized Run Time: 374 ms


# Artificial Neural Network

In [None]:
%%time
nn_model = MLPClassifier()
nn_model.fit(scale_X_train, y_train)
nn_scores = cross_val_score(nn_model, scale_X_train, y_train,
                            scoring = 'accuracy', cv = cv,
                            n_jobs = 1)

nn_ypred = nn_model.predict(scale_X_test)

In [102]:
print("Cross Validation Score: %s" % nn_scores.mean())
print("Nonparameterized Run Time: 1min 19s")

Cross Validation Score: 0.6322636300897171
Nonparameterized Run Time: 1min 19s
