# Modeling
Our classes (win versus loss) are almost perfectly balanced, meaning accuracy is a good way to see how well we've done. For completion, however, we include precision, recall, and F1. 

Method: Teach the machine to predict who will win given a game. Then, run the model on 2018 round 1, take those predicted wins, run the model on those games, etc. until we have an entire bracket. 

To Add:
    1. Correlation Heat Map
    2. Feature Importance Graphs given each model
    3. Actual predictions for 2018

In [245]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import pprint

# Model imports
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Suppressing warnings
import warnings
warnings.filterwarnings("ignore")


In [246]:
X = pd.read_csv("/Users/Neil/Desktop/X.csv")
y = pd.read_csv("/Users/Neil/Desktop/y.csv")

In [247]:
del X['f_Seed_region'], X['s_Seed_region'], X['WTeamID'], X['LTeamID'], X['Season']

In [248]:
X.shape

(981, 44)

In [249]:
#Creating train and test data
X_train = X[:713]
y_train = y[:713]
X_test = X[713:]
y_test = y[713:]

In [250]:
model_svm = LinearSVC()
model_gnb = GaussianNB()
model_lrc = LogisticRegressionCV()
model_knn = KNeighborsClassifier()
model_rfc = RandomForestClassifier()
model_xgb = XGBClassifier()
model_gbc = GradientBoostingClassifier()

model_gsc = GridSearchCV(model_gbc, {"learning_rate": [.1, .01, .001], "n_estimators": [100, 1000, 5000], 
                                'max_depth': [1, 2, 3]}, scoring='accuracy')

model_svm.fit(X_train,y_train)
model_gnb.fit(X_train,y_train)
model_lrc.fit(X_train,y_train)
model_knn.fit(X_train,y_train)
model_rfc.fit(X_train,y_train)
model_xgb.fit(X_train,y_train)
model_gsc.fit(X_train,y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [100, 1000, 5000], 'max_depth': [1, 2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [251]:
svm_pred = model_svm.predict(X_test)
gnb_pred = model_gnb.predict(X_test)
lrc_pred = model_lrc.predict(X_test)
knn_pred = model_knn.predict(X_test)
rfc_pred = model_rfc.predict(X_test)
xgb_pred = model_xgb.predict(X_test)
gsc_pred = model_gsc.predict(X_test)

In [252]:
svm_preds = pd.DataFrame(svm_pred)
gnb_preds = pd.DataFrame(gnb_pred)
lrc_preds = pd.DataFrame(lrc_pred)
knn_preds = pd.DataFrame(knn_pred)
rfc_preds = pd.DataFrame(rfc_pred)
xgb_preds = pd.DataFrame(xgb_pred)
gsc_preds = pd.DataFrame(gsc_pred)

In [253]:
print("SVM test acc: {}".format(accuracy_score(y_test, svm_preds)))

SVM test acc: 0.5111940298507462


In [254]:
print("Gaussian NB test acc: {}".format(accuracy_score(y_test, gnb_preds)))

Gaussian NB test acc: 0.6082089552238806


In [255]:
print("Logistic Regression test acc: {}".format(accuracy_score(y_test, lrc_preds)))

Logistic Regression test acc: 0.585820895522388


In [256]:
print("K-Nearest Neighbors test acc: {}".format(accuracy_score(y_test, knn_preds)))

K-Nearest Neighbors test acc: 0.5074626865671642


In [257]:
print("Random Forest test acc: {}".format(accuracy_score(y_test, rfc_preds)))

Random Forest test acc: 0.5671641791044776


In [258]:
print("XGBoost test acc: {}".format(accuracy_score(y_test, xgb_preds)))

XGBoost test acc: 0.5708955223880597


In [259]:
print("Gradient Boosting test acc: {}".format(accuracy_score(y_test, gsc_preds)))

Gradient Boosting test acc: 0.5746268656716418


In [260]:
confusion_matrix(y_test, svm_preds)

array([[137,   0],
       [131,   0]])

In [261]:
confusion_matrix(y_test, gnb_preds)

array([[81, 56],
       [49, 82]])

In [262]:
confusion_matrix(y_test, lrc_preds)

array([[76, 61],
       [50, 81]])

In [263]:
confusion_matrix(y_test, knn_preds)

array([[64, 73],
       [59, 72]])

In [264]:
confusion_matrix(y_test, rfc_preds)

array([[94, 43],
       [73, 58]])

In [265]:
confusion_matrix(y_test, xgb_preds)

array([[83, 54],
       [61, 70]])

In [266]:
confusion_matrix(y_test, gsc_preds)

array([[96, 41],
       [73, 58]])

In [267]:
print("SVM Test:")
pprint.pprint(precision_recall_fscore_support(y_test, svm_preds))

SVM Test:
(array([0.51119403, 0.        ]),
 array([1., 0.]),
 array([0.67654321, 0.        ]),
 array([137, 131]))


In [268]:
print("GNB Test:")
pprint.pprint(precision_recall_fscore_support(y_test, gnb_preds))

GNB Test:
(array([0.62307692, 0.5942029 ]),
 array([0.59124088, 0.6259542 ]),
 array([0.60674157, 0.60966543]),
 array([137, 131]))


In [269]:
print("LRC Test:")
pprint.pprint(precision_recall_fscore_support(y_test, lrc_preds))

LRC Test:
(array([0.6031746 , 0.57042254]),
 array([0.55474453, 0.61832061]),
 array([0.57794677, 0.59340659]),
 array([137, 131]))


In [270]:
print("KNN Test:")
pprint.pprint(precision_recall_fscore_support(y_test, knn_preds))

KNN Test:
(array([0.5203252 , 0.49655172]),
 array([0.46715328, 0.54961832]),
 array([0.49230769, 0.52173913]),
 array([137, 131]))


In [271]:
print("RFC Test:")
pprint.pprint(precision_recall_fscore_support(y_test, rfc_preds))

RFC Test:
(array([0.56287425, 0.57425743]),
 array([0.68613139, 0.44274809]),
 array([0.61842105, 0.5       ]),
 array([137, 131]))


In [272]:
print("XGB Test:")
pprint.pprint(precision_recall_fscore_support(y_test, xgb_preds))

XGB Test:
(array([0.57638889, 0.56451613]),
 array([0.60583942, 0.53435115]),
 array([0.59074733, 0.54901961]),
 array([137, 131]))


In [273]:
print("GBC Test:")
pprint.pprint(precision_recall_fscore_support(y_test, gsc_preds))

GBC Test:
(array([0.56804734, 0.58585859]),
 array([0.70072993, 0.44274809]),
 array([0.62745098, 0.50434783]),
 array([137, 131]))


In [274]:
feature_importances = pd.DataFrame(model_xgb.feature_importances_, index = X_train.columns, 
                                   columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
s_PF,0.046296
s_FTM,0.040123
f_FGA,0.03858
s_EScore,0.03858
LTeamElo,0.037037
WTeamElo,0.037037
s_Ast,0.033951
f_FGM3,0.032407
s_Blk,0.032407
f_Stl,0.029321


In [276]:
y['0'].value_counts()


0    501
1    480
Name: 0, dtype: int64