# Modeling
Our classes (win versus loss) are almost perfectly balanced, meaning accuracy is a good way to see how well we've done. For completion, however, we include precision, recall, and F1. 

Method: Teach the machine to predict who will win given a game. Then, run the model on 2018 round 1, take those predicted wins, run the model on those games, etc. until we have an entire bracket. 

To Add:
    1. Correlation Heat Map
    2. Feature Importance Graphs given each model
    3. Actual predictions for 2018

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import pprint

# Model imports
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Suppressing warnings
import warnings
warnings.filterwarnings("ignore")


In [2]:
X = pd.read_csv("/Users/Neil/Desktop/X.csv")
y = pd.read_csv("/Users/Neil/Desktop/y.csv")

In [3]:
del X['f_Seed_region'], X['s_Seed_region'], X['WTeamID'], X['LTeamID'], X['Season']

seed_diff = pd.DataFrame(X['f_Seed_no'] - X['s_Seed_no'])
elo_diff = pd.DataFrame(X['WTeamElo'] - X['LTeamElo'])
X['seed_diff'] = seed_diff
X['elo_diff'] = elo_diff

In [4]:
X.shape

(981, 46)

In [5]:
#Creating train and test data
X_train = X[:713]
y_train = y[:713]
X_test = X[713:]
y_test = y[713:]

In [6]:
model_svm = LinearSVC()
model_gnb = GaussianNB()
model_lrc = LogisticRegressionCV()
model_knn = KNeighborsClassifier()
model_rfc = RandomForestClassifier()
model_xgb = XGBClassifier()
model_gbc = GradientBoostingClassifier()

model_gsc = GridSearchCV(model_gbc, {"learning_rate": [.1, .01, .001], "n_estimators": [100, 1000, 5000], 
                                'max_depth': [1, 2, 3]}, scoring='accuracy')

model_svm.fit(X_train,y_train)
model_gnb.fit(X_train,y_train)
model_lrc.fit(X_train,y_train)
model_knn.fit(X_train,y_train)
model_rfc.fit(X_train,y_train)
model_xgb.fit(X_train,y_train)
model_gsc.fit(X_train,y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [100, 1000, 5000], 'max_depth': [1, 2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [7]:
svm_pred = model_svm.predict(X_test)
gnb_pred = model_gnb.predict(X_test)
lrc_pred = model_lrc.predict(X_test)
knn_pred = model_knn.predict(X_test)
rfc_pred = model_rfc.predict(X_test)
xgb_pred = model_xgb.predict(X_test)
gsc_pred = model_gsc.predict(X_test)

In [8]:
svm_preds = pd.DataFrame(svm_pred)
gnb_preds = pd.DataFrame(gnb_pred)
lrc_preds = pd.DataFrame(lrc_pred)
knn_preds = pd.DataFrame(knn_pred)
rfc_preds = pd.DataFrame(rfc_pred)
xgb_preds = pd.DataFrame(xgb_pred)
gsc_preds = pd.DataFrame(gsc_pred)

In [9]:
print("SVM test acc: {}".format(accuracy_score(y_test, svm_preds)))

SVM test acc: 0.6007462686567164


In [10]:
print("Gaussian NB test acc: {}".format(accuracy_score(y_test, gnb_preds)))

Gaussian NB test acc: 0.6902985074626866


In [11]:
print("Logistic Regression test acc: {}".format(accuracy_score(y_test, lrc_preds)))

Logistic Regression test acc: 0.7164179104477612


In [12]:
print("K-Nearest Neighbors test acc: {}".format(accuracy_score(y_test, knn_preds)))

K-Nearest Neighbors test acc: 0.7947761194029851


In [13]:
print("Random Forest test acc: {}".format(accuracy_score(y_test, rfc_preds)))

Random Forest test acc: 0.7574626865671642


In [14]:
print("XGBoost test acc: {}".format(accuracy_score(y_test, xgb_preds)))

XGBoost test acc: 0.8097014925373134


In [15]:
print("Gradient Boosting test acc: {}".format(accuracy_score(y_test, gsc_preds)))

Gradient Boosting test acc: 0.7723880597014925


In [16]:
confusion_matrix(y_test, svm_preds)

array([[ 32, 105],
       [  2, 129]])

In [17]:
confusion_matrix(y_test, gnb_preds)

array([[92, 45],
       [38, 93]])

In [18]:
confusion_matrix(y_test, lrc_preds)

array([[97, 40],
       [36, 95]])

In [19]:
confusion_matrix(y_test, knn_preds)

array([[110,  27],
       [ 28, 103]])

In [20]:
confusion_matrix(y_test, rfc_preds)

array([[109,  28],
       [ 37,  94]])

In [21]:
confusion_matrix(y_test, xgb_preds)

array([[114,  23],
       [ 28, 103]])

In [22]:
confusion_matrix(y_test, gsc_preds)

array([[112,  25],
       [ 36,  95]])

In [23]:
print("SVM Test:")
pprint.pprint(precision_recall_fscore_support(y_test, svm_preds))

SVM Test:
(array([0.94117647, 0.55128205]),
 array([0.23357664, 0.98473282]),
 array([0.37426901, 0.70684932]),
 array([137, 131]))


In [24]:
print("GNB Test:")
pprint.pprint(precision_recall_fscore_support(y_test, gnb_preds))

GNB Test:
(array([0.70769231, 0.67391304]),
 array([0.67153285, 0.70992366]),
 array([0.68913858, 0.69144981]),
 array([137, 131]))


In [25]:
print("LRC Test:")
pprint.pprint(precision_recall_fscore_support(y_test, lrc_preds))

LRC Test:
(array([0.72932331, 0.7037037 ]),
 array([0.7080292 , 0.72519084]),
 array([0.71851852, 0.71428571]),
 array([137, 131]))


In [26]:
print("KNN Test:")
pprint.pprint(precision_recall_fscore_support(y_test, knn_preds))

KNN Test:
(array([0.79710145, 0.79230769]),
 array([0.80291971, 0.78625954]),
 array([0.8       , 0.78927203]),
 array([137, 131]))


In [27]:
print("RFC Test:")
pprint.pprint(precision_recall_fscore_support(y_test, rfc_preds))

RFC Test:
(array([0.74657534, 0.7704918 ]),
 array([0.79562044, 0.71755725]),
 array([0.77031802, 0.743083  ]),
 array([137, 131]))


In [28]:
print("XGB Test:")
pprint.pprint(precision_recall_fscore_support(y_test, xgb_preds))

XGB Test:
(array([0.8028169 , 0.81746032]),
 array([0.83211679, 0.78625954]),
 array([0.8172043 , 0.80155642]),
 array([137, 131]))


In [29]:
print("GBC Test:")
pprint.pprint(precision_recall_fscore_support(y_test, gsc_preds))

GBC Test:
(array([0.75675676, 0.79166667]),
 array([0.81751825, 0.72519084]),
 array([0.78596491, 0.75697211]),
 array([137, 131]))


In [30]:
feature_importances = pd.DataFrame(model_xgb.feature_importances_, index = X_train.columns, 
                                   columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
elo_diff,0.192308
seed_diff,0.075321
WTeamElo,0.060897
f_rank,0.057692
s_rank,0.040064
f_diff,0.038462
s_Blk,0.035256
s_DR,0.027244
f_PF,0.025641
s_EScore,0.024038


In [31]:
y['winning'].value_counts()


0    502
1    479
Name: winning, dtype: int64