# Modelling
Our classes (winner, loser) are perfectly balanced, meaning accuracy is a good way to see how well we've done. For completion, however, we include precision, recall, and F1. 

In [96]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import pprint

# Model imports
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

#Suppressing warnings
import warnings
warnings.filterwarnings("ignore")


In [97]:
X = pd.read_csv("/Users/Neil/Desktop/X.csv")
y = pd.read_csv("/Users/Neil/Desktop/y.csv")

In [98]:
del X['f_Seed_region'], X['s_Seed_region'], X['WTeamID'], X['LTeamID'], X['Season']

In [99]:
X.shape

(981, 44)

In [100]:
#Creating train and test data
X_train = X[:713]
y_train = y[:713]
X_test = X[713:]
y_test = y[713:]

In [101]:
model_svm = LinearSVC()
model_gnb = GaussianNB()
model_lrc = LogisticRegressionCV()
model_knn = KNeighborsClassifier()
model_rfc = RandomForestClassifier()
model_xgb = XGBClassifier()
model_gbc = GradientBoostingClassifier()

model_svm.fit(X_train,y_train)
model_gnb.fit(X_train,y_train)
model_lrc.fit(X_train,y_train)
model_knn.fit(X_train,y_train)
model_rfc.fit(X_train,y_train)
model_xgb.fit(X_train,y_train)
model_gbc.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [102]:
svm_pred = model_svm.predict(X_test)
gnb_pred = model_gnb.predict(X_test)
lrc_pred = model_lrc.predict(X_test)
knn_pred = model_knn.predict(X_test)
rfc_pred = model_rfc.predict(X_test)
xgb_pred = model_xgb.predict(X_test)
gbc_pred = model_gbc.predict(X_test)

In [103]:
svm_preds = pd.DataFrame(svm_pred)
gnb_preds = pd.DataFrame(gnb_pred)
lrc_preds = pd.DataFrame(lrc_pred)
knn_preds = pd.DataFrame(knn_pred)
rfc_preds = pd.DataFrame(rfc_pred)
xgb_preds = pd.DataFrame(xgb_pred)
gbc_preds = pd.DataFrame(gbc_pred)

In [104]:
print("SVM test acc: {}".format(accuracy_score(y_test, svm_preds)))

SVM test acc: 0.5111940298507462


In [105]:
print("Gaussian NB test acc: {}".format(accuracy_score(y_test, gnb_preds)))

Gaussian NB test acc: 0.6082089552238806


In [106]:
print("Logistic Regression test acc: {}".format(accuracy_score(y_test, lrc_preds)))

Logistic Regression test acc: 0.585820895522388


In [107]:
print("K-Nearest Neighbors test acc: {}".format(accuracy_score(y_test, knn_preds)))

K-Nearest Neighbors test acc: 0.5074626865671642


In [108]:
print("Random Forest test acc: {}".format(accuracy_score(y_test, rfc_preds)))

Random Forest test acc: 0.5410447761194029


In [109]:
print("XGBoost test acc: {}".format(accuracy_score(y_test, xgb_preds)))

XGBoost test acc: 0.5708955223880597


In [110]:
print("Gradient Boosting test acc: {}".format(accuracy_score(y_test, gbc_preds)))

Gradient Boosting test acc: 0.5634328358208955


In [111]:
confusion_matrix(y_test, svm_preds)

array([[137,   0],
       [131,   0]])

In [112]:
confusion_matrix(y_test, gnb_preds)

array([[81, 56],
       [49, 82]])

In [113]:
confusion_matrix(y_test, lrc_preds)

array([[76, 61],
       [50, 81]])

In [114]:
confusion_matrix(y_test, knn_preds)

array([[64, 73],
       [59, 72]])

In [115]:
confusion_matrix(y_test, rfc_preds)

array([[83, 54],
       [69, 62]])

In [116]:
confusion_matrix(y_test, xgb_preds)

array([[83, 54],
       [61, 70]])

In [117]:
confusion_matrix(y_test, gbc_preds)

array([[74, 63],
       [54, 77]])

In [118]:
print("SVM Test:")
pprint.pprint(precision_recall_fscore_support(y_test, svm_preds))

SVM Test:
(array([0.51119403, 0.        ]),
 array([1., 0.]),
 array([0.67654321, 0.        ]),
 array([137, 131]))


In [119]:
print("GNB Test:")
pprint.pprint(precision_recall_fscore_support(y_test, gnb_preds))

GNB Test:
(array([0.62307692, 0.5942029 ]),
 array([0.59124088, 0.6259542 ]),
 array([0.60674157, 0.60966543]),
 array([137, 131]))


In [120]:
print("LRC Test:")
pprint.pprint(precision_recall_fscore_support(y_test, lrc_preds))

LRC Test:
(array([0.6031746 , 0.57042254]),
 array([0.55474453, 0.61832061]),
 array([0.57794677, 0.59340659]),
 array([137, 131]))


In [121]:
print("KNN Test:")
pprint.pprint(precision_recall_fscore_support(y_test, knn_preds))

KNN Test:
(array([0.5203252 , 0.49655172]),
 array([0.46715328, 0.54961832]),
 array([0.49230769, 0.52173913]),
 array([137, 131]))


In [122]:
print("RFC Test:")
pprint.pprint(precision_recall_fscore_support(y_test, rfc_preds))

RFC Test:
(array([0.54605263, 0.53448276]),
 array([0.60583942, 0.47328244]),
 array([0.57439446, 0.50202429]),
 array([137, 131]))


In [123]:
print("XGB Test:")
pprint.pprint(precision_recall_fscore_support(y_test, xgb_preds))

XGB Test:
(array([0.57638889, 0.56451613]),
 array([0.60583942, 0.53435115]),
 array([0.59074733, 0.54901961]),
 array([137, 131]))


In [124]:
print("GBC Test:")
pprint.pprint(precision_recall_fscore_support(y_test, gbc_preds))

GBC Test:
(array([0.578125, 0.55    ]),
 array([0.54014599, 0.58778626]),
 array([0.55849057, 0.56826568]),
 array([137, 131]))
