In [1]:
import os
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, plot_importance
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, r2_score

from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib


from keras.models import load_model


pd.options.display.max_columns = None

Using TensorFlow backend.


In [2]:
data = pd.read_hdf("processed.h5", key='df')

In [3]:
Y = data['winner']
X = data.drop(columns=['winner', 'TeamID_x','TeamID_y'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=0.2,
    random_state=42)

In [4]:
# Random Forest Classifier

rand_forest_clf = RandomForestClassifier(n_estimators=100)
rand_forest_clf = rand_forest_clf.fit(X_train, y_train)
forest_predictions = rand_forest_clf.predict(X_test)
forest_predictions[forest_predictions > .5] = 1
forest_predictions[forest_predictions <= .5] = 0
forest_accuracy = accuracy_score(y_test, forest_predictions)
print(forest_accuracy)

0.7258883248730964


In [5]:
# XGBoost Classifier

xgbCla = XGBClassifier(n_jobs=4)
xgbCla.fit(X_train, y_train, verbose=True)

xgb_predictions = xgbCla.predict(X_test)
xgb_predictions[xgb_predictions > .5] = 1
xgb_predictions[xgb_predictions <= .5] = 0
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
print(xgb_accuracy)

0.751269035532995


In [6]:
# Linear SVC Classifier

linear_svc_clf = LinearSVC(random_state=0, tol=1e-5, max_iter=2000)
linear_svc_clf.fit(X_train, y_train)

lin_svc_predictions = linear_svc_clf.predict(X_test)
lin_svc_predictions[lin_svc_predictions > .5] = 1
lin_svc_predictions[lin_svc_predictions <= .5] = 0
lin_svc_accuracy = accuracy_score(y_test, lin_svc_predictions)
print(lin_svc_accuracy)

0.751269035532995




In [7]:
# SVC Classifier

svc_clf = SVC(random_state=0, tol=1e-5)
svc_clf.fit(X_train, y_train)

svc_predictions = svc_clf.predict(X_test)
svc_predictions[svc_predictions > .5] = 1
svc_predictions[svc_predictions <= .5] = 0
svc_accuracy = accuracy_score(y_test, svc_predictions)
print(svc_accuracy)

0.6700507614213198




In [8]:
# Logistic Regressor (Classifier)

log_reg_clf = LogisticRegression(random_state=0, solver='liblinear', max_iter=2000)
log_reg_clf.fit(X_train, y_train)

log_predictions = log_reg_clf.predict(X_test)
log_predictions[log_predictions > .5] = 1
log_predictions[log_predictions <= .5] = 0
log_accuracy = accuracy_score(y_test, log_predictions)
print(log_accuracy)

0.7563451776649747


In [31]:
ann = load_model('ncaa_neural.h5')


y_pred = ann.predict(X_test)
y_pred[y_pred > .5] = 1
y_pred[y_pred <= .5] = 0
ann_accuracy = accuracy_score(y_test, y_pred)
print(ann_accuracy)

0.7411167512690355


## Predicting 2019 ##

In [10]:
this_year_data = pd.read_csv('../Data/DataFiles/tourneyData.csv')
teams = pd.read_csv('../Data/DataFiles/Teams.csv')
teamsConfs = pd.read_csv('../Data/DataFiles/TeamConferences.csv')

In [11]:
teams = teams.drop(columns=['FirstD1Season','LastD1Season'])

In [12]:
this_year_data = pd.merge(this_year_data, teams, left_on='Team', right_on='TeamName', how='left')

In [13]:
this_year_data = this_year_data.drop(columns=['OldFGM','Old No. Games','OldFGA','OldFGM3','OldFTM','OldFTA','OldFGA3'])

In [14]:
this_year_data.head()

Unnamed: 0,Team,Seed,OR,DR,Ast,TO,Stl,Blk,PF,FTM,FTA,FGM3,FGA3,FGM,FGA,PA,TeamID,TeamName
0,F Dickinson,16,9.46,23.54,13.9,13.7,7.5,3.8,16.5,14.542857,20.057143,7.685714,19.2,26.314286,55.4,71.9,1192.0,F Dickinson
1,Gonzaga,1,9.95,29.05,18.3,10.5,7.5,5.5,16.3,15.540541,20.405405,7.756757,21.351351,31.810811,60.513514,64.9,1211.0,Gonzaga
2,Baylor,9,13.24,24.24,13.9,13.1,6.1,4.7,18.8,12.647059,18.676471,8.058824,23.617647,25.558824,57.823529,67.7,1124.0,Baylor
3,Syracuse,8,10.79,23.79,12.0,12.5,8.2,4.8,17.3,14.117647,20.617647,8.058824,24.235294,23.764706,56.088235,66.1,1393.0,Syracuse
4,Vermont,13,9.24,25.88,11.7,11.1,5.5,4.0,16.7,14.911765,19.941176,8.029412,22.382353,25.411765,55.5,63.0,1436.0,Vermont


In [15]:
this_year_data['Score'] = 0
for row in range(len(this_year_data)):
    this_year_data.at[row, 'Score'] = this_year_data.at[row, 'FTM'] + (this_year_data.at[row, 'FGM'] - this_year_data.at[row, 'FGM3'])*2 + this_year_data.at[row,'FGM3']*3

In [16]:
teamsConfs.head()

Unnamed: 0,Season,TeamID,ConfAbbrev
0,1985,1114,a_sun
1,1985,1147,a_sun
2,1985,1204,a_sun
3,1985,1209,a_sun
4,1985,1215,a_sun


In [17]:
teamsConfs = teamsConfs[teamsConfs['Season'] == 2018]
teamsConfs = teamsConfs.drop(columns=['Season'], axis=1)
this_year_data = pd.merge(this_year_data, teamsConfs, on='TeamID')

In [18]:
this_year_data.head()

Unnamed: 0,Team,Seed,OR,DR,Ast,TO,Stl,Blk,PF,FTM,FTA,FGM3,FGA3,FGM,FGA,PA,TeamID,TeamName,Score,ConfAbbrev
0,F Dickinson,16,9.46,23.54,13.9,13.7,7.5,3.8,16.5,14.542857,20.057143,7.685714,19.2,26.314286,55.4,71.9,1192.0,F Dickinson,74,nec
1,Gonzaga,1,9.95,29.05,18.3,10.5,7.5,5.5,16.3,15.540541,20.405405,7.756757,21.351351,31.810811,60.513514,64.9,1211.0,Gonzaga,86,wcc
2,Baylor,9,13.24,24.24,13.9,13.1,6.1,4.7,18.8,12.647059,18.676471,8.058824,23.617647,25.558824,57.823529,67.7,1124.0,Baylor,71,big_twelve
3,Syracuse,8,10.79,23.79,12.0,12.5,8.2,4.8,17.3,14.117647,20.617647,8.058824,24.235294,23.764706,56.088235,66.1,1393.0,Syracuse,69,acc
4,Vermont,13,9.24,25.88,11.7,11.1,5.5,4.0,16.7,14.911765,19.941176,8.029412,22.382353,25.411765,55.5,63.0,1436.0,Vermont,73,aec


In [19]:
this_year_data = this_year_data.drop(columns=['TeamName', 'TeamID'])
this_year_data['Seed'] = this_year_data['Seed'].astype(str)
for row in range(0,len(this_year_data)):
    if this_year_data.at[row,'Seed'] == '1':
        this_year_data.at[row,'Seed'] = '01'
    elif this_year_data.at[row,'Seed'] == '2':
        this_year_data.at[row,'Seed'] = '02'
    elif this_year_data.at[row,'Seed'] == '3':
        this_year_data.at[row,'Seed'] = '03'
    elif this_year_data.at[row,'Seed'] == '4':
        this_year_data.at[row,'Seed'] = '04'
    elif this_year_data.at[row,'Seed'] == '5':
        this_year_data.at[row,'Seed'] = '05'
    elif this_year_data.at[row,'Seed'] == '6':
        this_year_data.at[row,'Seed'] = '06'
    elif this_year_data.at[row,'Seed'] == '7':
        this_year_data.at[row,'Seed'] = '07'
    elif this_year_data.at[row,'Seed'] == '8':
        this_year_data.at[row,'Seed'] = '08'
    elif this_year_data.at[row,'Seed'] == '9':
        this_year_data.at[row,'Seed'] = '09'
this_year_data.head()

Unnamed: 0,Team,Seed,OR,DR,Ast,TO,Stl,Blk,PF,FTM,FTA,FGM3,FGA3,FGM,FGA,PA,Score,ConfAbbrev
0,F Dickinson,16,9.46,23.54,13.9,13.7,7.5,3.8,16.5,14.542857,20.057143,7.685714,19.2,26.314286,55.4,71.9,74,nec
1,Gonzaga,1,9.95,29.05,18.3,10.5,7.5,5.5,16.3,15.540541,20.405405,7.756757,21.351351,31.810811,60.513514,64.9,86,wcc
2,Baylor,9,13.24,24.24,13.9,13.1,6.1,4.7,18.8,12.647059,18.676471,8.058824,23.617647,25.558824,57.823529,67.7,71,big_twelve
3,Syracuse,8,10.79,23.79,12.0,12.5,8.2,4.8,17.3,14.117647,20.617647,8.058824,24.235294,23.764706,56.088235,66.1,69,acc
4,Vermont,13,9.24,25.88,11.7,11.1,5.5,4.0,16.7,14.911765,19.941176,8.029412,22.382353,25.411765,55.5,63.0,73,aec


In [20]:
conf_enc = joblib.load('conf_enc_file.joblib')
seed_enc = joblib.load('seed_enc_file.joblib')
scl = joblib.load('scl_file.joblib')

this_year_data['ConfAbbrev'] = conf_enc.transform(this_year_data['ConfAbbrev'])
this_year_data['Seed'] = seed_enc.transform(this_year_data['Seed'])

In [32]:
def predict_game(team1, team2):
    x = this_year_data[this_year_data['Team'] == team1]
    y = this_year_data[this_year_data['Team'] == team2]
    
    x.columns = [str(col) + '_x' for col in x.columns]
    y.columns = [str(col) + '_y' for col in y.columns]
    
    game_info = x.iloc[0,:]
    game_info = game_info.append(y.iloc[0,:])
    
    game_info = game_info.drop(labels=['Team_x','Team_y'])
    
    #separate data
    
    cat = game_info[['ConfAbbrev_x','ConfAbbrev_y', 'Seed_x', 'Seed_y']]
    cols = game_info.index
    cols_quan = cols.drop(['ConfAbbrev_x','ConfAbbrev_y', 'Seed_x', 'Seed_y'])
    
    #encode/scale
    
#     cat['ConfAbbrev_x'] = conf_enc.transform(cat[0])
#     cat['ConfAbbrev_y'] = conf_enc.transform(cat['ConfAbbrev_y'])
#     cat['Seed_x'] = seed_enc.transform(cat['Seed_x'])
#     cat['Seed_y'] = seed_enc.transform(cat['Seed_y'])
    
    quan = game_info[cols_quan]
    quan = quan.values.reshape(1, -1)
    quan = scl.transform(quan)
    
    
    game_info_final = cat.append(pd.DataFrame(quan).transpose())
    
#     game_info_final = game_info_final.drop(labels=[0,15])
        
#     return game_info_final

    
    return ann.predict(game_info_final.values.reshape(1,-1))


def double_check(x, y):
    temp_one = predict_game(x,y)
    temp_two = predict_game(y,x)
    
    if temp_one > temp_two:
        return 1
    else:
        return 2

In [112]:
print(double_check("Michigan", "Virginia"))

2


