In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import autograd.numpy as np
import scipy.linalg as la
import scipy.optimize as opt
from autograd import jacobian as J

In [2]:
cities = pd.read_csv('Desktop/NCAA_Data/Cities.csv')
conf = pd.read_csv('Desktop/NCAA_Data/Conferences.csv')
game_cit = pd.read_csv('Desktop/NCAA_Data/GameCities.csv')
ncaa_tourn_comp = pd.read_csv('Desktop/NCAA_Data/NCAATourneyCompactResults.csv')
ncaa_tourn_det = pd.read_csv('Desktop/NCAA_Data/NCAATourneyDetailedResults.csv')
ncaa_tourn_seed_round = pd.read_csv('Desktop/NCAA_Data/NCAATourneySeedRoundSlots.csv')
ncaa_tourn_seed = pd.read_csv('Desktop/NCAA_Data/NCAATourneySeeds.csv')
ncaa_tourn_slot = pd.read_csv('Desktop/NCAA_Data/NCAATourneySlots.csv')
reg_seas_comp = pd.read_csv('Desktop/NCAA_Data/RegularSeasonCompactResults.csv')
reg_seas_det = pd.read_csv('Desktop/NCAA_Data/RegularSeasonDetailedResults.csv')
seas = pd.read_csv('Desktop/NCAA_Data/Seasons.csv')
secon_comp = pd.read_csv('Desktop/NCAA_Data/SecondaryTourneyCompactResults.csv')
secon_teams = pd.read_csv('Desktop/NCAA_Data/SecondaryTourneyTeams.csv')
team_coaches = pd.read_csv('Desktop/NCAA_Data/TeamCoaches.csv')
team_conf = pd.read_csv('Desktop/NCAA_Data/TeamConferences.csv')
teams = pd.read_csv('Desktop/NCAA_Data/Teams.csv')

In [3]:
other_lab =  ['fgp', '3pp','ftp']
w_labels = ['WFGM', 'WFGA', 'WFGM3','WFGA3','WFTM','WFTA','WOR','WDR',
          'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']
l_labels = ['LFGM', 'LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR',
          'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']

In [4]:
year = 2017
team_dic = {}
this_year_team = list(ncaa_tourn_seed[ncaa_tourn_seed['Season'] == year]['TeamID'])
for count, val in enumerate(ncaa_tourn_seed[ncaa_tourn_seed['Season'] == year]['Seed']):
    team_dic[val] = this_year_team[count]
team_dic_names = {}
team_names = list(teams['TeamName'])
for count, val in enumerate(teams['TeamID']):
    team_dic_names[val] =  team_names[count]

## Gather Data and Train

In [5]:
x_data = []
y_data = []
for i in range(2003,year):
    for win, lose in np.array(ncaa_tourn_det[ncaa_tourn_det['Season'] == i][['WTeamID','LTeamID']]):
        get_data = reg_seas_det[reg_seas_det['Season'] == i]
        win_data_wins = get_data[get_data['WTeamID'] == win][w_labels].mean()
        win_data_loses = get_data[get_data['LTeamID'] == win][l_labels].mean()
        win_win_perc = len(get_data[get_data['WTeamID'] == win])/(len(get_data[get_data['WTeamID'] == win]) + len(get_data[get_data['LTeamID'] == win]))
        lose_data_wins = get_data[get_data['WTeamID'] == lose][w_labels].mean()
        lose_data_loses = get_data[get_data['LTeamID'] == lose][l_labels].mean()
        lose_win_perc = len(get_data[get_data['WTeamID'] == lose])/(len(get_data[get_data['WTeamID'] == lose]) + len(get_data[get_data['LTeamID'] == lose]))
        win_data_wins = np.array(win_data_wins)
        win_data_loses = np.array(win_data_loses)
        lose_data_wins = np.array(lose_data_wins)
        lose_data_loses = np.array(lose_data_loses)
        win_data_wins[np.isnan(win_data_wins)] = 0
        win_data_loses[np.isnan(win_data_loses)] = 0
        lose_data_wins[np.isnan(lose_data_wins)] = 0
        lose_data_loses[np.isnan(lose_data_loses)] = 0
        wins_data = list((win_data_wins+ win_data_loses)/2)
        lose_data = list((lose_data_wins+lose_data_loses)/2)
        wins_data.append(win_win_perc)
        lose_data.append(lose_win_perc)
        x_data.append(wins_data+lose_data)
        x_data.append(lose_data+wins_data)
        y_data.append(1)
        y_data.append(0)
x_data = np.array(x_data)
x_data = np.hstack((np.ones((len(x_data),1)),x_data))
y_data = np.array(y_data)

In [6]:
def nonlinear_conjugate_gradient(f, df, x0, tol=1e-5, maxiter=100):
    """Compute the minimizer of f using the nonlinear conjugate gradient
    algorithm.

    Parameters:
        f (function): The objective function. Accepts a NumPy array of shape
            (n,) and returns a float.
        Df (function): The first derivative of f. Accepts and returns a NumPy
            array of shape (n,).
        x0 ((n,) ndarray): The initial guess.
        tol (float): The stopping tolerance.
        maxiter (int): The maximum number of iterations to compute.

    Returns:
        ((n,) ndarray): The approximate minimum of f.
        (bool): Whether or not the algorithm converged.
        (int): The number of iterations computed.
    """
    #Set all of the initial values for a, r, and x
    r0 = -1*df(x0).T
    d0 = r0
    xk= x0
    af = lambda alpha: f(xk + alpha*d0)
    a0 = opt.minimize_scalar(af).x
    xk = x0 + a0*d0
    x0 = xk
    rk = r0
    converged=False
    for i in range(maxiter):
        #Calcluate all of the new values for the a, r, and x
        rk = -1*df(xk).T
        Bk = (rk.T @ rk) / (r0.T @ r0)
        dk = rk + Bk*d0
        d0 = dk
        ak = opt.minimize_scalar(af).x
        r0 = rk
        xk = xk + ak*dk
        #Check to see if the value converged
        if (la.norm(xk-x0) < tol):
            converged=True
            break
        x0 = xk
    #Return the value, whether or not it converged, and the number of iterations
    return(xk, converged, i+1)

In [7]:
guess = np.ones(len(x_data[0]))*.5
f = lambda beta: np.sum(np.log(1+np.exp(-1*(x_data @ beta)))+(1-y_data).flatten() * (x_data@beta).flatten())
df = J(f)
betas, conv, my_iter = nonlinear_conjugate_gradient(f,df,guess,maxiter=3000)
#My function is faster, but you can use the function below to do the same thing
#betas=opt.fmin_cg(f, guess)

  tmp2 = (x - v) * (fx - fw)
  p = (x - v) * tmp2 - (x - w) * tmp1


In [8]:
conv

True

## Prediction on the Data

In [9]:
comp_teams = np.array(ncaa_tourn_slot[ncaa_tourn_slot['Season'] == year][['StrongSeed','WeakSeed']])
slot = list(ncaa_tourn_slot[ncaa_tourn_slot['Season'] == year]['Slot'])

In [10]:
x_test = []
f_test = lambda x: 1/(1+ np.exp(-1*(betas[0]+betas[1:]*x)))
for count, (weak, strong) in enumerate(comp_teams):
        team1 = team_dic[strong]
        team2 = team_dic[weak]
        get_data = reg_seas_det[reg_seas_det['Season'] == year]
        strong_data_wins = get_data[get_data['WTeamID'] == team1][w_labels].mean()
        strong_data_loses = get_data[get_data['LTeamID'] == team1][l_labels].mean()
        strong_win_perc = len(get_data[get_data['WTeamID'] == team1])/(len(get_data[get_data['WTeamID'] == team1]) + len(get_data[get_data['LTeamID'] == team1]))
        weak_data_wins = get_data[get_data['WTeamID'] == team2][w_labels].mean()
        weak_data_loses = get_data[get_data['LTeamID'] == team2][l_labels].mean()
        weak_win_perc = len(get_data[get_data['WTeamID'] == team2])/(len(get_data[get_data['WTeamID'] == team2]) + len(get_data[get_data['LTeamID'] == team2]))
        strong_data_wins = np.array(strong_data_wins)
        strong_data_loses = np.array(strong_data_loses)
        weak_data_wins = np.array(weak_data_wins)
        weak_data_loses = np.array(weak_data_loses)
        strong_data_wins[np.isnan(strong_data_wins)] = 0
        strong_data_loses[np.isnan(strong_data_loses)] = 0
        weak_data_wins[np.isnan(weak_data_wins)] = 0
        weak_data_loses[np.isnan(weak_data_loses)] = 0
        strong_data = list((strong_data_wins+ strong_data_loses)/2)
        weak_data = list((weak_data_wins+weak_data_loses)/2)
        strong_data.append(win_win_perc)
        weak_data.append(lose_win_perc)
        strong_testers = np.array(strong_data+weak_data)
        weak_testers = np.array(weak_data+strong_data)
        x_test.append(strong_testers)
        x_test.append(weak_testers)
        two_vals = (f_test(strong_testers), f_test(weak_testers))
        if np.argmax(two_vals)== 0:
            team_dic[slot[count]] = team1
            print(team_dic_names[team1] + ' wins and ' + team_dic_names[team2]+  ' loses!')
        else:
            team_dic[slot[count]] = team2
            print(team_dic_names[team2] + ' wins and ' + team_dic_names[team1] + ' loses!')
print(team_dic_names[team_dic['R6CH']]+' won March Madness!!')


Providence wins and USC loses!
Mt St Mary's wins and New Orleans loses!
NC Central wins and UC Davis loses!
Kansas St wins and Wake Forest loses!
Villanova wins and Mt St Mary's loses!
Duke wins and Troy loses!
Baylor wins and New Mexico St loses!
Florida wins and ETSU loses!
Virginia wins and UNC Wilmington loses!
SMU wins and Providence loses!
South Carolina wins and Marquette loses!
Wisconsin wins and Virginia Tech loses!
Gonzaga wins and S Dakota St loses!
Arizona wins and North Dakota loses!
Florida St wins and FL Gulf Coast loses!
West Virginia wins and Bucknell loses!
Notre Dame wins and Princeton loses!
Maryland wins and Xavier loses!
St Mary's CA wins and VA Commonwealth loses!
Northwestern wins and Vanderbilt loses!
Kansas wins and NC Central loses!
Louisville wins and Jacksonville St loses!
Oregon wins and Iona loses!
Purdue wins and Vermont loses!
Iowa St wins and Nevada loses!
Creighton wins and Rhode Island loses!
Michigan wins and Oklahoma St loses!
Miami FL wins and Mic

## Same Problem, Random Forests

In [36]:
from sklearn.ensemble import RandomForestRegressor

In [37]:
rf = RandomForestRegressor(n_estimators=10000,random_state = 42)
rf.fit(x_data[:,1:], y_data)
for count, (weak, strong) in enumerate(comp_teams):
        team1 = team_dic[strong]
        team2 = team_dic[weak]
        get_data = reg_seas_det[reg_seas_det['Season'] == year]
        strong_data_wins = get_data[get_data['WTeamID'] == team1][w_labels].mean()
        strong_data_loses = get_data[get_data['LTeamID'] == team1][l_labels].mean()
        strong_win_perc = len(get_data[get_data['WTeamID'] == team1])/(len(get_data[get_data['WTeamID'] == team1]) + len(get_data[get_data['LTeamID'] == team1]))
        weak_data_wins = get_data[get_data['WTeamID'] == team2][w_labels].mean()
        weak_data_loses = get_data[get_data['LTeamID'] == team2][l_labels].mean()
        weak_win_perc = len(get_data[get_data['WTeamID'] == team2])/(len(get_data[get_data['WTeamID'] == team2]) + len(get_data[get_data['LTeamID'] == team2]))
        strong_data_wins = np.array(strong_data_wins)
        strong_data_loses = np.array(strong_data_loses)
        weak_data_wins = np.array(weak_data_wins)
        weak_data_loses = np.array(weak_data_loses)
        strong_data_wins[np.isnan(strong_data_wins)] = 0
        strong_data_loses[np.isnan(strong_data_loses)] = 0
        weak_data_wins[np.isnan(weak_data_wins)] = 0
        weak_data_loses[np.isnan(weak_data_loses)] = 0
        strong_data = list((strong_data_wins+ strong_data_loses)/2)
        weak_data = list((weak_data_wins+weak_data_loses)/2)
        strong_data.append(win_win_perc)
        weak_data.append(lose_win_perc)
        strong_testers = np.array(strong_data+weak_data)
        weak_testers = np.array(weak_data+strong_data)
        x_test.append(strong_testers)
        x_test.append(weak_testers)
        two_vals = (rf.predict(strong_testers.reshape(1,-1)), rf.predict(weak_testers.reshape(1,-1)))
        if np.argmax(two_vals)== 0:
            team_dic[slot[count]] = team1
            print(team_dic_names[team1] + ' wins and ' + team_dic_names[team2]+  ' loses!')
        else:
            team_dic[slot[count]] = team2
            print(team_dic_names[team2] + ' wins and ' + team_dic_names[team1] + ' loses!')
print(team_dic_names[team_dic['R6CH']]+' won March Madness!!')
#Significantly worse predictions

Belmont wins and Temple loses!
N Dakota St wins and NC Central loses!
St John's wins and Arizona St loses!
F Dickinson wins and Prairie View loses!
Duke wins and N Dakota St loses!
Michigan St wins and Bradley loses!
LSU wins and Yale loses!
St Louis wins and Virginia Tech loses!
Mississippi St wins and Liberty loses!
Belmont wins and Maryland loses!
Louisville wins and Minnesota loses!
VA Commonwealth wins and UCF loses!
Gonzaga wins and F Dickinson loses!
Montana wins and Michigan loses!
N Kentucky wins and Texas Tech loses!
Florida St wins and Vermont loses!
Murray St wins and Marquette loses!
Buffalo wins and St John's loses!
Florida wins and Nevada loses!
Baylor wins and Syracuse loses!
North Carolina wins and Iona loses!
Abilene Chr wins and Kentucky loses!
Georgia St wins and Houston loses!
Kansas wins and Northeastern loses!
New Mexico St wins and Auburn loses!
Iowa St wins and Ohio St loses!
Wofford wins and Seton Hall loses!
Washington wins and Utah St loses!
Virginia wins an

## Support Vector

In [84]:
from sklearn.svm import SVR
clf = SVR(gamma = 'scale', C =50, epsilon = .1)
clf.fit(x_data[:,1:],y_data)
for count, (weak, strong) in enumerate(comp_teams):
        team1 = team_dic[strong]
        team2 = team_dic[weak]
        get_data = reg_seas_det[reg_seas_det['Season'] == year]
        strong_data_wins = get_data[get_data['WTeamID'] == team1][w_labels].mean()
        strong_data_loses = get_data[get_data['LTeamID'] == team1][l_labels].mean()
        strong_win_perc = len(get_data[get_data['WTeamID'] == team1])/(len(get_data[get_data['WTeamID'] == team1]) + len(get_data[get_data['LTeamID'] == team1]))
        weak_data_wins = get_data[get_data['WTeamID'] == team2][w_labels].mean()
        weak_data_loses = get_data[get_data['LTeamID'] == team2][l_labels].mean()
        weak_win_perc = len(get_data[get_data['WTeamID'] == team2])/(len(get_data[get_data['WTeamID'] == team2]) + len(get_data[get_data['LTeamID'] == team2]))
        strong_data_wins = np.array(strong_data_wins)
        strong_data_loses = np.array(strong_data_loses)
        weak_data_wins = np.array(weak_data_wins)
        weak_data_loses = np.array(weak_data_loses)
        strong_data_wins[np.isnan(strong_data_wins)] = 0
        strong_data_loses[np.isnan(strong_data_loses)] = 0
        weak_data_wins[np.isnan(weak_data_wins)] = 0
        weak_data_loses[np.isnan(weak_data_loses)] = 0
        strong_data = list((strong_data_wins+ strong_data_loses)/2)
        weak_data = list((weak_data_wins+weak_data_loses)/2)
        strong_data.append(win_win_perc)
        weak_data.append(lose_win_perc)
        strong_testers = np.array(strong_data+weak_data)
        weak_testers = np.array(weak_data+strong_data)
        x_test.append(strong_testers)
        x_test.append(weak_testers)
        two_vals = (clf.predict(strong_testers.reshape(1,-1)), clf.predict(weak_testers.reshape(1,-1)))
        if np.argmax(two_vals)== 0:
            team_dic[slot[count]] = team1
            print(team_dic_names[team1] + ' wins and ' + team_dic_names[team2]+  ' loses!')
        else:
            team_dic[slot[count]] = team2
            print(team_dic_names[team2] + ' wins and ' + team_dic_names[team1] + ' loses!')
print(team_dic_names[team_dic['R6CH']]+' won March Madness!!')
#Significantly worse predictions

St Bonaventure wins and UCLA loses!
Radford wins and Long Island loses!
Syracuse wins and Arizona St loses!
NC Central wins and TX Southern loses!
Villanova wins and Radford loses!
Purdue wins and CS Fullerton loses!
Texas Tech wins and SF Austin loses!
Wichita St wins and Marshall loses!
West Virginia wins and Murray St loses!
Florida wins and St Bonaventure loses!
Butler wins and Arkansas loses!
Alabama wins and Virginia Tech loses!
Kansas wins and Penn loses!
Duke wins and Iona loses!
Michigan St wins and Bucknell loses!
Auburn wins and Col Charleston loses!
New Mexico St wins and Clemson loses!
TCU wins and Syracuse loses!
Rhode Island wins and Oklahoma loses!
Seton Hall wins and NC State loses!
Virginia wins and UMBC loses!
Cincinnati wins and Georgia St loses!
Tennessee wins and Wright St loses!
Arizona wins and Buffalo loses!
Kentucky wins and Davidson loses!
Miami FL wins and Loyola-Chicago loses!
Nevada wins and Texas loses!
Creighton wins and Kansas St loses!
Xavier wins and 