## Making NIT Predictions using Model from NCAA Tourney

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

from sklearn.linear_model import LogisticRegression, LinearRegression

os.chdir("../../data/men")

### Data



In [2]:
### Changes since Rd 1: 
##   - use 
## Provided
teams_df            = pd.read_csv("MTeams.csv")
team_spellings_1_df = pd.read_csv("MTeamSpellings.csv")
team_spellings_2_df = pd.read_csv("MTeamSpellings_2.csv")
team_spellings_df   = pd.concat([team_spellings_1_df, team_spellings_2_df])
tourney_results_df  = pd.read_csv("MNCAATourneyCompactResults.csv")

## Kenpom
#kenpom_df = pd.read_csv("kenpom/pre-tourney-eff/kp-pre-tourney-2002-21-combined.csv")
#kenpom_df = pd.read_csv("kenpom/pre-tourney-eff/kp-pre-tourney-2002-21-mod-21.csv")
kenpom_df  = pd.read_csv("kenpom/pre-tourney-eff/kp-pre-tourney-2002-21-mod-21.csv")
updated_2021_df  = pd.read_csv("kenpom/pre-tourney-eff/kenpom_summary_2021-03-19.csv")
kenpom_df = kenpom_df[kenpom_df['Season'] != 2021]
kenpom_df = pd.concat([kenpom_df, updated_2021_df], sort=True).reset_index()
kenpom_df['TeamName'] = kenpom_df['TeamName'].apply(lambda x: x.lower())
kenpom_df = kenpom_df.merge(team_spellings_df, left_on = 'TeamName', right_on = 'TeamNameSpelling')


## NIT Teams to make a "sample submission" with all possible matchups
nit_seeds_df = pd.read_csv("NIT-Seeds.csv")

In [3]:
### Create a training dataset
tourney_results_df['Team1'] = tourney_results_df.apply(lambda x: x['WTeamID'] if x['WTeamID'] < x['LTeamID']
                                                 else x['LTeamID'], axis = 1)
tourney_results_df['Team2'] = tourney_results_df.apply(lambda x: x['WTeamID'] if x['WTeamID'] > x['LTeamID']
                                                 else x['LTeamID'], axis = 1)
## Targets
tourney_results_df['win']   = tourney_results_df.apply(lambda x: 1 if x['Team1'] == x['WTeamID']
                                                 else 0, axis = 1)
tourney_results_df['spread'] = tourney_results_df.apply(lambda x: x['WScore'] - x['LScore'] if x['Team1'] == x['WTeamID']
                                                 else x['LScore'] - x['WScore'], axis = 1)

### Merge the team names 
tourney_results_df = tourney_results_df.merge(teams_df, left_on='Team1', right_on='TeamID', suffixes = ('', '_1'))
tourney_results_df = tourney_results_df.merge(teams_df, left_on='Team2', right_on='TeamID', suffixes = ('', '_2'))

### Merge KP Data by Team Name and Season and subtract Team1 - Team2 for 1st variable
## For Team 1
kenpom_df.columns = [c + '_1' if c[len(c)-2:] not in ['_1', '_2'] else c[:-2] + '_1' for c in kenpom_df.columns]
tourney_results_df = tourney_results_df.merge(kenpom_df, left_on = ['Team1', 'Season'],
                                              right_on = ['TeamID_1', 'Season_1'])
## For Team 2
kenpom_df.columns = [c[:-2] + '_2' for c in kenpom_df.columns]
tourney_results_df = tourney_results_df.merge(kenpom_df, left_on = ['Team2', 'Season'],
                                              right_on = ['TeamID_2', 'Season_2'])

## Take differences between team1 and team2 for various stats
tourney_results_df['diff_AdjEM'] = tourney_results_df['AdjEM_1'] - tourney_results_df['AdjEM_2']
tourney_results_df['diff_AdjOE'] = tourney_results_df['AdjOE_1'] - tourney_results_df['AdjOE_2']
tourney_results_df['diff_AdjDE'] = tourney_results_df['AdjDE_1'] - tourney_results_df['AdjDE_2']
tourney_results_df['diff_AdjTempo'] = tourney_results_df['AdjEM_1'] - tourney_results_df['AdjEM_2']


In [4]:
train_df = tourney_results_df[tourney_results_df['Season'] < 2021].copy() ## basically all data

### Binary outcome

Starting with logistic regression

In [5]:
X = train_df['diff_AdjEM'].values.reshape(-1, 1)
clf = LogisticRegression(random_state=0, solver='lbfgs',
                          multi_class='ovr').fit(X, train_df['win'])

probs = clf.predict_proba(X) ### first in the array is probability of 0 (loss), second is prob of win (1)

train_df['fitted_probabilities'] = probs[:,1]

### Submission

Make a "submission" file with all combinations of NIT teams

In [6]:
game_ids = []
for team1 in nit_seeds_df.TeamID.unique():
    for team2 in nit_seeds_df.TeamID.unique():
        if team1 < team2:
            game_id = f"2021_{team1}_{team2}"
            game_ids.append(game_id)
        elif team1 > team2:
            game_id = f"2021_{team2}_{team1}"
            game_ids.append(game_id)

nit_sub_df = pd.DataFrame({'ID': game_ids})
## Split out the ID
nit_sub_df['Season'] = nit_sub_df['ID'].apply(lambda x: x.split('_')[0]).astype(int)
nit_sub_df['Team1']  = nit_sub_df['ID'].apply(lambda x: x.split('_')[1]).astype(int)
nit_sub_df['Team2']  = nit_sub_df['ID'].apply(lambda x: x.split('_')[2]).astype(int)

In [7]:
kenpom_df

Unnamed: 0,index_2,AdjDE_2,AdjEM_2,AdjOE_2,AdjTempo_2,DE_2,OE_2,RankAdjDE_2,RankAdjEM_2,RankAdjOE_2,RankAdjTempo_2,RankDE_2,RankOE_2,RankTempo_2,Season_2,TeamName_2,Tempo_2,seed_2,TeamNameSpelling_2,TeamID_2
0,0,87.3522,34.02200,121.3740,73.9016,90.2544,116.3730,2,1,1,10,4,1,8,2002,duke,77.0734,1.0,duke,1181
1,336,93.4116,22.80180,116.2130,71.0877,95.6622,111.2750,23,10,9,33,43,15,29,2003,duke,73.2713,3.0,duke,1181
2,724,88.2990,31.72790,120.0270,67.6352,91.8003,113.2080,3,1,3,112,12,8,91,2004,duke,69.9636,1.0,duke,1181
3,982,88.7397,28.69570,117.4350,68.6081,92.6727,112.3150,3,3,10,74,19,17,52,2005,duke,70.7990,1.0,duke,1181
4,1310,92.6239,28.56560,121.1890,69.6792,96.1668,115.4450,21,1,1,24,55,1,41,2006,duke,71.3370,1.0,duke,1181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6506,238,110.7680,-10.08320,100.6850,69.8383,109.6010,103.9370,324,273,203,109,318,114,156,2021,purdue fort wayne,69.5840,,purdue fort wayne,1236
6507,20,108.6540,-1.17919,107.4740,64.6667,104.3400,111.0080,300,183,89,332,245,23,322,2021,bellarmine,65.4292,,bellarmine,1468
6508,167,101.7120,-10.79090,90.9216,66.8334,97.1913,96.6076,163,281,334,259,68,265,229,2021,merrimack,67.9539,,merrimack,1467
6509,187,97.3609,13.06310,110.4240,68.0152,102.4390,107.2040,84,64,51,196,204,59,217,2021,n.c. state,68.1175,,n.c. state,1301


In [8]:
nit_sub_df

Unnamed: 0,ID,Season,Team1,Team2
0,2021_1138_1161,2021,1138,1161
1,2021_1161_1172,2021,1161,1172
2,2021_1161_1301,2021,1161,1301
3,2021_1161_1272,2021,1161,1272
4,2021_1161_1173,2021,1161,1173
...,...,...,...,...
235,2021_1350_1443,2021,1350,1443
236,2021_1405_1443,2021,1405,1443
237,2021_1279_1443,2021,1279,1443
238,2021_1256_1443,2021,1256,1443


In [9]:
## Merge in the predictor variables
## For Team 1
kenpom_df.columns = [c + '_1' if c[len(c)-2:] not in ['_1', '_2'] else c[:-2] + '_1' for c in kenpom_df.columns]
nit_sub_df = nit_sub_df.merge(kenpom_df, left_on = ['Team1', 'Season'],
                              right_on = ['TeamID_1', 'Season_1'])
## For Team 2
kenpom_df.columns = [c[:-2] + '_2' for c in kenpom_df.columns]
nit_sub_df = nit_sub_df.merge(kenpom_df, left_on = ['Team2', 'Season'],
                                              right_on = ['TeamID_2', 'Season_2'])
## Difference
nit_sub_df['diff_AdjEM'] = nit_sub_df['AdjEM_1'] - nit_sub_df['AdjEM_2']

Make predictions using each model and save output.

In [10]:
## Submission matrix
X = nit_sub_df['diff_AdjEM'].values.reshape(-1, 1)

## Win probability predictions
nit_sub_df['Pred'] = clf.predict_proba(X)[:,1]

nit_sub_df.head()

Unnamed: 0,ID,Season,Team1,Team2,index_1,AdjDE_1,AdjEM_1,AdjOE_1,AdjTempo_1,DE_1,...,RankOE_2,RankTempo_2,Season_2,TeamName_2,Tempo_2,seed_2,TeamNameSpelling_2,TeamID_2,diff_AdjEM,Pred
0,2021_1138_1161,2021,1138,1161,32,95.1415,12.0547,107.196,73.5752,97.1363,...,67,173,2021,colorado st.,69.2228,,colorado st.,1161,-1.0106,0.453546
1,2021_1138_1161,2021,1138,1161,32,95.1415,12.0547,107.196,73.5752,97.1363,...,67,173,2021,colorado st.,69.2228,,colorado st.,1161,-1.0106,0.453546
2,2021_1129_1161,2021,1129,1161,24,96.4702,13.6842,110.154,69.3927,96.5853,...,67,173,2021,colorado st.,69.2228,,colorado st.,1161,0.6189,0.502902
3,2021_1129_1161,2021,1129,1161,24,96.4702,13.6842,110.154,69.3927,96.5853,...,67,173,2021,colorado st.,69.2228,,colorado st.,1161,0.6189,0.502902
4,2021_1138_1172,2021,1138,1172,32,95.1415,12.0547,107.196,73.5752,97.1363,...,28,345,2021,davidson,63.3666,,davidson,1172,-0.7928,0.460111


In [12]:
(nit_sub_df[['ID', 'Pred', 'TeamName_1', 'TeamName_2']]
   .to_csv(f"subs/nit_probs_{datetime.today().strftime('%Y-%m-%d')}.csv", index = False))