## NCAAM March ML Mania 

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

from sklearn.linear_model import LogisticRegression, LinearRegression

os.chdir("../../data/men")

### Data

- [Provided Data](https://www.kaggle.com/c/ncaam-march-mania-2021/data): 20 provided files including basic team info, box scores, and rankings
- [KenPom data](https://kenpom.com/): Pre-tournament efficiency from KenPom
   - Possession, player, and lineup level data is available, but not used.
   - Possession-level efficiency metrics: 
      - `AdjDE`: Expected points allowed per 100 possessions, adjusted for opponent strength and location.
      - `AdjDE`: Expected points allowed per 100 possessions, adjusted for opponent strength and location.
   - Note: Adjusted Efficiency Margin (`AdjEM`) is a linear combination of `AdjDE` and `AdjOE` (`AdjOE`-`AdjDE`), so we should not include all 3 in our model.
   
   
#### Changes since Rd. 1:

- Use `MSampleSubmissionStage2.csv`
- Update KenPom data with season 2021 ratings

In [2]:
### Changes since Rd 1: 
##   - use 
## Provided
teams_df            = pd.read_csv("MTeams.csv")
team_spellings_1_df = pd.read_csv("MTeamSpellings.csv")
team_spellings_2_df = pd.read_csv("MTeamSpellings_2.csv")
team_spellings_df   = pd.concat([team_spellings_1_df, team_spellings_2_df])
tourney_results_df  = pd.read_csv("MNCAATourneyCompactResults.csv")
sample_submission   = pd.read_csv("MSampleSubmissionStage2.csv")

## Kenpom
#kenpom_df          = pd.read_csv("kenpom/pre-tourney-eff/kp-pre-tourney-2002-21-combined.csv")
kenpom_df          = pd.read_csv("kenpom/pre-tourney-eff/kp-pre-tourney-2002-21-mod-21.csv")
kenpom_df['TeamName'] = kenpom_df['TeamName'].apply(lambda x: x.lower())
kenpom_df = kenpom_df.merge(team_spellings_df, left_on = 'TeamName', right_on = 'TeamNameSpelling')

Training data criteria:
  - For Round 2, take all tournament data for past years and train the winning model from Stage 1

In [3]:
### Create a training dataset
tourney_results_df['Team1'] = tourney_results_df.apply(lambda x: x['WTeamID'] if x['WTeamID'] < x['LTeamID']
                                                 else x['LTeamID'], axis = 1)
tourney_results_df['Team2'] = tourney_results_df.apply(lambda x: x['WTeamID'] if x['WTeamID'] > x['LTeamID']
                                                 else x['LTeamID'], axis = 1)
## Targets
tourney_results_df['win']   = tourney_results_df.apply(lambda x: 1 if x['Team1'] == x['WTeamID']
                                                 else 0, axis = 1)
tourney_results_df['spread'] = tourney_results_df.apply(lambda x: x['WScore'] - x['LScore'] if x['Team1'] == x['WTeamID']
                                                 else x['LScore'] - x['WScore'], axis = 1)

### Merge the team names 
tourney_results_df = tourney_results_df.merge(teams_df, left_on='Team1', right_on='TeamID', suffixes = ('', '_1'))
tourney_results_df = tourney_results_df.merge(teams_df, left_on='Team2', right_on='TeamID', suffixes = ('', '_2'))

### Merge KP Data by Team Name and Season and subtract Team1 - Team2 for 1st variable
## For Team 1
kenpom_df.columns = [c + '_1' if c[len(c)-2:] not in ['_1', '_2'] else c[:-2] + '_1' for c in kenpom_df.columns]
tourney_results_df = tourney_results_df.merge(kenpom_df, left_on = ['Team1', 'Season'],
                                              right_on = ['TeamID_1', 'Season_1'])
## For Team 2
kenpom_df.columns = [c[:-2] + '_2' for c in kenpom_df.columns]
tourney_results_df = tourney_results_df.merge(kenpom_df, left_on = ['Team2', 'Season'],
                                              right_on = ['TeamID_2', 'Season_2'])

## Take differences between team1 and team2 for various stats
tourney_results_df['diff_AdjEM'] = tourney_results_df['AdjEM_1'] - tourney_results_df['AdjEM_2']
tourney_results_df['diff_AdjOE'] = tourney_results_df['AdjOE_1'] - tourney_results_df['AdjOE_2']
tourney_results_df['diff_AdjDE'] = tourney_results_df['AdjDE_1'] - tourney_results_df['AdjDE_2']
tourney_results_df['diff_AdjTempo'] = tourney_results_df['AdjEM_1'] - tourney_results_df['AdjEM_2']


In [4]:
train_df = tourney_results_df[tourney_results_df['Season'] < 2021].copy() ## basically all data

### Binary outcome

Starting with logistic regression

In [5]:
X = train_df['diff_AdjEM'].values.reshape(-1, 1)
clf = LogisticRegression(random_state=0, solver='lbfgs',
                          multi_class='ovr').fit(X, train_df['win'])

probs = clf.predict_proba(X) ### first in the array is probability of 0 (loss), second is prob of win (1)

train_df['fitted_probabilities'] = probs[:,1]

In [6]:
print(f"AdjEM Odds Ratio: {np.round(np.exp(clf.coef_[0])[0], 3)}")

AdjEM Odds Ratio: 1.129


Interpretation: For every 1 point increase in a team's AdjEM margin over than their opponents, the odds that the team wins are 1.13 times greater as the odds that the team loses.

### Spread

Starting with simple linear regression.

In [7]:
reg = LinearRegression().fit(X, train_df['spread'])

print(f"Coefficients {reg.coef_}")
print(f"R-Squared {np.round(reg.score(X, train_df['spread']), 3)}")

Coefficients [0.71443931]
R-Squared 0.414


Interpretation: For every 1 point increase in a team's AdjEM margin over their opponents, the point spread for increases by 0.73 points in favor of that team.

### Submission

Manipulate the sample submission file to add predictors.

In [8]:
## Split out the ID
sample_submission['Season'] = sample_submission['ID'].apply(lambda x: x.split('_')[0]).astype(int)
sample_submission['Team1'] = sample_submission['ID'].apply(lambda x: x.split('_')[1]).astype(int)
sample_submission['Team2'] = sample_submission['ID'].apply(lambda x: x.split('_')[2]).astype(int)

## Merge in the predictor variables
## For Team 1
kenpom_df.columns = [c + '_1' if c[len(c)-2:] not in ['_1', '_2'] else c[:-2] + '_1' for c in kenpom_df.columns]
sample_submission = sample_submission.merge(kenpom_df, left_on = ['Team1', 'Season'],
                                            right_on = ['TeamID_1', 'Season_1'])
## For Team 2
kenpom_df.columns = [c[:-2] + '_2' for c in kenpom_df.columns]
sample_submission = sample_submission.merge(kenpom_df, left_on = ['Team2', 'Season'],
                                              right_on = ['TeamID_2', 'Season_2'])
## Difference
sample_submission['diff_AdjEM'] = sample_submission['AdjEM_1'] - sample_submission['AdjEM_2']

Make predictions using each model and save output.

In [9]:
## Submission matrix
X = sample_submission['diff_AdjEM'].values.reshape(-1, 1)

## Win probability predictions
sample_submission['Pred_prob'] = clf.predict_proba(X)[:,1]

## Point spread predictions
sample_submission['Pred_spread'] = reg.predict(X)

sample_submission.head()

Unnamed: 0,ID,Pred,Season,Team1,Team2,Season_1,TeamName_1,Tempo_1,RankTempo_1,AdjTempo_1,...,AdjDE_2,RankAdjDE_2,AdjEM_2,RankAdjEM_2,seed_2,TeamNameSpelling_2,TeamID_2,diff_AdjEM,Pred_prob,Pred_spread
0,2021_1101_1104,0.5,2021,1101,1104,2021,abilene christian,71.6065,74,69.9579,...,86.0017,2,26.4162,8,,alabama,1104,-14.8603,0.133666,-11.095298
1,2021_1101_1111,0.5,2021,1101,1111,2021,abilene christian,71.6065,74,69.9579,...,103.006,205,-2.90939,209,,appalachian st.,1111,14.46529,0.844721,9.856056
2,2021_1104_1111,0.5,2021,1104,1111,2021,alabama,75.1471,10,73.9322,...,103.006,205,-2.90939,209,,appalachian st.,1111,29.32559,0.970663,20.472838
3,2021_1101_1116,0.5,2021,1101,1116,2021,abilene christian,71.6065,74,69.9579,...,89.2402,14,22.9578,18,,arkansas,1116,-11.4019,0.190191,-8.624481
4,2021_1104_1116,0.5,2021,1104,1116,2021,alabama,75.1471,10,73.9322,...,89.2402,14,22.9578,18,,arkansas,1116,3.4584,0.588211,1.992301


In [10]:
(sample_submission[['ID', 'Pred_prob']].rename(columns = {'Pred_prob': 'Pred'})
   .to_csv(f"subs/submission_probs_untrimmed_{datetime.today().strftime('%Y-%m-%d')}.csv", index = False))

(sample_submission[['ID', 'Pred_spread']].rename(columns = {'Pred_spread': 'Pred'})
   .to_csv(f"subs/submission_spread_{datetime.today().strftime('%Y-%m-%d')}.csv", index = False))

In [11]:
### Trimming probabilities based on residual analysis for round 1.
sample_submission['Pred_prob_trim'] = sample_submission['Pred_prob'].apply(lambda x: 0.95 if x >= 0.95
                                                                          else 0.05 if x <= 0.05
                                                                          else x)
(sample_submission[['ID', 'Pred_prob_trim']].rename(columns = {'Pred_prob_trim': 'Pred'})
   .to_csv(f"subs/submission_probs_trimmed_{datetime.today().strftime('%Y-%m-%d')}.csv", index = False))


In [12]:
### Writing out one with names to throw into a searchable table
sample_submission['Pred_prob_trim'] = sample_submission['Pred_prob'].apply(lambda x: 0.95 if x >= 0.95
                                                                          else 0.05 if x <= 0.05
                                                                          else x)

(sample_submission[['ID', 'TeamName_1', 'TeamName_2', 'Pred_prob',
                    'Pred_prob_trim', 'Pred_spread']]
   .rename(columns = {'Pred_prob_trim': 'trimmed_prob',
                      'Pred_prob': 'prob',
                      'Pred_spread': 'spread'})
   .to_csv(f"subs/submission_w_names_probs_trimmed_2021-03-17.csv", index = False))