## NCAAW March ML Mania

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.linear_model import LogisticRegression, LinearRegression

plt.style.use('fivethirtyeight')

## Utility Script from Kaggle
sys.path.append("../")
import mm_data_manipulation as dm

os.chdir("../../data/women")

### Data

There is far less data available on the women's side.  No KenPom data and fewer provided data files... but I guess that's an opportunity!

- [Provided Data](https://www.kaggle.com/c/ncaaw-march-mania-2021/data): 13 provided files including basic team info, box score stats, cities, etc.
- Play-by-play data worth scraping using [wncaahoopR](https://github.com/snestler/wncaahoopR)
   - This has descriptions of plays that we can potentially use to add player info.

In [2]:
## Provided
#teams_df              = pd.read_csv("WTeams.csv")
#seasons_df            = pd.read_csv("WSeasons.csv")
#conferences_df        = pd.read_csv("WTeamConferences.csv") ## could be useful as proxy for SOS?
#team_spellings_df     = pd.read_csv("WTeamSpellings.csv")
#tourney_results_df    = pd.read_csv("WNCAATourneyCompactResults.csv")
#reg_season_results_df = pd.read_csv("WRegularSeasonDetailedResults.csv")
#sample_submission     = pd.read_csv("WSampleSubmissionStage1.csv")
#
#                                  
### Merge in "DayZero" from season, could help link to wncaahoopR data: https://github.com/snestler/wncaahoopR_data/
#tourney_results_df = tourney_results_df.merge(seasons_df[['Season', 'DayZero']], on = 'Season')
#tourney_results_df['date'] = tourney_results_df.apply(lambda x: datetime.strptime(x['DayZero'], "%m/%d/%Y") + 
#                                                       timedelta(days = x['DayNum']), axis = 1)

## Feature Engineering

In [3]:
## Use Kaggle starter script to get off/def ratings, etc.
tourney_results_df, reg_season_stats_df = dm.prepare_data('women')
tourney_results_df.head()

Unnamed: 0,Season,DayNum,Team1,Team2,target,target_points,T1_Loc,T2_Loc,ID,T1_Ast,...,delta_FGM3_perc,delta_FT_perc,delta_FGM_no_ast_perc,delta_True_shooting_perc,delta_Opp_True_shooting_perc,delta_OT_win_perc,delta_Seed,delta_off_edge,delta_def_edge,delta_od_margin
0,2010,138,3124,3201,1,14,0,0,2010_3124_3201,14.6875,...,-0.069679,0.01726,0.003363,0.00923,-0.057436,-1.0,-9.0,15.506357,22.033034,1.603907
1,2010,138,3173,3395,1,1,0,0,2010_3173_3395,14.461538,...,-0.044864,-0.048387,0.045179,-0.008815,-0.000125,-0.5,-1.0,15.091164,16.37041,-0.43023
2,2010,138,3181,3214,1,35,1,-1,2010_3181_3214,14.25,...,0.010987,-0.052773,-0.008327,0.033501,-0.013925,-1.0,-13.0,18.300842,14.533827,9.110554
3,2010,138,3199,3256,1,14,1,-1,2010_3199_3256,15.333333,...,0.06282,0.028984,-0.046366,0.030904,-0.000103,-0.5,-11.0,16.225874,16.129244,4.787915
4,2010,138,3207,3265,1,20,0,0,2010_3207_3265,15.866667,...,-0.028334,-0.08157,-0.048658,-0.039596,0.066505,0.0,-7.0,11.551791,17.504158,-1.229263


In [4]:
### TODO: Add player info (ex. missing key players that use a large % of possessions?)

### Model

#### Binary outcome
Starting with logistic regression

In [5]:
train_df = tourney_results_df[tourney_results_df['Season'] < 2015].copy()
test_df  = tourney_results_df[tourney_results_df['Season'] >= 2015].copy()


In [6]:
#X = train_df[['delta_off_edge', 'delta_def_edge']]
X = train_df['delta_od_margin'].values.reshape(-1, 1)
clf = LogisticRegression(random_state=0, solver='lbfgs',
                          multi_class='ovr').fit(X, train_df['target'])
train_df['fitted_probabilities'] = clf.predict_proba(X)[:,1]


#### Point spread outcome
Starting with linear regression

In [7]:
reg = LinearRegression().fit(X, train_df['target_points'])

print(f"Coefficients {reg.coef_}")
print(f"R-Squared {np.round(reg.score(X, train_df['target_points']), 3)}")


Coefficients [0.87437913]
R-Squared 0.461


### Submission

Manipulate the sample submission file to add predictors.



In [9]:
sample_submission = pd.read_csv("WSampleSubmissionStage1.csv")

In [10]:
## Split out the ID
sample_submission['Season'] = sample_submission['ID'].apply(lambda x: x.split('_')[0]).astype(int)
sample_submission['Team1'] = sample_submission['ID'].apply(lambda x: x.split('_')[1]).astype(int)
sample_submission['Team2'] = sample_submission['ID'].apply(lambda x: x.split('_')[2]).astype(int)



In [11]:
## Merge in the predictor variables
## For Team 1
reg_season_stats_df.columns = [c + '_1' if c[len(c)-2:] not in ['_1', '_2'] else c[:-2] + '_1' for c in reg_season_stats_df.columns]
sample_submission = sample_submission.merge(reg_season_stats_df, left_on = ['Team1', 'Season'],
                                            right_on = ['TeamID_1', 'Season_1'])
## For Team 2
reg_season_stats_df.columns = [c[:-2] + '_2' for c in reg_season_stats_df.columns]
sample_submission = sample_submission.merge(reg_season_stats_df, left_on = ['Team2', 'Season'],
                                              right_on = ['TeamID_2', 'Season_2'])
## Difference
sample_submission['diff_AdjEM'] = sample_submission['AdjEM_1'] - sample_submission['AdjEM_2']


In [16]:
sample_submission['delta_od_margin'] = ((sample_submission['off_rating_1'] - 
                                        sample_submission['def_rating_1']) - 
                                        (sample_submission['off_rating_2'] - 
                                         sample_submission['def_rating_2']))

In [18]:
## Submission matrix
X = sample_submission['delta_od_margin'].values.reshape(-1, 1)

## Win probability predictions
sample_submission['Pred_prob'] = clf.predict_proba(X)[:,1]

## Point spread predictions
sample_submission['Pred_spread'] = reg.predict(X)

In [19]:
(sample_submission[['ID', 'Pred_prob']].rename(columns = {'Pred_prob': 'Pred'})
   .to_csv(f"subs/submission_probs_{datetime.today().strftime('%Y-%m-%d')}.csv", index = False))

(sample_submission[['ID', 'Pred_spread']].rename(columns = {'Pred_spread': 'Pred'})
   .to_csv(f"subs/submission_spread_{datetime.today().strftime('%Y-%m-%d')}.csv", index = False))

### Scores:

#### Win probabilities:
 - Basic logistic regression, 1 predictor (`delta_od_margin`): 0.56587

#### Spread:
 - Basic linear regression, 1 predictor (`delta_od_margin`): 16.26094