In [243]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import random
import math
import heapq
from scipy.stats import norm
from kaggle.api.kaggle_api_extended import KaggleApi
from zipfile import ZipFile
from datetime import datetime
import pytz
import swifter
from collections import defaultdict 

In [2]:
api = KaggleApi()
api.authenticate()

In [3]:
api.competitions_list(search='march')

[ncaam-march-mania-2021,
 ncaaw-march-mania-2021,
 ncaam-march-mania-2021-spread,
 ncaaw-march-mania-2021-spread,
 march-machine-learning-mania-2016,
 march-machine-learning-mania-2017,
 march-machine-learning-mania-2014,
 march-machine-learning-mania-2015,
 mens-machine-learning-competition-2018,
 mens-machine-learning-competition-2019,
 womens-machine-learning-competition-2018,
 womens-machine-learning-competition-2019,
 march-madness-analytics-2020,
 google-cloud-ncaa-march-madness-2020-division-1-mens-tournament,
 google-cloud-ncaa-march-madness-2020-division-1-womens-tournament]

In [66]:
api.competition_list_files('ncaaw-march-mania-2021')



[WDataFiles_Stage1/WRegularSeasonCompactResults.csv,
 WDataFiles_Stage1/WNCAATourneyDetailedResults.csv,
 WDataFiles_Stage1/WTeamSpellings.csv,
 WDataFiles_Stage1/WTeams.csv,
 WDataFiles_Stage1/WNCAATourneySlots.csv,
 WDataFiles_Stage1/Conferences.csv,
 WDataFiles_Stage1/WSampleSubmissionStage1.csv,
 WDataFiles_Stage1/Cities.csv,
 WDataFiles_Stage1/WNCAATourneySeeds.csv,
 WDataFiles_Stage1/WNCAATourneyCompactResults.csv,
 WDataFiles_Stage1/WSeasons.csv,
 WDataFiles_Stage1/WRegularSeasonDetailedResults.csv,
 WDataFiles_Stage1/WTeamConferences.csv,
 WDataFiles_Stage1/WGameCities.csv,
 WDataFiles_Stage2/WNCAATourneySeeds.csv,
 WDataFiles_Stage2/WTeamSpellings.csv,
 WDataFiles_Stage2/WRegularSeasonCompactResults.csv,
 WDataFiles_Stage2/Cities.csv,
 WDataFiles_Stage2/WSeasons.csv,
 WDataFiles_Stage2/Conferences.csv,
 WDataFiles_Stage2/WTeams.csv,
 WDataFiles_Stage2/WTeamConferences.csv,
 WDataFiles_Stage2/WNCAATourneyDetailedResults.csv,
 WDataFiles_Stage2/WSampleSubmissionStage2.csv,
 WDat

In [67]:
api.competition_download_files('ncaaw-march-mania-2021')

In [68]:
zf = ZipFile('ncaaw-march-mania-2021.zip')
zf.extractall('WData/') #save files in selected folder
zf.close()

In [7]:
season_data = pd.read_csv('WData/WRegularSeasonDetailedResults.csv')

In [8]:
season_data.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF'],
      dtype='object')

In [39]:
cols1 = ['Season','DayNum','WTeamID','WLoc','NumOT','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR',\
            'WAst','WTO','WStl','WBlk','WPF','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR','LAst',\
            'LTO','LStl','LBlk','LPF']
cols2 = ['Season','DayNum','LTeamID','WLoc','NumOT','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR',\
            'LAst','LTO','LStl','LBlk','LPF','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR','WAst',\
            'WTO','WStl','WBlk','WPF']

cols = ['Season','DayNum','TeamID','Loc','NumOT','TScore','TFGM','TFGA','TFGM3','TFGA3','TFTM','TFTA','TOR','TDR',\
           'TAst','TTO','TStl','TBlk','TPF','OScore','OFGM','OFGA','OFGM3','OFGA3','OFTM','OFTA','OOR','ODR','OAst',\
           'OTO','OStl','OBlk','OPF']

all_cols = ['Season','TeamID','NumOT','TScore','TFGM','TFGA','TFGM3','TFGA3','TFTM','TFTA','TOR','TDR','TAst','TTO',\
            'TStl','TBlk','TPF','OScore','OFGM','OFGA','OFGM3','OFGA3','OFTM','OFTA','OOR','ODR','OAst','OTO','OStl',\
            'OBlk','OPF','Wins','G','WP','AWP','ANWP','Min','TFGP','TFGP3','TFTP','TFG3R','TFTR','TEFG','TTFG','OFGP',\
            'OFGP3','OFTP','OFG3R','OFTR','OEFG','OTFG','TPoss','TPace','OPoss','OPace','TTRP','TORP','TAstP','TStlP',\
            'TBlkP','TTOP','OTRP','OORP','OAstP','OStlP','OBlkP','OTOP','OffRtg','DefRtg','DayNum']

season_data_continuous = pd.DataFrame(columns=all_cols)

for seas in range(2010,2021):
    max_daynum = max(season_data[(season_data['Season']==seas)]['DayNum'])
    for daynum in range(1,max_daynum+1):
        df = season_data[(season_data['Season']==seas)&(season_data['DayNum']<daynum)]
            
        if len(df) > 0:
            tm1 = df[cols1].rename(columns=dict(zip(cols1, cols)))
            tm2 = df[cols2].rename(columns=dict(zip(cols2, cols)))

            # Calculate total wins
            tm1['Wins'] = 1
            tm2['Wins'] = 0

            # Calculate total away wins and losses (Loc will be H for the losing team)
            tm1['AWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'A' else 0, axis=1)
            tm2['AWins'] = 0
            tm1['ALosses'] = 0
            tm2['ALosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'H' else 0, axis=1)

            # Calculate total neutral wins and losses
            tm1['NWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)
            tm2['NWins'] = 0
            tm1['NLosses'] = 0
            tm2['NLosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)

            tm = tm1.append(tm2)

            tm['G'] = 1

            agg_funcs = 29*['mean']+6*['sum']

            tm = tm.groupby(['Season','TeamID'], as_index=False).agg(dict(zip(cols[4:]+['G','Wins',\
                                                                'AWins','ALosses','NWins','NLosses'], agg_funcs)))

            # Game statistics
            tm['WP'] = tm['Wins']/tm['G']
            tm['AWP'] = tm['AWins']/(tm['AWins']+tm['ALosses'])
            tm['ANWP'] = (tm['AWins']+tm['NWins'])/(tm['AWins']+tm['ALosses']+tm['NWins']+tm['NLosses'])
            tm['Min'] = 40*tm['G']+5*tm['NumOT']

            # Team shooting percentages
            tm['TFGP'] = tm['TFGM']/tm['TFGA']
            tm['TFGP3'] = tm['TFGM3']/tm['TFGA3']
            tm['TFTP'] = tm['TFTM']/tm['TFTA']
            tm['TFG3R'] = tm['TFGA3']/tm['TFGA']
            tm['TFTR'] = tm['TFTA']/tm['TFGA']
            tm['TEFG'] = (0.5*tm['TFGM3']+tm['TFGM'])/tm['TFGA']
            tm['TTFG'] = tm['TScore']/(2*(0.44*tm['TFTA']+tm['TFGA']))

            # Opponent shooting percentages
            tm['OFGP'] = tm['OFGM']/tm['OFGA']
            tm['OFGP3'] = tm['OFGM3']/tm['OFGA3']
            tm['OFTP'] = tm['OFTM']/tm['OFTA']
            tm['OFG3R'] = tm['OFGA3']/tm['OFGA']
            tm['OFTR'] = tm['OFTA']/tm['OFGA']
            tm['OEFG'] = (0.5*tm['OFGM3']+tm['OFGM'])/tm['OFGA']
            tm['OTFG'] = tm['OScore']/(2*(0.44*tm['OFTA']+tm['OFGA']))

            # Team possession stats
            tm['TPoss'] = tm['TFGA']-tm['TOR']+tm['TTO']+0.4*tm['TFTA']
            tm['TPace'] = 40*tm['TPoss']/tm['Min']

            # Opponent possession stats
            tm['OPoss'] = tm['OFGA']-tm['OOR']+tm['OTO']+0.4*tm['OFTA']
            tm['OPace'] = 40*tm['OPoss']/tm['Min']

            # Team stat percentages
            tm['TTRP'] = (tm['TOR']+tm['TDR'])/(tm['TOR']+tm['TDR']+tm['OOR']+tm['ODR'])
            tm['TORP'] = tm['TOR']/(tm['TOR']+tm['ODR'])
            tm['TAstP'] = tm['TAst']/tm['TFGM']
            tm['TStlP'] = tm['TStl']/tm['OPoss']
            tm['TBlkP'] = tm['TBlk']/(tm['OFGA']-tm['OFGA3'])
            tm['TTOP'] = tm['TTO']/tm['TPoss']

            # Opponent stat percentages
            tm['OTRP'] = (tm['OOR']+tm['ODR'])/(tm['TOR']+tm['TDR']+tm['OOR']+tm['ODR'])
            tm['OORP'] = tm['OOR']/(tm['OOR']+tm['TDR'])
            tm['OAstP'] = tm['OAst']/tm['OFGM']
            tm['OStlP'] = tm['OStl']/tm['TPoss']
            tm['OBlkP'] = tm['OBlk']/(tm['TFGA']-tm['TFGA3'])
            tm['OTOP'] = tm['OTO']/tm['OPoss']

            # Ratings
            tm['OffRtg'] = tm['TScore']/tm['TPoss']
            tm['DefRtg'] = tm['OScore']/tm['OPoss']

            tm['DayNum'] = daynum

            tm = tm.drop(columns=['AWins','ALosses','NWins','NLosses'])

            season_data_continuous = season_data_continuous.append(tm)
    
    print('{} season completed.'.format(seas))

season_data_continuous

2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.


Unnamed: 0,Season,TeamID,NumOT,TScore,TFGM,TFGA,TFGM3,TFGA3,TFTM,TFTA,...,TTOP,OTRP,OORP,OAstP,OStlP,OBlkP,OTOP,OffRtg,DefRtg,DayNum
0,2010,3102,0,46,15,47,5,17,11,20,...,0.367647,0.576923,0.500000,0.520000,0.147059,0.033333,0.263930,0.676471,0.953079,12
1,2010,3103,0,63,23,54,5,9,12,19,...,0.258621,0.513514,0.297297,0.550000,0.100575,0.133333,0.328571,0.905172,0.700000,12
2,2010,3104,0,73,26,62,5,12,16,28,...,0.259067,0.459770,0.311111,0.280000,0.051813,0.040000,0.250627,0.945596,0.852130,12
3,2010,3105,0,61,22,57,3,12,14,21,...,0.344828,0.448718,0.289474,0.633333,0.198939,0.133333,0.198939,0.809019,0.954907,12
4,2010,3107,0,56,22,57,5,17,7,9,...,0.326087,0.538462,0.321429,0.448276,0.190217,0.125000,0.261708,0.760870,1.046832,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346,2020,3463,0.037037,68.851852,25.444444,58.925926,5.62963,16.518519,12.333333,18.37037,...,0.203836,0.509277,0.307910,0.559690,0.101656,0.066376,0.233985,0.974114,0.871464,129
347,2020,3464,0.107143,63.285714,22.071429,57.357143,6.964286,22.857143,12.178571,15.214286,...,0.212313,0.522366,0.306324,0.534188,0.093439,0.112836,0.189716,0.919850,1.005598,129
348,2020,3465,0.071429,72.214286,25.214286,63.5,9.678571,29.785714,12.107143,16.821429,...,0.175750,0.559695,0.317848,0.588462,0.083201,0.097458,0.169882,0.945125,0.976000,129
349,2020,3466,0.071429,74.785714,26.428571,58.857143,8.642857,25.142857,13.285714,17.607143,...,0.223656,0.508557,0.298893,0.469954,0.099942,0.064619,0.239367,1.015913,0.878811,129


In [40]:
# Get matchup history of all teams on any given day
matchup_dict = {}
for seas in range(2010,2021):
    d_day = {}
    for daynum in range(133):
        df = season_data[(season_data['Season']==seas)&(season_data['DayNum']<daynum)]
        d_team = {}
        for t in range(3101,3500):
            d_team[t] = list(df[df['WTeamID']==t]['LTeamID'].append(df[df['LTeamID']==t]['WTeamID']))
        d_day[daynum] = d_team
    matchup_dict[seas] = d_day
    print('{} season completed.'.format(seas))

# print(matchup_dict[2020][20])

2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.


In [41]:
# Get matchup history of all teams' opponents on any given day
opp_matchup_dict = {}

for seas in range(2010,2021):
    d_day = {}
    for daynum in range(133):
        d_team = {}
        for t in range(3101,3500):
            opp_matchups = []
            for o in matchup_dict[seas][daynum][t]:
                opp_matchups += matchup_dict[seas][daynum][o]
            d_team[t] = opp_matchups
        d_day[daynum] = d_team
    opp_matchup_dict[seas] = d_day
    print('{} season completed.'.format(seas))

# print(opp_matchup_dict[2020][20][1101])

2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.


In [42]:
wp_dict = {h: {g: f.groupby('TeamID')['WP'].apply(float).to_dict()
     for g, f in g.groupby('DayNum')}
     for h, g in season_data_continuous.groupby('Season')}

In [43]:
print('START:', datetime.now(pytz.timezone('US/Pacific')))

season_data_continuous['OWP'] = season_data_continuous.apply(lambda row: 
    np.mean([wp_dict[row['Season']][row['DayNum']][x] \
             for x in matchup_dict[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)

season_data_continuous['OOWP'] = season_data_continuous.apply(lambda row: 
    np.mean([wp_dict[row['Season']][row['DayNum']][x] \
             for x in opp_matchup_dict[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)

season_data_continuous['SOS'] = (2*season_data_continuous['OWP']+season_data_continuous['OOWP'])/3

print('FINISH:', datetime.now(pytz.timezone('US/Pacific')))

season_data_continuous

START: 2021-03-18 04:00:41.474552-07:00
FINISH: 2021-03-18 04:18:18.847123-07:00


Unnamed: 0,Season,TeamID,NumOT,TScore,TFGM,TFGA,TFGM3,TFGA3,TFTM,TFTA,...,OAstP,OStlP,OBlkP,OTOP,OffRtg,DefRtg,DayNum,OWP,OOWP,SOS
0,2010,3102,0,46,15,47,5,17,11,20,...,0.520000,0.147059,0.033333,0.263930,0.676471,0.953079,12,1.000000,0.000000,0.666667
1,2010,3103,0,63,23,54,5,9,12,19,...,0.550000,0.100575,0.133333,0.328571,0.905172,0.700000,12,0.000000,1.000000,0.333333
2,2010,3104,0,73,26,62,5,12,16,28,...,0.280000,0.051813,0.040000,0.250627,0.945596,0.852130,12,0.000000,1.000000,0.333333
3,2010,3105,0,61,22,57,3,12,14,21,...,0.633333,0.198939,0.133333,0.198939,0.809019,0.954907,12,1.000000,0.000000,0.666667
4,2010,3107,0,56,22,57,5,17,7,9,...,0.448276,0.190217,0.125000,0.261708,0.760870,1.046832,12,1.000000,0.000000,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346,2020,3463,0.037037,68.851852,25.444444,58.925926,5.62963,16.518519,12.333333,18.37037,...,0.559690,0.101656,0.066376,0.233985,0.974114,0.871464,129,0.529047,0.506280,0.521458
347,2020,3464,0.107143,63.285714,22.071429,57.357143,6.964286,22.857143,12.178571,15.214286,...,0.534188,0.093439,0.112836,0.189716,0.919850,1.005598,129,0.475375,0.470670,0.473807
348,2020,3465,0.071429,72.214286,25.214286,63.5,9.678571,29.785714,12.107143,16.821429,...,0.588462,0.083201,0.097458,0.169882,0.945125,0.976000,129,0.429284,0.457384,0.438651
349,2020,3466,0.071429,74.785714,26.428571,58.857143,8.642857,25.142857,13.285714,17.607143,...,0.469954,0.099942,0.064619,0.239367,1.015913,0.878811,129,0.423349,0.495569,0.447422


In [44]:
cols1 = ['Season','DayNum','WTeamID','WLoc','NumOT','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR',\
            'WAst','WTO','WStl','WBlk','WPF','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR','LAst',\
            'LTO','LStl','LBlk','LPF']
cols2 = ['Season','DayNum','LTeamID','WLoc','NumOT','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR',\
            'LAst','LTO','LStl','LBlk','LPF','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR','WAst',\
            'WTO','WStl','WBlk','WPF']

cols = ['Season','DayNum','TeamID','Loc','LNumOT','LTScore','LTFGM','LTFGA','LTFGM3','LTFGA3','LTFTM','LTFTA','LTOR',\
        'LTDR','LTAst','LTTO','LTStl','LTBlk','LTPF','LOScore','LOFGM','LOFGA','LOFGM3','LOFGA3','LOFTM','LOFTA',\
        'LOOR','LODR','LOAst','LOTO','LOStl','LOBlk','LOPF']

all_cols = ['Season','TeamID','LNumOT','LTScore','LTFGM','LTFGA','LTFGM3','LTFGA3','LTFTM','LTFTA','LTOR','LTDR',\
            'LTAst','LTTO','LTStl','LTBlk','LTPF','LOScore','LOFGM','LOFGA','LOFGM3','LOFGA3','LOFTM','LOFTA','LOOR',\
            'LODR','LOAst','LOTO','LOStl','LOBlk','LOPF','LWins','LG','LWP','LAWP','LANWP','LMin','LTFGP','LTFGP3',\
            'LTFTP','LTFG3R','LTFTR','LTEFG','LTTFG','LOFGP','LOFGP3','LOFTP','LOFG3R','LOFTR','LOEFG','LOTFG',\
            'LTPoss','LTPace','LOPoss','LOPace','LTTRP','LTORP','LTAstP','LTStlP','LTBlkP','LTTOP','LOTRP','LOORP',\
            'LOAstP','LOStlP','LOBlkP','LOTOP','LOffRtg','LDefRtg','DayNum']

l30_data_continuous = pd.DataFrame(columns=all_cols)

for seas in range(2010,2021):
    max_daynum = max(season_data[(season_data['Season']==seas)]['DayNum'])
    for daynum in range(1,max_daynum+1):
        df = season_data[(season_data['Season']==seas)&(season_data['DayNum']<daynum)\
                         &(season_data['DayNum']>=daynum-30)]
            
        if len(df) > 0:
            tm1 = df[cols1].rename(columns=dict(zip(cols1, cols)))
            tm2 = df[cols2].rename(columns=dict(zip(cols2, cols)))

            # Calculate total wins
            tm1['LWins'] = 1
            tm2['LWins'] = 0

            # Calculate total away wins and losses (Loc will be H for the losing team)
            tm1['LAWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'A' else 0, axis=1)
            tm2['LAWins'] = 0
            tm1['LALosses'] = 0
            tm2['LALosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'H' else 0, axis=1)

            # Calculate total neutral wins and losses
            tm1['LNWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)
            tm2['LNWins'] = 0
            tm1['LNLosses'] = 0
            tm2['LNLosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)

            tm = tm1.append(tm2)

            tm['LG'] = 1

            agg_funcs = 29*['mean']+6*['sum']

            tm = tm.groupby(['Season','TeamID'], as_index=False).agg(dict(zip(cols[4:]+['LG','LWins',\
                                                                'LAWins','LALosses','LNWins','LNLosses'], agg_funcs)))

            # Game statistics
            tm['LWP'] = tm['LWins']/tm['LG']
            tm['LAWP'] = tm['LAWins']/(tm['LAWins']+tm['LALosses'])
            tm['LANWP'] = (tm['LAWins']+tm['LNWins'])/(tm['LAWins']+tm['LALosses']+tm['LNWins']+tm['LNLosses'])
            tm['LMin'] = 40*tm['LG']+5*tm['LNumOT']

            # Team shooting percentages
            tm['LTFGP'] = tm['LTFGM']/tm['LTFGA']
            tm['LTFGP3'] = tm['LTFGM3']/tm['LTFGA3']
            tm['LTFTP'] = tm['LTFTM']/tm['LTFTA']
            tm['LTFG3R'] = tm['LTFGA3']/tm['LTFGA']
            tm['LTFTR'] = tm['LTFTA']/tm['LTFGA']
            tm['LTEFG'] = (0.5*tm['LTFGM3']+tm['LTFGM'])/tm['LTFGA']
            tm['LTTFG'] = tm['LTScore']/(2*(0.44*tm['LTFTA']+tm['LTFGA']))

            # Opponent shooting percentages
            tm['LOFGP'] = tm['LOFGM']/tm['LOFGA']
            tm['LOFGP3'] = tm['LOFGM3']/tm['LOFGA3']
            tm['LOFTP'] = tm['LOFTM']/tm['LOFTA']
            tm['LOFG3R'] = tm['LOFGA3']/tm['LOFGA']
            tm['LOFTR'] = tm['LOFTA']/tm['LOFGA']
            tm['LOEFG'] = (0.5*tm['LOFGM3']+tm['LOFGM'])/tm['LOFGA']
            tm['LOTFG'] = tm['LOScore']/(2*(0.44*tm['LOFTA']+tm['LOFGA']))

            # Team possession stats
            tm['LTPoss'] = tm['LTFGA']-tm['LTOR']+tm['LTTO']+0.4*tm['LTFTA']
            tm['LTPace'] = 40*tm['LTPoss']/tm['LMin']

            # Opponent possession stats
            tm['LOPoss'] = tm['LOFGA']-tm['LOOR']+tm['LOTO']+0.4*tm['LOFTA']
            tm['LOPace'] = 40*tm['LOPoss']/tm['LMin']

            # Team stat percentages
            tm['LTTRP'] = (tm['LTOR']+tm['LTDR'])/(tm['LTOR']+tm['LTDR']+tm['LOOR']+tm['LODR'])
            tm['LTORP'] = tm['LTOR']/(tm['LTOR']+tm['LODR'])
            tm['LTAstP'] = tm['LTAst']/tm['LTFGM']
            tm['LTStlP'] = tm['LTStl']/tm['LOPoss']
            tm['LTBlkP'] = tm['LTBlk']/(tm['LOFGA']-tm['LOFGA3'])
            tm['LTTOP'] = tm['LTTO']/tm['LTPoss']

            # Opponent stat percentages
            tm['LOTRP'] = (tm['LOOR']+tm['LODR'])/(tm['LTOR']+tm['LTDR']+tm['LOOR']+tm['LODR'])
            tm['LOORP'] = tm['LOOR']/(tm['LOOR']+tm['LTDR'])
            tm['LOAstP'] = tm['LOAst']/tm['LOFGM']
            tm['LOStlP'] = tm['LOStl']/tm['LTPoss']
            tm['LOBlkP'] = tm['LOBlk']/(tm['LTFGA']-tm['LTFGA3'])
            tm['LOTOP'] = tm['LOTO']/tm['LOPoss']

            # Ratings
            tm['LOffRtg'] = tm['LTScore']/tm['LTPoss']
            tm['LDefRtg'] = tm['LOScore']/tm['LOPoss']

            tm['DayNum'] = daynum

            tm = tm.drop(columns=['LAWins','LALosses','LNWins','LNLosses'])

            l30_data_continuous = l30_data_continuous.append(tm)
    
    print('{} season completed.'.format(seas))

l30_data_continuous

2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.


Unnamed: 0,Season,TeamID,LNumOT,LTScore,LTFGM,LTFGA,LTFGM3,LTFGA3,LTFTM,LTFTA,...,LTTOP,LOTRP,LOORP,LOAstP,LOStlP,LOBlkP,LOTOP,LOffRtg,LDefRtg,DayNum
0,2010,3102,0,46,15,47,5,17,11,20,...,0.367647,0.576923,0.500000,0.520000,0.147059,0.033333,0.263930,0.676471,0.953079,12
1,2010,3103,0,63,23,54,5,9,12,19,...,0.258621,0.513514,0.297297,0.550000,0.100575,0.133333,0.328571,0.905172,0.700000,12
2,2010,3104,0,73,26,62,5,12,16,28,...,0.259067,0.459770,0.311111,0.280000,0.051813,0.040000,0.250627,0.945596,0.852130,12
3,2010,3105,0,61,22,57,3,12,14,21,...,0.344828,0.448718,0.289474,0.633333,0.198939,0.133333,0.198939,0.809019,0.954907,12
4,2010,3107,0,56,22,57,5,17,7,9,...,0.326087,0.538462,0.321429,0.448276,0.190217,0.125000,0.261708,0.760870,1.046832,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346,2020,3463,0.0,57.5,21.75,55.375,4.875,15.875,9.125,14.625,...,0.207554,0.540670,0.339286,0.545977,0.115931,0.056962,0.215373,0.860135,0.837356,129
347,2020,3464,0.0,63.666667,22.5,57.666667,6.666667,21.833333,12.0,14.166667,...,0.224390,0.502439,0.279188,0.524691,0.090244,0.097674,0.191934,0.931707,1.064140,129
348,2020,3465,0.0,81.0,28.0,62.0,11.833333,30.5,13.166667,18.0,...,0.161077,0.558282,0.334559,0.598765,0.077229,0.058201,0.167261,1.072374,0.990187,129
349,2020,3466,0.142857,72.0,25.285714,55.857143,8.142857,23.857143,13.285714,17.428571,...,0.218011,0.515213,0.312030,0.470199,0.105949,0.049107,0.240620,1.026895,0.897227,129


In [45]:
# Get matchup history of all teams on any given day
l30_matchup_dict = {}
for seas in range(2010,2021):
    d_day = {}
    for daynum in range(133):
        df = season_data[(season_data['Season']==seas)&(season_data['DayNum']<daynum)&\
                         (season_data['DayNum']>=daynum-30)]
        d_team = {}
        for t in range(3101,3500):
            d_team[t] = list(df[df['WTeamID']==t]['LTeamID'].append(df[df['LTeamID']==t]['WTeamID']))
        d_day[daynum] = d_team
    l30_matchup_dict[seas] = d_day
    print('{} season completed.'.format(seas))

print(l30_matchup_dict[2020][20])

2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.
{3101: [3460], 3102: [3312, 3113, 3119, 3360, 3294, 3339], 3103: [3382, 3236, 3464, 3178, 3276], 3104: [3155, 3273, 3375, 3214], 3105: [3361, 3304, 3465, 3316], 3106: [3240, 3376, 3155], 3107: [3162, 3165, 3265, 3393, 3131, 3248], 3108: [3398, 3272, 3372, 3317, 3431], 3109: [], 3110: [3437, 3219, 3325, 3206], 3111: [3433, 3189, 3376, 3205], 3112: [3315, 3365, 3152, 3400, 3341], 3113: [3102, 3119, 3278], 3114: [3146, 3401, 3349, 3261, 3194], 3115: [3157, 3454, 3228], 3116: [3309, 3270, 3331, 3392, 3125], 3117: [3409, 3369, 3177, 3299], 3118: [], 3119: [3102, 3456, 3113, 3383, 3162], 3120: [3459, 3330, 3412], 3121: [], 3122: [3290, 3191, 3228], 3123: [3236, 3462, 3139, 3237, 3153], 3124: [3306, 3212, 3223, 3378, 3249], 3125: [3151, 3416, 3

In [46]:
# Get matchup history of all teams' opponents on any given day
l30_opp_matchup_dict = {}

for seas in range(2010,2021):
    d_day = {}
    for daynum in range(133):
        d_team = {}
        for t in range(3101,3500):
            opp_matchups = []
            for o in l30_matchup_dict[seas][daynum][t]:
                opp_matchups += l30_matchup_dict[seas][daynum][o]
            d_team[t] = opp_matchups
        d_day[daynum] = d_team
    l30_opp_matchup_dict[seas] = d_day
    print('{} season completed.'.format(seas))

print(l30_opp_matchup_dict[2020][20][3101])

2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.
[3167, 3462, 3101]


In [47]:
l30_wp_dict = {h: {g: f.groupby('TeamID')['LWP'].apply(float).to_dict()
     for g, f in g.groupby('DayNum')}
     for h, g in l30_data_continuous.groupby('Season')}

In [48]:
print('START:', datetime.now(pytz.timezone('US/Pacific')))

l30_data_continuous['LOWP'] = l30_data_continuous.apply(lambda row: 
    np.mean([l30_wp_dict[row['Season']][row['DayNum']][x] \
             for x in l30_matchup_dict[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)

l30_data_continuous['LOOWP'] = l30_data_continuous.apply(lambda row: 
    np.mean([l30_wp_dict[row['Season']][row['DayNum']][x] \
             for x in l30_opp_matchup_dict[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)

l30_data_continuous['LSOS'] = (2*l30_data_continuous['LOWP']+l30_data_continuous['LOOWP'])/3

print('FINISH:', datetime.now(pytz.timezone('US/Pacific')))

l30_data_continuous

START: 2021-03-18 04:47:08.552982-07:00
FINISH: 2021-03-18 04:49:58.604095-07:00


Unnamed: 0,Season,TeamID,LNumOT,LTScore,LTFGM,LTFGA,LTFGM3,LTFGA3,LTFTM,LTFTA,...,LOAstP,LOStlP,LOBlkP,LOTOP,LOffRtg,LDefRtg,DayNum,LOWP,LOOWP,LSOS
0,2010,3102,0,46,15,47,5,17,11,20,...,0.520000,0.147059,0.033333,0.263930,0.676471,0.953079,12,1.000000,0.000000,0.666667
1,2010,3103,0,63,23,54,5,9,12,19,...,0.550000,0.100575,0.133333,0.328571,0.905172,0.700000,12,0.000000,1.000000,0.333333
2,2010,3104,0,73,26,62,5,12,16,28,...,0.280000,0.051813,0.040000,0.250627,0.945596,0.852130,12,0.000000,1.000000,0.333333
3,2010,3105,0,61,22,57,3,12,14,21,...,0.633333,0.198939,0.133333,0.198939,0.809019,0.954907,12,1.000000,0.000000,0.666667
4,2010,3107,0,56,22,57,5,17,7,9,...,0.448276,0.190217,0.125000,0.261708,0.760870,1.046832,12,1.000000,0.000000,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346,2020,3463,0.0,57.5,21.75,55.375,4.875,15.875,9.125,14.625,...,0.545977,0.115931,0.056962,0.215373,0.860135,0.837356,129,0.631944,0.443627,0.569172
347,2020,3464,0.0,63.666667,22.5,57.666667,6.666667,21.833333,12.0,14.166667,...,0.524691,0.090244,0.097674,0.191934,0.931707,1.064140,129,0.461310,0.495859,0.472826
348,2020,3465,0.0,81.0,28.0,62.0,11.833333,30.5,13.166667,18.0,...,0.598765,0.077229,0.058201,0.167261,1.072374,0.990187,129,0.476190,0.514286,0.488889
349,2020,3466,0.142857,72.0,25.285714,55.857143,8.142857,23.857143,13.285714,17.428571,...,0.470199,0.105949,0.049107,0.240620,1.026895,0.897227,129,0.448980,0.499008,0.465656


In [49]:
season_data_c = pd.read_csv('WData/WRegularSeasonCompactResults.csv')
tourney_data_c = pd.read_csv('WData/WNCAATourneyCompactResults.csv')

cols = ['Season','DayNum','WTeamID','WScore','LTeamID','LScore','WLoc']

season_game_data = pd.merge(season_data_c[cols], season_data_continuous, left_on=['Season','DayNum','WTeamID'], \
        right_on=['Season','DayNum','TeamID']).drop(columns=['TeamID'])
season_game_data = pd.merge(season_game_data, l30_data_continuous, left_on=['Season','DayNum','WTeamID'], \
        right_on=['Season','DayNum','TeamID']).drop(columns=['TeamID'])
season_game_data = pd.merge(season_game_data, season_data_continuous, left_on=['Season','DayNum','LTeamID'], \
        right_on=['Season','DayNum','TeamID']).drop(columns=['TeamID'])
season_game_data = pd.merge(season_game_data, l30_data_continuous, left_on=['Season','DayNum','LTeamID'], \
        right_on=['Season','DayNum','TeamID']).drop(columns=['TeamID'])

season_game_data['Tourney'] = 0

season_data_final = pd.merge(season_data_continuous.groupby(['Season','TeamID'], \
        as_index=False).max()[['Season','TeamID','DayNum']], season_data_continuous).drop(columns='DayNum')
l30_data_final = pd.merge(l30_data_continuous.groupby(['Season','TeamID'], \
        as_index=False).max()[['Season','TeamID','DayNum']], l30_data_continuous).drop(columns='DayNum')

tourney_game_data = pd.merge(tourney_data_c[cols], season_data_final, left_on=['Season','WTeamID'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
tourney_game_data = pd.merge(tourney_game_data, l30_data_final, left_on=['Season','WTeamID'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
tourney_game_data = pd.merge(tourney_game_data, season_data_final, left_on=['Season','LTeamID'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
tourney_game_data = pd.merge(tourney_game_data, l30_data_final, left_on=['Season','LTeamID'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])

tourney_game_data['Tourney'] = 1

all_game_data = season_game_data.append(tourney_game_data)

cols = []
for c in list(all_game_data.columns[7:-1]):
    if c[-1:] == 'x':
        cols += [c[:-1]+'y']
    else:
        cols += [c[:-1]+'x']

drop_cols1 = ['DayNum','LTeamID','LScore','WLoc']
drop_cols2 = ['DayNum','WTeamID','WScore','WLoc']

all_game_data1 = all_game_data.drop(columns=drop_cols1)
all_game_data2 = all_game_data.rename(columns=dict(zip(all_game_data.columns[7:-1], cols))).drop(columns=drop_cols2)

all_game_data = all_game_data1.rename(columns={'WTeamID':'TeamID','WScore':'Score'})\
    .append(all_game_data2.rename(columns={'LTeamID':'TeamID','LScore':'Score'})).fillna(0)

all_game_data

Unnamed: 0,Season,TeamID,Score,NumOT_x,TScore_x,TFGM_x,TFGA_x,TFGM3_x,TFGA3_x,TFTM_x,...,LOAstP_y,LOStlP_y,LOBlkP_y,LOTOP_y,LOffRtg_y,LDefRtg_y,LOWP_y,LOOWP_y,LSOS_y,Tourney
0,2010,3198,64,0.000000,71.000000,25.000000,51.000000,13.000000,23.000000,8.000000,...,0.518519,0.082192,0.108108,0.303867,0.890411,1.091160,1.000000,0.000000,0.666667,0
1,2010,3219,73,1.000000,67.000000,24.000000,59.000000,4.000000,12.000000,15.000000,...,0.545455,0.072993,0.087719,0.321839,0.802920,0.988506,1.000000,0.000000,0.666667,0
2,2010,3298,68,1.000000,70.000000,26.000000,63.000000,7.000000,19.000000,11.000000,...,0.444444,0.114943,0.045455,0.158151,0.988506,0.802920,0.000000,1.000000,0.333333,0
3,2010,3394,96,0.000000,65.000000,25.000000,64.000000,2.000000,18.000000,13.000000,...,0.280000,0.051813,0.040000,0.250627,0.945596,0.852130,0.000000,1.000000,0.333333,0
4,2010,3399,81,0.000000,68.000000,25.000000,63.000000,4.000000,21.000000,14.000000,...,0.520000,0.147059,0.033333,0.263930,0.676471,0.953079,1.000000,0.000000,0.666667,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
625,2019,3140,63,0.064516,69.225806,25.483871,59.580645,6.967742,20.193548,11.290323,...,0.520000,0.115568,0.089347,0.175379,1.033311,0.824621,0.459656,0.521259,0.480190,1
626,2019,3283,46,0.064516,72.580645,27.225806,63.774194,4.354839,13.548387,13.774194,...,0.520000,0.115568,0.089347,0.175379,1.033311,0.824621,0.459656,0.521259,0.480190,1
627,2019,3200,49,0.030303,60.545455,22.121212,56.090909,7.787879,23.969697,8.515152,...,0.533040,0.090393,0.082569,0.205718,1.116625,1.042538,0.435714,0.565986,0.479138,1
628,2019,3397,77,0.032258,74.548387,28.064516,65.387097,5.258065,16.096774,13.161290,...,0.615385,0.071429,0.079404,0.234035,1.086081,0.997823,0.557292,0.491071,0.535218,1


In [50]:
def sum_sq_err(y_obs, y_pred):
    """
    inputs: y_obs, array of observed target values
            y_pred, array of predicted target values
    output: sse, sum of squared errors
    """
    return sum((y_obs-y_pred)**2)

def aic(y_obs, y_pred, k):
    """
    inputs: y_obs, array of observed target values
            y_pred, array of predicted target values
            k, number of features in model
    output: AIC (Akaike Information Criterion) for OLS, measure that rewards simple models
    """
    sse = sum_sq_err(y_obs, y_pred)
    n = len(y_pred)
    return 2*k + n*math.log(sse/n)

def selection(df_x, df_y, parents, threshold):
    """
    inputs: df_x, dataframe of dependent variable observations
            df_y, dataframe of independent variable observations
            parents, list of 'parent' vectors which determine features being used
            threshold, float on [0,1] which determines what portion of the population 'survives'
    output: portion of parents which are deemed most fit (based on AIC)
    """
    fitness = {}
    th = int(len(parents)*threshold)
    for s in parents:
        p = []
        for i in range(len(s)):
            if s[i]:
                p += [i]
        X = df_x.iloc[:,p].values
        y = df_y.values
        lm = LinearRegression().fit(X, y)
        fitness[aic(y, lm.predict(X), len(p))] = s
    best = heapq.nsmallest(th, fitness.keys())
    return [fitness[x] for x in best]

def crossover(parents, O):
    """
    inputs: parents, list of 'parent' vectors which determine features being used
            O, number of offspring generated
    output: offspring, list of 'child' vectors generated from parents
    """
    if len(parents) < 2:
        return
    l = len(parents[0])
    offspring = []
    for i in range(O):
        inds = np.random.choice(len(parents), 2, replace=False)
        cutoff = np.random.randint(1,l-1)
        offspring += [np.append(parents[inds[0]][:cutoff],parents[inds[1]][cutoff:])]
    return offspring

def mutation(offspring, r, P):
    """
    inputs: offspring, new generation of vectors which determine features being used
            r, float on [0,1] - rate at which mutation occurs
            P, number of parameters to choose from
    output: new list of features which has some random removals and additions
    """
    if len(offspring) < 1 or r > 1 or r < 0:
        return
    mutated = []
    for c in offspring:
        m = np.copy(c)
        for i in range(P):
            # Mutate with rate, r
            if np.random.choice([True,False], 1, p=[r,1-r]):
                m[i] = 1-c[i]
        mutated += [m]
    return mutated

def evolution(df_x, df_y, G, N = 500, t = 0.2, vocal=False, vocal_int=100):
    """
    inputs: df_x, dataframe of dependent variable observations
            df_y, dataframe of independent variable observations
            G, number of generations
    output: most fit feature vector after G generations
    """
    P = len(df_x.columns)
    r = 1/P
    
    # Initialize G0 of parents (randomly select features for N parents)
    ps = np.random.randint(2, size=(N,P))
    
    # Go through evolutionary process (selection, crossover, mutation) for G generations
    gen = ps
    num_children = int(N-N*t)
    for g in range(G):
        survivors = selection(df_x, df_y, gen, t)
        children = mutation(crossover(survivors, num_children), r, P)
        gen = survivors + children
        if vocal and g%vocal_int==0:
            print('GENERATION {} COMPLETED: {}'.format(g, datetime.now(pytz.timezone('US/Pacific'))))
    
    best = selection(df_x, df_y, gen, 1/N)[0]
    return best

In [51]:
only_tourney_data = all_game_data[all_game_data['Tourney']==1]
train_data, test_data = train_test_split(only_tourney_data, test_size=0.1, stratify=only_tourney_data[['Tourney']])

train_data = train_data.drop(columns=['Min_x','Min_y','LMin_x','LMin_y','G_x','G_y','LG_x','LG_y',\
                                     'Wins_x','Wins_y','LWins_x','LWins_y'])

# Set number of generations for genetic algorithm
num_gen = 10000

DFx = train_data.iloc[:,3:]
DFy = train_data['Score']

fs = DFx.columns

# Run evolutionary algorithm for desired nnumber of generations
use = evolution(DFx, DFy, num_gen, vocal=True, vocal_int=500)

features = []
for i in range(len(fs)):
    if use[i]:
        features += [fs[i]]
print(features)

pd.DataFrame(features).rename(columns={0:'feature'}).to_csv('WFeatures/featuresG'+str(num_gen)+'.txt', index=None)

GENERATION 0 COMPLETED: 2021-03-18 04:50:58.929228-07:00
GENERATION 500 COMPLETED: 2021-03-18 05:25:39.300035-07:00
GENERATION 1000 COMPLETED: 2021-03-18 05:59:51.132854-07:00
GENERATION 1500 COMPLETED: 2021-03-18 06:34:20.963894-07:00
GENERATION 2000 COMPLETED: 2021-03-18 07:08:57.275445-07:00
GENERATION 2500 COMPLETED: 2021-03-18 07:43:12.714655-07:00
GENERATION 3000 COMPLETED: 2021-03-18 08:17:29.771289-07:00
GENERATION 3500 COMPLETED: 2021-03-18 08:51:53.932332-07:00
GENERATION 4000 COMPLETED: 2021-03-18 09:26:16.324732-07:00
GENERATION 4500 COMPLETED: 2021-03-18 10:07:23.712870-07:00
GENERATION 5000 COMPLETED: 2021-03-18 10:51:00.280807-07:00
GENERATION 5500 COMPLETED: 2021-03-18 11:41:16.824981-07:00
GENERATION 6000 COMPLETED: 2021-03-18 12:36:31.596163-07:00
GENERATION 6500 COMPLETED: 2021-03-18 13:30:37.031254-07:00
GENERATION 7000 COMPLETED: 2021-03-18 14:19:52.081823-07:00
GENERATION 7500 COMPLETED: 2021-03-18 15:07:09.660108-07:00
GENERATION 8000 COMPLETED: 2021-03-18 15:49:

In [301]:
train_data, test_data = train_test_split(all_game_data, test_size=0.1, stratify=all_game_data[['Tourney']])

train_data = train_data.drop(columns=['Min_x','Min_y','LMin_x','LMin_y','G_x','G_y','LG_x','LG_y',\
                                     'Wins_x','Wins_y','LWins_x','LWins_y','TPF_x','OPF_x','LTPF_x',\
                                     'LOPF_x','TPF_y','OPF_y','LTPF_y','LOPF_y'])

# Set number of generations for genetic algorithm
num_gen = 1000

DFx = train_data.iloc[:,3:]
DFy = train_data['Score']

fs = DFx.columns

# Run evolutionary algorithm for desired nnumber of generations
use = evolution(DFx, DFy, num_gen, N=100, vocal=True, vocal_int=5)

features = []
for i in range(len(fs)):
    if use[i]:
        features += [fs[i]]
print(features)

pd.DataFrame(features).rename(columns={0:'feature'}).to_csv('WFeatures/allg_featuresG'+str(num_gen)+'.txt', index=None)

GENERATION 0 COMPLETED: 2021-03-19 17:16:43.509424-07:00
GENERATION 5 COMPLETED: 2021-03-19 17:22:21.675181-07:00
GENERATION 10 COMPLETED: 2021-03-19 17:28:32.678198-07:00
GENERATION 15 COMPLETED: 2021-03-19 17:35:09.501680-07:00
GENERATION 20 COMPLETED: 2021-03-19 17:49:33.820008-07:00
GENERATION 25 COMPLETED: 2021-03-19 17:55:50.927896-07:00
GENERATION 30 COMPLETED: 2021-03-19 18:01:37.879004-07:00
GENERATION 35 COMPLETED: 2021-03-19 18:07:49.192937-07:00
GENERATION 40 COMPLETED: 2021-03-19 18:57:18.843217-07:00
GENERATION 45 COMPLETED: 2021-03-19 19:03:43.381850-07:00
GENERATION 50 COMPLETED: 2021-03-19 19:09:58.523713-07:00
GENERATION 55 COMPLETED: 2021-03-19 19:15:44.776226-07:00
GENERATION 60 COMPLETED: 2021-03-19 19:21:30.467564-07:00
GENERATION 65 COMPLETED: 2021-03-19 19:27:09.995820-07:00
GENERATION 70 COMPLETED: 2021-03-19 19:32:35.492461-07:00
GENERATION 75 COMPLETED: 2021-03-19 19:37:58.684720-07:00
GENERATION 80 COMPLETED: 2021-03-19 19:41:56.615329-07:00
GENERATION 85 CO

GENERATION 700 COMPLETED: 2021-03-22 12:43:07.919864-07:00
GENERATION 705 COMPLETED: 2021-03-22 12:47:19.775834-07:00
GENERATION 710 COMPLETED: 2021-03-22 12:51:24.771360-07:00
GENERATION 715 COMPLETED: 2021-03-22 12:55:18.695014-07:00
GENERATION 720 COMPLETED: 2021-03-22 12:59:19.615328-07:00
GENERATION 725 COMPLETED: 2021-03-22 13:03:37.915082-07:00
GENERATION 730 COMPLETED: 2021-03-22 13:08:00.193026-07:00
GENERATION 735 COMPLETED: 2021-03-22 13:12:30.840480-07:00
GENERATION 740 COMPLETED: 2021-03-22 13:17:40.788697-07:00
GENERATION 745 COMPLETED: 2021-03-22 13:22:28.126303-07:00
GENERATION 750 COMPLETED: 2021-03-22 13:27:25.461456-07:00
GENERATION 755 COMPLETED: 2021-03-22 13:31:51.559196-07:00
GENERATION 760 COMPLETED: 2021-03-22 13:36:16.645436-07:00
GENERATION 765 COMPLETED: 2021-03-22 13:41:03.912859-07:00
GENERATION 770 COMPLETED: 2021-03-22 13:45:54.738265-07:00
GENERATION 775 COMPLETED: 2021-03-22 13:51:18.418672-07:00
GENERATION 780 COMPLETED: 2021-03-22 13:57:14.010486-07:

In [302]:
len(features)

107

In [53]:
features = pd.read_csv('WFeatures/featuresG10000.txt')
fs = features['feature'].values

X = train_data.iloc[:,3:][fs].values
y = train_data['Score'].values

lm = LinearRegression().fit(X, y)

train_data['PScore'] = lm.intercept_

for i in range(len(fs)):
    train_data['PScore'] = train_data.apply(lambda row: row['PScore'] + row[fs[i]]*lm.coef_[i], axis=1)
    
sse = sum_sq_err(y, lm.predict(X))
rmse = np.sqrt(sse/len(y))

print('TRAIN RMSE: {}'.format(rmse))

test_data['PScore'] = lm.intercept_

for i in range(len(fs)):
    test_data['PScore'] = test_data.apply(lambda row: row['PScore'] + row[fs[i]]*lm.coef_[i], axis=1)
    
sse = sum_sq_err(test_data['Score'].values, test_data['PScore'].values)
rmse = np.sqrt(sse/len(test_data['Score'].values))

print('TEST RMSE: {}'.format(rmse))

test_data[['Season','TeamID','Score','PScore']]

TRAIN RMSE: 9.155544076196652
TEST RMSE: 3.561555824644083


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Season,TeamID,Score,PScore
429,2016,3390,66,73.107268
603,2019,3323,81,78.807467
414,2016,3393,51,66.527035
51,2010,3246,68,69.198305
561,2018,3195,70,70.791167
...,...,...,...,...
185,2012,3382,66,59.433366
134,2012,3343,64,62.212737
543,2018,3346,46,54.000150
620,2019,3333,80,73.363786


In [57]:
season_data_std_score = season_data[['Season','WTeamID','WScore']]\
.rename(columns={'WTeamID':'TeamID','WScore':'Score'})\
.append(season_data[['Season','LTeamID','LScore']]\
.rename(columns={'LTeamID':'TeamID','LScore':'Score'}))\
.groupby(['Season','TeamID'], as_index=False).std()\
.rename(columns={'Score':'StdScore'})

tourney_data_sc = pd.DataFrame({0:[],1:[],2:[]})

tourney_teams = only_tourney_data.groupby('Season')['TeamID'].apply(set).apply(list).to_dict()
for s in sorted(tourney_teams.keys()):
    teams = sorted(tourney_teams[s])
    for i in range(len(teams)):
        for j in range(i+1, len(teams)):
            tourney_data_sc = tourney_data_sc.append(pd.DataFrame([s, teams[i], teams[j]]).T)

tourney_data_sc = tourney_data_sc.rename(columns={0:'Season',1:'TeamID1',2:'TeamID2'})
tourney_data_sc = tourney_data_sc[tourney_data_sc['Season']>=2015]

predict_data_1 = pd.merge(tourney_data_sc, season_data_final, left_on=['Season','TeamID1'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
predict_data_1 = pd.merge(predict_data_1, l30_data_final, left_on=['Season','TeamID1'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
predict_data_1 = pd.merge(predict_data_1, season_data_final, left_on=['Season','TeamID2'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
predict_data_1 = pd.merge(predict_data_1, l30_data_final, left_on=['Season','TeamID2'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])

In [59]:
predict_data_1['PScore1'] = lm.intercept_

for i in range(len(fs)):
    predict_data_1['PScore1'] = predict_data_1.apply(lambda row: row['PScore1'] + row[fs[i]]*lm.coef_[i], axis=1)

predict_data_1['PScore2'] = lm.intercept_

fs2 = [x[:-1]+'y' if x[-1] == 'x' else x[:-1]+'x' for x in fs]
for i in range(len(fs2)):
    predict_data_1['PScore2'] = predict_data_1.apply(lambda row: row['PScore2'] + row[fs2[i]]*lm.coef_[i], axis=1)
    
predictions_1 = pd.merge(predict_data_1[['Season','TeamID1','TeamID2','PScore1','PScore2']], season_data_std_score, \
        left_on=['Season','TeamID1'], right_on=['Season','TeamID']).drop(columns=['TeamID'])
predictions_1 = pd.merge(predictions_1, season_data_std_score, \
        left_on=['Season','TeamID2'], right_on=['Season','TeamID']).drop(columns=['TeamID'])

predictions_1['m'] = predictions_1['PScore1']-predictions_1['PScore2']
predictions_1['sd'] = predictions_1.apply(lambda row: math.sqrt(row['StdScore_x']*row['StdScore_y']), axis=1)

predictions_1['ID'] = predictions_1.apply(lambda row: '{}_{}_{}'.format(int(row['Season']), int(row['TeamID1']), \
                                                                        int(row['TeamID2'])), axis=1)
predictions_1['Pred'] = norm.cdf(predictions_1['m'], 0, predictions_1['sd'])

predictions_1[['ID','Pred']].to_csv('SubmissionStage1.csv', index=None)

predictions_1[['ID','Pred']]

Unnamed: 0,ID,Pred
0,2015_3106_3107,0.000791
1,2015_3106_3110,0.000581
2,2015_3107_3110,0.630924
3,2015_3106_3113,0.000006
4,2015_3107_3113,0.164908
...,...,...
10075,2019_3401_3460,0.657996
10076,2019_3406_3460,0.279117
10077,2019_3413_3460,0.593355
10078,2019_3416_3460,0.545082


## Predict upcoming tournament results on current season data

In [69]:
season_data_2021 = pd.read_csv('WData/WDataFiles_Stage2/WRegularSeasonDetailedResults.csv')

In [70]:
cols1 = ['Season','DayNum','WTeamID','WLoc','NumOT','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR',\
            'WAst','WTO','WStl','WBlk','WPF','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR','LAst',\
            'LTO','LStl','LBlk','LPF']
cols2 = ['Season','DayNum','LTeamID','WLoc','NumOT','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR',\
            'LAst','LTO','LStl','LBlk','LPF','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR','WAst',\
            'WTO','WStl','WBlk','WPF']

cols = ['Season','DayNum','TeamID','Loc','NumOT','TScore','TFGM','TFGA','TFGM3','TFGA3','TFTM','TFTA','TOR','TDR',\
           'TAst','TTO','TStl','TBlk','TPF','OScore','OFGM','OFGA','OFGM3','OFGA3','OFTM','OFTA','OOR','ODR','OAst',\
           'OTO','OStl','OBlk','OPF']

all_cols = ['Season','TeamID','NumOT','TScore','TFGM','TFGA','TFGM3','TFGA3','TFTM','TFTA','TOR','TDR','TAst','TTO',\
            'TStl','TBlk','TPF','OScore','OFGM','OFGA','OFGM3','OFGA3','OFTM','OFTA','OOR','ODR','OAst','OTO','OStl',\
            'OBlk','OPF','Wins','G','WP','AWP','ANWP','Min','TFGP','TFGP3','TFTP','TFG3R','TFTR','TEFG','TTFG','OFGP',\
            'OFGP3','OFTP','OFG3R','OFTR','OEFG','OTFG','TPoss','TPace','OPoss','OPace','TTRP','TORP','TAstP','TStlP',\
            'TBlkP','TTOP','OTRP','OORP','OAstP','OStlP','OBlkP','OTOP','OffRtg','DefRtg','DayNum']

season_data_2021_continuous = pd.DataFrame(columns=all_cols)

for seas in range(2021,2022):
    max_daynum = max(season_data_2021[(season_data_2021['Season']==seas)]['DayNum'])
    for daynum in range(1,max_daynum+1):
        df = season_data_2021[(season_data_2021['Season']==seas)&(season_data_2021['DayNum']<daynum)]
            
        if len(df) > 0:
            tm1 = df[cols1].rename(columns=dict(zip(cols1, cols)))
            tm2 = df[cols2].rename(columns=dict(zip(cols2, cols)))

            # Calculate total wins
            tm1['Wins'] = 1
            tm2['Wins'] = 0

            # Calculate total away wins and losses (Loc will be H for the losing team)
            tm1['AWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'A' else 0, axis=1)
            tm2['AWins'] = 0
            tm1['ALosses'] = 0
            tm2['ALosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'H' else 0, axis=1)

            # Calculate total neutral wins and losses
            tm1['NWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)
            tm2['NWins'] = 0
            tm1['NLosses'] = 0
            tm2['NLosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)

            tm = tm1.append(tm2)

            tm['G'] = 1

            agg_funcs = 29*['mean']+6*['sum']

            tm = tm.groupby(['Season','TeamID'], as_index=False).agg(dict(zip(cols[4:]+['G','Wins',\
                                                                'AWins','ALosses','NWins','NLosses'], agg_funcs)))

            # Game statistics
            tm['WP'] = tm['Wins']/tm['G']
            tm['AWP'] = tm['AWins']/(tm['AWins']+tm['ALosses'])
            tm['ANWP'] = (tm['AWins']+tm['NWins'])/(tm['AWins']+tm['ALosses']+tm['NWins']+tm['NLosses'])
            tm['Min'] = 40*tm['G']+5*tm['NumOT']

            # Team shooting percentages
            tm['TFGP'] = tm['TFGM']/tm['TFGA']
            tm['TFGP3'] = tm['TFGM3']/tm['TFGA3']
            tm['TFTP'] = tm['TFTM']/tm['TFTA']
            tm['TFG3R'] = tm['TFGA3']/tm['TFGA']
            tm['TFTR'] = tm['TFTA']/tm['TFGA']
            tm['TEFG'] = (0.5*tm['TFGM3']+tm['TFGM'])/tm['TFGA']
            tm['TTFG'] = tm['TScore']/(2*(0.44*tm['TFTA']+tm['TFGA']))

            # Opponent shooting percentages
            tm['OFGP'] = tm['OFGM']/tm['OFGA']
            tm['OFGP3'] = tm['OFGM3']/tm['OFGA3']
            tm['OFTP'] = tm['OFTM']/tm['OFTA']
            tm['OFG3R'] = tm['OFGA3']/tm['OFGA']
            tm['OFTR'] = tm['OFTA']/tm['OFGA']
            tm['OEFG'] = (0.5*tm['OFGM3']+tm['OFGM'])/tm['OFGA']
            tm['OTFG'] = tm['OScore']/(2*(0.44*tm['OFTA']+tm['OFGA']))

            # Team possession stats
            tm['TPoss'] = tm['TFGA']-tm['TOR']+tm['TTO']+0.4*tm['TFTA']
            tm['TPace'] = 40*tm['TPoss']/tm['Min']

            # Opponent possession stats
            tm['OPoss'] = tm['OFGA']-tm['OOR']+tm['OTO']+0.4*tm['OFTA']
            tm['OPace'] = 40*tm['OPoss']/tm['Min']

            # Team stat percentages
            tm['TTRP'] = (tm['TOR']+tm['TDR'])/(tm['TOR']+tm['TDR']+tm['OOR']+tm['ODR'])
            tm['TORP'] = tm['TOR']/(tm['TOR']+tm['ODR'])
            tm['TAstP'] = tm['TAst']/tm['TFGM']
            tm['TStlP'] = tm['TStl']/tm['OPoss']
            tm['TBlkP'] = tm['TBlk']/(tm['OFGA']-tm['OFGA3'])
            tm['TTOP'] = tm['TTO']/tm['TPoss']

            # Opponent stat percentages
            tm['OTRP'] = (tm['OOR']+tm['ODR'])/(tm['TOR']+tm['TDR']+tm['OOR']+tm['ODR'])
            tm['OORP'] = tm['OOR']/(tm['OOR']+tm['TDR'])
            tm['OAstP'] = tm['OAst']/tm['OFGM']
            tm['OStlP'] = tm['OStl']/tm['TPoss']
            tm['OBlkP'] = tm['OBlk']/(tm['TFGA']-tm['TFGA3'])
            tm['OTOP'] = tm['OTO']/tm['OPoss']

            # Ratings
            tm['OffRtg'] = tm['TScore']/tm['TPoss']
            tm['DefRtg'] = tm['OScore']/tm['OPoss']

            tm['DayNum'] = daynum

            tm = tm.drop(columns=['AWins','ALosses','NWins','NLosses'])

            season_data_2021_continuous = season_data_2021_continuous.append(tm)
    
    print('{} season completed.'.format(seas))

season_data_2021_continuous

2021 season completed.


Unnamed: 0,Season,TeamID,NumOT,TScore,TFGM,TFGA,TFGM3,TFGA3,TFTM,TFTA,...,TTOP,OTRP,OORP,OAstP,OStlP,OBlkP,OTOP,OffRtg,DefRtg,DayNum
0,2021,3102,0,56,21,66,2,22,12,18,...,0.170604,0.573171,0.242424,0.521739,0.104987,0.000000,0.268097,0.734908,0.965147,24
1,2021,3104,0,83,31,63,11,27,10,11,...,0.094637,0.460317,0.294118,0.555556,0.031546,0.111111,0.190058,1.309148,0.994152,24
2,2021,3108,0,65,24,56,2,13,15,22,...,0.162242,0.620000,0.352941,0.588235,0.103245,0.046512,0.234807,0.958702,1.367403,24
3,2021,3111,0,74,24,72,9,31,17,25,...,0.177215,0.481928,0.315789,0.454545,0.037975,0.170732,0.228426,0.936709,0.862944,24
4,2021,3113,0,56,20,58,4,15,12,18,...,0.323034,0.455696,0.297297,0.388889,0.168539,0.139535,0.303468,0.786517,0.679191,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,2021,3467,0.133333,64.666667,23.4,59.066667,7.866667,22.533333,10.0,14.0,...,0.236228,0.508459,0.318015,0.550409,0.131653,0.109489,0.229418,0.905696,0.953550,132
339,2021,3468,0.045455,62.772727,21.909091,60.454545,5.409091,18.363636,13.545455,18.409091,...,0.211527,0.547766,0.325373,0.548183,0.117719,0.103672,0.216687,0.846720,1.057285,132
340,2021,3469,0.0,48.5,17.0,55.5,5.0,18.5,9.5,16.0,...,0.248580,0.541096,0.289855,0.607843,0.163352,0.067568,0.161517,0.688920,1.004213,132
341,2021,3470,0.0,58.571429,18.428571,54.333333,5.047619,19.761905,16.666667,23.238095,...,0.204456,0.550036,0.308271,0.521905,0.105285,0.077135,0.213922,0.835484,0.943535,132


In [71]:
# Get matchup history of all teams on any given day
matchup_dict_2021 = {}
for seas in range(2021,2022):
    d_day = {}
    for daynum in range(133):
        df = season_data_2021[(season_data_2021['Season']==seas)&(season_data_2021['DayNum']<daynum)]
        d_team = {}
        for t in range(3101,3500):
            d_team[t] = list(df[df['WTeamID']==t]['LTeamID'].append(df[df['LTeamID']==t]['WTeamID']))
        d_day[daynum] = d_team
    matchup_dict_2021[seas] = d_day
    print('{} season completed.'.format(seas))

print(matchup_dict_2021[2021][100][3101])

2021 season completed.
[3294, 3391, 3394, 3368, 3146, 3249, 3401, 3412, 3358, 3372, 3249]


In [72]:
# Get matchup history of all teams' opponents on any given day
opp_matchup_dict_2021 = {}

for seas in range(2021,2022):
    d_day = {}
    for daynum in range(133):
        d_team = {}
        for t in range(3101,3500):
            opp_matchups = []
            for o in matchup_dict_2021[seas][daynum][t]:
                opp_matchups += matchup_dict_2021[seas][daynum][o]
            d_team[t] = opp_matchups
        d_day[daynum] = d_team
    opp_matchup_dict_2021[seas] = d_day
    print('{} season completed.'.format(seas))

print(opp_matchup_dict_2021[2021][100][3101])

2021 season completed.
[3176, 3285, 3286, 3225, 3186, 3186, 3319, 3242, 3303, 3101, 3226, 3226, 3461, 3285, 3286, 3225, 3319, 3381, 3316, 3468, 3378, 3412, 3101, 3316, 3468, 3195, 3195, 3252, 3252, 3309, 3455, 3242, 3410, 3410, 3402, 3368, 3372, 3101, 3249, 3108, 3394, 3358, 3322, 3309, 3407, 3408, 3311, 3101, 3270, 3146, 3115, 3115, 3311, 3368, 3322, 3309, 3124, 3395, 3243, 3116, 3114, 3280, 3270, 3358, 3372, 3101, 3470, 3311, 3394, 3223, 3101, 3401, 3361, 3201, 3401, 3395, 3402, 3400, 3358, 3372, 3101, 3249, 3177, 3249, 3400, 3114, 3101, 3358, 3349, 3322, 3196, 3246, 3116, 3280, 3281, 3120, 3208, 3261, 3116, 3261, 3151, 3212, 3391, 3101, 3379, 3379, 3273, 3317, 3317, 3150, 3150, 3292, 3292, 3431, 3431, 3427, 3146, 3249, 3101, 3223, 3311, 3307, 3401, 3311, 3368, 3372, 3388, 3433, 3238, 3426, 3120, 3411, 3309, 3394, 3146, 3230, 3322, 3249, 3101, 3358, 3309, 3113, 3222, 3470, 3311, 3394, 3223, 3101, 3401, 3361, 3201, 3401, 3395, 3402, 3400, 3358, 3372, 3101]


In [73]:
wp_dict_2021 = {h: {g: f.groupby('TeamID')['WP'].apply(float).to_dict()
     for g, f in g.groupby('DayNum')}
     for h, g in season_data_2021_continuous.groupby('Season')}

In [74]:
print('START:', datetime.now(pytz.timezone('US/Pacific')))

season_data_2021_continuous['OWP'] = season_data_2021_continuous.apply(lambda row: 
    np.mean([wp_dict_2021[row['Season']][row['DayNum']][x] \
             for x in matchup_dict_2021[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)

season_data_2021_continuous['OOWP'] = season_data_2021_continuous.apply(lambda row: 
    np.mean([wp_dict_2021[row['Season']][row['DayNum']][x] \
             for x in opp_matchup_dict_2021[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)

season_data_2021_continuous['SOS'] = (2*season_data_2021_continuous['OWP']+season_data_2021_continuous['OOWP'])/3

print('FINISH:', datetime.now(pytz.timezone('US/Pacific')))

season_data_2021_continuous

START: 2021-03-18 20:18:03.959561-07:00
FINISH: 2021-03-18 20:18:49.969017-07:00


Unnamed: 0,Season,TeamID,NumOT,TScore,TFGM,TFGA,TFGM3,TFGA3,TFTM,TFTA,...,OAstP,OStlP,OBlkP,OTOP,OffRtg,DefRtg,DayNum,OWP,OOWP,SOS
0,2021,3102,0,56,21,66,2,22,12,18,...,0.521739,0.104987,0.000000,0.268097,0.734908,0.965147,24,1.000000,0.000000,0.666667
1,2021,3104,0,83,31,63,11,27,10,11,...,0.555556,0.031546,0.111111,0.190058,1.309148,0.994152,24,0.000000,1.000000,0.333333
2,2021,3108,0,65,24,56,2,13,15,22,...,0.588235,0.103245,0.046512,0.234807,0.958702,1.367403,24,1.000000,0.000000,0.666667
3,2021,3111,0,74,24,72,9,31,17,25,...,0.454545,0.037975,0.170732,0.228426,0.936709,0.862944,24,0.000000,1.000000,0.333333
4,2021,3113,0,56,20,58,4,15,12,18,...,0.388889,0.168539,0.139535,0.303468,0.786517,0.679191,24,0.000000,1.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,2021,3467,0.133333,64.666667,23.4,59.066667,7.866667,22.533333,10.0,14.0,...,0.550409,0.131653,0.109489,0.229418,0.905696,0.953550,132,0.526277,0.470121,0.507558
339,2021,3468,0.045455,62.772727,21.909091,60.454545,5.409091,18.363636,13.545455,18.409091,...,0.548183,0.117719,0.103672,0.216687,0.846720,1.057285,132,0.464929,0.470568,0.466809
340,2021,3469,0.0,48.5,17.0,55.5,5.0,18.5,9.5,16.0,...,0.607843,0.163352,0.067568,0.161517,0.688920,1.004213,132,0.467703,0.458501,0.464636
341,2021,3470,0.0,58.571429,18.428571,54.333333,5.047619,19.761905,16.666667,23.238095,...,0.521905,0.105285,0.077135,0.213922,0.835484,0.943535,132,0.460916,0.476870,0.466234


In [75]:
cols1 = ['Season','DayNum','WTeamID','WLoc','NumOT','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR',\
            'WAst','WTO','WStl','WBlk','WPF','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR','LAst',\
            'LTO','LStl','LBlk','LPF']
cols2 = ['Season','DayNum','LTeamID','WLoc','NumOT','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR',\
            'LAst','LTO','LStl','LBlk','LPF','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR','WAst',\
            'WTO','WStl','WBlk','WPF']

cols = ['Season','DayNum','TeamID','Loc','LNumOT','LTScore','LTFGM','LTFGA','LTFGM3','LTFGA3','LTFTM','LTFTA','LTOR',\
        'LTDR','LTAst','LTTO','LTStl','LTBlk','LTPF','LOScore','LOFGM','LOFGA','LOFGM3','LOFGA3','LOFTM','LOFTA',\
        'LOOR','LODR','LOAst','LOTO','LOStl','LOBlk','LOPF']

all_cols = ['Season','TeamID','LNumOT','LTScore','LTFGM','LTFGA','LTFGM3','LTFGA3','LTFTM','LTFTA','LTOR','LTDR',\
            'LTAst','LTTO','LTStl','LTBlk','LTPF','LOScore','LOFGM','LOFGA','LOFGM3','LOFGA3','LOFTM','LOFTA','LOOR',\
            'LODR','LOAst','LOTO','LOStl','LOBlk','LOPF','LWins','LG','LWP','LAWP','LANWP','LMin','LTFGP','LTFGP3',\
            'LTFTP','LTFG3R','LTFTR','LTEFG','LTTFG','LOFGP','LOFGP3','LOFTP','LOFG3R','LOFTR','LOEFG','LOTFG',\
            'LTPoss','LTPace','LOPoss','LOPace','LTTRP','LTORP','LTAstP','LTStlP','LTBlkP','LTTOP','LOTRP','LOORP',\
            'LOAstP','LOStlP','LOBlkP','LOTOP','LOffRtg','LDefRtg','DayNum']

l30_data_2021_continuous = pd.DataFrame(columns=all_cols)

for seas in range(2021,2022):
    max_daynum = max(season_data_2021[(season_data_2021['Season']==seas)]['DayNum'])
    for daynum in range(1,max_daynum+1):
        df = season_data_2021[(season_data_2021['Season']==seas)&(season_data_2021['DayNum']<daynum)\
                         &(season_data_2021['DayNum']>=daynum-30)]
            
        if len(df) > 0:
            tm1 = df[cols1].rename(columns=dict(zip(cols1, cols)))
            tm2 = df[cols2].rename(columns=dict(zip(cols2, cols)))

            # Calculate total wins
            tm1['LWins'] = 1
            tm2['LWins'] = 0

            # Calculate total away wins and losses (Loc will be H for the losing team)
            tm1['LAWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'A' else 0, axis=1)
            tm2['LAWins'] = 0
            tm1['LALosses'] = 0
            tm2['LALosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'H' else 0, axis=1)

            # Calculate total neutral wins and losses
            tm1['LNWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)
            tm2['LNWins'] = 0
            tm1['LNLosses'] = 0
            tm2['LNLosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)

            tm = tm1.append(tm2)

            tm['LG'] = 1

            agg_funcs = 29*['mean']+6*['sum']

            tm = tm.groupby(['Season','TeamID'], as_index=False).agg(dict(zip(cols[4:]+['LG','LWins',\
                                                                'LAWins','LALosses','LNWins','LNLosses'], agg_funcs)))

            # Game statistics
            tm['LWP'] = tm['LWins']/tm['LG']
            tm['LAWP'] = tm['LAWins']/(tm['LAWins']+tm['LALosses'])
            tm['LANWP'] = (tm['LAWins']+tm['LNWins'])/(tm['LAWins']+tm['LALosses']+tm['LNWins']+tm['LNLosses'])
            tm['LMin'] = 40*tm['LG']+5*tm['LNumOT']

            # Team shooting percentages
            tm['LTFGP'] = tm['LTFGM']/tm['LTFGA']
            tm['LTFGP3'] = tm['LTFGM3']/tm['LTFGA3']
            tm['LTFTP'] = tm['LTFTM']/tm['LTFTA']
            tm['LTFG3R'] = tm['LTFGA3']/tm['LTFGA']
            tm['LTFTR'] = tm['LTFTA']/tm['LTFGA']
            tm['LTEFG'] = (0.5*tm['LTFGM3']+tm['LTFGM'])/tm['LTFGA']
            tm['LTTFG'] = tm['LTScore']/(2*(0.44*tm['LTFTA']+tm['LTFGA']))

            # Opponent shooting percentages
            tm['LOFGP'] = tm['LOFGM']/tm['LOFGA']
            tm['LOFGP3'] = tm['LOFGM3']/tm['LOFGA3']
            tm['LOFTP'] = tm['LOFTM']/tm['LOFTA']
            tm['LOFG3R'] = tm['LOFGA3']/tm['LOFGA']
            tm['LOFTR'] = tm['LOFTA']/tm['LOFGA']
            tm['LOEFG'] = (0.5*tm['LOFGM3']+tm['LOFGM'])/tm['LOFGA']
            tm['LOTFG'] = tm['LOScore']/(2*(0.44*tm['LOFTA']+tm['LOFGA']))

            # Team possession stats
            tm['LTPoss'] = tm['LTFGA']-tm['LTOR']+tm['LTTO']+0.4*tm['LTFTA']
            tm['LTPace'] = 40*tm['LTPoss']/tm['LMin']

            # Opponent possession stats
            tm['LOPoss'] = tm['LOFGA']-tm['LOOR']+tm['LOTO']+0.4*tm['LOFTA']
            tm['LOPace'] = 40*tm['LOPoss']/tm['LMin']

            # Team stat percentages
            tm['LTTRP'] = (tm['LTOR']+tm['LTDR'])/(tm['LTOR']+tm['LTDR']+tm['LOOR']+tm['LODR'])
            tm['LTORP'] = tm['LTOR']/(tm['LTOR']+tm['LODR'])
            tm['LTAstP'] = tm['LTAst']/tm['LTFGM']
            tm['LTStlP'] = tm['LTStl']/tm['LOPoss']
            tm['LTBlkP'] = tm['LTBlk']/(tm['LOFGA']-tm['LOFGA3'])
            tm['LTTOP'] = tm['LTTO']/tm['LTPoss']

            # Opponent stat percentages
            tm['LOTRP'] = (tm['LOOR']+tm['LODR'])/(tm['LTOR']+tm['LTDR']+tm['LOOR']+tm['LODR'])
            tm['LOORP'] = tm['LOOR']/(tm['LOOR']+tm['LTDR'])
            tm['LOAstP'] = tm['LOAst']/tm['LOFGM']
            tm['LOStlP'] = tm['LOStl']/tm['LTPoss']
            tm['LOBlkP'] = tm['LOBlk']/(tm['LTFGA']-tm['LTFGA3'])
            tm['LOTOP'] = tm['LOTO']/tm['LOPoss']

            # Ratings
            tm['LOffRtg'] = tm['LTScore']/tm['LTPoss']
            tm['LDefRtg'] = tm['LOScore']/tm['LOPoss']

            tm['DayNum'] = daynum

            tm = tm.drop(columns=['LAWins','LALosses','LNWins','LNLosses'])

            l30_data_2021_continuous = l30_data_2021_continuous.append(tm)
    
    print('{} season completed.'.format(seas))

l30_data_2021_continuous

2021 season completed.


Unnamed: 0,Season,TeamID,LNumOT,LTScore,LTFGM,LTFGA,LTFGM3,LTFGA3,LTFTM,LTFTA,...,LTTOP,LOTRP,LOORP,LOAstP,LOStlP,LOBlkP,LOTOP,LOffRtg,LDefRtg,DayNum
0,2021,3102,0,56,21,66,2,22,12,18,...,0.170604,0.573171,0.242424,0.521739,0.104987,0.000000,0.268097,0.734908,0.965147,24
1,2021,3104,0,83,31,63,11,27,10,11,...,0.094637,0.460317,0.294118,0.555556,0.031546,0.111111,0.190058,1.309148,0.994152,24
2,2021,3108,0,65,24,56,2,13,15,22,...,0.162242,0.620000,0.352941,0.588235,0.103245,0.046512,0.234807,0.958702,1.367403,24
3,2021,3111,0,74,24,72,9,31,17,25,...,0.177215,0.481928,0.315789,0.454545,0.037975,0.170732,0.228426,0.936709,0.862944,24
4,2021,3113,0,56,20,58,4,15,12,18,...,0.323034,0.455696,0.297297,0.388889,0.168539,0.139535,0.303468,0.786517,0.679191,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324,2021,3466,0.0,73.0,25.333333,60.666667,10.0,28.5,12.333333,16.166667,...,0.230190,0.478836,0.202247,0.444444,0.106242,0.077720,0.196166,0.969456,1.045475,132
325,2021,3467,0.142857,69.285714,24.285714,61.714286,8.857143,23.142857,11.857143,14.571429,...,0.208416,0.508841,0.312500,0.525140,0.093291,0.122222,0.229266,0.962684,0.944976,132
326,2021,3468,0.0,67.428571,24.571429,60.857143,6.857143,18.0,11.428571,16.142857,...,0.211051,0.549065,0.328283,0.579439,0.128550,0.126667,0.216037,0.905602,1.080187,132
327,2021,3470,0.0,57.75,17.5,50.125,4.875,18.875,17.875,23.0,...,0.196683,0.526423,0.291498,0.524691,0.098342,0.072000,0.235690,0.890860,0.856715,132


In [78]:
# Get matchup history of all teams on any given day
l30_matchup_dict_2021 = {}
for seas in range(2021,2022):
    d_day = {}
    for daynum in range(133):
        df = season_data_2021[(season_data_2021['Season']==seas)&(season_data_2021['DayNum']<daynum)&\
                         (season_data_2021['DayNum']>=daynum-30)]
        d_team = {}
        for t in range(3101,3500):
            d_team[t] = list(df[df['WTeamID']==t]['LTeamID'].append(df[df['LTeamID']==t]['WTeamID']))
        d_day[daynum] = d_team
    l30_matchup_dict_2021[seas] = d_day
    print('{} season completed.'.format(seas))

print(l30_matchup_dict_2021[2021][100][3101])

2021 season completed.
[3368, 3146, 3249, 3358, 3372, 3249]


In [79]:
# Get matchup history of all teams' opponents on any given day
l30_opp_matchup_dict_2021 = {}

for seas in range(2021,2022):
    d_day = {}
    for daynum in range(133):
        d_team = {}
        for t in range(3101,3500):
            opp_matchups = []
            for o in l30_matchup_dict_2021[seas][daynum][t]:
                opp_matchups += l30_matchup_dict_2021[seas][daynum][o]
            d_team[t] = opp_matchups
        d_day[daynum] = d_team
    l30_opp_matchup_dict_2021[seas] = d_day
    print('{} season completed.'.format(seas))

print(l30_opp_matchup_dict_2021[2021][100][3101])

2021 season completed.
[3322, 3309, 3101, 3270, 3146, 3311, 3368, 3322, 3309, 3372, 3101, 3394, 3223, 3101, 3358, 3372, 3101, 3249, 3101, 3223, 3311, 3372, 3146, 3230, 3322, 3249, 3101, 3358, 3309, 3394, 3223, 3101, 3358, 3372, 3101]


In [80]:
l30_wp_dict_2021 = {h: {g: f.groupby('TeamID')['LWP'].apply(float).to_dict()
     for g, f in g.groupby('DayNum')}
     for h, g in l30_data_2021_continuous.groupby('Season')}

In [81]:
print('START:', datetime.now(pytz.timezone('US/Pacific')))

l30_data_2021_continuous['LOWP'] = l30_data_2021_continuous.apply(lambda row: 
    np.mean([l30_wp_dict_2021[row['Season']][row['DayNum']][x] \
             for x in l30_matchup_dict_2021[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)

l30_data_2021_continuous['LOOWP'] = l30_data_2021_continuous.apply(lambda row: 
    np.mean([l30_wp_dict_2021[row['Season']][row['DayNum']][x] \
             for x in l30_opp_matchup_dict_2021[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)

l30_data_2021_continuous['LSOS'] = (2*l30_data_2021_continuous['LOWP']+l30_data_2021_continuous['LOOWP'])/3

print('FINISH:', datetime.now(pytz.timezone('US/Pacific')))

l30_data_2021_continuous

START: 2021-03-18 20:23:38.696961-07:00
FINISH: 2021-03-18 20:23:57.776215-07:00


Unnamed: 0,Season,TeamID,LNumOT,LTScore,LTFGM,LTFGA,LTFGM3,LTFGA3,LTFTM,LTFTA,...,LOAstP,LOStlP,LOBlkP,LOTOP,LOffRtg,LDefRtg,DayNum,LOWP,LOOWP,LSOS
0,2021,3102,0,56,21,66,2,22,12,18,...,0.521739,0.104987,0.000000,0.268097,0.734908,0.965147,24,1.000000,0.000000,0.666667
1,2021,3104,0,83,31,63,11,27,10,11,...,0.555556,0.031546,0.111111,0.190058,1.309148,0.994152,24,0.000000,1.000000,0.333333
2,2021,3108,0,65,24,56,2,13,15,22,...,0.588235,0.103245,0.046512,0.234807,0.958702,1.367403,24,1.000000,0.000000,0.666667
3,2021,3111,0,74,24,72,9,31,17,25,...,0.454545,0.037975,0.170732,0.228426,0.936709,0.862944,24,0.000000,1.000000,0.333333
4,2021,3113,0,56,20,58,4,15,12,18,...,0.388889,0.168539,0.139535,0.303468,0.786517,0.679191,24,0.000000,1.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324,2021,3466,0.0,73.0,25.333333,60.666667,10.0,28.5,12.333333,16.166667,...,0.444444,0.106242,0.077720,0.196166,0.969456,1.045475,132,0.420304,0.468915,0.436508
325,2021,3467,0.142857,69.285714,24.285714,61.714286,8.857143,23.142857,11.857143,14.571429,...,0.525140,0.093291,0.122222,0.229266,0.962684,0.944976,132,0.402597,0.537479,0.447558
326,2021,3468,0.0,67.428571,24.571429,60.857143,6.857143,18.0,11.428571,16.142857,...,0.579439,0.128550,0.126667,0.216037,0.905602,1.080187,132,0.559524,0.507708,0.542252
327,2021,3470,0.0,57.75,17.5,50.125,4.875,18.875,17.875,23.0,...,0.524691,0.098342,0.072000,0.235690,0.890860,0.856715,132,0.380556,0.490523,0.417211


In [92]:
season_data_2021_final = pd.merge(season_data_2021_continuous.groupby(['Season','TeamID'], \
        as_index=False).max()[['Season','TeamID','DayNum']], season_data_2021_continuous).drop(columns='DayNum')
l30_data_2021_final = pd.merge(l30_data_2021_continuous.groupby(['Season','TeamID'], \
        as_index=False).max()[['Season','TeamID','DayNum']], l30_data_2021_continuous).drop(columns='DayNum')

season_data_std_score_2021 = season_data_2021[['Season','WTeamID','WScore']]\
.rename(columns={'WTeamID':'TeamID','WScore':'Score'})\
.append(season_data[['Season','LTeamID','LScore']]\
.rename(columns={'LTeamID':'TeamID','LScore':'Score'}))\
.groupby(['Season','TeamID'], as_index=False).std()\
.rename(columns={'Score':'StdScore'})

tourney_data_2021_sc = pd.DataFrame({0:[],1:[],2:[]})

tourney_teams = pd.read_csv('WData/WDataFiles_Stage2/WNCAATourneySeeds.csv')
tourney_teams_2021 = tourney_teams[tourney_teams['Season']==2021][['Season','TeamID']]
tourney_teams_2021 = tourney_teams_2021.groupby('Season')['TeamID'].apply(set).apply(list).to_dict()

for s in sorted(tourney_teams_2021.keys()):
    teams = sorted(tourney_teams_2021[s])
    for i in range(len(teams)):
        for j in range(i+1, len(teams)):
            tourney_data_2021_sc = tourney_data_2021_sc.append(pd.DataFrame([s, teams[i], teams[j]]).T)

tourney_data_2021_sc = tourney_data_2021_sc.rename(columns={0:'Season',1:'TeamID1',2:'TeamID2'})
tourney_data_2021_sc = tourney_data_2021_sc[tourney_data_2021_sc['Season']==2021]

predict_data_2 = pd.merge(tourney_data_2021_sc, season_data_2021_final, left_on=['Season','TeamID1'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
predict_data_2 = pd.merge(predict_data_2, l30_data_2021_final, left_on=['Season','TeamID1'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
predict_data_2 = pd.merge(predict_data_2, season_data_2021_final, left_on=['Season','TeamID2'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
predict_data_2 = pd.merge(predict_data_2, l30_data_2021_final, left_on=['Season','TeamID2'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])

In [93]:
predict_data_2['PScore1'] = lm.intercept_

for i in range(len(fs)):
    predict_data_2['PScore1'] = predict_data_2.apply(lambda row: row['PScore1'] + row[fs[i]]*lm.coef_[i], axis=1)

predict_data_2['PScore2'] = lm.intercept_

fs2 = [x[:-1]+'y' if x[-1] == 'x' else x[:-1]+'x' for x in fs]
for i in range(len(fs2)):
    predict_data_2['PScore2'] = predict_data_2.apply(lambda row: row['PScore2'] + row[fs2[i]]*lm.coef_[i], axis=1)
    
predictions_2 = pd.merge(predict_data_2[['Season','TeamID1','TeamID2','PScore1','PScore2']], season_data_std_score_2021, \
        left_on=['Season','TeamID1'], right_on=['Season','TeamID']).drop(columns=['TeamID'])
predictions_2 = pd.merge(predictions_2, season_data_std_score_2021, \
        left_on=['Season','TeamID2'], right_on=['Season','TeamID']).drop(columns=['TeamID'])

predictions_2['m'] = predictions_2['PScore1']-predictions_2['PScore2']
predictions_2['sd'] = predictions_2.apply(lambda row: math.sqrt(row['StdScore_x']*row['StdScore_y']), axis=1)

predictions_2['ID'] = predictions_2.apply(lambda row: '{}_{}_{}'.format(int(row['Season']), int(row['TeamID1']), \
                                                                        int(row['TeamID2'])), axis=1)
predictions_2['Pred'] = norm.cdf(predictions_2['m'], 0, predictions_2['sd'])

predictions_2[['ID','Pred']].sort_values(by='ID').to_csv('SubmissionStage2.csv', index=None)

predictions_2[['ID','Pred']]

Unnamed: 0,ID,Pred
0,2021_3104_3112,0.343700
1,2021_3104_3116,0.366654
2,2021_3112_3116,0.532083
3,2021_3104_3124,0.117903
4,2021_3112_3124,0.208765
...,...,...
2011,2021_3439_3461,0.972400
2012,2021_3448_3461,0.965676
2013,2021_3450_3461,0.971094
2014,2021_3452_3461,0.999619


In [96]:
team_index = pd.read_csv('WData/WDataFiles_Stage2/WTeams.csv')
results_2 = pd.merge(team_index[['TeamID','TeamName']], predictions_2, left_on='TeamID', right_on='TeamID2')
results_2 = pd.merge(team_index[['TeamID','TeamName']], results_2, left_on='TeamID', right_on='TeamID1')

results_2.drop(columns=['Season','TeamID1','TeamID2','m','sd','ID'])[['TeamName_x','TeamName_y','Pred']].to_csv('WPredictions/2021_game_predictions.csv')

In [116]:
tourney_seeds = pd.read_csv('WData/WDataFiles_Stage2/WNCAATourneySeeds.csv')
bracket = pd.read_csv('MData/MDataFiles_Stage2/MNCAATourneySeedRoundSlots.csv')

seeds_2021 = tourney_seeds[tourney_seeds['Season']==2021]

Unnamed: 0,Slot,Season,Seed_x,TeamID_x,Seed_y,TeamID_y
0,R1W1,2021,W01,3390,W16,3430
1,R1W2,2021,W02,3257,W15,3265
2,R1W3,2021,W14,3180,W03,3208
3,R1W4,2021,W04,3116,W13,3460
4,R1W5,2021,W05,3283,W12,3413
5,R1W6,2021,W06,3332,W11,3377
6,R1W7,2021,W07,3321,W10,3416
7,R1W8,2021,W08,3329,W09,3448
8,R1X1,2021,X16,3273,X01,3376
9,R1X2,2021,X02,3268,X15,3291


In [195]:
bracket_lookup = pd.merge(bracket[['Seed','GameSlot','GameRound']], bracket[['Seed','GameSlot','GameRound']],\
              on=['GameSlot','GameRound'])
bracket_lookup['Seed_xy'] = bracket_lookup['Seed_x']+bracket_lookup['Seed_y']
bracket_lookup = bracket_lookup.groupby('Seed_xy', as_index=False).min()

In [217]:
bracket_results = pd.merge(results_2, tourney_seeds, left_on=['Season','TeamID_x'], \
                           right_on=['Season','TeamID']).drop(columns='TeamID')
bracket_results = pd.merge(bracket_results, tourney_seeds, left_on=['Season','TeamID_y'], \
                           right_on=['Season','TeamID']).drop(columns='TeamID')
bracket_results['Seed_xy'] = bracket_results['Seed_x']+bracket_results['Seed_y']

bracket_results = pd.merge(bracket_results, bracket_lookup[['GameSlot','Seed_xy','GameRound']], on=['Seed_xy'])

In [289]:
team_list = list(set(bracket_results['TeamID_x'].append(bracket_results['TeamID_y'])))

winner_dict = {1:dict(zip(team_list, len(team_list)*[0])), 2:dict(zip(team_list, len(team_list)*[0])), \
               3:dict(zip(team_list, len(team_list)*[0])), 4:dict(zip(team_list, len(team_list)*[0])), \
               5:dict(zip(team_list, len(team_list)*[0])), 6:dict(zip(team_list, len(team_list)*[0]))}

N = 1000

for i in range(N):
    
    winners = team_list

    for r in range(1,7):
        sim = bracket_results.copy()
        sim = sim = sim[(sim['GameRound']==r)&(sim['TeamID_x'].isin(winners))&(sim['TeamID_y'].isin(winners))]

        sim['Sim'] = sim.apply(lambda row: np.random.uniform(0,1), axis=1)
        sim['Winner'] = sim.apply(lambda row: row['TeamID_x'] if row['Sim']<row['Pred'] else row['TeamID_y'], axis=1)

        winners = list(sim['Winner'].values)

        for w in winners:
            winner_dict[r][w] += 1
    
win_rate = {}

for d in winner_dict:
    win_rate[d] = {k:(v/N) for k, v in winner_dict[d].items()}
    
print(win_rate[1])

{3329: 0.811, 3332: 0.81, 3333: 0.048, 3460: 0.082, 3461: 0.007, 3208: 1.0, 3210: 0.204, 3211: 0.739, 3219: 0.001, 3353: 0.854, 3226: 0.103, 3355: 0.072, 3231: 0.637, 3104: 0.358, 3234: 0.568, 3235: 0.087, 3238: 0.014, 3112: 0.999, 3116: 0.918, 3372: 0.796, 3246: 0.897, 3376: 0.997, 3377: 0.19, 3250: 0.0, 3378: 0.736, 3124: 0.986, 3125: 0.261, 3257: 0.96, 3133: 0.106, 3390: 1.0, 3392: 0.001, 3265: 0.04, 3266: 0.353, 3393: 0.928, 3140: 0.146, 3141: 0.432, 3268: 0.994, 3397: 0.865, 3400: 0.894, 3273: 0.003, 3401: 0.996, 3276: 0.316, 3277: 0.913, 3407: 0.004, 3283: 0.968, 3413: 0.032, 3416: 0.025, 3417: 0.993, 3163: 0.999, 3291: 0.006, 3292: 0.135, 3299: 0.043, 3301: 0.957, 3430: 0.0, 3433: 0.363, 3180: 0.0, 3439: 0.647, 3314: 0.642, 3448: 0.189, 3321: 0.975, 3450: 0.264, 3195: 0.684, 3452: 1.0, 3199: 0.952}


In [297]:
team_odds = tourney_seeds[tourney_seeds['Season']==2021]
team_odds = pd.merge(team_odds, team_index, on='TeamID')
team_odds['R1'] = team_odds.apply(lambda row: win_rate[1][row['TeamID']], axis=1)
team_odds['R2'] = team_odds.apply(lambda row: win_rate[2][row['TeamID']], axis=1)
team_odds['R3'] = team_odds.apply(lambda row: win_rate[3][row['TeamID']], axis=1)
team_odds['R4'] = team_odds.apply(lambda row: win_rate[4][row['TeamID']], axis=1)
team_odds['R5'] = team_odds.apply(lambda row: win_rate[5][row['TeamID']], axis=1)
team_odds['R6'] = team_odds.apply(lambda row: win_rate[6][row['TeamID']], axis=1)

team_odds.to_csv('WPredictions/team_odds_2021.csv', index=False)

team_odds

Unnamed: 0,Season,Seed,TeamID,TeamName,R1,R2,R3,R4,R5,R6
0,2021,W01,3390,Stanford,1.000,0.982,0.965,0.652,0.329,0.152
1,2021,W02,3257,Louisville,0.960,0.688,0.221,0.047,0.010,0.001
2,2021,W03,3208,Georgia,1.000,0.865,0.681,0.293,0.130,0.048
3,2021,W04,3116,Arkansas,0.918,0.730,0.022,0.001,0.000,0.000
4,2021,W05,3283,Missouri St,0.968,0.244,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...
59,2021,Z12,3141,C Michigan,0.432,0.062,0.000,0.000,0.000,0.000
60,2021,Z13,3226,Idaho St,0.103,0.046,0.000,0.000,0.000,0.000
61,2021,Z14,3292,MTSU,0.135,0.023,0.003,0.000,0.000,0.000
62,2021,Z15,3238,Jackson St,0.014,0.003,0.000,0.000,0.000,0.000
