In [27]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import random
import math
import heapq
from scipy.stats import norm
from kaggle.api.kaggle_api_extended import KaggleApi
from zipfile import ZipFile
from datetime import datetime
import pytz
import swifter
import itertools

In [3]:
api = KaggleApi()
api.authenticate()

In [4]:
api.competitions_list(search='march')

[ncaam-march-mania-2021,
 ncaaw-march-mania-2021,
 ncaam-march-mania-2021-spread,
 ncaaw-march-mania-2021-spread,
 march-machine-learning-mania-2016,
 march-machine-learning-mania-2017,
 march-machine-learning-mania-2014,
 march-machine-learning-mania-2015,
 mens-machine-learning-competition-2018,
 mens-machine-learning-competition-2019,
 womens-machine-learning-competition-2018,
 womens-machine-learning-competition-2019,
 march-madness-analytics-2020,
 google-cloud-ncaa-march-madness-2020-division-1-mens-tournament,
 google-cloud-ncaa-march-madness-2020-division-1-womens-tournament]

In [75]:
api.competition_list_files('ncaam-march-mania-2021')



[MDataFiles_Stage2/MConferenceTourneyGames.csv,
 MDataFiles_Stage2/MTeamConferences.csv,
 MDataFiles_Stage2/MRegularSeasonCompactResults.csv,
 MDataFiles_Stage2/Cities.csv,
 MDataFiles_Stage2/MNCAATourneyCompactResults.csv,
 MDataFiles_Stage2/MTeamCoaches.csv,
 MDataFiles_Stage2/MSeasons.csv,
 MDataFiles_Stage2/MMasseyOrdinals.csv,
 MDataFiles_Stage2/MRegularSeasonDetailedResults.csv,
 MDataFiles_Stage2/MNCAATourneySlots.csv,
 MDataFiles_Stage2/MSampleSubmissionStage2.csv,
 MDataFiles_Stage2/MGameCities.csv,
 MDataFiles_Stage2/MTeams.csv,
 MDataFiles_Stage2/MNCAATourneySeeds.csv,
 MDataFiles_Stage2/MSecondaryTourneyCompactResults.csv,
 MDataFiles_Stage2/MNCAATourneyDetailedResults.csv,
 MDataFiles_Stage2/MSecondaryTourneyTeams.csv,
 MDataFiles_Stage2/Conferences.csv,
 MDataFiles_Stage2/MNCAATourneySeedRoundSlots.csv,
 MDataFiles_Stage2/MTeamSpellings.csv,
 MDataFiles_Stage1/MConferenceTourneyGames.csv,
 MDataFiles_Stage1/MTeamConferences.csv,
 MDataFiles_Stage1/MRegularSeasonCompactRes

In [76]:
# api.competition_download_files('ncaam-march-mania-2021')

In [77]:
# zf = ZipFile('ncaam-march-mania-2021.zip')
# zf.extractall('MData/') #save files in selected folder
# zf.close()

In [8]:
season_data = pd.read_csv('Data/MRegularSeasonDetailedResults.csv')

In [9]:
season_data.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF'],
      dtype='object')

In [99]:
def get_continuous_data(data, min_season=2010, max_season=2020, last_n=None, vocal=True):
    cols1 = ['Season','DayNum','WTeamID','WLoc','NumOT','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR',\
            'WAst','WTO','WStl','WBlk','WPF','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR','LAst',\
            'LTO','LStl','LBlk','LPF']
    cols2 = ['Season','DayNum','LTeamID','WLoc','NumOT','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR',\
            'LAst','LTO','LStl','LBlk','LPF','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR','WAst',\
            'WTO','WStl','WBlk','WPF']

    cols = ['Season','DayNum','TeamID','Loc','NumOT','TScore','TFGM','TFGA','TFGM3','TFGA3','TFTM','TFTA','TOR','TDR',\
           'TAst','TTO','TStl','TBlk','TPF','OScore','OFGM','OFGA','OFGM3','OFGA3','OFTM','OFTA','OOR','ODR','OAst',\
           'OTO','OStl','OBlk','OPF']

    all_cols = ['Season','TeamID','NumOT','TScore','TFGM','TFGA','TFGM3','TFGA3','TFTM','TFTA','TOR','TDR','TAst','TTO',\
            'TStl','TBlk','TPF','OScore','OFGM','OFGA','OFGM3','OFGA3','OFTM','OFTA','OOR','ODR','OAst','OTO','OStl',\
            'OBlk','OPF','Wins','G','WP','AWP','ANWP','Min','TFGP','TFGP3','TFTP','TFG3R','TFTR','TEFG','TTFG','OFGP',\
            'OFGP3','OFTP','OFG3R','OFTR','OEFG','OTFG','TPoss','TPace','OPoss','OPace','TTRP','TORP','TAstP','TStlP',\
            'TBlkP','TTOP','OTRP','OORP','OAstP','OStlP','OBlkP','OTOP','OffRtg','DefRtg','DayNum']

    dfc = pd.DataFrame(columns=all_cols)

    for seas in range(min_season, max_season+1):
        max_daynum = max(data[(data['Season']==seas)]['DayNum'])
        for daynum in range(1,max_daynum+1):
            days_back = last_n
            if days_back is None:
                days_back = daynum

            df = data[(data['Season']==seas)&(data['DayNum']<daynum)&(season_data['DayNum']>=daynum-days_back)]

            if len(df) > 0:
                tm1 = df[cols1].rename(columns=dict(zip(cols1, cols)))
                tm2 = df[cols2].rename(columns=dict(zip(cols2, cols)))

                # Calculate total wins
                tm1['Wins'] = 1
                tm2['Wins'] = 0

                # Calculate total away wins and losses (Loc will be H for the losing team)
                tm1['AWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'A' else 0, axis=1)
                tm2['AWins'] = 0
                tm1['ALosses'] = 0
                tm2['ALosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'H' else 0, axis=1)

                # Calculate total neutral wins and losses
                tm1['NWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)
                tm2['NWins'] = 0
                tm1['NLosses'] = 0
                tm2['NLosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)

                tm = tm1.append(tm2)

                tm['G'] = 1

                agg_funcs = 29*['mean']+6*['sum']

                tm = tm.groupby(['Season','TeamID'], as_index=False).agg(dict(zip(cols[4:]+['G','Wins',\
                                                                    'AWins','ALosses','NWins','NLosses'], agg_funcs)))

                # Game statistics
                tm['WP'] = tm['Wins']/tm['G']
                tm['AWP'] = tm['AWins']/(tm['AWins']+tm['ALosses'])
                tm['ANWP'] = (tm['AWins']+tm['NWins'])/(tm['AWins']+tm['ALosses']+tm['NWins']+tm['NLosses'])
                tm['Min'] = 40*tm['G']+5*tm['NumOT']

                # Team shooting percentages
                tm['TFGP'] = tm['TFGM']/tm['TFGA']
                tm['TFGP3'] = tm['TFGM3']/tm['TFGA3']
                tm['TFTP'] = tm['TFTM']/tm['TFTA']
                tm['TFG3R'] = tm['TFGA3']/tm['TFGA']
                tm['TFTR'] = tm['TFTA']/tm['TFGA']
                tm['TEFG'] = (0.5*tm['TFGM3']+tm['TFGM'])/tm['TFGA']
                tm['TTFG'] = tm['TScore']/(2*(0.44*tm['TFTA']+tm['TFGA']))

                # Opponent shooting percentages
                tm['OFGP'] = tm['OFGM']/tm['OFGA']
                tm['OFGP3'] = tm['OFGM3']/tm['OFGA3']
                tm['OFTP'] = tm['OFTM']/tm['OFTA']
                tm['OFG3R'] = tm['OFGA3']/tm['OFGA']
                tm['OFTR'] = tm['OFTA']/tm['OFGA']
                tm['OEFG'] = (0.5*tm['OFGM3']+tm['OFGM'])/tm['OFGA']
                tm['OTFG'] = tm['OScore']/(2*(0.44*tm['OFTA']+tm['OFGA']))

                # Team possession stats
                tm['TPoss'] = tm['TFGA']-tm['TOR']+tm['TTO']+0.4*tm['TFTA']
                tm['TPace'] = 40*tm['TPoss']/tm['Min']

                # Opponent possession stats
                tm['OPoss'] = tm['OFGA']-tm['OOR']+tm['OTO']+0.4*tm['OFTA']
                tm['OPace'] = 40*tm['OPoss']/tm['Min']

                # Team stat percentages
                tm['TTRP'] = (tm['TOR']+tm['TDR'])/(tm['TOR']+tm['TDR']+tm['OOR']+tm['ODR'])
                tm['TORP'] = tm['TOR']/(tm['TOR']+tm['ODR'])
                tm['TAstP'] = tm['TAst']/tm['TFGM']
                tm['TStlP'] = tm['TStl']/tm['OPoss']
                tm['TBlkP'] = tm['TBlk']/(tm['OFGA']-tm['OFGA3'])
                tm['TTOP'] = tm['TTO']/tm['TPoss']

                # Opponent stat percentages
                tm['OTRP'] = (tm['OOR']+tm['ODR'])/(tm['TOR']+tm['TDR']+tm['OOR']+tm['ODR'])
                tm['OORP'] = tm['OOR']/(tm['OOR']+tm['TDR'])
                tm['OAstP'] = tm['OAst']/tm['OFGM']
                tm['OStlP'] = tm['OStl']/tm['TPoss']
                tm['OBlkP'] = tm['OBlk']/(tm['TFGA']-tm['TFGA3'])
                tm['OTOP'] = tm['OTO']/tm['OPoss']

                # Ratings
                tm['OffRtg'] = tm['TScore']/tm['TPoss']
                tm['DefRtg'] = tm['OScore']/tm['OPoss']

                tm['DayNum'] = daynum

                tm = tm.drop(columns=['AWins','ALosses','NWins','NLosses'])

                dfc = dfc.append(tm)
                
        if vocal:
            print('{} season completed.'.format(seas))
            
    return dfc

def get_matchup_dict(data, min_season=2010, max_season=2020, max_daynum=132, min_team=1101, max_team=1499):
    # Get matchup history of all teams on any given day
    matchup_d = {}
    for seas in range(min_season, max_season+1):
        d_day = {}
        for daynum in range(max_daynum+1):
            df = data[(data['Season']==seas)&(data['DayNum']<daynum)]
            d_team = {}
            for t in range(min_team, max_team+1):
                d_team[t] = list(df[df['WTeamID']==t]['LTeamID'].append(df[df['LTeamID']==t]['WTeamID']))
            d_day[daynum] = d_team
        matchup_d[seas] = d_day
        print('{} season completed.'.format(seas))
    return matchup_d

def get_opp_matchup_dict(matchup_d, min_season=2010, max_season=2020, max_daynum=132, min_team=1101, max_team=1499):
    # Get matchup history of all teams' opponents on any given day
    opp_matchup_d = {}
    for seas in range(min_season, max_season+1):
        d_day = {}
        for daynum in range(max_daynum+1):
            d_team = {}
            for t in range(min_team, max_team+1):
                opp_matchups = []
                for o in matchup_d[seas][daynum][t]:
                    opp_matchups += matchup_d[seas][daynum][o]
                d_team[t] = opp_matchups
            d_day[daynum] = d_team
        opp_matchup_d[seas] = d_day
        print('{} season completed.'.format(seas))
    return opp_matchup_d

def add_sos_var(data, wp_d, matchup_d, opp_matchup_d):
    print('START:', datetime.now(pytz.timezone('US/Pacific')))

    data['OWP'] = data.apply(lambda row: np.mean([wp_d[row['Season']][row['DayNum']][x] \
                 for x in matchup_d[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)

    data['OOWP'] = data.apply(lambda row: np.mean([wp_d[row['Season']][row['DayNum']][x] \
                 for x in opp_matchup_d[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)

    data['SOS'] = (2*data['OWP']+data['OOWP'])/3

    print('FINISH:', datetime.now(pytz.timezone('US/Pacific')))

    return data

In [100]:
get_continuous_data(season_data)

Unnamed: 0,Season,TeamID,NumOT,TScore,TFGM,TFGA,TFGM3,TFGA3,TFTM,TFTA,...,TTOP,OTRP,OORP,OAstP,OStlP,OBlkP,OTOP,OffRtg,DefRtg,DayNum
0,2010,1107,0,43,15,55,5,28,8,14,...,0.397022,0.541176,0.341463,0.724138,0.260546,0.222222,0.264484,0.533499,0.944584,8
1,2010,1108,0,60,21,61,7,17,11,20,...,0.183099,0.573171,0.351351,0.743590,0.112676,0.136364,0.084746,0.845070,1.412429,8
2,2010,1143,0,75,24,52,5,12,22,32,...,0.188088,0.492063,0.366667,0.423077,0.109718,0.075000,0.265625,1.175549,1.093750,8
3,2010,1198,0,72,25,68,8,23,14,17,...,0.244499,0.602740,0.428571,0.676471,0.122249,0.177778,0.314770,0.880196,1.065375,8
4,2010,1293,0,70,26,52,8,21,10,15,...,0.265625,0.507937,0.393939,0.625000,0.125000,0.032258,0.188088,1.093750,1.175549,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,2020,1463,0.285714,74.535714,26.392857,57.285714,8.892857,24.071429,12.857143,17.892857,...,0.188855,0.462860,0.213397,0.503726,0.098578,0.094624,0.168748,1.082806,0.959335,128
349,2020,1464,0.096774,71.193548,25.580645,61.548387,7.193548,23.064516,12.83871,18.451613,...,0.181270,0.475546,0.295186,0.490148,0.086570,0.088013,0.169077,1.055577,1.068589,128
350,2020,1465,0.111111,76.222222,25.296296,58.925926,9.259259,24.555556,16.37037,20.777778,...,0.182171,0.465431,0.255566,0.476124,0.087852,0.108836,0.143071,1.109195,1.035259,128
351,2020,1466,0.035714,67.464286,22.892857,54.678571,6.642857,19.821429,15.035714,21.714286,...,0.219134,0.492598,0.263050,0.431718,0.081720,0.110656,0.198371,0.983240,1.032575,128


In [372]:
cols1 = ['Season','DayNum','WTeamID','WLoc','NumOT','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR',\
            'WAst','WTO','WStl','WBlk','WPF','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR','LAst',\
            'LTO','LStl','LBlk','LPF']
cols2 = ['Season','DayNum','LTeamID','WLoc','NumOT','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR',\
            'LAst','LTO','LStl','LBlk','LPF','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR','WAst',\
            'WTO','WStl','WBlk','WPF']

cols = ['Season','DayNum','TeamID','Loc','NumOT','TScore','TFGM','TFGA','TFGM3','TFGA3','TFTM','TFTA','TOR','TDR',\
           'TAst','TTO','TStl','TBlk','TPF','OScore','OFGM','OFGA','OFGM3','OFGA3','OFTM','OFTA','OOR','ODR','OAst',\
           'OTO','OStl','OBlk','OPF']

all_cols = ['Season','TeamID','NumOT','TScore','TFGM','TFGA','TFGM3','TFGA3','TFTM','TFTA','TOR','TDR','TAst','TTO',\
            'TStl','TBlk','TPF','OScore','OFGM','OFGA','OFGM3','OFGA3','OFTM','OFTA','OOR','ODR','OAst','OTO','OStl',\
            'OBlk','OPF','Wins','G','WP','AWP','ANWP','Min','TFGP','TFGP3','TFTP','TFG3R','TFTR','TEFG','TTFG','OFGP',\
            'OFGP3','OFTP','OFG3R','OFTR','OEFG','OTFG','TPoss','TPace','OPoss','OPace','TTRP','TORP','TAstP','TStlP',\
            'TBlkP','TTOP','OTRP','OORP','OAstP','OStlP','OBlkP','OTOP','OffRtg','DefRtg','DayNum']

season_data_continuous = pd.DataFrame(columns=all_cols)

for seas in range(2010,2021):
    max_daynum = max(season_data[(season_data['Season']==seas)]['DayNum'])
    for daynum in range(1,max_daynum+1):
        df = season_data[(season_data['Season']==seas)&(season_data['DayNum']<daynum)]
            
        if len(df) > 0:
            tm1 = df[cols1].rename(columns=dict(zip(cols1, cols)))
            tm2 = df[cols2].rename(columns=dict(zip(cols2, cols)))

            # Calculate total wins
            tm1['Wins'] = 1
            tm2['Wins'] = 0

            # Calculate total away wins and losses (Loc will be H for the losing team)
            tm1['AWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'A' else 0, axis=1)
            tm2['AWins'] = 0
            tm1['ALosses'] = 0
            tm2['ALosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'H' else 0, axis=1)

            # Calculate total neutral wins and losses
            tm1['NWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)
            tm2['NWins'] = 0
            tm1['NLosses'] = 0
            tm2['NLosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)

            tm = tm1.append(tm2)

            tm['G'] = 1

            agg_funcs = 29*['mean']+6*['sum']

            tm = tm.groupby(['Season','TeamID'], as_index=False).agg(dict(zip(cols[4:]+['G','Wins',\
                                                                'AWins','ALosses','NWins','NLosses'], agg_funcs)))

            # Game statistics
            tm['WP'] = tm['Wins']/tm['G']
            tm['AWP'] = tm['AWins']/(tm['AWins']+tm['ALosses'])
            tm['ANWP'] = (tm['AWins']+tm['NWins'])/(tm['AWins']+tm['ALosses']+tm['NWins']+tm['NLosses'])
            tm['Min'] = 40*tm['G']+5*tm['NumOT']

            # Team shooting percentages
            tm['TFGP'] = tm['TFGM']/tm['TFGA']
            tm['TFGP3'] = tm['TFGM3']/tm['TFGA3']
            tm['TFTP'] = tm['TFTM']/tm['TFTA']
            tm['TFG3R'] = tm['TFGA3']/tm['TFGA']
            tm['TFTR'] = tm['TFTA']/tm['TFGA']
            tm['TEFG'] = (0.5*tm['TFGM3']+tm['TFGM'])/tm['TFGA']
            tm['TTFG'] = tm['TScore']/(2*(0.44*tm['TFTA']+tm['TFGA']))

            # Opponent shooting percentages
            tm['OFGP'] = tm['OFGM']/tm['OFGA']
            tm['OFGP3'] = tm['OFGM3']/tm['OFGA3']
            tm['OFTP'] = tm['OFTM']/tm['OFTA']
            tm['OFG3R'] = tm['OFGA3']/tm['OFGA']
            tm['OFTR'] = tm['OFTA']/tm['OFGA']
            tm['OEFG'] = (0.5*tm['OFGM3']+tm['OFGM'])/tm['OFGA']
            tm['OTFG'] = tm['OScore']/(2*(0.44*tm['OFTA']+tm['OFGA']))

            # Team possession stats
            tm['TPoss'] = tm['TFGA']-tm['TOR']+tm['TTO']+0.4*tm['TFTA']
            tm['TPace'] = 40*tm['TPoss']/tm['Min']

            # Opponent possession stats
            tm['OPoss'] = tm['OFGA']-tm['OOR']+tm['OTO']+0.4*tm['OFTA']
            tm['OPace'] = 40*tm['OPoss']/tm['Min']

            # Team stat percentages
            tm['TTRP'] = (tm['TOR']+tm['TDR'])/(tm['TOR']+tm['TDR']+tm['OOR']+tm['ODR'])
            tm['TORP'] = tm['TOR']/(tm['TOR']+tm['ODR'])
            tm['TAstP'] = tm['TAst']/tm['TFGM']
            tm['TStlP'] = tm['TStl']/tm['OPoss']
            tm['TBlkP'] = tm['TBlk']/(tm['OFGA']-tm['OFGA3'])
            tm['TTOP'] = tm['TTO']/tm['TPoss']

            # Opponent stat percentages
            tm['OTRP'] = (tm['OOR']+tm['ODR'])/(tm['TOR']+tm['TDR']+tm['OOR']+tm['ODR'])
            tm['OORP'] = tm['OOR']/(tm['OOR']+tm['TDR'])
            tm['OAstP'] = tm['OAst']/tm['OFGM']
            tm['OStlP'] = tm['OStl']/tm['TPoss']
            tm['OBlkP'] = tm['OBlk']/(tm['TFGA']-tm['TFGA3'])
            tm['OTOP'] = tm['OTO']/tm['OPoss']

            # Ratings
            tm['OffRtg'] = tm['TScore']/tm['TPoss']
            tm['DefRtg'] = tm['OScore']/tm['OPoss']

            tm['DayNum'] = daynum

            tm = tm.drop(columns=['AWins','ALosses','NWins','NLosses'])

            season_data_continuous = season_data_continuous.append(tm)
    
    print('{} season completed.'.format(seas))

season_data_continuous

2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.


Unnamed: 0,Season,TeamID,NumOT,TScore,TFGM,TFGA,TFGM3,TFGA3,TFTM,TFTA,...,TTOP,OTRP,OORP,OAstP,OStlP,OBlkP,OTOP,OffRtg,DefRtg,DayNum
0,2010,1107,0,43,15,55,5,28,8,14,...,0.397022,0.541176,0.341463,0.724138,0.260546,0.222222,0.264484,0.533499,0.944584,8
1,2010,1108,0,60,21,61,7,17,11,20,...,0.183099,0.573171,0.351351,0.743590,0.112676,0.136364,0.084746,0.845070,1.412429,8
2,2010,1143,0,75,24,52,5,12,22,32,...,0.188088,0.492063,0.366667,0.423077,0.109718,0.075000,0.265625,1.175549,1.093750,8
3,2010,1198,0,72,25,68,8,23,14,17,...,0.244499,0.602740,0.428571,0.676471,0.122249,0.177778,0.314770,0.880196,1.065375,8
4,2010,1293,0,70,26,52,8,21,10,15,...,0.265625,0.507937,0.393939,0.625000,0.125000,0.032258,0.188088,1.093750,1.175549,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,2020,1463,0.285714,74.535714,26.392857,57.285714,8.892857,24.071429,12.857143,17.892857,...,0.188855,0.462860,0.213397,0.503726,0.098578,0.094624,0.168748,1.082806,0.959335,128
349,2020,1464,0.096774,71.193548,25.580645,61.548387,7.193548,23.064516,12.83871,18.451613,...,0.181270,0.475546,0.295186,0.490148,0.086570,0.088013,0.169077,1.055577,1.068589,128
350,2020,1465,0.111111,76.222222,25.296296,58.925926,9.259259,24.555556,16.37037,20.777778,...,0.182171,0.465431,0.255566,0.476124,0.087852,0.108836,0.143071,1.109195,1.035259,128
351,2020,1466,0.035714,67.464286,22.892857,54.678571,6.642857,19.821429,15.035714,21.714286,...,0.219134,0.492598,0.263050,0.431718,0.081720,0.110656,0.198371,0.983240,1.032575,128


In [373]:
# Get matchup history of all teams on any given day
matchup_dict = {}
for seas in range(2010,2021):
    d_day = {}
    for daynum in range(133):
        df = season_data[(season_data['Season']==seas)&(season_data['DayNum']<daynum)]
        d_team = {}
        for t in range(1101,1500):
            d_team[t] = list(df[df['WTeamID']==t]['LTeamID'].append(df[df['LTeamID']==t]['WTeamID']))
        d_day[daynum] = d_team
    matchup_dict[seas] = d_day
    print('{} season completed.'.format(seas))

# print(matchup_dict[2020][20])

2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.


In [374]:
# Get matchup history of all teams' opponents on any given day
opp_matchup_dict = {}

for seas in range(2010,2021):
    d_day = {}
    for daynum in range(133):
        d_team = {}
        for t in range(1101,1500):
            opp_matchups = []
            for o in matchup_dict[seas][daynum][t]:
                opp_matchups += matchup_dict[seas][daynum][o]
            d_team[t] = opp_matchups
        d_day[daynum] = d_team
    opp_matchup_dict[seas] = d_day
    print('{} season completed.'.format(seas))

# print(opp_matchup_dict[2020][20][1101])

2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.


In [375]:
wp_dict = {h: {g: f.groupby('TeamID')['WP'].apply(float).to_dict()
     for g, f in g.groupby('DayNum')}
     for h, g in season_data_continuous.groupby('Season')}

In [376]:
print('START:', datetime.now(pytz.timezone('US/Pacific')))

season_data_continuous['OWP'] = season_data_continuous.apply(lambda row: 
    np.mean([wp_dict[row['Season']][row['DayNum']][x] \
             for x in matchup_dict[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)

season_data_continuous['OOWP'] = season_data_continuous.apply(lambda row: 
    np.mean([wp_dict[row['Season']][row['DayNum']][x] \
             for x in opp_matchup_dict[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)

season_data_continuous['SOS'] = (2*season_data_continuous['OWP']+season_data_continuous['OOWP'])/3

print('FINISH:', datetime.now(pytz.timezone('US/Pacific')))

season_data_continuous

START: 2021-03-18 03:56:13.138126-07:00
FINISH: 2021-03-18 04:15:16.688090-07:00


Unnamed: 0,Season,TeamID,NumOT,TScore,TFGM,TFGA,TFGM3,TFGA3,TFTM,TFTA,...,OAstP,OStlP,OBlkP,OTOP,OffRtg,DefRtg,DayNum,OWP,OOWP,SOS
0,2010,1107,0,43,15,55,5,28,8,14,...,0.724138,0.260546,0.222222,0.264484,0.533499,0.944584,8,1.000000,0.000000,0.666667
1,2010,1108,0,60,21,61,7,17,11,20,...,0.743590,0.112676,0.136364,0.084746,0.845070,1.412429,8,1.000000,0.000000,0.666667
2,2010,1143,0,75,24,52,5,12,22,32,...,0.423077,0.109718,0.075000,0.265625,1.175549,1.093750,8,0.000000,1.000000,0.333333
3,2010,1198,0,72,25,68,8,23,14,17,...,0.676471,0.122249,0.177778,0.314770,0.880196,1.065375,8,1.000000,0.000000,0.666667
4,2010,1293,0,70,26,52,8,21,10,15,...,0.625000,0.125000,0.032258,0.188088,1.093750,1.175549,8,1.000000,0.000000,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,2020,1463,0.285714,74.535714,26.392857,57.285714,8.892857,24.071429,12.857143,17.892857,...,0.503726,0.098578,0.094624,0.168748,1.082806,0.959335,128,0.466754,0.504394,0.479301
349,2020,1464,0.096774,71.193548,25.580645,61.548387,7.193548,23.064516,12.83871,18.451613,...,0.490148,0.086570,0.088013,0.169077,1.055577,1.068589,128,0.452354,0.480587,0.461765
350,2020,1465,0.111111,76.222222,25.296296,58.925926,9.259259,24.555556,16.37037,20.777778,...,0.476124,0.087852,0.108836,0.143071,1.109195,1.035259,128,0.427844,0.467879,0.441189
351,2020,1466,0.035714,67.464286,22.892857,54.678571,6.642857,19.821429,15.035714,21.714286,...,0.431718,0.081720,0.110656,0.198371,0.983240,1.032575,128,0.442535,0.470572,0.451881


In [377]:
cols1 = ['Season','DayNum','WTeamID','WLoc','NumOT','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR',\
            'WAst','WTO','WStl','WBlk','WPF','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR','LAst',\
            'LTO','LStl','LBlk','LPF']
cols2 = ['Season','DayNum','LTeamID','WLoc','NumOT','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR',\
            'LAst','LTO','LStl','LBlk','LPF','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR','WAst',\
            'WTO','WStl','WBlk','WPF']

cols = ['Season','DayNum','TeamID','Loc','LNumOT','LTScore','LTFGM','LTFGA','LTFGM3','LTFGA3','LTFTM','LTFTA','LTOR',\
        'LTDR','LTAst','LTTO','LTStl','LTBlk','LTPF','LOScore','LOFGM','LOFGA','LOFGM3','LOFGA3','LOFTM','LOFTA',\
        'LOOR','LODR','LOAst','LOTO','LOStl','LOBlk','LOPF']

all_cols = ['Season','TeamID','LNumOT','LTScore','LTFGM','LTFGA','LTFGM3','LTFGA3','LTFTM','LTFTA','LTOR','LTDR',\
            'LTAst','LTTO','LTStl','LTBlk','LTPF','LOScore','LOFGM','LOFGA','LOFGM3','LOFGA3','LOFTM','LOFTA','LOOR',\
            'LODR','LOAst','LOTO','LOStl','LOBlk','LOPF','LWins','LG','LWP','LAWP','LANWP','LMin','LTFGP','LTFGP3',\
            'LTFTP','LTFG3R','LTFTR','LTEFG','LTTFG','LOFGP','LOFGP3','LOFTP','LOFG3R','LOFTR','LOEFG','LOTFG',\
            'LTPoss','LTPace','LOPoss','LOPace','LTTRP','LTORP','LTAstP','LTStlP','LTBlkP','LTTOP','LOTRP','LOORP',\
            'LOAstP','LOStlP','LOBlkP','LOTOP','LOffRtg','LDefRtg','DayNum']

l30_data_continuous = pd.DataFrame(columns=all_cols)

for seas in range(2010,2021):
    max_daynum = max(season_data[(season_data['Season']==seas)]['DayNum'])
    for daynum in range(1,max_daynum+1):
        df = season_data[(season_data['Season']==seas)&(season_data['DayNum']<daynum)\
                         &(season_data['DayNum']>=daynum-30)]
            
        if len(df) > 0:
            tm1 = df[cols1].rename(columns=dict(zip(cols1, cols)))
            tm2 = df[cols2].rename(columns=dict(zip(cols2, cols)))

            # Calculate total wins
            tm1['LWins'] = 1
            tm2['LWins'] = 0

            # Calculate total away wins and losses (Loc will be H for the losing team)
            tm1['LAWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'A' else 0, axis=1)
            tm2['LAWins'] = 0
            tm1['LALosses'] = 0
            tm2['LALosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'H' else 0, axis=1)

            # Calculate total neutral wins and losses
            tm1['LNWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)
            tm2['LNWins'] = 0
            tm1['LNLosses'] = 0
            tm2['LNLosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)

            tm = tm1.append(tm2)

            tm['LG'] = 1

            agg_funcs = 29*['mean']+6*['sum']

            tm = tm.groupby(['Season','TeamID'], as_index=False).agg(dict(zip(cols[4:]+['LG','LWins',\
                                                                'LAWins','LALosses','LNWins','LNLosses'], agg_funcs)))

            # Game statistics
            tm['LWP'] = tm['LWins']/tm['LG']
            tm['LAWP'] = tm['LAWins']/(tm['LAWins']+tm['LALosses'])
            tm['LANWP'] = (tm['LAWins']+tm['LNWins'])/(tm['LAWins']+tm['LALosses']+tm['LNWins']+tm['LNLosses'])
            tm['LMin'] = 40*tm['LG']+5*tm['LNumOT']

            # Team shooting percentages
            tm['LTFGP'] = tm['LTFGM']/tm['LTFGA']
            tm['LTFGP3'] = tm['LTFGM3']/tm['LTFGA3']
            tm['LTFTP'] = tm['LTFTM']/tm['LTFTA']
            tm['LTFG3R'] = tm['LTFGA3']/tm['LTFGA']
            tm['LTFTR'] = tm['LTFTA']/tm['LTFGA']
            tm['LTEFG'] = (0.5*tm['LTFGM3']+tm['LTFGM'])/tm['LTFGA']
            tm['LTTFG'] = tm['LTScore']/(2*(0.44*tm['LTFTA']+tm['LTFGA']))

            # Opponent shooting percentages
            tm['LOFGP'] = tm['LOFGM']/tm['LOFGA']
            tm['LOFGP3'] = tm['LOFGM3']/tm['LOFGA3']
            tm['LOFTP'] = tm['LOFTM']/tm['LOFTA']
            tm['LOFG3R'] = tm['LOFGA3']/tm['LOFGA']
            tm['LOFTR'] = tm['LOFTA']/tm['LOFGA']
            tm['LOEFG'] = (0.5*tm['LOFGM3']+tm['LOFGM'])/tm['LOFGA']
            tm['LOTFG'] = tm['LOScore']/(2*(0.44*tm['LOFTA']+tm['LOFGA']))

            # Team possession stats
            tm['LTPoss'] = tm['LTFGA']-tm['LTOR']+tm['LTTO']+0.4*tm['LTFTA']
            tm['LTPace'] = 40*tm['LTPoss']/tm['LMin']

            # Opponent possession stats
            tm['LOPoss'] = tm['LOFGA']-tm['LOOR']+tm['LOTO']+0.4*tm['LOFTA']
            tm['LOPace'] = 40*tm['LOPoss']/tm['LMin']

            # Team stat percentages
            tm['LTTRP'] = (tm['LTOR']+tm['LTDR'])/(tm['LTOR']+tm['LTDR']+tm['LOOR']+tm['LODR'])
            tm['LTORP'] = tm['LTOR']/(tm['LTOR']+tm['LODR'])
            tm['LTAstP'] = tm['LTAst']/tm['LTFGM']
            tm['LTStlP'] = tm['LTStl']/tm['LOPoss']
            tm['LTBlkP'] = tm['LTBlk']/(tm['LOFGA']-tm['LOFGA3'])
            tm['LTTOP'] = tm['LTTO']/tm['LTPoss']

            # Opponent stat percentages
            tm['LOTRP'] = (tm['LOOR']+tm['LODR'])/(tm['LTOR']+tm['LTDR']+tm['LOOR']+tm['LODR'])
            tm['LOORP'] = tm['LOOR']/(tm['LOOR']+tm['LTDR'])
            tm['LOAstP'] = tm['LOAst']/tm['LOFGM']
            tm['LOStlP'] = tm['LOStl']/tm['LTPoss']
            tm['LOBlkP'] = tm['LOBlk']/(tm['LTFGA']-tm['LTFGA3'])
            tm['LOTOP'] = tm['LOTO']/tm['LOPoss']

            # Ratings
            tm['LOffRtg'] = tm['LTScore']/tm['LTPoss']
            tm['LDefRtg'] = tm['LOScore']/tm['LOPoss']

            tm['DayNum'] = daynum

            tm = tm.drop(columns=['LAWins','LALosses','LNWins','LNLosses'])

            l30_data_continuous = l30_data_continuous.append(tm)
    
    print('{} season completed.'.format(seas))

l30_data_continuous

2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.


Unnamed: 0,Season,TeamID,LNumOT,LTScore,LTFGM,LTFGA,LTFGM3,LTFGA3,LTFTM,LTFTA,...,LTTOP,LOTRP,LOORP,LOAstP,LOStlP,LOBlkP,LOTOP,LOffRtg,LDefRtg,DayNum
0,2010,1107,0,43,15,55,5,28,8,14,...,0.397022,0.541176,0.341463,0.724138,0.260546,0.222222,0.264484,0.533499,0.944584,8
1,2010,1108,0,60,21,61,7,17,11,20,...,0.183099,0.573171,0.351351,0.743590,0.112676,0.136364,0.084746,0.845070,1.412429,8
2,2010,1143,0,75,24,52,5,12,22,32,...,0.188088,0.492063,0.366667,0.423077,0.109718,0.075000,0.265625,1.175549,1.093750,8
3,2010,1198,0,72,25,68,8,23,14,17,...,0.244499,0.602740,0.428571,0.676471,0.122249,0.177778,0.314770,0.880196,1.065375,8
4,2010,1293,0,70,26,52,8,21,10,15,...,0.265625,0.507937,0.393939,0.625000,0.125000,0.032258,0.188088,1.093750,1.175549,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,2020,1463,0.25,74.5,26.875,59.0,8.0,23.0,12.75,17.0,...,0.188400,0.431734,0.175373,0.470297,0.099741,0.079861,0.163265,1.100850,1.035250,128
349,2020,1464,0.0,72.0,26.0,60.75,7.75,23.25,12.25,16.375,...,0.185602,0.487544,0.318519,0.495098,0.091864,0.100000,0.178434,1.079865,1.093630,128
350,2020,1465,0.333333,74.166667,24.5,59.166667,9.166667,23.833333,16.0,18.666667,...,0.162642,0.454545,0.200000,0.468750,0.088714,0.108491,0.125492,1.096599,1.075295,128
351,2020,1466,0.166667,69.666667,24.666667,55.5,6.833333,17.833333,13.5,20.5,...,0.208963,0.443005,0.183333,0.475177,0.105740,0.119469,0.188583,1.052367,1.072885,128


In [378]:
# Get matchup history of all teams on any given day
l30_matchup_dict = {}
for seas in range(2010,2021):
    d_day = {}
    for daynum in range(133):
        df = season_data[(season_data['Season']==seas)&(season_data['DayNum']<daynum)&\
                         (season_data['DayNum']>=daynum-30)]
        d_team = {}
        for t in range(1101,1500):
            d_team[t] = list(df[df['WTeamID']==t]['LTeamID'].append(df[df['LTeamID']==t]['WTeamID']))
        d_day[daynum] = d_team
    l30_matchup_dict[seas] = d_day
    print('{} season completed.'.format(seas))

print(l30_matchup_dict[2020][20][1101])

2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.
{1101: [1180, 1337, 1424], 1102: [1402, 1119, 1226, 1395, 1258, 1182], 1103: [1300, 1367, 1464, 1452], 1104: [1194, 1202, 1335, 1348], 1105: [1398, 1412, 1153, 1275, 1155], 1106: [1211, 1283, 1222, 1397], 1107: [1145, 1357, 1250, 1264, 1346], 1108: [1177, 1419, 1272, 1301], 1109: [], 1110: [1203, 1373, 1456, 1384], 1111: [1187, 1399, 1150, 1276, 1286, 1422], 1112: [1319, 1228, 1363, 1308, 1355], 1113: [1148, 1351, 1385, 1160], 1114: [1283, 1146, 1229, 1272, 1301], 1115: [1378, 1211, 1213, 1243, 1338], 1116: [1349, 1317, 1285, 1411, 1377], 1117: [1440, 1413, 1225, 1161, 1279], 1118: [], 1119: [1192, 1437, 1102, 1127], 1120: [1204, 1172, 1375, 1169, 1159], 1121: [], 1122: [1368, 1443, 1409, 1435], 1123: [1227, 1232, 1224, 1191, 1297], 1124

In [379]:
# Get matchup history of all teams' opponents on any given day
l30_opp_matchup_dict = {}

for seas in range(2010,2021):
    d_day = {}
    for daynum in range(133):
        d_team = {}
        for t in range(1101,1500):
            opp_matchups = []
            for o in l30_matchup_dict[seas][daynum][t]:
                opp_matchups += l30_matchup_dict[seas][daynum][o]
            d_team[t] = opp_matchups
        d_day[daynum] = d_team
    l30_opp_matchup_dict[seas] = d_day
    print('{} season completed.'.format(seas))

print(l30_opp_matchup_dict[2020][20][1101])

2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.
[1310, 1101, 1136, 1396, 1353, 1372, 1414, 1169, 1101, 1143, 1425, 1170, 1236, 1101, 1243, 1143, 1417, 1402, 1374]


In [380]:
l30_wp_dict = {h: {g: f.groupby('TeamID')['LWP'].apply(float).to_dict()
     for g, f in g.groupby('DayNum')}
     for h, g in l30_data_continuous.groupby('Season')}

In [381]:
print('START:', datetime.now(pytz.timezone('US/Pacific')))

l30_data_continuous['LOWP'] = l30_data_continuous.apply(lambda row: 
    np.mean([l30_wp_dict[row['Season']][row['DayNum']][x] \
             for x in l30_matchup_dict[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)

l30_data_continuous['LOOWP'] = l30_data_continuous.apply(lambda row: 
    np.mean([l30_wp_dict[row['Season']][row['DayNum']][x] \
             for x in l30_opp_matchup_dict[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)

l30_data_continuous['LSOS'] = (2*l30_data_continuous['LOWP']+l30_data_continuous['LOOWP'])/3

print('FINISH:', datetime.now(pytz.timezone('US/Pacific')))

l30_data_continuous

START: 2021-03-18 04:44:37.266593-07:00
FINISH: 2021-03-18 04:47:40.555970-07:00


Unnamed: 0,Season,TeamID,LNumOT,LTScore,LTFGM,LTFGA,LTFGM3,LTFGA3,LTFTM,LTFTA,...,LOAstP,LOStlP,LOBlkP,LOTOP,LOffRtg,LDefRtg,DayNum,LOWP,LOOWP,LSOS
0,2010,1107,0,43,15,55,5,28,8,14,...,0.724138,0.260546,0.222222,0.264484,0.533499,0.944584,8,1.000000,0.000000,0.666667
1,2010,1108,0,60,21,61,7,17,11,20,...,0.743590,0.112676,0.136364,0.084746,0.845070,1.412429,8,1.000000,0.000000,0.666667
2,2010,1143,0,75,24,52,5,12,22,32,...,0.423077,0.109718,0.075000,0.265625,1.175549,1.093750,8,0.000000,1.000000,0.333333
3,2010,1198,0,72,25,68,8,23,14,17,...,0.676471,0.122249,0.177778,0.314770,0.880196,1.065375,8,1.000000,0.000000,0.666667
4,2010,1293,0,70,26,52,8,21,10,15,...,0.625000,0.125000,0.032258,0.188088,1.093750,1.175549,8,1.000000,0.000000,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,2020,1463,0.25,74.5,26.875,59.0,8.0,23.0,12.75,17.0,...,0.470297,0.099741,0.079861,0.163265,1.100850,1.035250,128,0.468750,0.515625,0.484375
349,2020,1464,0.0,72.0,26.0,60.75,7.75,23.25,12.25,16.375,...,0.495098,0.091864,0.100000,0.178434,1.079865,1.093630,128,0.480655,0.482035,0.481115
350,2020,1465,0.333333,74.166667,24.5,59.166667,9.166667,23.833333,16.0,18.666667,...,0.468750,0.088714,0.108491,0.125492,1.096599,1.075295,128,0.533333,0.490179,0.518948
351,2020,1466,0.166667,69.666667,24.666667,55.5,6.833333,17.833333,13.5,20.5,...,0.475177,0.105740,0.119469,0.188583,1.052367,1.072885,128,0.375000,0.510629,0.420210


## Select features with evolutionary algorithm

In [382]:
season_data_c = pd.read_csv('Data/MRegularSeasonCompactResults.csv')
tourney_data_c = pd.read_csv('Data/MNCAATourneyCompactResults.csv')

cols = ['Season','DayNum','WTeamID','WScore','LTeamID','LScore','WLoc']

season_game_data = pd.merge(season_data_c[cols], season_data_continuous, left_on=['Season','DayNum','WTeamID'], \
        right_on=['Season','DayNum','TeamID']).drop(columns=['TeamID'])
season_game_data = pd.merge(season_game_data, l30_data_continuous, left_on=['Season','DayNum','WTeamID'], \
        right_on=['Season','DayNum','TeamID']).drop(columns=['TeamID'])
season_game_data = pd.merge(season_game_data, season_data_continuous, left_on=['Season','DayNum','LTeamID'], \
        right_on=['Season','DayNum','TeamID']).drop(columns=['TeamID'])
season_game_data = pd.merge(season_game_data, l30_data_continuous, left_on=['Season','DayNum','LTeamID'], \
        right_on=['Season','DayNum','TeamID']).drop(columns=['TeamID'])

season_game_data['Tourney'] = 0

season_data_final = pd.merge(season_data_continuous.groupby(['Season','TeamID'], \
        as_index=False).max()[['Season','TeamID','DayNum']], season_data_continuous).drop(columns='DayNum')
l30_data_final = pd.merge(l30_data_continuous.groupby(['Season','TeamID'], \
        as_index=False).max()[['Season','TeamID','DayNum']], l30_data_continuous).drop(columns='DayNum')

tourney_game_data = pd.merge(tourney_data_c[cols], season_data_final, left_on=['Season','WTeamID'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
tourney_game_data = pd.merge(tourney_game_data, l30_data_final, left_on=['Season','WTeamID'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
tourney_game_data = pd.merge(tourney_game_data, season_data_final, left_on=['Season','LTeamID'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
tourney_game_data = pd.merge(tourney_game_data, l30_data_final, left_on=['Season','LTeamID'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])

tourney_game_data['Tourney'] = 1

all_game_data = season_game_data.append(tourney_game_data)

cols = []
for c in list(all_game_data.columns[7:-1]):
    if c[-1:] == 'x':
        cols += [c[:-1]+'y']
    else:
        cols += [c[:-1]+'x']

drop_cols1 = ['DayNum','LTeamID','LScore','WLoc']
drop_cols2 = ['DayNum','WTeamID','WScore','WLoc']

all_game_data1 = all_game_data.drop(columns=drop_cols1)
all_game_data2 = all_game_data.rename(columns=dict(zip(all_game_data.columns[7:-1], cols))).drop(columns=drop_cols2)

all_game_data = all_game_data1.rename(columns={'WTeamID':'TeamID','WScore':'Score'})\
    .append(all_game_data2.rename(columns={'LTeamID':'TeamID','LScore':'Score'})).fillna(0)

all_game_data

Unnamed: 0,Season,TeamID,Score,NumOT_x,TScore_x,TFGM_x,TFGA_x,TFGM3_x,TFGA3_x,TFTM_x,...,LOAstP_y,LOStlP_y,LOBlkP_y,LOTOP_y,LOffRtg_y,LDefRtg_y,LOWP_y,LOOWP_y,LSOS_y,Tourney
0,2010,1161,57,0.0,91.000000,29.000000,52.000000,14.000000,25.000000,19.000000,...,0.593750,0.165746,0.275862,0.154494,0.593923,1.320225,1.000000,0.000000,0.666667,0
1,2010,1170,70,0.0,51.000000,18.000000,53.000000,6.000000,23.000000,9.000000,...,0.629630,0.077519,0.125000,0.223684,0.826873,1.000000,1.000000,0.000000,0.666667,0
2,2010,1245,72,0.0,69.000000,26.000000,63.000000,3.000000,21.000000,14.000000,...,0.500000,0.068259,0.046512,0.214521,1.160410,0.924092,0.000000,1.000000,0.333333,0
3,2010,1258,83,0.0,87.000000,33.000000,68.000000,8.000000,17.000000,13.000000,...,0.523810,0.139860,0.120000,0.206186,0.646853,1.048110,1.000000,0.000000,0.666667,0
4,2010,1273,80,0.0,89.000000,30.000000,58.000000,8.000000,22.000000,21.000000,...,0.527778,0.171958,0.292683,0.125698,0.701058,1.340782,1.000000,0.000000,0.666667,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
662,2019,1120,62,0.0,78.727273,27.060606,60.333333,11.303030,29.666667,13.303030,...,0.418478,0.083302,0.098485,0.163560,1.190837,0.969951,0.426631,0.509277,0.454180,1
663,2019,1403,77,0.0,73.093750,26.156250,55.468750,7.250000,19.718750,13.531250,...,0.418478,0.083302,0.098485,0.163560,1.190837,0.969951,0.426631,0.509277,0.454180,1
664,2019,1387,52,0.0,67.411765,23.941176,57.088235,5.794118,18.529412,13.735294,...,0.553672,0.082197,0.085470,0.177165,1.110666,1.029528,0.537500,0.495062,0.523354,1
665,2019,1251,58,0.0,72.193548,26.387097,54.548387,8.387097,22.903226,11.032258,...,0.553672,0.082197,0.085470,0.177165,1.110666,1.029528,0.537500,0.495062,0.523354,1


In [565]:
def sum_sq_err(y_obs, y_pred):
    """
    inputs: y_obs, array of observed target values
            y_pred, array of predicted target values
    output: sse, sum of squared errors
    """
    return sum((y_obs-y_pred)**2)

def aic(y_obs, y_pred, k):
    """
    inputs: y_obs, array of observed target values
            y_pred, array of predicted target values
            k, number of features in model
    output: AIC (Akaike Information Criterion) for OLS, measure that rewards simple models
    """
    sse = sum_sq_err(y_obs, y_pred)
    n = len(y_pred)
    return 2*k + n*math.log(sse/n)

def selection(df_x, df_y, parents, threshold):
    """
    inputs: df_x, dataframe of dependent variable observations
            df_y, dataframe of independent variable observations
            parents, list of 'parent' vectors which determine features being used
            threshold, float on [0,1] which determines what portion of the population 'survives'
    output: portion of parents which are deemed most fit (based on AIC)
    """
    fitness = {}
    th = int(len(parents)*threshold)
    for s in parents:
        p = []
        for i in range(len(s)):
            if s[i]:
                p += [i]
        X = df_x.iloc[:,p].values
        y = df_y.values
        lm = LinearRegression().fit(X, y)
        fitness[aic(y, lm.predict(X), len(p))] = s
    best = heapq.nsmallest(th, fitness.keys())
    return [fitness[x] for x in best]

def crossover(parents, O):
    """
    inputs: parents, list of 'parent' vectors which determine features being used
            O, number of offspring generated
    output: offspring, list of 'child' vectors generated from parents
    """
    if len(parents) < 2:
        return
    l = len(parents[0])
    offspring = []
    for i in range(O):
        inds = np.random.choice(len(parents), 2, replace=False)
        cutoff = np.random.randint(1,l-1)
        offspring += [np.append(parents[inds[0]][:cutoff],parents[inds[1]][cutoff:])]
    return offspring

def mutation(offspring, r, P):
    """
    inputs: offspring, new generation of vectors which determine features being used
            r, float on [0,1] - rate at which mutation occurs
            P, number of parameters to choose from
    output: new list of features which has some random removals and additions
    """
    if len(offspring) < 1 or r > 1 or r < 0:
        return
    mutated = []
    for c in offspring:
        m = np.copy(c)
        for i in range(P):
            # Mutate with rate, r
            if np.random.choice([True,False], 1, p=[r,1-r]):
                m[i] = 1-c[i]
        mutated += [m]
    return mutated

def evolution(df_x, df_y, G, N=500, t=0.2, vocal=False, vocal_int=100):
    """
    inputs: df_x, dataframe of dependent variable observations
            df_y, dataframe of independent variable observations
            G, number of generations
    output: most fit feature vector after G generations
    """
    print('START: {}'.format(datetime.now(pytz.timezone('US/Pacific'))))
    P = len(df_x.columns)
    r = 1/P
    
    # Initialize G0 of parents (randomly select features for N parents)
    ps = np.random.randint(2, size=(N,P))
    
    # Go through evolutionary process (selection, crossover, mutation) for G generations
    gen = ps
    num_children = int(N-N*t)
    for g in range(G):
        survivors = selection(df_x, df_y, gen, t)
        children = mutation(crossover(survivors, num_children), r, P)
        gen = survivors + children
        if vocal and g%vocal_int==0:
            print('GENERATION {} COMPLETED: {}'.format(g, datetime.now(pytz.timezone('US/Pacific'))))
    
    best = selection(df_x, df_y, gen, 1/N)[0]
    return best

In [549]:
only_tourney_data = all_game_data[all_game_data['Tourney']==1]
train_data, test_data = train_test_split(only_tourney_data, test_size=0.1, stratify=only_tourney_data[['Tourney']])

train_data = train_data.drop(columns=['Min_x','Min_y','LMin_x','LMin_y','G_x','G_y','LG_x','LG_y',\
                                     'Wins_x','Wins_y','LWins_x','LWins_y','TPF_x','OPF_x','LTPF_x',\
                                     'LOPF_x','TPF_y','OPF_y','LTPF_y','LOPF_y'])

# Set number of generations for genetic algorithm
num_gen = 10000

DFx = train_data.iloc[:,3:]
DFy = train_data['Score']

fs = DFx.columns

# Run evolutionary algorithm for desired nnumber of generations
use = evolution(DFx, DFy, num_gen, vocal=True, vocal_int=500)

features = []
for i in range(len(fs)):
    if use[i]:
        features += [fs[i]]
print(features)

pd.DataFrame(features).rename(columns={0:'feature'}).to_csv('MFeatures/featuresG'+str(num_gen)+'.txt', index=None)

GENERATION 0 COMPLETED: 2021-03-19 11:59:25.527246-07:00
['TFGA3_x', 'TAst_x', 'TBlk_x', 'OFGM_x', 'OFGM3_x', 'OFTM_x', 'OOR_x', 'OTO_x', 'OStl_x', 'OBlk_x', 'OFTP_x', 'OPace_x', 'TTRP_x', 'TAstP_x', 'TStlP_x', 'TBlkP_x', 'OStlP_x', 'OTOP_x', 'DefRtg_x', 'OOWP_x', 'LTFGM_x', 'LTFGM3_x', 'LTFTA_x', 'LTTO_x', 'LOFGM_x', 'LOFGM3_x', 'LOFGA3_x', 'LTFTP_x', 'LTFG3R_x', 'LTFTR_x', 'LOFGP3_x', 'LTStlP_x', 'LOAstP_x', 'LDefRtg_x', 'TAst_y', 'TBlk_y', 'OScore_y', 'OFTA_y', 'ODR_y', 'OStl_y', 'OBlk_y', 'TFTR_y', 'OFGP_y', 'OFTR_y', 'TPoss_y', 'TPace_y', 'OPoss_y', 'OPace_y', 'TTRP_y', 'TORP_y', 'TTOP_y', 'OORP_y', 'OAstP_y', 'DefRtg_y', 'OWP_y', 'OOWP_y', 'LTFGM_y', 'LTFGA_y', 'LTFGM3_y', 'LTFGA3_y', 'LTOR_y', 'LTDR_y', 'LTStl_y', 'LOFGA_y', 'LOFGM3_y', 'LOFGA3_y', 'LOOR_y', 'LWP_y', 'LTFGP_y', 'LTEFG_y', 'LTTFG_y', 'LOFGP3_y', 'LOFTR_y', 'LOTFG_y', 'LTPoss_y', 'LTBlkP_y', 'LOTRP_y', 'LOTOP_y', 'LOffRtg_y', 'LDefRtg_y', 'LOOWP_y']


In [572]:
train_data, test_data = train_test_split(all_game_data, test_size=0.1, stratify=all_game_data[['Tourney']])

train_data = train_data.drop(columns=['Min_x','Min_y','LMin_x','LMin_y','G_x','G_y','LG_x','LG_y',\
                                     'Wins_x','Wins_y','LWins_x','LWins_y','TPF_x','OPF_x','LTPF_x',\
                                     'LOPF_x','TPF_y','OPF_y','LTPF_y','LOPF_y'])

# Set number of generations for genetic algorithm
num_gen = 1000

DFx = train_data.iloc[:,3:]
DFy = train_data['Score']

fs = DFx.columns

# Run evolutionary algorithm for desired nnumber of generations
use = evolution(DFx, DFy, num_gen, N=100, vocal=True, vocal_int=5)

features = []
for i in range(len(fs)):
    if use[i]:
        features += [fs[i]]
print(features)

pd.DataFrame(features).rename(columns={0:'feature'}).to_csv('MFeatures/allg_featuresG'+str(num_gen)+'.txt', index=None)

START: 2021-03-19 17:12:14.900384-07:00
GENERATION 0 COMPLETED: 2021-03-19 17:12:52.571903-07:00
GENERATION 5 COMPLETED: 2021-03-19 17:16:31.156606-07:00
GENERATION 10 COMPLETED: 2021-03-19 17:22:19.279020-07:00
GENERATION 15 COMPLETED: 2021-03-19 17:28:33.057850-07:00
GENERATION 20 COMPLETED: 2021-03-19 17:35:04.686217-07:00
GENERATION 25 COMPLETED: 2021-03-19 17:49:03.596107-07:00
GENERATION 30 COMPLETED: 2021-03-19 17:54:54.334009-07:00
GENERATION 35 COMPLETED: 2021-03-19 18:00:40.004429-07:00
GENERATION 40 COMPLETED: 2021-03-19 18:06:27.402849-07:00
GENERATION 45 COMPLETED: 2021-03-19 18:12:35.739486-07:00
GENERATION 50 COMPLETED: 2021-03-19 19:01:11.659694-07:00
GENERATION 55 COMPLETED: 2021-03-19 19:06:50.038889-07:00
GENERATION 60 COMPLETED: 2021-03-19 19:12:16.985978-07:00
GENERATION 65 COMPLETED: 2021-03-19 19:17:20.243479-07:00
GENERATION 70 COMPLETED: 2021-03-19 19:22:12.234707-07:00
GENERATION 75 COMPLETED: 2021-03-19 19:27:09.767369-07:00
GENERATION 80 COMPLETED: 2021-03-1

GENERATION 695 COMPLETED: 2021-03-22 12:02:12.839278-07:00
GENERATION 700 COMPLETED: 2021-03-22 12:05:50.293273-07:00
GENERATION 705 COMPLETED: 2021-03-22 12:09:22.169257-07:00
GENERATION 710 COMPLETED: 2021-03-22 12:12:51.650119-07:00
GENERATION 715 COMPLETED: 2021-03-22 12:16:26.731344-07:00
GENERATION 720 COMPLETED: 2021-03-22 12:20:06.159449-07:00
GENERATION 725 COMPLETED: 2021-03-22 12:23:45.092399-07:00
GENERATION 730 COMPLETED: 2021-03-22 12:27:27.861454-07:00
GENERATION 735 COMPLETED: 2021-03-22 12:31:10.968940-07:00
GENERATION 740 COMPLETED: 2021-03-22 12:34:55.164546-07:00
GENERATION 745 COMPLETED: 2021-03-22 12:38:36.598014-07:00
GENERATION 750 COMPLETED: 2021-03-22 12:42:16.507822-07:00
GENERATION 755 COMPLETED: 2021-03-22 12:46:11.579667-07:00
GENERATION 760 COMPLETED: 2021-03-22 12:50:04.505672-07:00
GENERATION 765 COMPLETED: 2021-03-22 12:53:49.055450-07:00
GENERATION 770 COMPLETED: 2021-03-22 12:57:31.260347-07:00
GENERATION 775 COMPLETED: 2021-03-22 13:01:26.941349-07:

In [573]:
len(features)

103

In [571]:
features = pd.read_csv('MFeatures/allg_featuresG100.txt')
fs = features['feature'].values
fs = [x for x in fs if x not in ['Min_x','Min_y','LMin_x','LMin_y','G_x','G_y','LG_x','LG_y',\
                                 'Wins_x','Wins_y','LWins_x','LWins_y','TPF_x','OPF_x','LTPF_x',\
                                 'LOPF_x','TPF_y','OPF_y','LTPF_y','LOPF_y']]

X = train_data.iloc[:,3:][fs].values
y = train_data['Score'].values

lm = LinearRegression().fit(X, y)

train_data['PScore'] = lm.intercept_

for i in range(len(fs)):
    train_data['PScore'] = train_data.apply(lambda row: row['PScore'] + row[fs[i]]*lm.coef_[i], axis=1)
    
sse = sum_sq_err(y, lm.predict(X))
rmse = np.sqrt(sse/len(y))

print('TRAIN RMSE: {}'.format(rmse))

test_data['PScore'] = lm.intercept_

for i in range(len(fs)):
    test_data['PScore'] = test_data.apply(lambda row: row['PScore'] + row[fs[i]]*lm.coef_[i], axis=1)
    
sse = sum_sq_err(test_data['Score'].values, test_data['PScore'].values)
rmse = np.sqrt(sse/len(test_data['PScore'].values))

print('TEST RMSE: {}'.format(rmse))

test_data[['Season','TeamID','Score','PScore']]

TRAIN RMSE: 10.448684012743614


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


TEST RMSE: 10.557755572748599


Unnamed: 0,Season,TeamID,Score,PScore
4743,2010,1406,91,70.980931
5624,2011,1152,65,63.211143
44542,2018,1436,81,73.713172
5598,2011,1153,68,62.135753
25557,2015,1197,49,56.093298
...,...,...,...,...
24626,2014,1104,67,68.577288
48064,2019,1427,75,72.415344
33166,2016,1207,50,64.231936
56344,2020,1426,62,75.056692


In [420]:
season_data_std_score = season_data[['Season','WTeamID','WScore']]\
.rename(columns={'WTeamID':'TeamID','WScore':'Score'})\
.append(season_data[['Season','LTeamID','LScore']]\
.rename(columns={'LTeamID':'TeamID','LScore':'Score'}))\
.groupby(['Season','TeamID'], as_index=False).std()\
.rename(columns={'Score':'StdScore'})

tourney_data_sc = pd.DataFrame({0:[],1:[],2:[]})

tourney_teams = only_tourney_data.groupby('Season')['TeamID'].apply(set).apply(list).to_dict()
for s in sorted(tourney_teams.keys()):
    teams = sorted(tourney_teams[s])
    for i in range(len(teams)):
        for j in range(i+1, len(teams)):
            tourney_data_sc = tourney_data_sc.append(pd.DataFrame([s, teams[i], teams[j]]).T)

tourney_data_sc = tourney_data_sc.rename(columns={0:'Season',1:'TeamID1',2:'TeamID2'})
tourney_data_sc = tourney_data_sc[tourney_data_sc['Season']>=2015]

predict_data_1 = pd.merge(tourney_data_sc, season_data_final, left_on=['Season','TeamID1'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
predict_data_1 = pd.merge(predict_data_1, l30_data_final, left_on=['Season','TeamID1'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
predict_data_1 = pd.merge(predict_data_1, season_data_final, left_on=['Season','TeamID2'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
predict_data_1 = pd.merge(predict_data_1, l30_data_final, left_on=['Season','TeamID2'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])

In [421]:
predict_data_1['PScore1'] = lm.intercept_

for i in range(len(fs)):
    predict_data_1['PScore1'] = predict_data_1.apply(lambda row: row['PScore1'] + row[fs[i]]*lm.coef_[i], axis=1)

predict_data_1['PScore2'] = lm.intercept_

fs2 = [x[:-1]+'y' if x[-1] == 'x' else x[:-1]+'x' for x in fs]
for i in range(len(fs2)):
    predict_data_1['PScore2'] = predict_data_1.apply(lambda row: row['PScore2'] + row[fs2[i]]*lm.coef_[i], axis=1)
    
predictions_1 = pd.merge(predict_data_1[['Season','TeamID1','TeamID2','PScore1','PScore2']], season_data_std_score, \
        left_on=['Season','TeamID1'], right_on=['Season','TeamID']).drop(columns=['TeamID'])
predictions_1 = pd.merge(predictions_1, season_data_std_score, \
        left_on=['Season','TeamID2'], right_on=['Season','TeamID']).drop(columns=['TeamID'])

predictions_1['m'] = predictions_1['PScore1']-predictions_1['PScore2']
predictions_1['sd'] = predictions_1.apply(lambda row: math.sqrt(row['StdScore_x']*row['StdScore_y']), axis=1)

predictions_1['ID'] = predictions_1.apply(lambda row: '{}_{}_{}'.format(int(row['Season']), int(row['TeamID1']), \
                                                                        int(row['TeamID2'])), axis=1)
predictions_1['Pred'] = norm.cdf(0, predictions_1['m'], predictions_1['sd'])

predictions_1[['ID','Pred']].to_csv('SubmissionStage1.csv', index=None)

predictions_1[['ID','Pred']]

Unnamed: 0,ID,Pred
0,2015_1107_1112,0.901100
1,2015_1107_1116,0.652349
2,2015_1112_1116,0.219407
3,2015_1107_1124,0.827487
4,2015_1112_1124,0.346858
...,...,...
11385,2019_1438_1463,0.129898
11386,2019_1439_1463,0.119681
11387,2019_1449_1463,0.171653
11388,2019_1458_1463,0.394827


## Predict upcoming tournament results on current season data

In [422]:
season_data_2021 = pd.read_csv('MData/MDataFiles_Stage2/MRegularSeasonDetailedResults.csv')

In [423]:
cols1 = ['Season','DayNum','WTeamID','WLoc','NumOT','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR',\
            'WAst','WTO','WStl','WBlk','WPF','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR','LAst',\
            'LTO','LStl','LBlk','LPF']
cols2 = ['Season','DayNum','LTeamID','WLoc','NumOT','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR',\
            'LAst','LTO','LStl','LBlk','LPF','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR','WAst',\
            'WTO','WStl','WBlk','WPF']

cols = ['Season','DayNum','TeamID','Loc','NumOT','TScore','TFGM','TFGA','TFGM3','TFGA3','TFTM','TFTA','TOR','TDR',\
           'TAst','TTO','TStl','TBlk','TPF','OScore','OFGM','OFGA','OFGM3','OFGA3','OFTM','OFTA','OOR','ODR','OAst',\
           'OTO','OStl','OBlk','OPF']

all_cols = ['Season','TeamID','NumOT','TScore','TFGM','TFGA','TFGM3','TFGA3','TFTM','TFTA','TOR','TDR','TAst','TTO',\
            'TStl','TBlk','TPF','OScore','OFGM','OFGA','OFGM3','OFGA3','OFTM','OFTA','OOR','ODR','OAst','OTO','OStl',\
            'OBlk','OPF','Wins','G','WP','AWP','ANWP','Min','TFGP','TFGP3','TFTP','TFG3R','TFTR','TEFG','TTFG','OFGP',\
            'OFGP3','OFTP','OFG3R','OFTR','OEFG','OTFG','TPoss','TPace','OPoss','OPace','TTRP','TORP','TAstP','TStlP',\
            'TBlkP','TTOP','OTRP','OORP','OAstP','OStlP','OBlkP','OTOP','OffRtg','DefRtg','DayNum']

season_data_2021_continuous = pd.DataFrame(columns=all_cols)

for seas in range(2021,2022):
    max_daynum = max(season_data_2021[(season_data_2021['Season']==seas)]['DayNum'])
    for daynum in range(1,max_daynum+1):
        df = season_data_2021[(season_data_2021['Season']==seas)&(season_data_2021['DayNum']<daynum)]
            
        if len(df) > 0:
            tm1 = df[cols1].rename(columns=dict(zip(cols1, cols)))
            tm2 = df[cols2].rename(columns=dict(zip(cols2, cols)))

            # Calculate total wins
            tm1['Wins'] = 1
            tm2['Wins'] = 0

            # Calculate total away wins and losses (Loc will be H for the losing team)
            tm1['AWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'A' else 0, axis=1)
            tm2['AWins'] = 0
            tm1['ALosses'] = 0
            tm2['ALosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'H' else 0, axis=1)

            # Calculate total neutral wins and losses
            tm1['NWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)
            tm2['NWins'] = 0
            tm1['NLosses'] = 0
            tm2['NLosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)

            tm = tm1.append(tm2)

            tm['G'] = 1

            agg_funcs = 29*['mean']+6*['sum']

            tm = tm.groupby(['Season','TeamID'], as_index=False).agg(dict(zip(cols[4:]+['G','Wins',\
                                                                'AWins','ALosses','NWins','NLosses'], agg_funcs)))

            # Game statistics
            tm['WP'] = tm['Wins']/tm['G']
            tm['AWP'] = tm['AWins']/(tm['AWins']+tm['ALosses'])
            tm['ANWP'] = (tm['AWins']+tm['NWins'])/(tm['AWins']+tm['ALosses']+tm['NWins']+tm['NLosses'])
            tm['Min'] = 40*tm['G']+5*tm['NumOT']

            # Team shooting percentages
            tm['TFGP'] = tm['TFGM']/tm['TFGA']
            tm['TFGP3'] = tm['TFGM3']/tm['TFGA3']
            tm['TFTP'] = tm['TFTM']/tm['TFTA']
            tm['TFG3R'] = tm['TFGA3']/tm['TFGA']
            tm['TFTR'] = tm['TFTA']/tm['TFGA']
            tm['TEFG'] = (0.5*tm['TFGM3']+tm['TFGM'])/tm['TFGA']
            tm['TTFG'] = tm['TScore']/(2*(0.44*tm['TFTA']+tm['TFGA']))

            # Opponent shooting percentages
            tm['OFGP'] = tm['OFGM']/tm['OFGA']
            tm['OFGP3'] = tm['OFGM3']/tm['OFGA3']
            tm['OFTP'] = tm['OFTM']/tm['OFTA']
            tm['OFG3R'] = tm['OFGA3']/tm['OFGA']
            tm['OFTR'] = tm['OFTA']/tm['OFGA']
            tm['OEFG'] = (0.5*tm['OFGM3']+tm['OFGM'])/tm['OFGA']
            tm['OTFG'] = tm['OScore']/(2*(0.44*tm['OFTA']+tm['OFGA']))

            # Team possession stats
            tm['TPoss'] = tm['TFGA']-tm['TOR']+tm['TTO']+0.4*tm['TFTA']
            tm['TPace'] = 40*tm['TPoss']/tm['Min']

            # Opponent possession stats
            tm['OPoss'] = tm['OFGA']-tm['OOR']+tm['OTO']+0.4*tm['OFTA']
            tm['OPace'] = 40*tm['OPoss']/tm['Min']

            # Team stat percentages
            tm['TTRP'] = (tm['TOR']+tm['TDR'])/(tm['TOR']+tm['TDR']+tm['OOR']+tm['ODR'])
            tm['TORP'] = tm['TOR']/(tm['TOR']+tm['ODR'])
            tm['TAstP'] = tm['TAst']/tm['TFGM']
            tm['TStlP'] = tm['TStl']/tm['OPoss']
            tm['TBlkP'] = tm['TBlk']/(tm['OFGA']-tm['OFGA3'])
            tm['TTOP'] = tm['TTO']/tm['TPoss']

            # Opponent stat percentages
            tm['OTRP'] = (tm['OOR']+tm['ODR'])/(tm['TOR']+tm['TDR']+tm['OOR']+tm['ODR'])
            tm['OORP'] = tm['OOR']/(tm['OOR']+tm['TDR'])
            tm['OAstP'] = tm['OAst']/tm['OFGM']
            tm['OStlP'] = tm['OStl']/tm['TPoss']
            tm['OBlkP'] = tm['OBlk']/(tm['TFGA']-tm['TFGA3'])
            tm['OTOP'] = tm['OTO']/tm['OPoss']

            # Ratings
            tm['OffRtg'] = tm['TScore']/tm['TPoss']
            tm['DefRtg'] = tm['OScore']/tm['OPoss']

            tm['DayNum'] = daynum

            tm = tm.drop(columns=['AWins','ALosses','NWins','NLosses'])

            season_data_2021_continuous = season_data_2021_continuous.append(tm)
    
    print('{} season completed.'.format(seas))

season_data_2021_continuous

2021 season completed.


Unnamed: 0,Season,TeamID,NumOT,TScore,TFGM,TFGA,TFGM3,TFGA3,TFTM,TFTA,...,TTOP,OTRP,OORP,OAstP,OStlP,OBlkP,OTOP,OffRtg,DefRtg,DayNum
0,2021,1101,0,70,20,49,13,23,17,27,...,0.218023,0.558824,0.314286,0.333333,0.087209,0.076923,0.319767,1.017442,0.683140,24
1,2021,1104,0,81,30,77,7,31,14,20,...,0.131579,0.479167,0.367347,0.526316,0.065789,0.043478,0.257069,1.065789,0.732648,24
2,2021,1108,0,50,19,63,3,16,9,12,...,0.277045,0.507937,0.217391,0.354839,0.171504,0.212766,0.138191,0.659631,1.243719,24
3,2021,1111,0,81,32,62,7,27,10,13,...,0.297927,0.357143,0.226415,0.500000,0.168394,0.000000,0.110565,1.049223,0.749386,24
4,2021,1113,0,94,27,61,6,20,34,46,...,0.153061,0.485294,0.241379,0.482759,0.102041,0.048780,0.202532,1.198980,1.113924,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
342,2021,1467,0.055556,66.277778,24.111111,53.722222,7.0,22.111111,11.055556,16.666667,...,0.192276,0.551786,0.293073,0.578824,0.099425,0.072056,0.209921,0.980279,0.972605,132
343,2021,1468,0.0,72.555556,27.277778,54.5,6.333333,17.055556,11.666667,15.5,...,0.167152,0.515213,0.272912,0.573427,0.080576,0.075668,0.230795,1.119493,1.046670,132
344,2021,1469,0.0,67.631579,23.368421,57.842105,5.842105,19.684211,15.052632,20.473684,...,0.207350,0.551645,0.309262,0.541199,0.101530,0.102069,0.186889,0.918776,1.068143,132
345,2021,1470,0.0,63.866667,22.0,50.866667,5.466667,14.6,14.4,19.4,...,0.167792,0.578465,0.310502,0.718266,0.053202,0.117647,0.228873,0.980151,1.058409,132


In [424]:
# Get matchup history of all teams on any given day
matchup_dict_2021 = {}
for seas in range(2021,2022):
    d_day = {}
    for daynum in range(133):
        df = season_data_2021[(season_data_2021['Season']==seas)&(season_data_2021['DayNum']<daynum)]
        d_team = {}
        for t in range(1101,1500):
            d_team[t] = list(df[df['WTeamID']==t]['LTeamID'].append(df[df['LTeamID']==t]['WTeamID']))
        d_day[daynum] = d_team
    matchup_dict_2021[seas] = d_day
    print('{} season completed.'.format(seas))

print(matchup_dict_2021[2021][100][1101])

2021 season completed.
[1190, 1122, 1303, 1470, 1249, 1223, 1368, 1146, 1372, 1230, 1249, 1394, 1403, 1116, 1358]


In [425]:
# Get matchup history of all teams' opponents on any given day
opp_matchup_dict_2021 = {}

for seas in range(2021,2022):
    d_day = {}
    for daynum in range(133):
        d_team = {}
        for t in range(1101,1500):
            opp_matchups = []
            for o in matchup_dict_2021[seas][daynum][t]:
                opp_matchups += matchup_dict_2021[seas][daynum][o]
            d_team[t] = opp_matchups
        d_day[daynum] = d_team
    opp_matchup_dict_2021[seas] = d_day
    print('{} season completed.'.format(seas))

print(opp_matchup_dict_2021[2021][100][1101])

2021 season completed.
[1292, 1205, 1441, 1422, 1440, 1441, 1202, 1154, 1273, 1101, 1122, 1412, 1104, 1202, 1459, 1151, 1303, 1190, 1293, 1398, 1183, 1399, 1398, 1188, 1184, 1101, 1293, 1197, 1184, 1240, 1125, 1287, 1404, 1292, 1188, 1122, 1101, 1166, 1179, 1242, 1160, 1461, 1243, 1331, 1331, 1295, 1295, 1377, 1377, 1442, 1442, 1430, 1401, 1101, 1213, 1213, 1465, 1465, 1451, 1430, 1419, 1368, 1223, 1270, 1222, 1408, 1252, 1102, 1379, 1256, 1427, 1101, 1311, 1358, 1372, 1230, 1101, 1322, 1230, 1395, 1113, 1374, 1349, 1349, 1317, 1328, 1101, 1249, 1358, 1311, 1270, 1465, 1212, 1394, 1270, 1146, 1309, 1236, 1412, 1261, 1465, 1256, 1401, 1249, 1358, 1311, 1101, 1322, 1270, 1309, 1272, 1114, 1387, 1116, 1279, 1280, 1124, 1358, 1372, 1311, 1101, 1368, 1322, 1270, 1270, 1309, 1146, 1230, 1322, 1249, 1358, 1394, 1124, 1419, 1101, 1322, 1270, 1394, 1249, 1394, 1349, 1461, 1402, 1403, 1309, 1372, 1223, 1101, 1419, 1368, 1223, 1270, 1222, 1408, 1252, 1102, 1379, 1256, 1427, 1101, 1311, 1358, 1372

In [426]:
wp_dict_2021 = {h: {g: f.groupby('TeamID')['WP'].apply(float).to_dict()
     for g, f in g.groupby('DayNum')}
     for h, g in season_data_2021_continuous.groupby('Season')}

In [427]:
print('START:', datetime.now(pytz.timezone('US/Pacific')))

season_data_2021_continuous['OWP'] = season_data_2021_continuous.apply(lambda row: 
    np.mean([wp_dict_2021[row['Season']][row['DayNum']][x] \
             for x in matchup_dict_2021[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)

season_data_2021_continuous['OOWP'] = season_data_2021_continuous.apply(lambda row: 
    np.mean([wp_dict_2021[row['Season']][row['DayNum']][x] \
             for x in opp_matchup_dict_2021[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)

season_data_2021_continuous['SOS'] = (2*season_data_2021_continuous['OWP']+season_data_2021_continuous['OOWP'])/3

print('FINISH:', datetime.now(pytz.timezone('US/Pacific')))

season_data_2021_continuous

START: 2021-03-19 03:13:56.151741-07:00
FINISH: 2021-03-19 03:14:36.440588-07:00


Unnamed: 0,Season,TeamID,NumOT,TScore,TFGM,TFGA,TFGM3,TFGA3,TFTM,TFTA,...,OAstP,OStlP,OBlkP,OTOP,OffRtg,DefRtg,DayNum,OWP,OOWP,SOS
0,2021,1101,0,70,20,49,13,23,17,27,...,0.333333,0.087209,0.076923,0.319767,1.017442,0.683140,24,0.000000,1.000000,0.333333
1,2021,1104,0,81,30,77,7,31,14,20,...,0.526316,0.065789,0.043478,0.257069,1.065789,0.732648,24,0.000000,1.000000,0.333333
2,2021,1108,0,50,19,63,3,16,9,12,...,0.354839,0.171504,0.212766,0.138191,0.659631,1.243719,24,1.000000,0.000000,0.666667
3,2021,1111,0,81,32,62,7,27,10,13,...,0.500000,0.168394,0.000000,0.110565,1.049223,0.749386,24,0.000000,1.000000,0.333333
4,2021,1113,0,94,27,61,6,20,34,46,...,0.482759,0.102041,0.048780,0.202532,1.198980,1.113924,24,0.000000,1.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
342,2021,1467,0.055556,66.277778,24.111111,53.722222,7.0,22.111111,11.055556,16.666667,...,0.578824,0.099425,0.072056,0.209921,0.980279,0.972605,132,0.472773,0.488673,0.478073
343,2021,1468,0.0,72.555556,27.277778,54.5,6.333333,17.055556,11.666667,15.5,...,0.573427,0.080576,0.075668,0.230795,1.119493,1.046670,132,0.431418,0.474196,0.445678
344,2021,1469,0.0,67.631579,23.368421,57.842105,5.842105,19.684211,15.052632,20.473684,...,0.541199,0.101530,0.102069,0.186889,0.918776,1.068143,132,0.466698,0.456639,0.463345
345,2021,1470,0.0,63.866667,22.0,50.866667,5.466667,14.6,14.4,19.4,...,0.718266,0.053202,0.117647,0.228873,0.980151,1.058409,132,0.504998,0.451744,0.487246


In [428]:
cols1 = ['Season','DayNum','WTeamID','WLoc','NumOT','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR',\
            'WAst','WTO','WStl','WBlk','WPF','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR','LAst',\
            'LTO','LStl','LBlk','LPF']
cols2 = ['Season','DayNum','LTeamID','WLoc','NumOT','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR',\
            'LAst','LTO','LStl','LBlk','LPF','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR','WAst',\
            'WTO','WStl','WBlk','WPF']

cols = ['Season','DayNum','TeamID','Loc','LNumOT','LTScore','LTFGM','LTFGA','LTFGM3','LTFGA3','LTFTM','LTFTA','LTOR',\
        'LTDR','LTAst','LTTO','LTStl','LTBlk','LTPF','LOScore','LOFGM','LOFGA','LOFGM3','LOFGA3','LOFTM','LOFTA',\
        'LOOR','LODR','LOAst','LOTO','LOStl','LOBlk','LOPF']

all_cols = ['Season','TeamID','LNumOT','LTScore','LTFGM','LTFGA','LTFGM3','LTFGA3','LTFTM','LTFTA','LTOR','LTDR',\
            'LTAst','LTTO','LTStl','LTBlk','LTPF','LOScore','LOFGM','LOFGA','LOFGM3','LOFGA3','LOFTM','LOFTA','LOOR',\
            'LODR','LOAst','LOTO','LOStl','LOBlk','LOPF','LWins','LG','LWP','LAWP','LANWP','LMin','LTFGP','LTFGP3',\
            'LTFTP','LTFG3R','LTFTR','LTEFG','LTTFG','LOFGP','LOFGP3','LOFTP','LOFG3R','LOFTR','LOEFG','LOTFG',\
            'LTPoss','LTPace','LOPoss','LOPace','LTTRP','LTORP','LTAstP','LTStlP','LTBlkP','LTTOP','LOTRP','LOORP',\
            'LOAstP','LOStlP','LOBlkP','LOTOP','LOffRtg','LDefRtg','DayNum']

l30_data_2021_continuous = pd.DataFrame(columns=all_cols)

for seas in range(2021,2022):
    max_daynum = max(season_data_2021[(season_data_2021['Season']==seas)]['DayNum'])
    for daynum in range(1,max_daynum+1):
        df = season_data_2021[(season_data_2021['Season']==seas)&(season_data_2021['DayNum']<daynum)\
                         &(season_data_2021['DayNum']>=daynum-30)]
            
        if len(df) > 0:
            tm1 = df[cols1].rename(columns=dict(zip(cols1, cols)))
            tm2 = df[cols2].rename(columns=dict(zip(cols2, cols)))

            # Calculate total wins
            tm1['LWins'] = 1
            tm2['LWins'] = 0

            # Calculate total away wins and losses (Loc will be H for the losing team)
            tm1['LAWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'A' else 0, axis=1)
            tm2['LAWins'] = 0
            tm1['LALosses'] = 0
            tm2['LALosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'H' else 0, axis=1)

            # Calculate total neutral wins and losses
            tm1['LNWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)
            tm2['LNWins'] = 0
            tm1['LNLosses'] = 0
            tm2['LNLosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)

            tm = tm1.append(tm2)

            tm['LG'] = 1

            agg_funcs = 29*['mean']+6*['sum']

            tm = tm.groupby(['Season','TeamID'], as_index=False).agg(dict(zip(cols[4:]+['LG','LWins',\
                                                                'LAWins','LALosses','LNWins','LNLosses'], agg_funcs)))

            # Game statistics
            tm['LWP'] = tm['LWins']/tm['LG']
            tm['LAWP'] = tm['LAWins']/(tm['LAWins']+tm['LALosses'])
            tm['LANWP'] = (tm['LAWins']+tm['LNWins'])/(tm['LAWins']+tm['LALosses']+tm['LNWins']+tm['LNLosses'])
            tm['LMin'] = 40*tm['LG']+5*tm['LNumOT']

            # Team shooting percentages
            tm['LTFGP'] = tm['LTFGM']/tm['LTFGA']
            tm['LTFGP3'] = tm['LTFGM3']/tm['LTFGA3']
            tm['LTFTP'] = tm['LTFTM']/tm['LTFTA']
            tm['LTFG3R'] = tm['LTFGA3']/tm['LTFGA']
            tm['LTFTR'] = tm['LTFTA']/tm['LTFGA']
            tm['LTEFG'] = (0.5*tm['LTFGM3']+tm['LTFGM'])/tm['LTFGA']
            tm['LTTFG'] = tm['LTScore']/(2*(0.44*tm['LTFTA']+tm['LTFGA']))

            # Opponent shooting percentages
            tm['LOFGP'] = tm['LOFGM']/tm['LOFGA']
            tm['LOFGP3'] = tm['LOFGM3']/tm['LOFGA3']
            tm['LOFTP'] = tm['LOFTM']/tm['LOFTA']
            tm['LOFG3R'] = tm['LOFGA3']/tm['LOFGA']
            tm['LOFTR'] = tm['LOFTA']/tm['LOFGA']
            tm['LOEFG'] = (0.5*tm['LOFGM3']+tm['LOFGM'])/tm['LOFGA']
            tm['LOTFG'] = tm['LOScore']/(2*(0.44*tm['LOFTA']+tm['LOFGA']))

            # Team possession stats
            tm['LTPoss'] = tm['LTFGA']-tm['LTOR']+tm['LTTO']+0.4*tm['LTFTA']
            tm['LTPace'] = 40*tm['LTPoss']/tm['LMin']

            # Opponent possession stats
            tm['LOPoss'] = tm['LOFGA']-tm['LOOR']+tm['LOTO']+0.4*tm['LOFTA']
            tm['LOPace'] = 40*tm['LOPoss']/tm['LMin']

            # Team stat percentages
            tm['LTTRP'] = (tm['LTOR']+tm['LTDR'])/(tm['LTOR']+tm['LTDR']+tm['LOOR']+tm['LODR'])
            tm['LTORP'] = tm['LTOR']/(tm['LTOR']+tm['LODR'])
            tm['LTAstP'] = tm['LTAst']/tm['LTFGM']
            tm['LTStlP'] = tm['LTStl']/tm['LOPoss']
            tm['LTBlkP'] = tm['LTBlk']/(tm['LOFGA']-tm['LOFGA3'])
            tm['LTTOP'] = tm['LTTO']/tm['LTPoss']

            # Opponent stat percentages
            tm['LOTRP'] = (tm['LOOR']+tm['LODR'])/(tm['LTOR']+tm['LTDR']+tm['LOOR']+tm['LODR'])
            tm['LOORP'] = tm['LOOR']/(tm['LOOR']+tm['LTDR'])
            tm['LOAstP'] = tm['LOAst']/tm['LOFGM']
            tm['LOStlP'] = tm['LOStl']/tm['LTPoss']
            tm['LOBlkP'] = tm['LOBlk']/(tm['LTFGA']-tm['LTFGA3'])
            tm['LOTOP'] = tm['LOTO']/tm['LOPoss']

            # Ratings
            tm['LOffRtg'] = tm['LTScore']/tm['LTPoss']
            tm['LDefRtg'] = tm['LOScore']/tm['LOPoss']

            tm['DayNum'] = daynum

            tm = tm.drop(columns=['LAWins','LALosses','LNWins','LNLosses'])

            l30_data_2021_continuous = l30_data_2021_continuous.append(tm)
    
    print('{} season completed.'.format(seas))

l30_data_2021_continuous

2021 season completed.


Unnamed: 0,Season,TeamID,LNumOT,LTScore,LTFGM,LTFGA,LTFGM3,LTFGA3,LTFTM,LTFTA,...,LTTOP,LOTRP,LOORP,LOAstP,LOStlP,LOBlkP,LOTOP,LOffRtg,LDefRtg,DayNum
0,2021,1101,0,70,20,49,13,23,17,27,...,0.218023,0.558824,0.314286,0.333333,0.087209,0.076923,0.319767,1.017442,0.683140,24
1,2021,1104,0,81,30,77,7,31,14,20,...,0.131579,0.479167,0.367347,0.526316,0.065789,0.043478,0.257069,1.065789,0.732648,24
2,2021,1108,0,50,19,63,3,16,9,12,...,0.277045,0.507937,0.217391,0.354839,0.171504,0.212766,0.138191,0.659631,1.243719,24
3,2021,1111,0,81,32,62,7,27,10,13,...,0.297927,0.357143,0.226415,0.500000,0.168394,0.000000,0.110565,1.049223,0.749386,24
4,2021,1113,0,94,27,61,6,20,34,46,...,0.153061,0.485294,0.241379,0.482759,0.102041,0.048780,0.202532,1.198980,1.113924,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,2021,1467,0.0,66.0,23.142857,51.857143,7.285714,22.0,12.428571,18.428571,...,0.212044,0.548780,0.307317,0.514451,0.125106,0.047847,0.194236,0.979644,0.996241,132
340,2021,1468,0.0,75.25,28.0,52.25,6.5,17.0,12.75,17.25,...,0.173344,0.567164,0.350000,0.634615,0.053929,0.063830,0.271493,1.159476,1.108597,132
341,2021,1469,0.0,68.333333,22.333333,55.0,7.0,21.166667,16.666667,20.333333,...,0.223881,0.538462,0.323671,0.583851,0.125933,0.137931,0.179426,0.956157,1.064593,132
342,2021,1470,0.0,62.333333,22.5,50.5,4.166667,13.0,13.166667,17.5,...,0.161804,0.558074,0.331579,0.683761,0.058355,0.093333,0.243644,0.992042,0.956038,132


In [429]:
# Get matchup history of all teams on any given day
l30_matchup_dict_2021 = {}
for seas in range(2021,2022):
    d_day = {}
    for daynum in range(133):
        df = season_data_2021[(season_data_2021['Season']==seas)&(season_data_2021['DayNum']<daynum)&\
                         (season_data_2021['DayNum']>=daynum-30)]
        d_team = {}
        for t in range(1101,1500):
            d_team[t] = list(df[df['WTeamID']==t]['LTeamID'].append(df[df['LTeamID']==t]['WTeamID']))
        d_day[daynum] = d_team
    l30_matchup_dict_2021[seas] = d_day
    print('{} season completed.'.format(seas))

print(l30_matchup_dict_2021[2021][100][1101])

2021 season completed.
[1368, 1146, 1372, 1230, 1249, 1394, 1358]


In [430]:
# Get matchup history of all teams' opponents on any given day
l30_opp_matchup_dict_2021 = {}

for seas in range(2021,2022):
    d_day = {}
    for daynum in range(133):
        d_team = {}
        for t in range(1101,1500):
            opp_matchups = []
            for o in l30_matchup_dict_2021[seas][daynum][t]:
                opp_matchups += l30_matchup_dict_2021[seas][daynum][o]
            d_team[t] = opp_matchups
        d_day[daynum] = d_team
    l30_opp_matchup_dict_2021[seas] = d_day
    print('{} season completed.'.format(seas))

print(l30_opp_matchup_dict_2021[2021][100][1101])

2021 season completed.
[1394, 1270, 1146, 1309, 1101, 1322, 1372, 1311, 1101, 1368, 1322, 1270, 1146, 1230, 1322, 1249, 1358, 1394, 1101, 1270, 1394, 1249, 1394, 1372, 1223, 1101, 1270, 1358, 1372, 1230, 1101, 1368, 1309, 1230, 1358, 1372, 1230, 1101, 1249, 1223, 1101, 1394, 1311, 1372]


In [431]:
l30_wp_dict_2021 = {h: {g: f.groupby('TeamID')['LWP'].apply(float).to_dict()
     for g, f in g.groupby('DayNum')}
     for h, g in l30_data_2021_continuous.groupby('Season')}

In [432]:
print('START:', datetime.now(pytz.timezone('US/Pacific')))

l30_data_2021_continuous['LOWP'] = l30_data_2021_continuous.apply(lambda row: 
    np.mean([l30_wp_dict_2021[row['Season']][row['DayNum']][x] \
             for x in l30_matchup_dict_2021[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)

l30_data_2021_continuous['LOOWP'] = l30_data_2021_continuous.apply(lambda row: 
    np.mean([l30_wp_dict_2021[row['Season']][row['DayNum']][x] \
             for x in l30_opp_matchup_dict_2021[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)

l30_data_2021_continuous['LSOS'] = (2*l30_data_2021_continuous['LOWP']+l30_data_2021_continuous['LOOWP'])/3

print('FINISH:', datetime.now(pytz.timezone('US/Pacific')))

l30_data_2021_continuous

START: 2021-03-19 03:15:49.264447-07:00
FINISH: 2021-03-19 03:16:00.037853-07:00


Unnamed: 0,Season,TeamID,LNumOT,LTScore,LTFGM,LTFGA,LTFGM3,LTFGA3,LTFTM,LTFTA,...,LOAstP,LOStlP,LOBlkP,LOTOP,LOffRtg,LDefRtg,DayNum,LOWP,LOOWP,LSOS
0,2021,1101,0,70,20,49,13,23,17,27,...,0.333333,0.087209,0.076923,0.319767,1.017442,0.683140,24,0.000000,1.000000,0.333333
1,2021,1104,0,81,30,77,7,31,14,20,...,0.526316,0.065789,0.043478,0.257069,1.065789,0.732648,24,0.000000,1.000000,0.333333
2,2021,1108,0,50,19,63,3,16,9,12,...,0.354839,0.171504,0.212766,0.138191,0.659631,1.243719,24,1.000000,0.000000,0.666667
3,2021,1111,0,81,32,62,7,27,10,13,...,0.500000,0.168394,0.000000,0.110565,1.049223,0.749386,24,0.000000,1.000000,0.333333
4,2021,1113,0,94,27,61,6,20,34,46,...,0.482759,0.102041,0.048780,0.202532,1.198980,1.113924,24,0.000000,1.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,2021,1467,0.0,66.0,23.142857,51.857143,7.285714,22.0,12.428571,18.428571,...,0.514451,0.125106,0.047847,0.194236,0.979644,0.996241,132,0.653741,0.386447,0.564643
340,2021,1468,0.0,75.25,28.0,52.25,6.5,17.0,12.75,17.25,...,0.634615,0.053929,0.063830,0.271493,1.159476,1.108597,132,0.562500,0.515365,0.546788
341,2021,1469,0.0,68.333333,22.333333,55.0,7.0,21.166667,16.666667,20.333333,...,0.583851,0.125933,0.137931,0.179426,0.956157,1.064593,132,0.629630,0.415123,0.558128
342,2021,1470,0.0,62.333333,22.5,50.5,4.166667,13.0,13.166667,17.5,...,0.683761,0.058355,0.093333,0.243644,0.992042,0.956038,132,0.333333,0.535301,0.400656


In [433]:
season_data_2021_final = pd.merge(season_data_2021_continuous.groupby(['Season','TeamID'], \
        as_index=False).max()[['Season','TeamID','DayNum']], season_data_2021_continuous).drop(columns='DayNum')
l30_data_2021_final = pd.merge(l30_data_2021_continuous.groupby(['Season','TeamID'], \
        as_index=False).max()[['Season','TeamID','DayNum']], l30_data_2021_continuous).drop(columns='DayNum')

season_data_std_score_2021 = season_data_2021[['Season','WTeamID','WScore']]\
.rename(columns={'WTeamID':'TeamID','WScore':'Score'})\
.append(season_data[['Season','LTeamID','LScore']]\
.rename(columns={'LTeamID':'TeamID','LScore':'Score'}))\
.groupby(['Season','TeamID'], as_index=False).std()\
.rename(columns={'Score':'StdScore'})

tourney_data_2021_sc = pd.DataFrame({0:[],1:[],2:[]})

tourney_teams = pd.read_csv('MData/MDataFiles_Stage2/MNCAATourneySeeds.csv')
tourney_teams_2021 = tourney_teams[tourney_teams['Season']==2021][['Season','TeamID']]
tourney_teams_2021 = tourney_teams_2021.groupby('Season')['TeamID'].apply(set).apply(list).to_dict()

for s in sorted(tourney_teams_2021.keys()):
    teams = sorted(tourney_teams_2021[s])
    for i in range(len(teams)):
        for j in range(i+1, len(teams)):
            tourney_data_2021_sc = tourney_data_2021_sc.append(pd.DataFrame([s, teams[i], teams[j]]).T)

tourney_data_2021_sc = tourney_data_2021_sc.rename(columns={0:'Season',1:'TeamID1',2:'TeamID2'})
tourney_data_2021_sc = tourney_data_2021_sc[tourney_data_2021_sc['Season']==2021]

predict_data_2 = pd.merge(tourney_data_2021_sc, season_data_2021_final, left_on=['Season','TeamID1'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
predict_data_2 = pd.merge(predict_data_2, l30_data_2021_final, left_on=['Season','TeamID1'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
predict_data_2 = pd.merge(predict_data_2, season_data_2021_final, left_on=['Season','TeamID2'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
predict_data_2 = pd.merge(predict_data_2, l30_data_2021_final, left_on=['Season','TeamID2'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])

In [434]:
predict_data_2['PScore1'] = lm.intercept_

for i in range(len(fs)):
    predict_data_2['PScore1'] = predict_data_2.apply(lambda row: row['PScore1'] + row[fs[i]]*lm.coef_[i], axis=1)

predict_data_2['PScore2'] = lm.intercept_

fs2 = [x[:-1]+'y' if x[-1] == 'x' else x[:-1]+'x' for x in fs]
for i in range(len(fs2)):
    predict_data_2['PScore2'] = predict_data_2.apply(lambda row: row['PScore2'] + row[fs2[i]]*lm.coef_[i], axis=1)
    
predictions_2 = pd.merge(predict_data_2[['Season','TeamID1','TeamID2','PScore1','PScore2']], season_data_std_score_2021, \
        left_on=['Season','TeamID1'], right_on=['Season','TeamID']).drop(columns=['TeamID'])
predictions_2 = pd.merge(predictions_2, season_data_std_score_2021, \
        left_on=['Season','TeamID2'], right_on=['Season','TeamID']).drop(columns=['TeamID'])

predictions_2['m'] = predictions_2['PScore1']-predictions_2['PScore2']
predictions_2['sd'] = predictions_2.apply(lambda row: math.sqrt(row['StdScore_x']*row['StdScore_y']), axis=1)

predictions_2['ID'] = predictions_2.apply(lambda row: '{}_{}_{}'.format(int(row['Season']), int(row['TeamID1']), \
                                                                        int(row['TeamID2'])), axis=1)
predictions_2['Pred'] = norm.cdf(predictions_2['m'], 0, predictions_2['sd'])

predictions_2[['ID','Pred']].sort_values(by='ID').to_csv('MPredictions/SubmissionStage2.csv', index=None)

predictions_2[['ID','Pred']]

Unnamed: 0,ID,Pred
0,2021_1101_1104,0.108871
1,2021_1101_1111,0.843407
2,2021_1104_1111,0.982718
3,2021_1101_1116,0.098623
4,2021_1104_1116,0.399783
...,...,...
2273,2021_1438_1458,0.604725
2274,2021_1439_1458,0.369269
2275,2021_1452_1458,0.599776
2276,2021_1455_1458,0.266133


In [435]:
team_index = pd.read_csv('MData/MDataFiles_Stage2/MTeams.csv')
results_2 = pd.merge(team_index[['TeamID','TeamName']], predictions_2, left_on='TeamID', right_on='TeamID2')
results_2 = pd.merge(team_index[['TeamID','TeamName']], results_2, left_on='TeamID', right_on='TeamID1')

results_2.drop(columns=['Season','TeamID1','TeamID2','m','sd','ID'])[['TeamName_x','TeamName_y','Pred']].to_csv('2021_game_predictions.csv')

In [436]:
tourney_seeds = pd.read_csv('MData/MDataFiles_Stage2/MNCAATourneySeeds.csv')
bracket = pd.read_csv('MData/MDataFiles_Stage2/MNCAATourneySeedRoundSlots.csv')

seeds_2021 = tourney_seeds[tourney_seeds['Season']==2021]

bracket_lookup = pd.merge(bracket[['Seed','GameSlot','GameRound']], bracket[['Seed','GameSlot','GameRound']],\
              on=['GameSlot','GameRound'])
bracket_lookup['Seed_xy'] = bracket_lookup['Seed_x']+bracket_lookup['Seed_y']
bracket_lookup = bracket_lookup.groupby('Seed_xy', as_index=False).min()

bracket_results = pd.merge(results_2, tourney_seeds, left_on=['Season','TeamID_x'], \
                           right_on=['Season','TeamID']).drop(columns='TeamID')
bracket_results = pd.merge(bracket_results, tourney_seeds, left_on=['Season','TeamID_y'], \
                           right_on=['Season','TeamID']).drop(columns='TeamID')
bracket_results['Seed_xy'] = bracket_results['Seed_x']+bracket_results['Seed_y']

bracket_results = pd.merge(bracket_results, bracket_lookup[['GameSlot','Seed_xy','GameRound']], on=['Seed_xy'])

bracket_results

Unnamed: 0,TeamID_x,TeamName_x,TeamID_y,TeamName_y,Season,TeamID1,TeamID2,PScore1,PScore2,StdScore_x,StdScore_y,m,sd,ID,Pred,Seed_x,Seed_y,Seed_xy,GameSlot,GameRound
0,1101,Abilene Chr,1104,Alabama,2021.0,1101.0,1104.0,60.504423,72.446966,8.579521,10.942538,-11.942542,9.689259,2021_1101_1104,0.108871,W14,W02,W14W02,R3W2,3
1,1101,Abilene Chr,1111,Appalachian St,2021.0,1101.0,1111.0,60.656853,51.917571,8.579521,8.751557,8.739283,8.665112,2021_1101_1111,0.843407,W14,X16a,W14X16a,R5WX,5
2,1104,Alabama,1111,Appalachian St,2021.0,1104.0,1111.0,77.080465,56.398640,10.942538,8.751557,20.681825,9.785920,2021_1104_1111,0.982718,W02,X16a,W02X16a,R5WX,5
3,1101,Abilene Chr,1116,Arkansas,2021.0,1101.0,1116.0,55.949615,71.307541,8.579521,16.534890,-15.357925,11.910560,2021_1101_1116,0.098623,W14,Z03,W14Z03,R6CH,6
4,1104,Alabama,1116,Arkansas,2021.0,1104.0,1116.0,72.373227,75.788610,10.942538,16.534890,-3.415383,13.451158,2021_1104_1116,0.399783,W02,Z03,W02Z03,R6CH,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2273,1438,Virginia,1458,Wisconsin,2021.0,1438.0,1458.0,59.892275,57.471864,9.030992,9.195987,2.420411,9.113116,2021_1438_1458,0.604725,X04,Z09,X04Z09,R6CH,6
2274,1439,Virginia Tech,1458,Wisconsin,2021.0,1439.0,1458.0,55.088590,58.249652,9.752655,9.195987,-3.161062,9.470232,2021_1439_1458,0.369269,Z10,Z09,Z10Z09,R4Z1,4
2275,1452,West Virginia,1458,Wisconsin,2021.0,1452.0,1458.0,65.244858,63.041063,8.266200,9.195987,2.203795,8.718708,2021_1452_1458,0.599776,Y03,Z09,Y03Z09,R5YZ,5
2276,1455,Wichita St,1458,Wisconsin,2021.0,1455.0,1458.0,54.290003,59.850234,8.618916,9.195987,-5.560231,8.902777,2021_1455_1458,0.266133,X11b,Z09,X11bZ09,R6CH,6


In [437]:
team_list = list(set(bracket_results['TeamID_x'].append(bracket_results['TeamID_y'])))
losers = [1277, 1111, 1455, 1291]
team_list = [x for x in team_list if x not in losers]

winner_dict = {1:dict(zip(team_list, len(team_list)*[0])), 2:dict(zip(team_list, len(team_list)*[0])), \
               3:dict(zip(team_list, len(team_list)*[0])), 4:dict(zip(team_list, len(team_list)*[0])), \
               5:dict(zip(team_list, len(team_list)*[0])), 6:dict(zip(team_list, len(team_list)*[0]))}

N = 1000

for i in range(N):
    
    winners = team_list

    for r in range(1,7):
        sim = bracket_results.copy()
        sim = sim = sim[(sim['GameRound']==r)&(sim['TeamID_x'].isin(winners))&(sim['TeamID_y'].isin(winners))]

        sim['Sim'] = sim.apply(lambda row: np.random.uniform(0,1), axis=1)
        sim['Winner'] = sim.apply(lambda row: row['TeamID_x'] if row['Sim']<row['Pred'] else row['TeamID_y'], axis=1)

        winners = list(sim['Winner'].values)

        for w in winners:
            winner_dict[r][w] += 1
    
win_rate = {}

for d in winner_dict:
    win_rate[d] = {k:(v/N) for k, v in winner_dict[d].items()}
    
print(win_rate[1])

{1281: 0.228, 1155: 0.692, 1156: 0.001, 1411: 0.19, 1159: 0.063, 1160: 0.633, 1287: 0.642, 1417: 0.26, 1163: 0.476, 1166: 0.65, 1422: 0.207, 1425: 0.48, 1429: 0.195, 1433: 0.592, 1179: 0.52, 1180: 0.017, 1437: 0.818, 1438: 0.771, 1439: 0.239, 1313: 0.0, 1186: 0.026, 1314: 0.641, 1317: 0.228, 1196: 0.761, 1325: 0.229, 1326: 0.927, 1199: 0.793, 1328: 0.772, 1329: 0.931, 1452: 0.358, 1331: 0.073, 1332: 0.408, 1333: 0.284, 1207: 0.367, 1457: 0.182, 1458: 0.359, 1210: 0.802, 1211: 1.0, 1213: 0.006, 1216: 0.166, 1345: 0.772, 1222: 0.999, 1353: 0.308, 1228: 0.983, 1101: 0.116, 1104: 0.989, 1233: 0.011, 1234: 0.994, 1361: 0.658, 1364: 0.35, 1242: 0.974, 1116: 0.937, 1251: 0.069, 1124: 0.834, 1382: 0.151, 1260: 0.198, 1261: 0.849, 1393: 0.342, 1140: 0.74, 1268: 0.524, 1397: 0.716, 1400: 0.884, 1403: 0.805, 1276: 0.81}


In [438]:
team_odds = tourney_seeds[tourney_seeds['Season']==2021]
team_odds = pd.merge(team_odds, team_index[~team_index['TeamID'].isin(losers)], on='TeamID')
team_odds['R1'] = team_odds.apply(lambda row: win_rate[1][row['TeamID']], axis=1)
team_odds['R2'] = team_odds.apply(lambda row: win_rate[2][row['TeamID']], axis=1)
team_odds['R3'] = team_odds.apply(lambda row: win_rate[3][row['TeamID']], axis=1)
team_odds['R4'] = team_odds.apply(lambda row: win_rate[4][row['TeamID']], axis=1)
team_odds['R5'] = team_odds.apply(lambda row: win_rate[5][row['TeamID']], axis=1)
team_odds['R6'] = team_odds.apply(lambda row: win_rate[6][row['TeamID']], axis=1)

team_odds.to_csv('MPredictions/team_odds_2021.csv', index=False)

team_odds

Unnamed: 0,Season,Seed,TeamID,TeamName,FirstD1Season,LastD1Season,R1,R2,R3,R4,R5,R6
0,2021,W01,1276,Michigan,1985,2021,0.810,0.207,0.055,0.007,0.001,0.000
1,2021,W02,1104,Alabama,1985,2021,0.989,0.800,0.482,0.312,0.152,0.036
2,2021,W03,1400,Texas,1985,2021,0.884,0.497,0.224,0.126,0.043,0.007
3,2021,W04,1199,Florida St,1985,2021,0.793,0.424,0.228,0.093,0.032,0.002
4,2021,W05,1160,Colorado,1985,2021,0.633,0.368,0.189,0.075,0.027,0.004
...,...,...,...,...,...,...,...,...,...,...,...,...
59,2021,Z12,1457,Winthrop,1987,2021,0.182,0.025,0.005,0.000,0.000,0.000
60,2021,Z13,1317,North Texas,1985,2021,0.228,0.100,0.034,0.009,0.000,0.000
61,2021,Z14,1159,Colgate,1985,2021,0.063,0.009,0.003,0.000,0.000,0.000
62,2021,Z15,1331,Oral Roberts,1985,2021,0.073,0.019,0.000,0.000,0.000,0.000


In [563]:
results_2[results_2['TeamName_x']=='Ohio St']

Unnamed: 0,TeamID_x,TeamName_x,TeamID_y,TeamName_y,Season,TeamID1,TeamID2,PScore1,PScore2,StdScore_x,StdScore_y,m,sd,ID,Pred
1900,1326,Ohio St,1328,Oklahoma,2021.0,1326.0,1328.0,64.586491,60.314629,7.65693,9.46774,4.271862,8.51433,2021_1326_1328,0.69207
1901,1326,Ohio St,1329,Oklahoma St,2021.0,1326.0,1329.0,64.407261,68.461091,7.65693,6.98796,-4.05383,7.314801,2021_1326_1329,0.289723
1902,1326,Ohio St,1331,Oral Roberts,2021.0,1326.0,1331.0,75.867236,62.86355,7.65693,11.340307,13.003686,9.318365,2021_1326_1331,0.918566
1903,1326,Ohio St,1332,Oregon,2021.0,1326.0,1332.0,60.771279,58.931195,7.65693,8.01643,1.840084,7.834618,2021_1326_1332,0.592844
1904,1326,Ohio St,1333,Oregon St,2021.0,1326.0,1333.0,66.219534,57.940296,7.65693,8.084708,8.279238,7.867912,2021_1326_1333,0.853664
1905,1326,Ohio St,1345,Purdue,2021.0,1326.0,1345.0,61.954652,65.242277,7.65693,8.649304,-3.287625,8.138004,2021_1326_1345,0.343112
1906,1326,Ohio St,1353,Rutgers,2021.0,1326.0,1353.0,67.073579,59.99362,7.65693,9.998571,7.079959,8.749763,2021_1326_1353,0.790788
1907,1326,Ohio St,1361,San Diego St,2021.0,1326.0,1361.0,58.480704,60.618199,7.65693,10.084062,-2.137495,8.78709,2021_1326_1361,0.403904
1908,1326,Ohio St,1364,UC Santa Barbara,2021.0,1326.0,1364.0,61.250524,53.430096,7.65693,11.076172,7.820428,9.209206,2021_1326_1364,0.802114
1909,1326,Ohio St,1382,St Bonaventure,2021.0,1326.0,1382.0,65.2994,58.669684,7.65693,9.083868,6.629715,8.339937,2021_1326_1382,0.786675


## Predict with PCA

In [452]:
from sklearn import preprocessing
from sklearn.decomposition import PCA

In [499]:
features = pd.read_csv('MFeatures/featuresG10000.txt')
fs = features['feature'].values
fs = [x for x in fs if x not in ['Min_x','Min_y','LMin_x','LMin_y','G_x','G_y','LG_x','LG_y',\
                                 'Wins_x','Wins_y','LWins_x','LWins_y','TPF_x','OPF_x','LTPF_x',\
                                 'LOPF_x','TPF_y','OPF_y','LTPF_y','LOPF_y']]

X = train_data.iloc[:,3:][fs].values
y = train_data['Score'].values

scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)

pca = PCA(n_components=32, svd_solver='full')
pca.fit(X_scaled)
print('EXPLAINED VARIANCE RATIO:\n', np.cumsum(pca.explained_variance_ratio_), '\n')

X_comp = pca.transform(X_scaled)
train_pca = pd.DataFrame(X_comp)

lm = LinearRegression().fit(X_comp, y)

train_pca['PScore'] = lm.intercept_

for i in range(32):
    train_pca['PScore'] = train_pca.apply(lambda row: row['PScore'] + row[i]*lm.coef_[i], axis=1)
    
sse = sum_sq_err(y, lm.predict(X_comp))
rmse = np.sqrt(sse/len(y))

print('TRAIN RMSE: {}'.format(rmse))

X = test_data.iloc[:,3:][fs].values
y = test_data['Score'].values

X_scaled = scaler.transform(X)

X_comp = pca.transform(X_scaled)
test_pca = pd.DataFrame(X_comp)

test_pca['PScore'] = lm.intercept_

for i in range(32):
    test_pca['PScore'] = test_pca.apply(lambda row: row['PScore'] + row[i]*lm.coef_[i], axis=1)

sse = sum_sq_err(y, lm.predict(X_comp))
rmse = np.sqrt(sse/len(y))

print('TEST RMSE: {}'.format(rmse))

pd.concat([test_data.drop(columns=['PScore']).reset_index(drop=True), test_pca.reset_index(drop=True)], \
          axis=1)[['Season','TeamID','Score','PScore']]

EXPLAINED VARIANCE RATIO:
 [0.13994378 0.22355561 0.29858386 0.36397167 0.42786569 0.47754209
 0.52407812 0.55937152 0.59299542 0.62261456 0.65079166 0.67572614
 0.69977152 0.72215302 0.74288282 0.76270703 0.78047458 0.79765559
 0.81400077 0.82983847 0.8441618  0.85759635 0.87074441 0.88329312
 0.89434931 0.90439501 0.91360897 0.92237445 0.93058762 0.93784839
 0.94498238 0.951668  ] 

TRAIN RMSE: 9.542197824868513
TEST RMSE: 10.97670758315237


Unnamed: 0,Season,TeamID,Score,PScore
0,2017,1376,77,62.901265
1,2010,1397,83,72.753307
2,2014,1235,85,78.851075
3,2013,1393,55,66.337632
4,2011,1433,72,64.922666
...,...,...,...,...
129,2014,1163,77,62.573024
130,2018,1222,63,68.995532
131,2017,1124,91,71.098888
132,2012,1393,64,60.491788


In [538]:
predict_pca_1 = pd.merge(tourney_data_sc, season_data_final, left_on=['Season','TeamID1'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
predict_pca_1 = pd.merge(predict_pca_1, l30_data_final, left_on=['Season','TeamID1'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
predict_pca_1 = pd.merge(predict_pca_1, season_data_final, left_on=['Season','TeamID2'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
predict_pca_1 = pd.merge(predict_pca_1, l30_data_final, left_on=['Season','TeamID2'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])

X1 = predict_pca_1.iloc[:,3:][fs].values
X1_scaled = scaler.transform(X1)
X1_comp = pca.transform(X1_scaled)

X2 = predict_pca_1.iloc[:,3:][fs2].rename(columns=dict(zip(fs2,fs)))
X2_scaled = scaler.transform(X2)
X2_comp = pca.transform(X2_scaled)

predictions_pca_1_1 = pd.DataFrame(X1_comp)
predictions_pca_2_1 = pd.DataFrame(X2_comp)

predictions_pca_1_1['PScore1'] = lm.intercept_
for i in range(32):
    predictions_pca_1_1['PScore1'] = predictions_pca_1_1.apply(lambda row: row['PScore1'] + row[i]*lm.coef_[i], axis=1)

predictions_pca_2_1['PScore2'] = lm.intercept_
for i in range(32):
    predictions_pca_2_1['PScore2'] = predictions_pca_2_1.apply(lambda row: row['PScore2'] + row[i]*lm.coef_[i], axis=1)
    

predictions_pca_1 = pd.concat([predict_pca_1, predictions_pca_1_1, predictions_pca_2_1], axis=1)

predictions_pca_1 = pd.merge(predictions_pca_1[['Season','TeamID1','TeamID2','PScore1','PScore2']], season_data_std_score, \
        left_on=['Season','TeamID1'], right_on=['Season','TeamID']).drop(columns=['TeamID'])
predictions_pca_1 = pd.merge(predictions_pca_1, season_data_std_score, \
        left_on=['Season','TeamID2'], right_on=['Season','TeamID']).drop(columns=['TeamID'])

predictions_pca_1['m'] = predictions_pca_1['PScore1']-predictions_pca_1['PScore2']
predictions_pca_1['sd'] = predictions_pca_1.apply(lambda row: math.sqrt(row['StdScore_x']*row['StdScore_y']), axis=1)

predictions_pca_1['ID'] = predictions_pca_1.apply(lambda row: '{}_{}_{}'.format(int(row['Season']), int(row['TeamID1']), \
                                                                        int(row['TeamID2'])), axis=1)
predictions_pca_1['Pred'] = norm.cdf(predictions_pca_1['m'], 0, predictions_pca_1['sd'])

predictions_pca_1[['ID','Pred']].to_csv('MPredictions/SubmissionStage1.csv', index=None)

predictions_pca_1[['ID','Pred']]

Unnamed: 0,ID,Pred
0,2015_1107_1112,0.046759
1,2015_1107_1116,0.224481
2,2015_1112_1116,0.785761
3,2015_1107_1124,0.076527
4,2015_1112_1124,0.632491
...,...,...
11385,2019_1438_1463,0.926855
11386,2019_1439_1463,0.787694
11387,2019_1449_1463,0.725647
11388,2019_1458_1463,0.774736


In [539]:
predict_pca_2 = pd.merge(tourney_data_2021_sc, season_data_2021_final, left_on=['Season','TeamID1'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
predict_pca_2 = pd.merge(predict_pca_2, l30_data_2021_final, left_on=['Season','TeamID1'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
predict_pca_2 = pd.merge(predict_pca_2, season_data_2021_final, left_on=['Season','TeamID2'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])
predict_pca_2 = pd.merge(predict_pca_2, l30_data_2021_final, left_on=['Season','TeamID2'], \
        right_on=['Season','TeamID']).drop(columns=['TeamID'])

X1 = predict_pca_2.iloc[:,3:][fs].values
X1_scaled = scaler.transform(X1)
X1_comp = pca.transform(X1_scaled)

X2 = predict_pca_2.iloc[:,3:][fs2].rename(columns=dict(zip(fs2,fs)))
X2_scaled = scaler.transform(X2)
X2_comp = pca.transform(X2_scaled)

predictions_pca_1_2 = pd.DataFrame(X1_comp)
predictions_pca_2_2 = pd.DataFrame(X2_comp)

predictions_pca_1_2['PScore1'] = lm.intercept_
for i in range(32):
    predictions_pca_1_2['PScore1'] = predictions_pca_1_2.apply(lambda row: row['PScore1'] + row[i]*lm.coef_[i], axis=1)

predictions_pca_2_2['PScore2'] = lm.intercept_
for i in range(32):
    predictions_pca_2_2['PScore2'] = predictions_pca_2_2.apply(lambda row: row['PScore2'] + row[i]*lm.coef_[i], axis=1)
    

predictions_pca_2 = pd.concat([predict_pca_2, predictions_pca_1_2, predictions_pca_2_2], axis=1)

predictions_pca_2 = pd.merge(predictions_pca_2[['Season','TeamID1','TeamID2','PScore1','PScore2']], season_data_std_score_2021, \
        left_on=['Season','TeamID1'], right_on=['Season','TeamID']).drop(columns=['TeamID'])
predictions_pca_2 = pd.merge(predictions_pca_2, season_data_std_score_2021, \
        left_on=['Season','TeamID2'], right_on=['Season','TeamID']).drop(columns=['TeamID'])

predictions_pca_2['m'] = predictions_pca_2['PScore1']-predictions_pca_2['PScore2']
predictions_pca_2['sd'] = predictions_pca_2.apply(lambda row: math.sqrt(row['StdScore_x']*row['StdScore_y']), axis=1)

predictions_pca_2['ID'] = predictions_pca_2.apply(lambda row: '{}_{}_{}'.format(int(row['Season']), int(row['TeamID1']), \
                                                                        int(row['TeamID2'])), axis=1)
predictions_pca_2['Pred'] = norm.cdf(predictions_pca_2['m'], 0, predictions_pca_2['sd'])

predictions_pca_2[['ID','Pred']].to_csv('MPredictions/SubmissionStage2.csv', index=None)

predictions_pca_2[['ID','Pred']]

Unnamed: 0,ID,Pred
0,2021_1101_1104,0.180141
1,2021_1101_1111,0.669688
2,2021_1104_1111,0.902263
3,2021_1101_1116,0.229251
4,2021_1104_1116,0.501023
...,...,...
2273,2021_1438_1458,0.708798
2274,2021_1439_1458,0.485601
2275,2021_1452_1458,0.777707
2276,2021_1455_1458,0.516743


In [542]:
results_final = pd.merge(team_index[['TeamID','TeamName']], predictions_pca_2, left_on='TeamID', right_on='TeamID2')
results_final = pd.merge(team_index[['TeamID','TeamName']], results_final, left_on='TeamID', right_on='TeamID1')

results_final.drop(columns=['Season','TeamID1','TeamID2','m','sd','ID'])[['TeamName_x',\
                                                    'TeamName_y','Pred']].to_csv('2021_game_predictions.csv')

bracket_results_final = pd.merge(results_final, tourney_seeds, left_on=['Season','TeamID_x'], \
                           right_on=['Season','TeamID']).drop(columns='TeamID')
bracket_results_final = pd.merge(bracket_results_final, tourney_seeds, left_on=['Season','TeamID_y'], \
                           right_on=['Season','TeamID']).drop(columns='TeamID')
bracket_results_final['Seed_xy'] = bracket_results_final['Seed_x']+bracket_results_final['Seed_y']

bracket_results_final = pd.merge(bracket_results_final, bracket_lookup[['GameSlot','Seed_xy','GameRound']], \
                                 on=['Seed_xy'])

bracket_results_final

Unnamed: 0,TeamID_x,TeamName_x,TeamID_y,TeamName_y,Season,TeamID1,TeamID2,PScore1,PScore2,StdScore_x,StdScore_y,m,sd,ID,Pred,Seed_x,Seed_y,Seed_xy,GameSlot,GameRound
0,1101,Abilene Chr,1104,Alabama,2021.0,1101.0,1104.0,69.240011,78.103998,8.579521,10.942538,-8.863987,9.689259,2021_1101_1104,0.180141,W14,W02,W14W02,R3W2,3
1,1101,Abilene Chr,1111,Appalachian St,2021.0,1101.0,1111.0,70.718336,66.913896,8.579521,8.751557,3.804440,8.665112,2021_1101_1111,0.669688,W14,X16a,W14X16a,R5WX,5
2,1104,Alabama,1111,Appalachian St,2021.0,1104.0,1111.0,81.673287,69.004860,10.942538,8.751557,12.668427,9.785920,2021_1104_1111,0.902263,W02,X16a,W02X16a,R5WX,5
3,1101,Abilene Chr,1116,Arkansas,2021.0,1101.0,1116.0,68.906734,77.736214,8.579521,16.534890,-8.829480,11.910560,2021_1101_1116,0.229251,W14,Z03,W14Z03,R6CH,6
4,1104,Alabama,1116,Arkansas,2021.0,1104.0,1116.0,79.861685,79.827178,10.942538,16.534890,0.034507,13.451158,2021_1104_1116,0.501023,W02,Z03,W02Z03,R6CH,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2273,1438,Virginia,1458,Wisconsin,2021.0,1438.0,1458.0,69.783584,64.772490,9.030992,9.195987,5.011094,9.113116,2021_1438_1458,0.708798,X04,Z09,X04Z09,R6CH,6
2274,1439,Virginia Tech,1458,Wisconsin,2021.0,1439.0,1458.0,68.735786,69.077678,9.752655,9.195987,-0.341892,9.470232,2021_1439_1458,0.485601,Z10,Z09,Z10Z09,R4Z1,4
2275,1452,West Virginia,1458,Wisconsin,2021.0,1452.0,1458.0,78.293200,71.627981,8.266200,9.195987,6.665218,8.718708,2021_1452_1458,0.777707,Y03,Z09,Y03Z09,R5YZ,5
2276,1455,Wichita St,1458,Wisconsin,2021.0,1455.0,1458.0,71.506547,71.132795,8.618916,9.195987,0.373751,8.902777,2021_1455_1458,0.516743,X11b,Z09,X11bZ09,R6CH,6


In [543]:
team_list = list(set(bracket_results_final['TeamID_x'].append(bracket_results_final['TeamID_y'])))
losers = [1277, 1111, 1455, 1291]
team_list = [x for x in team_list if x not in losers]

winner_dict = {1:dict(zip(team_list, len(team_list)*[0])), 2:dict(zip(team_list, len(team_list)*[0])), \
               3:dict(zip(team_list, len(team_list)*[0])), 4:dict(zip(team_list, len(team_list)*[0])), \
               5:dict(zip(team_list, len(team_list)*[0])), 6:dict(zip(team_list, len(team_list)*[0]))}

N = 1000

for i in range(N):
    
    winners = team_list

    for r in range(1,7):
        sim = bracket_results_final.copy()
        sim = sim = sim[(sim['GameRound']==r)&(sim['TeamID_x'].isin(winners))&(sim['TeamID_y'].isin(winners))]

        sim['Sim'] = sim.apply(lambda row: np.random.uniform(0,1), axis=1)
        sim['Winner'] = sim.apply(lambda row: row['TeamID_x'] if row['Sim']<row['Pred'] else row['TeamID_y'], axis=1)

        winners = list(sim['Winner'].values)

        for w in winners:
            winner_dict[r][w] += 1
    
win_rate = {}

for d in winner_dict:
    win_rate[d] = {k:(v/N) for k, v in winner_dict[d].items()}
    
print(win_rate[1])

{1281: 0.382, 1155: 0.597, 1156: 0.011, 1411: 0.158, 1159: 0.511, 1160: 0.68, 1287: 0.1, 1417: 0.376, 1163: 0.699, 1166: 0.495, 1422: 0.314, 1425: 0.785, 1429: 0.429, 1433: 0.533, 1179: 0.215, 1180: 0.1, 1437: 0.739, 1438: 0.908, 1439: 0.553, 1313: 0.029, 1186: 0.225, 1314: 0.688, 1317: 0.422, 1196: 0.447, 1325: 0.092, 1326: 0.783, 1199: 0.686, 1328: 0.618, 1329: 0.784, 1452: 0.9, 1331: 0.217, 1332: 0.467, 1333: 0.196, 1207: 0.32, 1457: 0.261, 1458: 0.312, 1210: 0.424, 1211: 0.971, 1213: 0.271, 1216: 0.053, 1345: 0.578, 1222: 0.989, 1353: 0.403, 1228: 0.9, 1101: 0.141, 1104: 0.811, 1233: 0.189, 1234: 0.729, 1361: 0.734, 1364: 0.505, 1242: 0.775, 1116: 0.489, 1251: 0.216, 1124: 0.947, 1382: 0.646, 1260: 0.576, 1261: 0.354, 1393: 0.266, 1140: 0.624, 1268: 0.301, 1397: 0.804, 1400: 0.859, 1403: 0.571, 1276: 0.842}


In [544]:
team_odds = tourney_seeds[tourney_seeds['Season']==2021]
team_odds = pd.merge(team_odds, team_index[~team_index['TeamID'].isin(losers)], on='TeamID')
team_odds['R1'] = team_odds.apply(lambda row: win_rate[1][row['TeamID']], axis=1)
team_odds['R2'] = team_odds.apply(lambda row: win_rate[2][row['TeamID']], axis=1)
team_odds['R3'] = team_odds.apply(lambda row: win_rate[3][row['TeamID']], axis=1)
team_odds['R4'] = team_odds.apply(lambda row: win_rate[4][row['TeamID']], axis=1)
team_odds['R5'] = team_odds.apply(lambda row: win_rate[5][row['TeamID']], axis=1)
team_odds['R6'] = team_odds.apply(lambda row: win_rate[6][row['TeamID']], axis=1)

team_odds.to_csv('MPredictions/team_odds_pca_2021.csv', index=False)

team_odds

Unnamed: 0,Season,Seed,TeamID,TeamName,FirstD1Season,LastD1Season,R1,R2,R3,R4,R5,R6
0,2021,W01,1276,Michigan,1985,2021,0.842,0.365,0.207,0.091,0.023,0.001
1,2021,W02,1104,Alabama,1985,2021,0.811,0.461,0.242,0.135,0.064,0.022
2,2021,W03,1400,Texas,1985,2021,0.859,0.559,0.319,0.212,0.103,0.025
3,2021,W04,1199,Florida St,1985,2021,0.686,0.403,0.185,0.087,0.025,0.009
4,2021,W05,1160,Colorado,1985,2021,0.680,0.366,0.152,0.044,0.015,0.002
...,...,...,...,...,...,...,...,...,...,...,...,...
59,2021,Z12,1457,Winthrop,1987,2021,0.261,0.122,0.027,0.011,0.001,0.000
60,2021,Z13,1317,North Texas,1985,2021,0.422,0.147,0.039,0.015,0.005,0.000
61,2021,Z14,1159,Colgate,1985,2021,0.511,0.323,0.245,0.118,0.050,0.025
62,2021,Z15,1331,Oral Roberts,1985,2021,0.217,0.073,0.019,0.001,0.000,0.000


In [547]:
results_final[results_final['TeamName_x']=='Gonzaga']

Unnamed: 0,TeamID_x,TeamName_x,TeamID_y,TeamName_y,Season,TeamID1,TeamID2,PScore1,PScore2,StdScore_x,StdScore_y,m,sd,ID,Pred
1102,1211,Gonzaga,1213,Grand Canyon,2021.0,1211.0,1213.0,81.113778,64.972922,10.226737,8.13215,16.140856,9.119505,2021_1211_1213,0.96163
1103,1211,Gonzaga,1216,Hartford,2021.0,1211.0,1216.0,81.447787,65.54821,10.226737,7.012234,15.899577,8.46831,2021_1211_1216,0.969778
1104,1211,Gonzaga,1222,Houston,2021.0,1211.0,1222.0,72.480303,75.14345,10.226737,9.529028,-2.663147,9.87172,2021_1211_1222,0.393667
1105,1211,Gonzaga,1228,Illinois,2021.0,1211.0,1228.0,83.442512,76.26279,10.226737,12.267764,7.179723,11.200857,2021_1211_1228,0.739238
1106,1211,Gonzaga,1233,Iona,2021.0,1211.0,1233.0,80.341533,65.929413,10.226737,11.249242,14.41212,10.725812,2021_1211_1233,0.910475
1107,1211,Gonzaga,1234,Iowa,2021.0,1211.0,1234.0,86.259389,76.422159,10.226737,12.61858,9.83723,11.359881,2021_1211_1234,0.806745
1108,1211,Gonzaga,1242,Kansas,2021.0,1211.0,1242.0,80.319948,71.161043,10.226737,12.867976,9.158905,11.471591,2021_1211_1242,0.78768
1109,1211,Gonzaga,1251,Liberty,2021.0,1211.0,1251.0,80.08755,65.443351,10.226737,11.031852,14.6442,10.621669,2021_1211_1251,0.916008
1110,1211,Gonzaga,1260,Loyola-Chicago,2021.0,1211.0,1260.0,75.374927,69.273964,10.226737,10.30204,6.100963,10.26432,2021_1211_1260,0.723873
1111,1211,Gonzaga,1261,LSU,2021.0,1211.0,1261.0,87.418387,77.903553,10.226737,8.259674,9.514835,9.19073,2021_1211_1261,0.849727
