# Predicting March Madness Winners

In [72]:
import numpy as np
import pandas as pd

import random
import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from scipy.stats import norm
import xgboost as xgb
import graphviz

from kaggle.api.kaggle_api_extended import KaggleApi
from zipfile import ZipFile
from datetime import datetime
import pytz
import swifter
import itertools
import os

In [73]:
api = KaggleApi()
api.authenticate()

In [74]:
def write_to_csv(df, outname, outdir, index=None):
    '''
    inputs:
    output:
    '''
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    fullname = os.path.join(outdir, outname)    
    df.to_csv(fullname, index=index) 
    return

def save_model_to_dir(model, outname, outdir):
    '''
    inputs:
    output:
    '''
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    fullname = os.path.join(outdir, outname)    
    model.save_model(fullname)
    return

## <u>Data Processing</u>

### Pull Data from Kaggle
Use the Kaggle API to list and download relevant files for 2022 March Madness Tournament (men's or women's).

In [75]:
bracket_type = 'womens'
bi = bracket_type[0].upper()

In [76]:
# api.competitions_list(search='march')
api.competition_list_files('{}-march-mania-2022'.format(bracket_type))



[WDataFiles_Stage1/WSampleSubmissionStage1.csv,
 WDataFiles_Stage1/WNCAATourneySeeds.csv,
 WDataFiles_Stage1/WTeamSpellings.csv,
 WDataFiles_Stage1/WGameCities.csv,
 WDataFiles_Stage1/WRegularSeasonDetailedResults.csv,
 WDataFiles_Stage1/WNCAATourneyDetailedResults.csv,
 WDataFiles_Stage1/WSeasons.csv,
 WDataFiles_Stage1/WNCAATourneyCompactResults.csv,
 WDataFiles_Stage1/WTeamConferences.csv,
 WDataFiles_Stage1/Conferences.csv,
 WDataFiles_Stage1/WTeams.csv,
 WDataFiles_Stage1/WNCAATourneySlots.csv,
 WDataFiles_Stage1/Cities.csv,
 WDataFiles_Stage1/WRegularSeasonCompactResults.csv,
 WDataFiles_Stage2/WNCAATourneyCompactResults.csv,
 WDataFiles_Stage2/WNCAATourneySeeds.csv,
 WDataFiles_Stage2/Conferences.csv,
 WDataFiles_Stage2/WNCAATourneySlots1998thru2021.csv,
 WDataFiles_Stage2/WGameCities.csv,
 WDataFiles_Stage2/WRegularSeasonCompactResults.csv,
 WDataFiles_Stage2/WNCAATourneySlots2022.csv,
 WDataFiles_Stage2/WNCAATourneyDetailedResults.csv,
 WDataFiles_Stage2/WSampleSubmissionStage

<b>Note:</b> Only run this section once to download files

In [77]:
# api.competition_download_files('{}-march-mania-2022'.format(bracket_type))

# zf = ZipFile('{}-march-mania-2022.zip'.format(bracket_type))
# zf.extractall('Data/') #save files in selected folder
# zf.close()

### Extract Regular Season Matchup Data

In [78]:
season_data = pd.read_csv('Data/{}DataFiles_Stage1/{}RegularSeasonDetailedResults.csv'.format(bi, bi))
season_data.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF'],
      dtype='object')

### Functions to Transform Regular Season Matchup Data

In [13]:
def get_continuous_data(data, min_season=2010, max_season=2022, last_n=None, vocal=True):
    '''
    inputs: data (dataframe), matchup data with each teams' basic stats as features
            min_season (int), minimum season to consider in data (default 2010)
            max_season (int), maximum season to consider in data (default 2022)
            last_n (int), number of days to consider in look-back window (default None -- all days considered)
            vocal (bool), prints updates if True (default True)
    output: rolling averages of "flattened" version of data (each team in matchup given its own row) 
            with advanced stats added (Win%, Away-Win%, Away/Neutral-Win%, Min, G, FG%, FT%, 3P%, 3Pr, FTr, 
            eFG%, TS%, Possession, Pace, Reb%, O-Reb%, Ast%, Stl%, Blk%, TO%, Off/Def Rating for Team/Opponent)
    '''
    if vocal:
        print('---- PROCESS CONTINOUS SEASON DATA ----\nSTART:', datetime.now(pytz.timezone('US/Pacific')))
        
    cols1 = ['Season','DayNum','WTeamID','WLoc','NumOT','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA',\
             'WOR','WDR','WAst','WTO','WStl','WBlk','WPF','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA',\
             'LOR','LDR','LAst','LTO','LStl','LBlk','LPF']
    cols2 = ['Season','DayNum','LTeamID','WLoc','NumOT','LScore','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA',\
             'LOR','LDR','LAst','LTO','LStl','LBlk','LPF','WScore','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA',\
             'WOR','WDR','WAst','WTO','WStl','WBlk','WPF']

    cols = ['Season','DayNum','TeamID','Loc','NumOT','TScore','TFGM','TFGA','TFGM3','TFGA3','TFTM','TFTA',
            'TOR','TDR','TAst','TTO','TStl','TBlk','TPF','OScore','OFGM','OFGA','OFGM3','OFGA3','OFTM','OFTA',\
            'OOR','ODR','OAst','OTO','OStl','OBlk','OPF']

    all_cols = ['Season','TeamID','NumOT','TScore','TFGM','TFGA','TFGM3','TFGA3','TFTM','TFTA','TOR','TDR',\
                'TAst','TTO','TStl','TBlk','TPF','OScore','OFGM','OFGA','OFGM3','OFGA3','OFTM','OFTA','OOR','ODR',\
                'OAst','OTO','OStl','OBlk','OPF','Wins','G','WP','AWP','ANWP','Min','TFGP','TFGP3','TFTP',\
                'TFG3R','TFTR','TEFG','TTFG','OFGP','OFGP3','OFTP','OFG3R','OFTR','OEFG','OTFG','TPoss','TPace',\
                'OPoss','OPace','TTRP','TORP','TAstP','TStlP','TBlkP','TTOP','OTRP','OORP','OAstP','OStlP',\
                'OBlkP','OTOP','OffRtg','DefRtg','DayNum']

    dfc = pd.DataFrame(columns=all_cols)
    
    # Iterate overe each season of data
    for seas in range(min_season, max_season+1):
        max_daynum = max(data[(data['Season']==seas)]['DayNum'])
        for daynum in range(1,max_daynum+1):
            days_back = last_n
            if days_back is None:
                days_back = daynum

            df = data[(data['Season']==seas)&(data['DayNum']<daynum)&(season_data['DayNum']>=daynum-days_back)]

            if len(df) > 0:
                tm1 = df[cols1].rename(columns=dict(zip(cols1, cols)))
                tm2 = df[cols2].rename(columns=dict(zip(cols2, cols)))

                # Calculate total wins
                tm1['Wins'] = 1
                tm2['Wins'] = 0

                # Calculate total away wins and losses (Loc will be H for the losing team)
                tm1['AWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'A' else 0, axis=1)
                tm2['AWins'] = 0
                tm1['ALosses'] = 0
                tm2['ALosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'H' else 0, axis=1)

                # Calculate total neutral wins and losses
                tm1['NWins'] = tm1.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)
                tm2['NWins'] = 0
                tm1['NLosses'] = 0
                tm2['NLosses'] = tm2.apply(lambda row: 1 if row['Loc'] == 'N' else 0, axis=1)

                tm = tm1.append(tm2)

                tm['G'] = 1

                agg_funcs = 29*['mean']+6*['sum']
                tm = tm.groupby(['Season','TeamID'], as_index=False).agg(dict(zip(cols[4:]+['G','Wins',\
                                                        'AWins','ALosses','NWins','NLosses'], agg_funcs)))

                # Game statistics
                tm['WP'] = tm['Wins']/tm['G'] # Win%
                tm['AWP'] = tm['AWins']/(tm['AWins']+tm['ALosses']) # Away Win%
                tm['ANWP'] = (tm['AWins']+tm['NWins'])/(tm['AWins']+tm['ALosses']+tm['NWins']+tm['NLosses']) # Away/Neutral Win%
                tm['Min'] = 40*tm['G']+5*tm['NumOT'] # Min

                # Team shooting percentages
                tm['TFGP'] = tm['TFGM']/tm['TFGA'] # Team FG%
                tm['TFGP3'] = tm['TFGM3']/tm['TFGA3'] # Team 3P%
                tm['TFTP'] = tm['TFTM']/tm['TFTA'] # Team FT%
                tm['TFG3R'] = tm['TFGA3']/tm['TFGA'] # Team 3P Rate
                tm['TFTR'] = tm['TFTA']/tm['TFGA'] # Team FT Rate
                tm['TEFG'] = (0.5*tm['TFGM3']+tm['TFGM'])/tm['TFGA'] # Team Effective FG%
                tm['TTFG'] = tm['TScore']/(2*(0.44*tm['TFTA']+tm['TFGA'])) # Team TS%

                # Opponent shooting percentages
                tm['OFGP'] = tm['OFGM']/tm['OFGA'] # Opponent FG%
                tm['OFGP3'] = tm['OFGM3']/tm['OFGA3'] # Opponent 3P%
                tm['OFTP'] = tm['OFTM']/tm['OFTA'] # Opponent FT%
                tm['OFG3R'] = tm['OFGA3']/tm['OFGA'] # Opponent 3P Rate
                tm['OFTR'] = tm['OFTA']/tm['OFGA'] # Opponent FT Rate
                tm['OEFG'] = (0.5*tm['OFGM3']+tm['OFGM'])/tm['OFGA'] # Opponent Effective FG%
                tm['OTFG'] = tm['OScore']/(2*(0.44*tm['OFTA']+tm['OFGA'])) # Opponent TS%

                # Team possession stats
                tm['TPoss'] = tm['TFGA']-tm['TOR']+tm['TTO']+0.4*tm['TFTA'] # Team Possession
                tm['TPace'] = 40*tm['TPoss']/tm['Min'] # Team Pace

                # Opponent possession stats
                tm['OPoss'] = tm['OFGA']-tm['OOR']+tm['OTO']+0.4*tm['OFTA'] # Opponent Possession
                tm['OPace'] = 40*tm['OPoss']/tm['Min'] # Opponent Pace

                # Team stat percentages
                tm['TTRP'] = (tm['TOR']+tm['TDR'])/(tm['TOR']+tm['TDR']+tm['OOR']+tm['ODR']) # Team Rebound%
                tm['TORP'] = tm['TOR']/(tm['TOR']+tm['ODR']) # Team Offensive-Rebound%
                tm['TAstP'] = tm['TAst']/tm['TFGM'] # Team Ast%
                tm['TStlP'] = tm['TStl']/tm['OPoss'] # Team Stl%
                tm['TBlkP'] = tm['TBlk']/(tm['OFGA']-tm['OFGA3']) # Team Blk%
                tm['TTOP'] = tm['TTO']/tm['TPoss'] # Team TO%

                # Opponent stat percentages
                tm['OTRP'] = (tm['OOR']+tm['ODR'])/(tm['TOR']+tm['TDR']+tm['OOR']+tm['ODR']) # Opponent Rebound%
                tm['OORP'] = tm['OOR']/(tm['OOR']+tm['TDR']) # Opponent Offensive-Rebound%
                tm['OAstP'] = tm['OAst']/tm['OFGM'] # Opponent Ast%
                tm['OStlP'] = tm['OStl']/tm['TPoss'] # Opponent Stl%
                tm['OBlkP'] = tm['OBlk']/(tm['TFGA']-tm['TFGA3']) # Opponent Blk%
                tm['OTOP'] = tm['OTO']/tm['OPoss'] # Opponent TO%

                # Ratings
                tm['OffRtg'] = tm['TScore']/tm['TPoss'] # Offensive Rating
                tm['DefRtg'] = tm['OScore']/tm['OPoss'] # Defensive Rating

                tm['DayNum'] = daynum

                tm = tm.drop(columns=['AWins','ALosses','NWins','NLosses'])

                dfc = dfc.append(tm)
                
        if vocal:
            print('{} season completed.'.format(seas))
    if vocal:
        print('FINISH:', datetime.now(pytz.timezone('US/Pacific')))
            
    return dfc

def get_matchup_dict(data, min_season=2010, max_season=2022, max_daynum=132, 
                     min_team=1101, max_team=1499, last_n=30, vocal=True):
    '''
    '''
    if vocal:
        print('---- COLLECT TEAM MATCHUP HISTORY ----\nSTART:', datetime.now(pytz.timezone('US/Pacific')))
    # Get matchup history of all teams on any given day
    matchup_d = {}
    for seas in range(min_season, max_season+1):
        d_day = {}
        for daynum in range(max_daynum+1):
            days_back = last_n
            if days_back is None:
                days_back = daynum
            df = data[(data['Season']==seas)&(data['DayNum']<daynum)&(season_data['DayNum']>=daynum-days_back)]
            d_team = {}
            for t in range(min_team, max_team+1):
                d_team[t] = list(df[df['WTeamID']==t]['LTeamID'].append(df[df['LTeamID']==t]['WTeamID']))
            d_day[daynum] = d_team
        matchup_d[seas] = d_day
        if vocal:
            print('{} season completed.'.format(seas))
    if vocal:
        print('FINISH:', datetime.now(pytz.timezone('US/Pacific')))
    return matchup_d

def get_opp_matchup_dict(matchup_d, min_season=2010, max_season=2022, max_daynum=132, 
                         min_team=1101, max_team=1499, vocal=True):
    '''
    '''
    if vocal:
        print('---- COLLECT OPPONENT MATCHUP HISTORY ----\nSTART:', datetime.now(pytz.timezone('US/Pacific')))
    # Get matchup history of all teams' opponents on any given day
    opp_matchup_d = {}
    for seas in range(min_season, max_season+1):
        d_day = {}
        for daynum in range(max_daynum+1):
            d_team = {}
            for t in range(min_team, max_team+1):
                opp_matchups = []
                for o in matchup_d[seas][daynum][t]:
                    opp_matchups += matchup_d[seas][daynum][o]
                d_team[t] = opp_matchups
            d_day[daynum] = d_team
        opp_matchup_d[seas] = d_day
        if vocal:
            print('{} season completed.'.format(seas))
    if vocal:
        print('FINISH:', datetime.now(pytz.timezone('US/Pacific')))
    return opp_matchup_d

def get_win_pct_dict(data_cont):
    '''
    '''
    return {h: {g: f.groupby('TeamID')['WP'].apply(float).to_dict() \
                for g, f in g.groupby('DayNum')} for h, g in data_cont.groupby('Season')}

def get_elo_dict(data, min_season=2010, max_season=2022, max_daynum=132, min_team=1101, max_team=1499, vocal=True):
    '''
    '''
    if vocal:
        print('---- CALCULATE SIMPLE ELO SCORE ----\nSTART:', datetime.now(pytz.timezone('US/Pacific')))
    # Get matchup history of all teams on any given day
    elo_d = {}
    for seas in range(min_season, max_season+1):
        d_day = {}
        for daynum in range(max_daynum+1):
            df = data[(data['Season']==seas)&(data['DayNum']<daynum)]
            d_team = {}
            for t in range(min_team, max_team+1):
                # Initialize Elo if first game of season
                if daynum == 0:
                    if seas == min_season:
                        d_team[t] = [1500,0]
                    else:
                        d_team[t] = [(2*elo_d[seas-1][max_daynum-1][t][0] + 1500) / 3, 0]
                # Calculate Elo based on previous Elo of both teams involved
                else:
                    # Create temporary datafame to store past opponents, day, and result (win or loss)
                    temp_df1 = df[df['WTeamID']==t][['LTeamID','DayNum']].rename(columns={'LTeamID':'TeamID'})
                    temp_df1['W'] = 1
                    temp_df2 = df[df['LTeamID']==t][['WTeamID','DayNum']].rename(columns={'WTeamID':'TeamID'})
                    temp_df2['W'] = 0
                    temp_df = temp_df1.append(temp_df2)
                    # Get number of games in season
                    num_games = len(temp_df)
                    # If no new game on this day, keep Elo the same
                    if d_day[daynum-1][t][1] == num_games:
                        d_team[t] = [d_day[daynum-1][t][0], num_games]
                    # Otherwise, calculate Elo based on previous Elo and previous opponent Elo
                    else:
                        # Locate the most recent opponent and result (win or loss)
                        last_opp = temp_df.loc[temp_df['DayNum'].idxmax()]['TeamID']
                        result = temp_df.loc[temp_df['DayNum'].idxmax()]['W']
                        score = d_day[daynum-1][t][0]*(num_games-1) + d_day[daynum-1][last_opp][0]
                        score = (score + (2*result-1) * 400) / num_games
                        d_team[t] = [score, num_games]
            d_day[daynum] = d_team
        elo_d[seas] = d_day
        if vocal:
            print('{} season completed.'.format(seas))
    if vocal:
        print('FINISH:', datetime.now(pytz.timezone('US/Pacific')))
    return {s: {d: {t: v[0] for t, v in dict1.items()} for d, dict1 in dict0.items()} for s, dict0 in elo_d.items()}

def get_carmelo_dict(data, min_season=2010, max_season=2022, max_daynum=132, min_team=1101, max_team=1499, vocal=True):
    '''
    '''
    if vocal:
        print('---- CALCULATE 538 ELO SCORE ----\nSTART:', datetime.now(pytz.timezone('US/Pacific')))
    # Get matchup history of all teams on any given day
    elo_d = {}
    for seas in range(min_season, max_season+1):
        d_day = {}
        for daynum in range(max_daynum+1):
            df = data[(data['Season']==seas)&(data['DayNum']<daynum)]
            d_team = {}
            for t in range(min_team, max_team+1):
                # Initialize Elo if first game of season
                if daynum == 0:
                    if seas == min_season:
                        d_team[t] = [1500,0]
                    else:
                        d_team[t] = [(2*elo_d[seas-1][max_daynum-1][t][0] + 1500) / 3, 0]
                # Calculate CarmElo based on previous CarmElo of both teams involved
                else:
                    # Create temporary datafame to store past opponents, day, scores, and result (win or loss)
                    temp_df1 = df[df['WTeamID']==t][['LTeamID','DayNum','WScore','LScore','WLoc']]
                    temp_df1 = temp_df1.rename(columns={'LTeamID':'TeamID','WScore':'Score1','LScore':'Score2'})
                    temp_df1['WLoc'] = temp_df1.apply(lambda x: 1 if x['WLoc'] == 'H' else 0, axis=1)
                    temp_df2 = df[df['LTeamID']==t][['WTeamID','DayNum','LScore','WScore','WLoc']]
                    temp_df2 = temp_df2.rename(columns={'WTeamID':'TeamID','LScore':'Score1','WScore':'Score2'})
                    temp_df2['WLoc'] = temp_df2.apply(lambda x: 1 if x['WLoc'] == 'A' else 0, axis=1)
                    temp_df = temp_df1.append(temp_df2)
                    temp_df['W'] = temp_df.apply(lambda x: 1 if len(temp_df) > 0 and x['Score1'] > x['Score2'] else 0, axis=1)
                    # Get number of games in season
                    num_games = len(temp_df)
                    # If no new game on this day, keep CarmElo the same
                    if d_day[daynum-1][t][1] == num_games:
                        d_team[t] = [d_day[daynum-1][t][0], num_games]
                    # Otherwise, calculate CarmElo based on previous Elo and previous opponent CarmElo
                    else:
                        # Locate the most recent opponent and result (win or loss)
                        last_opp = temp_df.loc[temp_df['DayNum'].idxmax()]['TeamID']
                        last_result = temp_df.loc[temp_df['DayNum'].idxmax()]['W']
                        last_score1 = temp_df.loc[temp_df['DayNum'].idxmax()]['Score1']
                        last_score2 = temp_df.loc[temp_df['DayNum'].idxmax()]['Score2']
                        was_home = temp_df.loc[temp_df['DayNum'].idxmax()]['WLoc']
                        # Calculate CarmElo in steps
                        team_elo = d_day[daynum-1][t][0]
                        opp_elo = d_day[daynum-1][last_opp][0]
                        # Add 100 to CarmElo if team was at home
                        elo_diff = (opp_elo + (1-was_home)*100 - team_elo + was_home*100)
                        e_team = 1 / (1 + 10**(elo_diff/400))
                        k = 20 * (abs(last_score1 - last_score2) + 3)**0.8 / (7.5 + 0.006*abs(elo_diff))
                        score = k*(last_result - e_team) + team_elo
                        d_team[t] = [score, num_games]
            d_day[daynum] = d_team
        elo_d[seas] = d_day
        if vocal:
            print('{} season completed.'.format(seas))
    if vocal:
        print('FINISH:', datetime.now(pytz.timezone('US/Pacific')))
    return {s: {d: {t: v[0] for t, v in dict1.items()} for d, dict1 in dict0.items()} for s, dict0 in elo_d.items()}

def add_sos_var(data, wp_d, matchup_d, opp_matchup_d, vocal=True):
    '''
    '''
    if vocal:
        print('---- ADD SOS FEATURES TO DATA ----\nSTART:', datetime.now(pytz.timezone('US/Pacific')))
    # Calculate opponnent win percentage
    data['OWP'] = data.apply(lambda row: np.mean([wp_d[row['Season']][row['DayNum']][x] \
                 for x in matchup_d[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)
    # Calculate opponent's opponent win percentage
    data['OOWP'] = data.apply(lambda row: np.mean([wp_d[row['Season']][row['DayNum']][x] \
                 for x in opp_matchup_d[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)
    # Calculate strength of schedule from OWP and OOWP
    data['SOS'] = (2*data['OWP']+data['OOWP'])/3
    if vocal:
        print('FINISH:', datetime.now(pytz.timezone('US/Pacific')))
    return data

def add_elo_var(data, elo_d, matchup_d, elo_var_name='Elo', vocal=True):
    '''
    '''
    if vocal:
        print('---- ADD ELO FEATURES TO DATA ----\nSTART:', datetime.now(pytz.timezone('US/Pacific')))
    data['T'+elo_var_name] = data.apply(lambda row: elo_d[row['Season']][row['DayNum']][row['TeamID']], axis=1)
    data['O'+elo_var_name] = data.apply(lambda row: np.mean([elo_d[row['Season']][row['DayNum']][x] \
                 for x in matchup_d[row['Season']][row['DayNum']][row['TeamID']]]), axis=1)
    if vocal:
        print('FINISH:', datetime.now(pytz.timezone('US/Pacific')))
    return data

### Create Continuous Season Dataset

In [82]:
MIN_TEAM = 1101
MAX_TEAM = 1499

if bracket_type == 'womens':
    MIN_TEAM += 2000
    MAX_TEAM += 2000

In [15]:
elo_dict = get_elo_dict(season_data, min_team=MIN_TEAM, max_team=MAX_TEAM)

---- CALCULATE SIMPLE ELO SCORE ----
START: 2022-03-15 09:59:05.431325-07:00
2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.
2021 season completed.
2022 season completed.
FINISH: 2022-03-15 10:35:36.427263-07:00


In [16]:
carmelo_dict = get_carmelo_dict(season_data, min_team=MIN_TEAM, max_team=MAX_TEAM)

---- CALCULATE 538 ELO SCORE ----
START: 2022-03-15 10:35:36.545406-07:00
2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.
2021 season completed.
2022 season completed.
FINISH: 2022-03-15 11:29:27.616888-07:00


In [17]:
continuous_data_all = get_continuous_data(season_data)
continuous_data_all.head()

---- PROCESS CONTINOUS SEASON DATA ----
START: 2022-03-15 11:29:27.810668-07:00
2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.
2021 season completed.
2022 season completed.
FINISH: 2022-03-15 11:48:30.008283-07:00


Unnamed: 0,Season,TeamID,NumOT,TScore,TFGM,TFGA,TFGM3,TFGA3,TFTM,TFTA,...,TTOP,OTRP,OORP,OAstP,OStlP,OBlkP,OTOP,OffRtg,DefRtg,DayNum
0,2010,1107,0,43,15,55,5,28,8,14,...,0.397022,0.541176,0.341463,0.724138,0.260546,0.222222,0.264484,0.533499,0.944584,8
1,2010,1108,0,60,21,61,7,17,11,20,...,0.183099,0.573171,0.351351,0.74359,0.112676,0.136364,0.084746,0.84507,1.412429,8
2,2010,1143,0,75,24,52,5,12,22,32,...,0.188088,0.492063,0.366667,0.423077,0.109718,0.075,0.265625,1.175549,1.09375,8
3,2010,1198,0,72,25,68,8,23,14,17,...,0.244499,0.60274,0.428571,0.676471,0.122249,0.177778,0.31477,0.880196,1.065375,8
4,2010,1293,0,70,26,52,8,21,10,15,...,0.265625,0.507937,0.393939,0.625,0.125,0.032258,0.188088,1.09375,1.175549,8


In [18]:
matchup_dict_all = get_matchup_dict(season_data, min_team=MIN_TEAM, max_team=MAX_TEAM)

---- COLLECT TEAM MATCHUP HISTORY ----
START: 2022-03-15 11:48:30.041062-07:00
2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.
2021 season completed.
2022 season completed.
FINISH: 2022-03-15 12:00:06.660522-07:00


In [19]:
opp_matchup_dict_all = get_opp_matchup_dict(matchup_dict_all, min_team=MIN_TEAM, max_team=MAX_TEAM)

---- COLLECT OPPONENT MATCHUP HISTORY ----
START: 2022-03-15 12:00:06.664172-07:00
2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.
2021 season completed.
2022 season completed.
FINISH: 2022-03-15 12:00:09.062966-07:00


In [20]:
# Add columns to dataframe
wp_dict_all = get_win_pct_dict(continuous_data_all)
continuous_data_all = add_sos_var(continuous_data_all, wp_dict_all, matchup_dict_all, opp_matchup_dict_all)
continuous_data_all = add_elo_var(continuous_data_all, elo_dict, matchup_dict_all, elo_var_name='Elo')
continuous_data_all = add_elo_var(continuous_data_all, carmelo_dict, matchup_dict_all, elo_var_name='CarmElo')

# Modify dataframe
continuous_data_all = continuous_data_all.apply(pd.to_numeric, errors='coerce', axis=1)
continuous_data_all = continuous_data_all.reset_index(drop=True).drop(columns=['Wins'])

# Write dataframe
write_to_csv(continuous_data_all, 'regular_season_continuous.csv', 'Data/{}Transformed/'.format(bi))
continuous_data_all.head()

---- ADD SOS FEATURES TO DATA ----
START: 2022-03-15 12:00:26.572888-07:00


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


FINISH: 2022-03-15 12:03:24.951878-07:00
---- ADD ELO FEATURES TO DATA ----
START: 2022-03-15 12:03:24.952169-07:00
FINISH: 2022-03-15 12:04:06.244137-07:00
---- ADD ELO FEATURES TO DATA ----
START: 2022-03-15 12:04:06.244379-07:00
FINISH: 2022-03-15 12:04:48.009934-07:00


Unnamed: 0,Season,TeamID,NumOT,TScore,TFGM,TFGA,TFGM3,TFGA3,TFTM,TFTA,...,OffRtg,DefRtg,DayNum,OWP,OOWP,SOS,TElo,OElo,TCarmElo,OCarmElo
0,2010.0,1107.0,0.0,43.0,15.0,55.0,5.0,28.0,8.0,14.0,...,0.533499,0.944584,8.0,1.0,0.0,0.666667,1100.0,1900.0,1484.723513,1527.165862
1,2010.0,1108.0,0.0,60.0,21.0,61.0,7.0,17.0,11.0,20.0,...,0.84507,1.412429,8.0,1.0,0.0,0.666667,1100.0,1900.0,1481.988751,1532.029033
2,2010.0,1143.0,0.0,75.0,24.0,52.0,5.0,12.0,22.0,32.0,...,1.175549,1.09375,8.0,0.0,1.0,0.333333,1900.0,1100.0,1508.34144,1495.309263
3,2010.0,1198.0,0.0,72.0,25.0,68.0,8.0,23.0,14.0,17.0,...,0.880196,1.065375,8.0,1.0,0.0,0.666667,1100.0,1900.0,1490.629302,1516.663719
4,2010.0,1293.0,0.0,70.0,26.0,52.0,8.0,21.0,10.0,15.0,...,1.09375,1.175549,8.0,1.0,0.0,0.666667,1100.0,1900.0,1495.309263,1508.34144


In [21]:
season_end_data_all = continuous_data_all.loc[continuous_data_all.groupby(['Season','TeamID'])['DayNum'].idxmax()]
write_to_csv(season_end_data_all, 'regular_season_end.csv', 'Data/{}Transformed/'.format(bi))
season_end_data_all.head()

Unnamed: 0,Season,TeamID,NumOT,TScore,TFGM,TFGA,TFGM3,TFGA3,TFTM,TFTA,...,OffRtg,DefRtg,DayNum,OWP,OOWP,SOS,TElo,OElo,TCarmElo,OCarmElo
41270,2010.0,1102.0,0.0,55.62069,20.0,45.586207,5.62069,18.103448,10.0,15.689655,...,0.960691,1.096433,132.0,0.637667,0.534936,0.603423,1423.172878,1698.40035,1498.109119,1679.627025
41271,2010.0,1103.0,0.181818,71.030303,25.090909,57.636364,6.606061,19.515152,14.242424,21.393939,...,1.069636,1.011251,132.0,0.566562,0.530536,0.554553,1642.817671,1561.48762,1684.472375,1634.322822
41272,2010.0,1104.0,0.0,68.5,24.71875,56.0625,5.46875,15.625,13.59375,19.21875,...,1.062016,1.011614,132.0,0.577781,0.605935,0.587166,1660.594856,1710.008678,1653.347529,1666.560692
41273,2010.0,1105.0,0.043478,63.826087,21.173913,57.173913,3.826087,13.565217,17.652174,27.521739,...,0.909993,1.000247,132.0,0.366642,0.377327,0.370203,1160.698632,1226.706863,1480.160287,1480.395455
41274,2010.0,1106.0,0.035714,64.035714,21.214286,53.5,5.928571,19.071429,15.678571,24.25,...,0.969399,0.998711,132.0,0.359052,0.375425,0.364509,1300.596901,1214.645142,1540.765852,1479.881428


### Create Last-30-Day Continuous Dataset

In [22]:
continuous_data_l30 = get_continuous_data(season_data, last_n=30)
continuous_data_l30.head()

---- PROCESS CONTINOUS SEASON DATA ----
START: 2022-03-15 12:07:00.990794-07:00
2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.
2021 season completed.
2022 season completed.
FINISH: 2022-03-15 12:25:50.660994-07:00


Unnamed: 0,Season,TeamID,NumOT,TScore,TFGM,TFGA,TFGM3,TFGA3,TFTM,TFTA,...,TTOP,OTRP,OORP,OAstP,OStlP,OBlkP,OTOP,OffRtg,DefRtg,DayNum
0,2010,1107,0,43,15,55,5,28,8,14,...,0.397022,0.541176,0.341463,0.724138,0.260546,0.222222,0.264484,0.533499,0.944584,8
1,2010,1108,0,60,21,61,7,17,11,20,...,0.183099,0.573171,0.351351,0.74359,0.112676,0.136364,0.084746,0.84507,1.412429,8
2,2010,1143,0,75,24,52,5,12,22,32,...,0.188088,0.492063,0.366667,0.423077,0.109718,0.075,0.265625,1.175549,1.09375,8
3,2010,1198,0,72,25,68,8,23,14,17,...,0.244499,0.60274,0.428571,0.676471,0.122249,0.177778,0.31477,0.880196,1.065375,8
4,2010,1293,0,70,26,52,8,21,10,15,...,0.265625,0.507937,0.393939,0.625,0.125,0.032258,0.188088,1.09375,1.175549,8


In [23]:
matchup_dict_l30 = get_matchup_dict(season_data, last_n=30, min_team=MIN_TEAM, max_team=MAX_TEAM)

---- COLLECT TEAM MATCHUP HISTORY ----
START: 2022-03-15 12:25:50.685938-07:00
2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.
2021 season completed.
2022 season completed.
FINISH: 2022-03-15 12:38:36.749532-07:00


In [24]:
opp_matchup_dict_l30 = get_opp_matchup_dict(matchup_dict_l30, min_team=MIN_TEAM, max_team=MAX_TEAM)

---- COLLECT OPPONENT MATCHUP HISTORY ----
START: 2022-03-15 12:38:36.754849-07:00
2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.
2021 season completed.
2022 season completed.
FINISH: 2022-03-15 12:38:38.654932-07:00


In [25]:
# Add columns to dataframe
wp_dict_l30 = get_win_pct_dict(continuous_data_l30)
continuous_data_l30 = add_sos_var(continuous_data_l30, wp_dict_l30, matchup_dict_l30, opp_matchup_dict_l30)

# Modify dataframe
continuous_data_l30 = continuous_data_l30.apply(pd.to_numeric, errors='coerce', axis=1)
continuous_data_l30 = continuous_data_l30.reset_index(drop=True).drop(columns=['Wins'])

# Write dataframe
write_to_csv(continuous_data_l30, 'regular_season_continuous_last30.csv', 'Data/{}Transformed/'.format(bi))
continuous_data_l30.head()

---- ADD SOS FEATURES TO DATA ----
START: 2022-03-15 12:38:58.210433-07:00
FINISH: 2022-03-15 12:42:17.227487-07:00


Unnamed: 0,Season,TeamID,NumOT,TScore,TFGM,TFGA,TFGM3,TFGA3,TFTM,TFTA,...,OAstP,OStlP,OBlkP,OTOP,OffRtg,DefRtg,DayNum,OWP,OOWP,SOS
0,2010.0,1107.0,0.0,43.0,15.0,55.0,5.0,28.0,8.0,14.0,...,0.724138,0.260546,0.222222,0.264484,0.533499,0.944584,8.0,1.0,0.0,0.666667
1,2010.0,1108.0,0.0,60.0,21.0,61.0,7.0,17.0,11.0,20.0,...,0.74359,0.112676,0.136364,0.084746,0.84507,1.412429,8.0,1.0,0.0,0.666667
2,2010.0,1143.0,0.0,75.0,24.0,52.0,5.0,12.0,22.0,32.0,...,0.423077,0.109718,0.075,0.265625,1.175549,1.09375,8.0,0.0,1.0,0.333333
3,2010.0,1198.0,0.0,72.0,25.0,68.0,8.0,23.0,14.0,17.0,...,0.676471,0.122249,0.177778,0.31477,0.880196,1.065375,8.0,1.0,0.0,0.666667
4,2010.0,1293.0,0.0,70.0,26.0,52.0,8.0,21.0,10.0,15.0,...,0.625,0.125,0.032258,0.188088,1.09375,1.175549,8.0,1.0,0.0,0.666667


In [26]:
season_end_data_l30 = continuous_data_l30.loc[continuous_data_l30.groupby(['Season','TeamID'])['DayNum'].idxmax()]
write_to_csv(season_end_data_l30, 'regular_season_end_l30.csv', 'Data/{}Transformed/'.format(bi))
season_end_data_l30.head()

Unnamed: 0,Season,TeamID,NumOT,TScore,TFGM,TFGA,TFGM3,TFGA3,TFTM,TFTA,...,OAstP,OStlP,OBlkP,OTOP,OffRtg,DefRtg,DayNum,OWP,OOWP,SOS
41270,2010.0,1102.0,0.0,51.625,20.125,45.0,5.625,18.5,5.75,9.75,...,0.622093,0.094128,0.084906,0.198614,0.925594,1.189376,132.0,0.593006,0.435684,0.540565
41271,2010.0,1103.0,0.555556,72.777778,25.555556,61.222222,5.777778,19.444444,15.888889,23.777778,...,0.42735,0.092743,0.071809,0.197412,1.065734,1.089914,132.0,0.578395,0.503767,0.553519
41272,2010.0,1104.0,0.0,71.125,24.5,57.625,5.375,15.125,16.75,22.75,...,0.490385,0.099288,0.141176,0.190157,1.065942,1.045861,132.0,0.415179,0.502471,0.444276
41273,2010.0,1105.0,0.0,64.125,20.5,57.5,3.75,14.125,19.375,28.5,...,0.615789,0.123799,0.115274,0.285505,0.947894,0.988287,132.0,0.495017,0.498725,0.496253
41274,2010.0,1106.0,0.0,67.222222,22.222222,51.0,5.222222,16.666667,17.555556,27.555556,...,0.480447,0.119048,0.090615,0.240798,1.059174,0.920193,132.0,0.450509,0.50261,0.467876


## <u>Predict Winnners with Elo</u>

In [47]:
# Get basic results from all regular season games
season_data_compact = pd.read_csv('Data/{}DataFiles_Stage1/{}RegularSeasonCompactResults.csv'.format(bi, bi))

In [48]:
# Merge Elo with compact results
season_elo_continuous = season_data_compact.merge(continuous_data_all[['Season','TeamID','TElo','TCarmElo','DayNum']], \
    left_on=['Season','WTeamID','DayNum'], right_on=['Season','TeamID','DayNum']).drop(columns=['TeamID'])
season_elo_continuous = season_elo_continuous.rename(columns={'TElo':'WElo','TCarmElo':'WCarmElo'})
season_elo_continuous = season_elo_continuous.merge(continuous_data_all[['Season','TeamID','TElo','TCarmElo','DayNum']], \
    left_on=['Season','LTeamID','DayNum'], right_on=['Season','TeamID','DayNum']).drop(columns=['TeamID'])
season_elo_continuous = season_elo_continuous.rename(columns={'TElo':'LElo','TCarmElo':'LCarmElo'})

# Look back at Elo from previous day (to introduce no training bias)
season_elo_continuous = pd.concat([season_elo_continuous[['Season','DayNum','WTeamID','LTeamID','WLoc']], \
           season_elo_continuous[['WElo','LElo','WCarmElo','LCarmElo']].shift(1)], axis=1).dropna()

# Add 100 to CarmElo of home team
season_elo_continuous['WCarmElo'] = season_elo_continuous.apply(lambda row: \
                                100 + row['WCarmElo'] if row['WLoc'] == 'H'else row['WCarmElo'], axis=1)
season_elo_continuous['LCarmElo'] = season_elo_continuous.apply(lambda row: \
                                100 + row['LCarmElo'] if row['WLoc'] == 'A' else row['LCarmElo'], axis=1)

# Calculate probability of WTeamID winning based on difference between Elos
season_elo_continuous['PElo'] = season_elo_continuous.apply(lambda row: \
                                            1 / (1 + 10**((row['LElo']-row['WElo'])/400)), axis=1)
season_elo_continuous['PCarmElo'] = season_elo_continuous.apply(lambda row: \
                                            1 / (1 + 10**((row['LCarmElo']-row['WCarmElo'])/400)), axis=1)

# Calculate success rate (accuracy, logloss) of each version of Elo
print('---- PREDICTING REGULAR SEASON OUTCOMES ----')
print('ELO ACCURACY:\t\t', len(season_elo_continuous[season_elo_continuous['PElo']>0.5])/len(season_elo_continuous))
print('CARMELO ACCURACY:\t', len(season_elo_continuous[season_elo_continuous['PCarmElo']>0.5])/len(season_elo_continuous))
print('ELO LOGLOSS:\t\t', -np.mean([math.log(x) for x in season_elo_continuous['PElo'].values]))
print('CARMELO LOGLOSS:\t', -np.mean([math.log(x) for x in season_elo_continuous['PCarmElo'].values]))

season_elo_continuous[['Season','DayNum','WTeamID','LTeamID','PElo','PCarmElo']].head()

---- PREDICTING REGULAR SEASON OUTCOMES ----
ELO ACCURACY:		 0.6766718580395488
CARMELO ACCURACY:	 0.7231425110513316
ELO LOGLOSS:		 0.618881765347084
CARMELO LOGLOSS:	 0.5483747966984185


Unnamed: 0,Season,DayNum,WTeamID,LTeamID,PElo,PCarmElo
1,2010,12,1170,1223,0.990099,0.414032
2,2010,12,1245,1412,0.5,0.630974
3,2010,12,1258,1315,0.5,0.489689
4,2010,12,1273,1136,0.5,0.512498
5,2010,12,1285,1129,0.990099,0.679503


In [49]:
# Get basic results from all tournament games
tourney_data_compact = pd.read_csv('Data/{}DataFiles_Stage1/{}NCAATourneyCompactResults.csv'.format(bi, bi))

# Merge Elo with compact results
tourney_elo_continuous = tourney_data_compact.merge(season_end_data_all[['Season','TeamID','TElo','TCarmElo']], \
    left_on=['Season','WTeamID'], right_on=['Season','TeamID']).drop(columns=['TeamID'])
tourney_elo_continuous = tourney_elo_continuous.rename(columns={'TElo':'WElo','TCarmElo':'WCarmElo'})
tourney_elo_continuous = tourney_elo_continuous.merge(season_end_data_all[['Season','TeamID','TElo','TCarmElo']], \
    left_on=['Season','LTeamID'], right_on=['Season','TeamID']).drop(columns=['TeamID'])
tourney_elo_continuous = tourney_elo_continuous.rename(columns={'TElo':'LElo','TCarmElo':'LCarmElo'})

# Calculate probability of WTeamID winning based on difference between Elos
tourney_elo_continuous['PElo'] = tourney_elo_continuous.apply(lambda row: \
                                            1 / (1 + 10**((row['LElo']-row['WElo'])/400)), axis=1)
tourney_elo_continuous['PCarmElo'] = tourney_elo_continuous.apply(lambda row: \
                                            1 / (1 + 10**((row['LCarmElo']-row['WCarmElo'])/400)), axis=1)

# Calculate success rate (accuracy, logloss) of each version of Elo
print('---- PREDICTING TOURNAMENT OUTCOMES ----')
print('ELO ACCURACY:\t\t', len(tourney_elo_continuous[tourney_elo_continuous['PElo']>0.5])/len(tourney_elo_continuous))
print('CARMELO ACCURACY:\t', len(tourney_elo_continuous[tourney_elo_continuous['PCarmElo']>0.5])/len(tourney_elo_continuous))
print('ELO LOGLOSS:\t\t', -np.mean([math.log(x) for x in tourney_elo_continuous['PElo'].values]))
print('CARMELO LOGLOSS:\t', -np.mean([math.log(x) for x in tourney_elo_continuous['PCarmElo'].values]))

tourney_elo_continuous[['Season','DayNum','WTeamID','LTeamID','PElo','PCarmElo']]

---- PREDICTING TOURNAMENT OUTCOMES ----
ELO ACCURACY:		 0.6807639836289222
CARMELO ACCURACY:	 0.6930422919508867
ELO LOGLOSS:		 0.5754813816423914
CARMELO LOGLOSS:	 0.5740800817635081


Unnamed: 0,Season,DayNum,WTeamID,LTeamID,PElo,PCarmElo
0,2010,134,1115,1457,0.334593,0.478008
1,2010,136,1124,1358,0.856505,0.611877
2,2010,138,1124,1330,0.669757,0.528405
3,2010,144,1124,1388,0.664881,0.535462
4,2010,136,1139,1431,0.619491,0.512442
...,...,...,...,...,...,...
728,2021,138,1328,1281,0.384369,0.565292
729,2021,138,1425,1179,0.638519,0.628558
730,2021,140,1425,1242,0.508466,0.479606
731,2021,146,1425,1332,0.539355,0.535510


In [352]:
# Get sample submission file (ID encodes season and both teams)
tourney_predictions_historic = pd.read_csv('Data/{}DataFiles_Stage1/{}SampleSubmissionStage1.csv'.format(bi, bi))

# Parse ID from submission so extract season and both teams, and then merge with CarmElo
tourney_predictions_historic['CarmElo1'] = tourney_predictions_historic.apply(lambda row: \
        season_end_data_all[(season_end_data_all['Season'] == int(row['ID'][:4])) &\
                            (season_end_data_all['TeamID'] == int(row['ID'][5:9]))]['TCarmElo'].values[0], axis=1)
tourney_predictions_historic['CarmElo2'] = tourney_predictions_historic.apply(lambda row: \
        season_end_data_all[(season_end_data_all['Season'] == int(row['ID'][:4])) &\
                            (season_end_data_all['TeamID'] == int(row['ID'][10:14]))]['TCarmElo'].values[0], axis=1)
# Calculate probability of first team winning based on difference between Elos
tourney_predictions_historic['Pred'] = tourney_predictions_historic.apply(lambda row: \
                                            1 / (1 + 10**((row['CarmElo2']-row['CarmElo1'])/400)), axis=1)

# Drop extra columns and write results to CSV
tourney_predictions_historic = tourney_predictions_historic.drop(columns=['CarmElo1','CarmElo2'])
write_to_csv(tourney_predictions_historic, 'SubmissionStage1.csv', 'Output')

tourney_predictions_historic.head()

Unnamed: 0,ID,Pred
0,2016_3106_3107,0.191061
1,2016_3106_3113,0.143504
2,2016_3106_3119,0.172838
3,2016_3106_3120,0.284624
4,2016_3106_3124,0.048881


In [353]:
!kaggle competitions submit -c 'womens-march-mania-2022' -f 'Output/SubmissionStage1.csv' -m "Predictions using Elo only."

100%|█████████████████████████████████████████| 337k/337k [00:00<00:00, 402kB/s]
Successfully submitted to March Machine Learning Mania 2022 - Women's

## <u>XGBoost Regression Model</u>

In [50]:
def rmse(y_obs, y_act):
    return np.sqrt(sum((y_act-y_obs)**2)/len(y_act))

def logloss(y_obs, y_act):
    return -np.mean([y*math.log(x) + (1-y)*math.log(1-x) for x,y in list(zip(y_obs, y_act)) if x < 1 and x > 0])

In [51]:
params = {'Score1': {'max_depth': 3, 'min_child_weight': 1, 'colsample_bytree': 1.0, \
                     'subsample': 0.9, 'gamma': 0.7, 'reg_alpha': 100.0}, \
          'Score2': {'max_depth': 3, 'min_child_weight': 3, 'colsample_bytree': 1.0, \
                     'subsample': 0.8, 'gamma': 0.1, 'reg_alpha': 100.0}, \
          'Outcome': {'max_depth': 3, 'min_child_weight': 1, 'colsample_bytree': 1.0, \
                     'subsample': 0.9, 'gamma': 0.7, 'reg_alpha': 100.0}, \
          'Score': {'max_depth': 3, 'min_child_weight': 3, 'colsample_bytree': 1.0, \
                     'subsample': 0.8, 'gamma': 0.1, 'reg_alpha': 100.0}}

In [52]:
cols = continuous_data_l30.columns[2:-4].append(continuous_data_l30.columns[-3:])

continuous_data_ext = continuous_data_all.merge(continuous_data_l30.rename(columns=\
                    dict(zip(cols, ['L'+x for x in cols]))), on=['Season','TeamID','DayNum'])
continuous_data_ext['DayNum'] = continuous_data_ext['DayNum'] + 1

matchup_data = season_data_compact[season_data_compact['Season']>=2010].drop(columns=['NumOT'])
matchup_data_W = matchup_data.merge(continuous_data_ext, left_on=['Season','WTeamID','DayNum'], \
                   right_on=['Season','TeamID','DayNum']).drop(columns=['TeamID'])
matchup_data_L = matchup_data.merge(continuous_data_ext, left_on=['Season','LTeamID','DayNum'], \
                   right_on=['Season','TeamID','DayNum']).drop(columns=['TeamID'])

cols = ['Season','DayNum','WTeamID','WScore','LTeamID','LScore','WLoc']
matchup_data1 = matchup_data_W.merge(matchup_data_L, on=cols).rename(columns=\
                                    {'WTeamID':'TeamID1','LTeamID':'TeamID2','WScore':'Score1','LScore':'Score2'})
matchup_data2 = matchup_data_L.merge(matchup_data_W, on=cols).rename(columns=\
                                    {'LTeamID':'TeamID1','WTeamID':'TeamID2','LScore':'Score1','WScore':'Score2'})
matchup_data = matchup_data1.append(matchup_data2[matchup_data1.columns]).reset_index(drop=True)

matchup_data['Win_Prob'] = matchup_data.apply(lambda row: 1/(1 + 10**((row['TCarmElo_y']-row['TCarmElo_x'])/400)), \
                                              axis=1)

matchup_data.head()

Unnamed: 0,Season,DayNum,TeamID1,Score1,TeamID2,Score2,WLoc,NumOT_x,TScore_x,TFGM_x,...,LOAstP_y,LOStlP_y,LOBlkP_y,LOTOP_y,LOffRtg_y,LDefRtg_y,LOWP_y,LOOWP_y,LSOS_y,Win_Prob
0,2010,13,1124,71,1216,69,H,0.0,86.0,31.0,...,0.413793,0.085227,0.105263,0.206612,1.051136,1.170799,1.0,0.0,0.666667,0.54591
1,2010,13,1129,75,1315,60,N,0.0,90.0,32.0,...,0.52381,0.13986,0.12,0.206186,0.646853,1.04811,1.0,0.0,0.666667,0.527372
2,2010,13,1133,74,1226,69,H,0.0,60.0,18.0,...,0.657143,0.102041,0.09434,0.239437,0.991254,1.239437,1.0,0.0,0.666667,0.505758
3,2010,13,1137,59,1136,56,N,0.0,80.0,27.0,...,0.527778,0.171958,0.292683,0.125698,0.701058,1.340782,1.0,0.0,0.666667,0.518012
4,2010,13,1176,75,1115,56,H,0.0,65.0,23.0,...,0.625,0.119681,0.065217,0.263158,0.957447,1.157895,1.0,0.0,0.666667,0.506068


In [53]:
# NJOBS = -1
# response = ['Score1','Score2']
# params = {}

r = 'Score'

In [54]:
# for r in response:
#     print('-- {}: {} --\n'.format(r.upper(), datetime.now()))
#     x_mat = np.matrix(matchup_data.drop(columns=['TeamID1','TeamID2','Score1','Score2','WLoc']))
#     y_mat = np.matrix(matchup_data[r]).T

#     x_train, x_test, y_train, y_test = train_test_split(x_mat, y_mat, test_size=0.3, random_state=47)

#     # Tune max_depth of tree and min_child_weight (min number of residuals in tree split)
#     print('TUNING: STEP 1 - {}'.format(datetime.now()))
#     params1 = {
#      'max_depth':list(range(3,24,5)),
#      'min_child_weight':list(range(1,6,2))
#     }

#     gsearch1 = GridSearchCV(estimator=xgb.XGBRegressor(learning_rate=0.1, n_estimators=800), \
#                             param_grid=params1, scoring='neg_root_mean_squared_error', cv=5, n_jobs=NJOBS)
#     gsearch1.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=20, \
#                             eval_metric='rmse', verbose=False)

#     print('Best Parameters: {}'.format(gsearch1.best_params_))
#     print('Best Score: {}\n'.format(gsearch1.best_score_))

#     # Tune max_depth of tree and min_child_weight (min number of residuals in tree split) with more granular weights
#     print('TUNING: STEP 2 - {}'.format(datetime.now()))
#     params2 = {
#      'max_depth':range(max(3, gsearch1.best_params_['max_depth']-4), min(gsearch1.best_params_['max_depth']+4, 28)),
#      'min_child_weight':range(max(1, gsearch1.best_params_['min_child_weight']-2), \
#                               min(gsearch1.best_params_['min_child_weight']+2, 6))
#     }

#     gsearch2 = GridSearchCV(estimator=xgb.XGBRegressor(learning_rate=0.1, n_estimators=800), \
#                             param_grid=params2, scoring='neg_root_mean_squared_error', cv=5, n_jobs=NJOBS)
#     gsearch2.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=20, \
#                             eval_metric='rmse', verbose=False)

#     print('Best Parameters: {}'.format(gsearch2.best_params_))
#     print('Best Score: {}\n'.format(gsearch2.best_score_))

#     # Tune subsample (ratio of training observations used in tree) and colsample_bytree (ratio of columns used in tree)
#     print('TUNING: STEP 3 - {}'.format(datetime.now()))
#     params3 = {
#      'subsample':[i/10.0 for i in range(6,11)],
#      'colsample_bytree':[i/10.0 for i in range(6,11)]
#     }

#     gsearch3 = GridSearchCV(estimator=xgb.XGBRegressor(learning_rate=0.1, n_estimators=800, \
#                     max_depth=gsearch2.best_params_['max_depth'], \
#                     min_child_weight=gsearch2.best_params_['min_child_weight']), \
#                     param_grid=params3, scoring='neg_root_mean_squared_error', cv=5, n_jobs=NJOBS)
#     gsearch3.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=20, 
#                     eval_metric='rmse', verbose=False)

#     print('Best Parameters: {}'.format(gsearch3.best_params_))
#     print('Best Score: {}\n'.format(gsearch3.best_score_))

#     # Tune gamma (min loss reduction required for split) and reg_alpha (regularization parameter equivalent to LASSO)
#     print('TUNING: STEP 4 - {}'.format(datetime.now()))
#     params4 = {
#      'gamma':[i/10.0 for i in range(0,8)],
#      'reg_alpha':[1/10.0**i for i in range(-2,6)]
#     }

#     gsearch4 = GridSearchCV(estimator=xgb.XGBRegressor(learning_rate=0.1, n_estimators=800, \
#                     max_depth=gsearch2.best_params_['max_depth'], \
#                     min_child_weight=gsearch2.best_params_['min_child_weight'], \
#                     subsample=gsearch3.best_params_['subsample'], \
#                     colsample_bytree=gsearch3.best_params_['colsample_bytree']), \
#                     param_grid=params4, scoring='neg_root_mean_squared_error', cv=5, n_jobs=NJOBS)
#     gsearch4.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=20, \
#                     eval_metric='rmse', verbose=False)

#     print('Best Parameters: {}'.format(gsearch4.best_params_))
#     print('Best Score: {}\n'.format(gsearch4.best_score_))
    
#     # Update parameters for each response variable
#     d = gsearch1.best_params_
#     d.update(gsearch2.best_params_)
#     d.update(gsearch3.best_params_)
#     d.update(gsearch4.best_params_)
#     params[r] = d
    
# print(params)

In [359]:
cols = matchup_data.drop(columns=['TeamID1','TeamID2','Score1','Score2','WLoc']).columns
x_mat = np.matrix(matchup_data[cols])
y_mat = np.matrix(matchup_data['{}1'.format(r)]).T

x_train, x_test, y_train, y_test = train_test_split(x_mat, y_mat, test_size=0.3, random_state=4747)

model = xgb.XGBRegressor(learning_rate=0.01, n_estimators=10000, \
                max_depth=params[r]['max_depth'], \
                min_child_weight=params[r]['min_child_weight'], \
                subsample=params[r]['subsample'], \
                colsample_bytree=params[r]['colsample_bytree'],\
                gamma=params[r]['gamma'],\
                reg_alpha=params[r]['reg_alpha'])

model.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=250, \
          eval_metric='rmse', verbose=False)

save_model_to_dir(model, 'xgbr_{}_{}_all.txt'.format(bracket_type, r.lower()), 'Models')

y_pred = model.predict(x_test)

score = rmse(np.array([x[0,0] for x in y_test]), y_pred)
print("RMSE: {}".format(score))

fi = pd.DataFrame(cols)
fi = fi.rename(columns={0:'col'}).join(pd.DataFrame(model.feature_importances_).rename(columns={0:'imp'}))

fi['imp'] = fi[['imp']].sort_values(by='imp', ascending=False).cumsum()

write_to_csv(fi, 'xgbr{}_{}_feature_importance.csv'.format(bracket_type, r.lower()), 'Features', index=None)

print("TOP FEATURES: {}\n".format(list(fi.sort_values(by='imp').col)[:5]))

RMSE: 10.3794820791187
TOP FEATURES: ['TScore_x', 'Win_Prob', 'OScore_y', 'LTScore_x', 'LOScore_y']



In [55]:
tourney_data_compact = pd.read_csv('Data/{}DataFiles_Stage1/{}NCAATourneyCompactResults.csv'.format(bi, bi))

cols = season_end_data_l30.columns[2:-4].append(season_end_data_l30.columns[-3:])

season_end_data_ext = season_end_data_all.merge(season_end_data_l30.rename(columns=\
                    dict(zip(cols, ['L'+x for x in cols]))), on=['Season','TeamID','DayNum'])

tourney_xgbr_continuous = tourney_data_compact.merge(season_end_data_ext.drop(columns=['DayNum']), \
    left_on=['Season','WTeamID'], right_on=['Season','TeamID']).drop(columns=['TeamID'])
tourney_xgbr_continuous = tourney_xgbr_continuous.merge(season_end_data_ext.drop(columns=['DayNum']), \
    left_on=['Season','LTeamID'], right_on=['Season','TeamID']).drop(columns=['TeamID'])

tourney_xgbr_continuous = tourney_xgbr_continuous.rename(columns=\
                                {'WTeamID':'TeamID1','WScore':'Score1','LTeamID':'TeamID2','LScore':'Score2'})
tourney_xgbr_continuous['Win_Prob'] = tourney_xgbr_continuous.apply(lambda row: \
                                            1 / (1 + 10**((row['TCarmElo_y']-row['TCarmElo_x'])/400)), axis=1)
tourney_xgbr_continuous = tourney_xgbr_continuous[matchup_data.columns]

model = xgb.Booster()
model.load_model('Models/xgbr_{}_{}_all.txt'.format(bracket_type, r.lower()))

cols = tourney_xgbr_continuous.drop(columns=['TeamID1','TeamID2','Score1','Score2','WLoc']).columns

x_mat1 = xgb.DMatrix(tourney_xgbr_continuous[cols])
rpred1 = tourney_xgbr_continuous[['Season','TeamID1','TeamID2','Score1','Score2']]
rpred1 = rpred1.join(pd.DataFrame(model.predict(x_mat1))).rename(columns={0:'P{}1'.format(r)})

x_mat2 = xgb.DMatrix(tourney_xgbr_continuous.rename(columns=\
                                                dict([(x, x[:-1] + chr(1-(ord(x[-1])-ord('x')) + ord('x'))) \
                                                      for x in cols if x[-2:] == '_x' or x[-2:] == '_y']))[cols]\
                    .apply(lambda x: 1-x if x.name == 'Win_Prob' else x))
rpred2 = tourney_xgbr_continuous[['Season','TeamID1','TeamID2','Score1','Score2']]
rpred2 = rpred2.join(pd.DataFrame(model.predict(x_mat2))).rename(columns={0:'P{}2'.format(r)})

rpred = rpred1.merge(rpred2, on=['Season','TeamID1','TeamID2','Score1','Score2'])
    
season_data_std_score = season_data[['Season','WTeamID','WScore']]\
.rename(columns={'WTeamID':'TeamID','WScore':'Score'})\
.append(season_data[['Season','LTeamID','LScore']]\
.rename(columns={'LTeamID':'TeamID','LScore':'Score'}))\
.groupby(['Season','TeamID'], as_index=False).std()\
.rename(columns={'Score':'StdScore'})

rpred = rpred.merge(season_data_std_score, left_on=['Season','TeamID1'], right_on=['Season','TeamID'])
rpred = rpred.drop(columns=['TeamID']).rename(columns={'StdScore':'StdScore1'})
rpred = rpred.merge(season_data_std_score, left_on=['Season','TeamID2'], right_on=['Season','TeamID'])
rpred = rpred.drop(columns=['TeamID']).rename(columns={'StdScore':'StdScore2'})

rpred['Pred'] = rpred.apply(lambda row: norm.cdf(0, row['PScore2']-row['PScore1'], \
                                                 math.sqrt(row['StdScore1']*row['StdScore2'])), axis=1)
rpred['Outcome'] = rpred.apply(lambda row: 1 if row['Score1']>row['Score2'] else 0, axis=1)

print('---- PREDICTING TOURNAMENT OUTCOMES ----')
print('ACCURACY:\t', len(rpred[(rpred['Pred']>=0.5)])/len(rpred))
print('LOGLOSS:\t', logloss(rpred['Pred'].values, rpred['Outcome'].values))

rpred.head()

---- PREDICTING TOURNAMENT OUTCOMES ----
ACCURACY:	 0.6930422919508867
LOGLOSS:	 0.5628169893763711


Unnamed: 0,Season,TeamID1,TeamID2,Score1,Score2,PScore1,PScore2,StdScore1,StdScore2,Pred,Outcome
0,2010,1115,1457,61,44,57.244411,59.501667,11.943035,11.678697,0.424212,1
1,2010,1124,1358,68,59,77.601822,70.479836,9.802019,14.195303,0.727001,1
2,2010,1124,1330,76,68,66.430771,63.488659,9.802019,10.45694,0.614322,1
3,2010,1124,1388,72,49,72.293549,70.11161,9.802019,11.170775,0.582588,1
4,2010,1139,1431,77,59,65.073097,64.422852,8.298307,13.027847,0.524933,1


In [61]:
tourney_predictions_historic = pd.read_csv('Data/{}DataFiles_Stage1/{}SampleSubmissionStage1.csv'.format(bi, bi))

tourney_predictions_historic['Season'] = tourney_predictions_historic.apply(lambda row: int(row['ID'][:4]), axis=1)
tourney_predictions_historic['TeamID1'] = tourney_predictions_historic.apply(lambda row: int(row['ID'][5:9]), axis=1)
tourney_predictions_historic['TeamID2'] = tourney_predictions_historic.apply(lambda row: int(row['ID'][10:14]), axis=1)

tourney_predictions_historic = tourney_predictions_historic.merge(season_end_data_ext, \
                        left_on=['Season','TeamID1'], right_on=['Season','TeamID']).drop(columns=['TeamID','DayNum'])
tourney_predictions_historic = tourney_predictions_historic.merge(season_end_data_ext, \
                        left_on=['Season','TeamID2'], right_on=['Season','TeamID']).drop(columns=['TeamID'])

tourney_predictions_historic['Win_Prob'] = tourney_predictions_historic.apply(lambda row: \
                                            1 / (1 + 10**((row['TCarmElo_y']-row['TCarmElo_x'])/400)), axis=1)

model = xgb.Booster()
model.load_model('Models/xgbr_{}_{}_all.txt'.format(bracket_type, r.lower()))

cols = tourney_xgbr_continuous.drop(columns=['TeamID1','TeamID2','Score1','Score2','WLoc']).columns

x_mat1 = xgb.DMatrix(tourney_predictions_historic[cols])
tourney_predictions_historic['P{}1'.format(r)] = model.predict(x_mat1)

x_mat2 = xgb.DMatrix(tourney_predictions_historic.rename(columns=\
                                                dict([(x, x[:-1] + chr(1-(ord(x[-1])-ord('x')) + ord('x'))) \
                                                 for x in cols if x[-2:] == '_x' or x[-2:] == '_y']))[cols]\
                    .apply(lambda x: 1-x if x.name == 'Win_Prob' else x))
tourney_predictions_historic['P{}2'.format(r)] = model.predict(x_mat2)

season_data_std_score = season_data[['Season','WTeamID','WScore']]\
.rename(columns={'WTeamID':'TeamID','WScore':'Score'})\
.append(season_data[['Season','LTeamID','LScore']]\
.rename(columns={'LTeamID':'TeamID','LScore':'Score'}))\
.groupby(['Season','TeamID'], as_index=False).std()\
.rename(columns={'Score':'StdScore'})

tourney_predictions_historic = tourney_predictions_historic.merge(season_data_std_score, \
                                                left_on=['Season','TeamID1'], right_on=['Season','TeamID'])
tourney_predictions_historic = tourney_predictions_historic.drop(columns=['TeamID']).rename(columns=\
                                                                                    {'StdScore':'StdScore1'})
tourney_predictions_historic = tourney_predictions_historic.merge(season_data_std_score, \
                                                left_on=['Season','TeamID2'], right_on=['Season','TeamID'])
tourney_predictions_historic = tourney_predictions_historic.drop(columns=['TeamID']).rename(columns=\
                                                                                    {'StdScore':'StdScore2'})

tourney_predictions_historic['Pred'] = tourney_predictions_historic.apply(lambda row: \
                norm.cdf(0, row['PScore2']-row['PScore1'], math.sqrt(row['StdScore1']*row['StdScore2'])), axis=1)

write_to_csv(tourney_predictions_historic[['ID','Pred']], 'SubmissionStage1.csv', 'Output')

tourney_predictions_historic.head()

Unnamed: 0,ID,Pred,Season,TeamID1,TeamID2,NumOT_x,TScore_x,TFGM_x,TFGA_x,TFGM3_x,...,LOffRtg_y,LDefRtg_y,LOWP_y,LOOWP_y,LSOS_y,Win_Prob,PScore1,PScore2,StdScore1,StdScore2
0,2016_1112_1114,0.911634,2016,1112,1114,0.181818,81.212121,28.060606,58.242424,6.515152,...,1.142112,1.037394,0.387599,0.524929,0.433376,0.785321,75.501762,61.725624,10.913746,9.528903
1,2016_1112_1122,0.968534,2016,1112,1122,0.181818,81.212121,28.060606,58.242424,6.515152,...,1.178275,1.113888,0.486905,0.545292,0.506367,0.929056,84.97525,62.721855,10.913746,13.120291
2,2016_1114_1122,0.812246,2016,1114,1122,0.033333,70.0,25.0,54.866667,7.4,...,1.178275,1.113888,0.486905,0.545292,0.506367,0.781655,70.43792,60.528984,9.528903,13.120291
3,2016_1112_1124,0.709595,2016,1112,1124,0.181818,81.212121,28.060606,58.242424,6.515152,...,1.110363,1.102242,0.610185,0.481597,0.567323,0.635662,75.424751,69.152267,10.913746,11.822443
4,2016_1114_1124,0.246021,2016,1114,1124,0.033333,70.0,25.0,54.866667,7.4,...,1.110363,1.102242,0.610185,0.481597,0.567323,0.322925,62.585712,69.878136,9.528903,11.822443


In [377]:
cols = ['Season','TeamID1','TeamID2','Pred','Win_Prob','PScore1','PScore2','StdScore1','StdScore2']
write_to_csv(tourney_predictions_historic[cols], \
             'predictions_all.csv', 'Output/{}/Regression/'.format(bi+bracket_type[1:]))

In [363]:
!kaggle competitions submit -c womens-march-mania-2022 -f Output/SubmissionStage1.csv -m "Predictions using XGBoost Regressor."

100%|█████████████████████████████████████████| 337k/337k [00:01<00:00, 298kB/s]
Successfully submitted to March Machine Learning Mania 2022 - Women's

## XGBoost Classification Model

In [364]:
r = 'Outcome'

In [365]:
matchup_data_bin = matchup_data.copy()
matchup_data_bin['Win_Prob'] = matchup_data_bin.apply(lambda row: \
                                            1 / (1 + 10**((row['TCarmElo_y']-row['TCarmElo_x'])/400)), axis=1)
matchup_data_bin[r] = matchup_data_bin.apply(lambda row: 1 if row['Score1']>row['Score2'] else 0, axis=1)

print('-- {}: {} --'.format(r.upper(), datetime.now()))
x_mat = np.matrix(matchup_data_bin.drop(columns=['TeamID1','TeamID2','Score1','Score2','WLoc','Outcome']))
y_mat = np.matrix(matchup_data_bin[r]).T

x_train, x_test, y_train, y_test = train_test_split(x_mat, y_mat, test_size=0.3, random_state=4747)

model = xgb.XGBClassifier(objective='binary:logistic', learning_rate=0.01, n_estimators=10000, \
                max_depth=params['Score1']['max_depth'], \
                min_child_weight=params['Score1']['min_child_weight'], \
                subsample=params['Score1']['subsample'], \
                colsample_bytree=params['Score1']['colsample_bytree'],\
                gamma=params['Score1']['gamma'],\
                reg_alpha=params['Score1']['reg_alpha'])

model.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=250, \
          eval_metric='logloss', verbose=False)

save_model_to_dir(model, 'xgbc_{}_{}_all.txt'.format(bracket_type, r.lower()), 'Models')

y_pred = model.predict_proba(x_test)

score = logloss([x[1] for x in y_pred], np.array([x[0,0] for x in y_test]))
print("LOGLOSS: {}".format(score))

fi = pd.DataFrame(matchup_data_bin.drop(columns=['TeamID1','TeamID2','Score1','Score2','WLoc','Outcome']).columns)
fi = fi.rename(columns={0:'col'}).join(pd.DataFrame(model.feature_importances_).rename(columns={0:'imp'}))

fi['imp'] = fi[['imp']].sort_values(by='imp', ascending=False).cumsum()

write_to_csv(fi, 'xgbc{}_{}_feature_importance.csv'.format(bracket_type, r.lower()), 'Features', index=None)

print("TOP FEATURES: {}\n".format(list(fi.sort_values(by='imp').col)[:5]))

-- OUTCOME: 2022-03-13 17:18:57.015756 --


  return f(**kwargs)


LOGLOSS: 0.5019809175472547
TOP FEATURES: ['Win_Prob', 'LOffRtg_y', 'LWP_y', 'TElo_y', 'LWP_x']



In [368]:
tourney_xgbc_continuous = tourney_xgbr_continuous.copy()
tourney_xgbc_continuous['Win_Prob'] = tourney_xgbc_continuous.apply(lambda row: \
                                            1 / (1 + 10**((row['TCarmElo_y']-row['TCarmElo_x'])/400)), axis=1)
tourney_xgbc_continuous[r] = tourney_xgbc_continuous.apply(lambda row: 1 if row['Score1']>row['Score2'] else 0, axis=1)
x_mat = xgb.DMatrix(tourney_xgbc_continuous.drop(columns=['TeamID1','TeamID2','Score1','Score2','WLoc','Outcome']))
cpred = tourney_xgbc_continuous[['Season','TeamID1','TeamID2','Score1','Score2','Outcome']]

model = xgb.Booster()
model.load_model('Models/xgbc_{}_{}_all.txt'.format(bracket_type, r.lower()))
cpred = cpred.join(pd.DataFrame(model.predict(x_mat))).rename(columns={0:'Pred'})


print('---- PREDICTING TOURNAMENT OUTCOMES ----')
print('ACCURACY:\t', (len(cpred[(cpred['Pred']>=0.5)]))/len(cpred))
print('LOGLOSS:\t', logloss(cpred['Pred'].values, cpred['Outcome'].values))

cpred.head()

---- PREDICTING TOURNAMENT OUTCOMES ----
ACCURACY:	 0.7792207792207793
LOGLOSS:	 0.4515049598707352


Unnamed: 0,Season,TeamID1,TeamID2,Score1,Score2,Outcome,Pred
0,2010,3124,3201,69,55,1,0.628778
1,2010,3124,3207,49,33,1,0.563659
2,2010,3124,3397,77,62,1,0.253349
3,2010,3124,3181,51,48,1,0.324616
4,2010,3173,3395,67,66,1,0.647844


In [369]:
r = 'Outcome'

tourney_predictions_historic = pd.read_csv('Data/{}DataFiles_Stage1/{}SampleSubmissionStage1.csv'.format(bi, bi))

tourney_predictions_historic['Season'] = tourney_predictions_historic.apply(lambda row: int(row['ID'][:4]), axis=1)
tourney_predictions_historic['TeamID1'] = tourney_predictions_historic.apply(lambda row: int(row['ID'][5:9]), axis=1)
tourney_predictions_historic['TeamID2'] = tourney_predictions_historic.apply(lambda row: int(row['ID'][10:14]), axis=1)

tourney_predictions_historic = tourney_predictions_historic.merge(season_end_data_ext, \
                        left_on=['Season','TeamID1'], right_on=['Season','TeamID']).drop(columns=['TeamID','DayNum'])
tourney_predictions_historic = tourney_predictions_historic.merge(season_end_data_ext, \
                        left_on=['Season','TeamID2'], right_on=['Season','TeamID']).drop(columns=['TeamID'])
tourney_predictions_historic['Win_Prob'] = tourney_predictions_historic.apply(lambda row: \
                                            1 / (1 + 10**((row['TCarmElo_y']-row['TCarmElo_x'])/400)), axis=1)

x_mat = xgb.DMatrix(tourney_predictions_historic[tourney_xgbc_continuous.drop(columns=\
                                            ['TeamID1','TeamID2','Score1','Score2','WLoc','Outcome']).columns])

model = xgb.Booster()
model.load_model('Models/xgbc_{}_{}_all.txt'.format(bracket_type, r.lower()))
tourney_predictions_historic['Pred'] = model.predict(x_mat)

write_to_csv(tourney_predictions_historic[['ID','Pred']], 'SubmissionStage1.csv', 'Output')

tourney_predictions_historic.head()

Unnamed: 0,ID,Pred,Season,TeamID1,TeamID2,NumOT_x,TScore_x,TFGM_x,TFGA_x,TFGM3_x,...,LOAstP_y,LOStlP_y,LOBlkP_y,LOTOP_y,LOffRtg_y,LDefRtg_y,LOWP_y,LOOWP_y,LSOS_y,Win_Prob
0,2016_3106_3107,0.055746,2016,3106,3107,0.068966,63.275862,21.62069,55.172414,4.37931,...,0.726563,0.07874,0.048433,0.275711,1.137358,0.774617,0.435374,0.547052,0.4726,0.191061
1,2016_3106_3113,0.056325,2016,3106,3113,0.068966,63.275862,21.62069,55.172414,4.37931,...,0.581699,0.136204,0.08156,0.263955,0.966608,0.865426,0.511905,0.489943,0.504584,0.143504
2,2016_3107_3113,0.430005,2016,3107,3113,0.0,71.032258,27.774194,56.806452,2.354839,...,0.581699,0.136204,0.08156,0.263955,0.966608,0.865426,0.511905,0.489943,0.504584,0.414994
3,2016_3106_3119,0.071792,2016,3106,3119,0.068966,63.275862,21.62069,55.172414,4.37931,...,0.397516,0.08947,0.074586,0.26532,1.152787,0.725779,0.366182,0.550696,0.427686,0.172838
4,2016_3107_3119,0.53178,2016,3107,3119,0.0,71.032258,27.774194,56.806452,2.354839,...,0.397516,0.08947,0.074586,0.26532,1.152787,0.725779,0.366182,0.550696,0.427686,0.469409


In [370]:
!kaggle competitions submit -c womens-march-mania-2022 -f Output/SubmissionStage1.csv -m "Predictions using XGBoost Classifier."

100%|█████████████████████████████████████████| 252k/252k [00:01<00:00, 212kB/s]
Successfully submitted to March Machine Learning Mania 2022 - Women's

## <u>Predict Current Tournament Results</u>

In [83]:
# Regular season stats
season_data = pd.read_csv('Data/{}DataFiles_Stage2/{}RegularSeasonDetailedResults.csv'.format(bi, bi))
season_data_latest = season_data[season_data['Season']==2022].reset_index(drop=True)

# Elo stats
elo_dict_latest = get_elo_dict(season_data, min_season=2010, max_season=2022, min_team=MIN_TEAM, max_team=MAX_TEAM)
carmelo_dict_latest = get_carmelo_dict(season_data, min_season=2010, max_season=2022, \
                                       min_team=MIN_TEAM, max_team=MAX_TEAM)

# Stats from whole season
continuous_data_all_latest = get_continuous_data(season_data_latest, min_season=2022, max_season=2022)
matchup_dict_all_latest = get_matchup_dict(season_data_latest, min_season=2022, max_season=2022, \
                                           min_team=MIN_TEAM, max_team=MAX_TEAM)
opp_matchup_dict_all_latest = get_opp_matchup_dict(matchup_dict_all_latest, min_season=2022, max_season=2022, \
                                                   min_team=MIN_TEAM, max_team=MAX_TEAM)
wp_dict_all_latest = get_win_pct_dict(continuous_data_all_latest)
continuous_data_all_latest = add_sos_var(continuous_data_all_latest, wp_dict_all_latest, \
                                  matchup_dict_all_latest, opp_matchup_dict_all_latest)
continuous_data_all_latest = add_elo_var(continuous_data_all_latest, elo_dict_latest, \
                                         matchup_dict_all_latest, elo_var_name='Elo')
continuous_data_all_latest = add_elo_var(continuous_data_all_latest, carmelo_dict_latest, \
                                         matchup_dict_all_latest, elo_var_name='CarmElo')
continuous_data_all_latest = continuous_data_all_latest.apply(pd.to_numeric, errors='coerce', axis=1)
continuous_data_all_latest = continuous_data_all_latest.reset_index(drop=True).drop(columns=['Wins'])
season_end_data_all_latest = continuous_data_all_latest.loc[continuous_data_all_latest\
                                                            .groupby(['Season','TeamID'])['DayNum'].idxmax()]

# Stats from last 30 days
continuous_data_l30_latest = get_continuous_data(season_data_latest, min_season=2022, max_season=2022, last_n=30)
matchup_dict_l30_latest = get_matchup_dict(season_data_latest, min_season=2022, max_season=2022, last_n=30, \
                                           min_team=MIN_TEAM, max_team=MAX_TEAM)
opp_matchup_dict_l30_latest = get_opp_matchup_dict(matchup_dict_l30_latest, min_season=2022, max_season=2022, \
                                                   min_team=MIN_TEAM, max_team=MAX_TEAM)
wp_dict_l30_latest = get_win_pct_dict(continuous_data_l30_latest)
continuous_data_l30_latest = add_sos_var(continuous_data_l30_latest, wp_dict_l30_latest, \
                                         matchup_dict_l30_latest, opp_matchup_dict_l30_latest)
continuous_data_l30_latest = continuous_data_l30_latest.apply(pd.to_numeric, errors='coerce', axis=1)
continuous_data_l30_latest = continuous_data_l30_latest.reset_index(drop=True).drop(columns=['Wins'])
season_end_data_l30_latest = continuous_data_l30_latest.loc[continuous_data_l30_latest\
                                                            .groupby(['Season','TeamID'])['DayNum'].idxmax()]

cols = season_end_data_l30_latest.columns[2:-4].append(season_end_data_l30_latest.columns[-3:])
season_end_data_ext_latest = season_end_data_all_latest.merge(season_end_data_l30_latest.rename(columns=\
                    dict(zip(cols, ['L'+x for x in cols]))), on=['Season','TeamID','DayNum'])

season_end_data_ext_latest.head()

---- CALCULATE SIMPLE ELO SCORE ----
START: 2022-03-16 20:02:54.813496-07:00
2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.
2021 season completed.
2022 season completed.
FINISH: 2022-03-16 20:41:32.598266-07:00
---- CALCULATE 538 ELO SCORE ----
START: 2022-03-16 20:41:32.768658-07:00
2010 season completed.
2011 season completed.
2012 season completed.
2013 season completed.
2014 season completed.
2015 season completed.
2016 season completed.
2017 season completed.
2018 season completed.
2019 season completed.
2020 season completed.
2021 season completed.
2022 season completed.
FINISH: 2022-03-16 21:36:48.106610-07:00
---- PROCESS CONTINOUS SEASON DATA ----
START: 2022-03-16 21:36:48.284542-07:00








2022 season completed.
FINISH: 2022-03-16 21:37:19.295048-07:00
---- COLLECT TEAM MATCHUP HISTORY ----
START: 2022-03-16 21:37:19.296479-07:00










2022 season completed.
FINISH: 2022-03-16 21:40:10.932349-07:00
---- COLLECT OPPONENT MATCHUP HISTORY ----
START: 2022-03-16 21:40:10.940860-07:00
2022 season completed.
FINISH: 2022-03-16 21:40:11.038967-07:00
---- ADD SOS FEATURES TO DATA ----
START: 2022-03-16 21:40:12.630030-07:00
FINISH: 2022-03-16 21:40:25.620340-07:00


Unnamed: 0,Season,TeamID,NumOT,TScore,TFGM,TFGA,TFGM3,TFGA3,TFTM,TFTA,...,LOORP,LOAstP,LOStlP,LOBlkP,LOTOP,LOffRtg,LDefRtg,LOWP,LOOWP,LSOS
0,2022.0,3101.0,0.0,70.923077,24.269231,57.423077,8.730769,25.076923,13.653846,18.192308,...,0.335106,0.538071,0.09434,0.08658,0.230676,1.019671,1.056253,0.548186,0.452081,0.516151
1,2022.0,3102.0,0.068966,61.862069,23.137931,61.37931,4.896552,16.862069,10.689655,14.482759,...,0.273684,0.455128,0.053556,0.083832,0.260664,1.015424,0.904782,0.387755,0.518322,0.431277
2,2022.0,3103.0,0.0,66.333333,24.444444,57.333333,5.740741,18.185185,11.703704,16.740741,...,0.268293,0.472803,0.136688,0.0563,0.177792,0.967692,0.999688,0.467372,0.508543,0.481096
3,2022.0,3104.0,0.1,70.533333,24.533333,59.166667,7.766667,23.2,13.7,19.366667,...,0.340351,0.593137,0.090941,0.166667,0.199313,0.967121,0.93299,0.455357,0.533009,0.481241
4,2022.0,3105.0,0.068966,59.655172,21.758621,57.62069,3.172414,12.37931,12.965517,18.551724,...,0.27668,0.450704,0.110486,0.031153,0.194676,0.903978,0.812475,0.430272,0.543651,0.468065


In [84]:
r = 'Score'

tourney_predictions = pd.read_csv('Data/{}DataFiles_Stage2/{}SampleSubmissionStage2.csv'.format(bi, bi))

tourney_predictions['Season'] = tourney_predictions.apply(lambda row: int(row['ID'][:4]), axis=1)
tourney_predictions['TeamID1'] = tourney_predictions.apply(lambda row: int(row['ID'][5:9]), axis=1)
tourney_predictions['TeamID2'] = tourney_predictions.apply(lambda row: int(row['ID'][10:14]), axis=1)

tourney_predictions = tourney_predictions.merge(season_end_data_ext_latest, \
                        left_on=['Season','TeamID1'], right_on=['Season','TeamID']).drop(columns=['TeamID','DayNum'])
tourney_predictions = tourney_predictions.merge(season_end_data_ext_latest, \
                        left_on=['Season','TeamID2'], right_on=['Season','TeamID']).drop(columns=['TeamID'])

tourney_predictions['Win_Prob'] = tourney_predictions.apply(lambda row: \
                                            1 / (1 + 10**((row['TCarmElo_y']-row['TCarmElo_x'])/400)), axis=1)

model = xgb.Booster()
model.load_model('Models/xgbr_{}_{}_all.txt'.format(bracket_type, r.lower()))

cols = tourney_xgbr_continuous.drop(columns=['TeamID1','TeamID2','Score1','Score2','WLoc']).columns

x_mat1 = xgb.DMatrix(tourney_predictions[cols])
tourney_predictions['P{}1'.format(r)] = model.predict(x_mat1)

x_mat2 = xgb.DMatrix(tourney_predictions.rename(columns=\
                                                dict([(x, x[:-1] + chr(1-(ord(x[-1])-ord('x')) + ord('x'))) \
                                                 for x in cols if x[-2:] == '_x' or x[-2:] == '_y']))[cols]\
                    .apply(lambda x: 1-x if x.name == 'Win_Prob' else x))
tourney_predictions['P{}2'.format(r)] = model.predict(x_mat2)

season_data_std_score = season_data_latest[['Season','WTeamID','WScore']]\
.rename(columns={'WTeamID':'TeamID','WScore':'Score'})\
.append(season_data_latest[['Season','LTeamID','LScore']]\
.rename(columns={'LTeamID':'TeamID','LScore':'Score'}))\
.groupby(['Season','TeamID'], as_index=False).std()\
.rename(columns={'Score':'StdScore'})

tourney_predictions = tourney_predictions.merge(season_data_std_score, \
                                                left_on=['Season','TeamID1'], right_on=['Season','TeamID'])
tourney_predictions = tourney_predictions.drop(columns=['TeamID']).rename(columns=\
                                                                                    {'StdScore':'StdScore1'})
tourney_predictions = tourney_predictions.merge(season_data_std_score, \
                                                left_on=['Season','TeamID2'], right_on=['Season','TeamID'])
tourney_predictions = tourney_predictions.drop(columns=['TeamID']).rename(columns=\
                                                                                    {'StdScore':'StdScore2'})

tourney_predictions['Pred'] = tourney_predictions.apply(lambda row: \
                norm.cdf(0, row['PScore2']-row['PScore1'], math.sqrt(row['StdScore1']*row['StdScore2'])), axis=1)

write_to_csv(tourney_predictions[['ID','Pred']], 'SubmissionStage2.csv', 'Output')

tourney_predictions.head()

Unnamed: 0,ID,Pred,Season,TeamID1,TeamID2,NumOT_x,TScore_x,TFGM_x,TFGA_x,TFGM3_x,...,LOffRtg_y,LDefRtg_y,LOWP_y,LOOWP_y,LSOS_y,Win_Prob,PScore1,PScore2,StdScore1,StdScore2
0,2022_3107_3110,0.404655,2022,3107,3110,0.0,56.322581,21.16129,51.709677,5.806452,...,0.92571,0.777172,0.398065,0.562879,0.453003,0.431239,50.12571,52.022041,7.354759,8.396364
1,2022_3107_3112,0.059233,2022,3107,3112,0.0,56.322581,21.16129,51.709677,5.806452,...,0.906401,0.913487,0.430556,0.481845,0.447652,0.207364,44.095428,59.043705,7.354759,12.464422
2,2022_3110_3112,0.081944,2022,3110,3112,0.066667,59.866667,22.533333,53.566667,4.166667,...,0.906401,0.913487,0.430556,0.481845,0.447652,0.256529,47.677383,61.918877,8.396364,12.464422
3,2022_3107_3116,0.077825,2022,3107,3116,0.0,56.322581,21.16129,51.709677,5.806452,...,0.889479,0.963506,0.529762,0.458539,0.506021,0.258228,51.171074,65.789566,7.354759,14.412883
4,2022_3110_3116,0.108892,2022,3110,3116,0.066667,59.866667,22.533333,53.566667,4.166667,...,0.889479,0.963506,0.529762,0.458539,0.506021,0.314665,55.169472,68.727219,8.396364,14.412883


In [85]:
cols = ['Season','TeamID1','TeamID2','Pred','Win_Prob','PScore1','PScore2','StdScore1','StdScore2']
write_to_csv(tourney_predictions[cols], \
             'predictions_2022.csv', 'Output/{}/Regression/'.format(bi+bracket_type[1:]))

In [86]:
!kaggle competitions submit -c womens-march-mania-2022 -f Output/SubmissionStage2.csv -m "Predictions using XGBoost Regressor."

100%|██████████████████████████████████████| 76.4k/76.4k [00:02<00:00, 30.9kB/s]
Successfully submitted to March Machine Learning Mania 2022 - Women's