In [1]:
import pandas as pd
import pickle
import datetime as dt
import numpy as np

In [35]:
this_year = dt.datetime.now().year
branch='women'
model_filepath_win = f'../model/logit_model_with_winrate_{branch}.pickle'
model_filepath = f'../model/logit_model_sans_winrate_{branch}.pickle'
loaded_model_win = pickle.load(open(model_filepath_win, 'rb'))
loaded_model = pickle.load(open(model_filepath, 'rb'))
luck_adj = False
if luck_adj:
    luck_ext = '_luck_adj'
else:
    luck_ext = ''

In [36]:
teams = pd.read_csv(f'../data/tournament_regions_{branch}23.csv')
teams.head()

Unnamed: 0,team,seed,region
0,Stanford,1,East
1,Iowa,2,East
2,Duke,3,East
3,Texas,4,East
4,Louisville,5,East


In [37]:
# first_four_losers = ['Texas Southern','Mississippi State','Southeast Missouri State','Nevada']
first_four_losers = ['Illinois','Southern','Purdue','Monmouth']

In [38]:
teams = teams.loc[~teams.team.isin(first_four_losers)]

In [39]:
conference_regions = ['South','East','Midwest','West']
round_one_matchups = [
    (1,16),
    (8,9),
    (5,12),
    (4,13),
    (6,11),
    (3,14),
    (7,10),
    (2,15)
]

In [40]:
matchups = []
for region in conference_regions:
    for matchup in round_one_matchups:
        team = teams.loc[
            (teams.region == region) & 
            (teams.seed == matchup[0]),'team'
        ].item()
        
        opponent = teams.loc[
            (teams.region == region) & 
            (teams.seed == matchup[1]),'team'
        ].item()
        
        matchups.append([team,opponent,matchup[0],matchup[1]])
        
init_teams_df = pd.DataFrame(matchups,columns=[
    'team','opponent','seed','seed_opp'
])

In [41]:
data = pd.read_csv(f'../data/transformed_data_{branch}{str(this_year)[-2:]}.csv')
data.head()

Unnamed: 0,team,opponent,won,team_score,opponent_score,team_rank,opponent_rank,game_round,season_type,date,...,score_diff,luck,luck_opp,choke_rate,choke_rate_opp,upset_rate,upset_rate_opp,win_streak,win_streak_opp,matchup_win_rate
0,Abilene Christian,Northern Colorado,1,81.0,61.0,,,,regular_season,2020/12/04,...,20.0,0.5,0.6,,,0.0,,0.0,0.354424,0.0
1,Abilene Christian,Texas A&M,0,59.0,77.0,,10.0,,regular_season,2020/12/13,...,-18.0,0.5,0.875,,0.107143,0.0,,0.315176,3.661989,
2,Abilene Christian,UAB,0,73.0,81.0,,,,regular_season,2020/12/19,...,-8.0,0.5,0.166667,,,0.0,,0.0,0.801142,
3,Abilene Christian,Stetson,1,70.0,55.0,,,,regular_season,2020/12/20,...,15.0,0.5,0.444444,,,0.0,,0.0,0.0,
4,Abilene Christian,Texas A&M Corpus Christi,1,73.0,51.0,,,,regular_season,2021/01/09,...,22.0,0.5,0.0,,,0.0,,0.315176,0.0,


In [42]:
team_cols = [col for col in data.columns.tolist() if (('_opp' not in col) & ('opponent' not in col))]
for col in ['team_score','underdog','fan_favorite','seed','score_diff','matchup_win_rate','home_game','season_year',
           'season_type','date']:
    try:
        team_cols.remove(col)
    except:
        continue

team_cols

['team',
 'won',
 'team_rank',
 'game_round',
 'de',
 'oe',
 'te',
 'pace',
 'physicality_score',
 'sos_norm',
 'srs_norm',
 'luck',
 'choke_rate',
 'upset_rate',
 'win_streak']

In [43]:
opp_cols = {}
for col in team_cols:
    if col == 'team':
        opp_cols[col] = 'opponent'
    elif col in ['won','game_round']:
        continue
    elif col == 'team_rank':
        opp_cols[col] = 'opponent_rank'
    else:
        opp_cols[col] = f'{col}_opp'

In [44]:
team_metrics = data.loc[data.season_year == this_year,team_cols].groupby(['team']).last().reset_index()

In [45]:
def get_matchup_win_rate(matchups,data):
    # Calculate matchup win rate
    print('Calculating matchup win rate')
    matchups_df = pd.DataFrame()

    
    match_wl = []
    for match in matchups:
        wins = data.loc[
            (data.team == match[0]) &
            (data.opponent == match[1]),'won'
        ].sum()
        games = data.loc[
            (data.team == match[0]) &
            (data.opponent == match[1]),'won'
        ].count()
        played_previously = 0
        if games >= 1:
            played_previously = 1

        match_wl.append(match+[wins/games,played_previously])

    matchups_df = pd.DataFrame(match_wl,columns=['team','opponent','matchup_win_rate','previous_matchup'])
    return matchups_df

In [46]:
def get_probability(coefs,vals,intercept):
    log_prob = np.add(np.matmul(vals,coefs),intercept)
    return (1/(1+np.exp(-log_prob)))

In [47]:
def get_next_round_matchups(data):
    round_winners = []
    for i in data.index:
        if data.loc[i].win == 1:
            winning_team = data.loc[i].team
            winning_seed = data.loc[i].seed
        else:
            winning_team = data.loc[i].opponent
            winning_seed = data.loc[i].seed_opp
        round_winners.append([winning_team,winning_seed])
        
    matchups = []
    matchup = []
    i = 0
    for team in round_winners:
        if i % 2 == 0:
            matchup += team
        else:
            matchups.append(matchup + team)
            matchup = []
        i += 1
        
    next_round = pd.DataFrame(matchups,columns=['team','seed','opponent','seed_opp'])
    return next_round

In [48]:
df_rnds = []
teams_df = init_teams_df.copy()
for rnd in ['first','second','sweet16','elite_eight','final_four','championship']:

    round_df = teams_df.merge(team_metrics,on=['team'],how='left')
    round_df = round_df.merge(
        team_metrics[list(opp_cols.keys())].rename(columns=opp_cols),on=['opponent'],how='left')
    round_df.game_round = rnd
    round_df.drop('won',axis=1,inplace=True)
    round_df.insert(2,'win',0)

    # Get previous matchups
    matchups = round_df[['team','opponent']].drop_duplicates().values.tolist()
    df_match = get_matchup_win_rate(matchups,data).dropna()
    matchup_bool = True
    if len(df_match) != 0:
        round_df = round_df.merge(df_match,on=['team','opponent'],how='left')
    else:
        matchup_bool = False

    
    # Add fan_favorite - home team in regular season or underdog in tournaments
    round_df['fan_favorite'] = 0
    round_df['fan_favorite_opp'] = 0
    round_df.loc[
        ((round_df.seed - round_df.seed_opp) >= 8) | 
        (np.isnan(round_df.team_rank) & ~np.isnan(round_df.opponent_rank)),
        'fan_favorite'] = 1
    round_df.loc[
        ((round_df.seed_opp - round_df.seed) >= 8) | 
        (np.isnan(round_df.opponent_rank) & ~np.isnan(round_df.team_rank)),
        'fan_favorite_opp'] = 1
    round_df.fillna(0)
    
    
    if matchup_bool:
        round_df.insert(2,'team_win_probability',0)
        for i in round_df.index:
            if np.isnan(round_df.loc[i]['matchup_win_rate']):
                intercept = loaded_model.params[0]
                coef_cols = loaded_model.params.index[1:]
                coefs = list(loaded_model.params[1:])
                
                vals = round_df.loc[i][coef_cols].fillna(0).values.tolist()
                round_df.iloc[i,round_df.columns.get_loc('team_win_probability')] = get_probability(coefs,vals,intercept)
            else:
                intercept = loaded_model_win.params[0]        
                coef_cols = loaded_model_win.params.index[1:]
                coefs = list(loaded_model_win.params[1:])
                
                vals = round_df.loc[i][coef_cols].fillna(0).values.tolist()
                round_df.iloc[i,round_df.columns.get_loc('team_win_probability')] = get_probability(coefs,vals,intercept)
    else:
        round_df.fillna(0,inplace=True)
        intercept = loaded_model.params[0]
        coef_cols = loaded_model.params.index[1:]
        coefs = list(loaded_model.params[1:])
        vals = round_df[coef_cols].values.tolist()
        round_df.insert(2,'team_win_probability',get_probability(coefs,vals,intercept))
    
    if luck_adj:
        round_df.loc[
            ((abs(round_df.team_win_probability - 0.5) < .02) &
            (round_df.luck >= round_df.luck_opp)) |
            (round_df.team_win_probability > 0.52)
            ,'win'] = 1
    else:
        round_df.loc[
            round_df.team_win_probability > 0.5
            ,'win'] = 1

    df_rnds.append(round_df)
    # Calculate next round matchups
    teams_df = get_next_round_matchups(round_df)

df_tourney = pd.concat(df_rnds).reset_index(drop=True)
df_tourney = df_tourney[df_tourney.columns[:6].tolist() + sorted(df_tourney.columns[6:])]
df_tourney.to_csv(f'tournament_outcomes{luck_ext}_{branch}{str(this_year)[-2:]}.csv',index=False)

Calculating matchup win rate
Calculating matchup win rate


  match_wl.append(match+[wins/games,played_previously])
  match_wl.append(match+[wins/games,played_previously])


Calculating matchup win rate
Calculating matchup win rate
Calculating matchup win rate
Calculating matchup win rate


  match_wl.append(match+[wins/games,played_previously])
  match_wl.append(match+[wins/games,played_previously])
  match_wl.append(match+[wins/games,played_previously])


In [92]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 35
df_tourney

Unnamed: 0,team,opponent,team_win_probability,win,seed,seed_opp,choke_rate,choke_rate_opp,de,de_opp,fan_favorite,fan_favorite_opp,game_round,luck,luck_opp,matchup_win_rate,oe,oe_opp,opponent_rank,pace,pace_opp,physicality_score,physicality_score_opp,previous_matchup,sos_norm,sos_norm_opp,srs_norm,srs_norm_opp,te,te_opp,team_rank,upset_rate,upset_rate_opp,win_streak,win_streak_opp
0,Alabama,Texas A&M Corpus Christi,0.993754,1,1,16,0.172414,0.0,93.085532,101.408451,0,1,first,1.0,0.25,,111.670663,112.37954,0.0,72.0,71.0,66.458617,53.973626,,89.151713,16.476346,100.0,46.470227,1.199657,1.108187,4.0,1.0,0.0,0.0,0.49429
1,Maryland,West Virginia,0.385608,0,8,9,0.6,1.0,98.301887,102.554006,0,0,first,0.0,0.4,,109.528302,110.166803,24.0,64.0,69.0,35.0273,50.311513,,79.97553,93.67863,82.156742,85.06241,1.114203,1.074232,21.0,0.625,0.333333,0.0,2.810359
2,San Diego State,College of Charleston,0.810005,1,5,12,0.210526,0.142857,95.648211,95.516163,0,0,first,0.833333,0.666667,,108.361814,114.577685,18.0,66.0,70.0,45.369207,56.677468,,82.177814,29.404568,83.916513,68.569675,1.13292,1.199563,20.0,0.0,0.0,0.821778,2.05832
3,Virginia,Furman,0.764344,1,4,13,0.214286,0.0,96.799297,102.071146,0,1,first,0.6,0.666667,,108.949416,117.670619,0.0,62.0,69.0,26.753712,44.014946,,72.675367,31.892333,79.210149,61.694291,1.125519,1.152829,13.0,1.0,0.0,1.453507,1.594617
4,Creighton,North Carolina State,0.557805,1,6,11,0.416667,0.333333,98.507137,102.203992,0,0,first,0.285714,0.714286,,110.232102,112.879409,23.0,69.0,69.0,30.631491,49.462318,,86.663948,69.045677,87.436055,77.061592,1.119027,1.104452,24.0,0.5,0.5,1.733279,0.690457
5,Baylor,UC Santa Barbara,0.884482,1,3,14,0.291667,0.0,104.442767,100.580445,0,1,first,0.666667,0.8,,114.710494,110.633981,0.0,67.0,65.0,41.90452,35.047112,,93.311582,38.907015,88.213628,59.197872,1.09831,1.099955,10.0,0.4,0.0,0.0,1.556281
6,Missouri,Utah State,0.489833,1,7,10,0.666667,0.0,105.735768,100.65297,0,1,first,1.0,0.5,,112.696026,113.436322,0.0,70.0,69.0,49.154839,39.154733,,77.487765,77.446982,76.202169,82.443217,1.065827,1.127004,20.0,0.625,0.0,2.324633,3.872349
7,Arizona,Princeton,0.936087,1,2,15,0.185185,0.0,99.305556,98.082023,0,1,first,0.666667,0.666667,,114.828431,108.738822,0.0,72.0,69.0,48.905704,25.408704,,83.686786,35.766721,92.142419,57.663188,1.156314,1.108652,8.0,0.75,0.0,0.0,0.357667
8,Purdue,Fairleigh Dickinson,0.999637,1,1,16,0.192308,0.0,97.664835,0.0,0,1,first,0.833333,0.0,,113.782051,0.0,0.0,64.0,0.0,39.125547,0.0,,84.706362,0.0,90.976059,0.0,1.165026,0.0,5.0,1.0,0.0,1.694127,0.0
9,Memphis,Florida Atlantic,0.571263,1,8,9,0.0,0.25,98.828283,93.472972,1,0,first,0.444444,0.875,,109.616162,113.360838,25.0,72.0,69.0,63.882897,42.532613,,76.794454,49.836868,81.870268,78.637201,1.109158,1.212766,0.0,0.25,0.0,2.303834,2.491843
