# 02 Create Predictions
Create predictions for each potential game

In [1]:
import pandas as pd
import numpy as np

## Read in data

In [2]:
comb_results = pd.read_csv('../data/comb_results.csv')
comb_results.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,Score_Strong,NumOT_Strong,Score_Weak,NumOT_Weak,TeamID_Winner
0,1985,R1W1,W01,W16,1,1207,Georgetown,1250,Lehigh,68,0,43,0,1207
1,1985,R1W2,W02,W15,1,1210,Georgia Tech,1273,Mercer,65,0,58,0,1210
2,1985,R1W3,W03,W14,1,1228,Illinois,1318,Northeastern,76,0,57,0,1228
3,1985,R1W4,W04,W13,1,1260,Loyola-Chicago,1233,Iona,59,0,58,0,1260
4,1985,R1W5,W05,W12,1,1374,SMU,1330,Old Dominion,85,0,68,0,1374


In [3]:
seeds = pd.read_csv('../data/Stage2DataFiles//NCAATourneySeeds.csv')
seeds['Seed_v2'] = [int(x[1:3]) for x in seeds['Seed']]
seeds.drop('Seed', axis=1, inplace=True)
seeds.head()

Unnamed: 0,Season,TeamID,Seed_v2
0,1985,1207,1
1,1985,1210,2
2,1985,1228,3
3,1985,1260,4
4,1985,1374,5


## Get performance by seed matchup

In [4]:
# join to get original seed for strong and weak teams
comb_results_seeds = pd.merge(comb_results, seeds, 
                              left_on=['Season', 'TeamID_Strong'], 
                              right_on=['Season', 'TeamID'], 
                              validate='m:1')
comb_results_seeds.rename(columns={'Seed_v2':'Orig_Seed_Strong'}, inplace=True)
comb_results_seeds.drop('TeamID', axis=1, inplace=True)

comb_results_seeds = pd.merge(comb_results_seeds, seeds, 
                              left_on=['Season', 'TeamID_Weak'], 
                              right_on=['Season', 'TeamID'], 
                              validate='m:1')
comb_results_seeds.rename(columns={'Seed_v2':'Orig_Seed_Weak'}, inplace=True)
comb_results_seeds.drop('TeamID', axis=1, inplace=True)

comb_results_seeds.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,Score_Strong,NumOT_Strong,Score_Weak,NumOT_Weak,TeamID_Winner,Orig_Seed_Strong,Orig_Seed_Weak
0,1985,R1W1,W01,W16,1,1207,Georgetown,1250,Lehigh,68,0,43,0,1207,1,16
1,1985,R2W1,R1W1,R1W8,2,1207,Georgetown,1396,Temple,63,0,46,0,1207,1,8
2,1985,R3W1,R2W1,R2W4,3,1207,Georgetown,1260,Loyola-Chicago,65,0,53,0,1207,1,4
3,1985,R4W1,R3W1,R3W2,4,1207,Georgetown,1210,Georgia Tech,60,0,54,0,1207,1,2
4,1985,R5WX,R4W1,R4X1,5,1207,Georgetown,1385,St John's,77,0,59,0,1207,1,1


In [5]:
# add win indicator for strong team
comb_results_seeds_v2 = comb_results_seeds.copy()
comb_results_seeds_v2['Strong_Win'] = [1] * (comb_results_seeds_v2['TeamID_Winner']
                                             ==comb_results_seeds_v2['TeamID_Strong'])
comb_results_seeds_v2.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,Score_Strong,NumOT_Strong,Score_Weak,NumOT_Weak,TeamID_Winner,Orig_Seed_Strong,Orig_Seed_Weak,Strong_Win
0,1985,R1W1,W01,W16,1,1207,Georgetown,1250,Lehigh,68,0,43,0,1207,1,16,1
1,1985,R2W1,R1W1,R1W8,2,1207,Georgetown,1396,Temple,63,0,46,0,1207,1,8,1
2,1985,R3W1,R2W1,R2W4,3,1207,Georgetown,1260,Loyola-Chicago,65,0,53,0,1207,1,4,1
3,1985,R4W1,R3W1,R3W2,4,1207,Georgetown,1210,Georgia Tech,60,0,54,0,1207,1,2,1
4,1985,R5WX,R4W1,R4X1,5,1207,Georgetown,1385,St John's,77,0,59,0,1207,1,1,1


In [6]:
comb_results_seeds_v2['comp'] = 1
comb_results_seeds_v2['comp'] = [1 if s<w else -1 for w,s in zip(comb_results_seeds_v2['Orig_Seed_Weak'], 
                                            comb_results_seeds_v2['Orig_Seed_Strong'])]


comb_results_seeds_v2['Strong_Win'] = [p if c==1 else (1-p) for c,p in zip(comb_results_seeds_v2['comp'], comb_results_seeds_v2['Strong_Win'])]


In [7]:
# Aggregate by round and matchup
results_agg = comb_results_seeds_v2.groupby(['Round', 'Orig_Seed_Strong', 'Orig_Seed_Weak']).agg(
    {'Strong_Win':sum, 'Season':'count'}).reset_index()
results_agg.rename(columns={'Season':'Count'}, inplace=True)
results_agg.head()

Unnamed: 0,Round,Orig_Seed_Strong,Orig_Seed_Weak,Strong_Win,Count
0,1,1,16,135,136
1,1,2,15,128,136
2,1,3,14,115,136
3,1,4,13,108,136
4,1,5,12,89,136


In [17]:
results_agg.to_csv('../data/results_agg.csv', index=False)

In [9]:
# @np.vectorize
# group(rnd, s, w):
#     x = 1
#     if rnd==1: 
#         x+=1
#         return (x-1)
#     elif rnd==2:
        
        
@np.vectorize
def agg_matchup(w, s):
    min_s = min(w,s)
    max_s = max(w,s)
    diff = max_s - min_s
    return diff

results_agg['Matchup'] = agg_matchup(results_agg['Orig_Seed_Strong'], results_agg['Orig_Seed_Weak'])
results_agg.head()
# Think of ways to get more data points for later rounds.  Maybe group rounds together and seeds together
# and think of way to get combinations instead of permutations

Unnamed: 0,Round,Orig_Seed_Strong,Orig_Seed_Weak,Strong_Win,Count,Matchup
0,1,1,16,135,136,15
1,1,2,15,128,136,13
2,1,3,14,115,136,11
3,1,4,13,108,136,9
4,1,5,12,89,136,7


In [10]:
results_agg_v2 = results_agg.groupby(['Round', 'Matchup']).agg({'Strong_Win':sum, 'Count':sum}).reset_index()
results_agg_v2.head()

Unnamed: 0,Round,Matchup,Strong_Win,Count
0,1,1,68,136
1,1,3,84,136
2,1,5,85,136
3,1,7,89,136
4,1,9,108,136


In [11]:
@np.vectorize
def agg_round(r):
    if r<3:
        return r
    elif r>=3:
        return 3
    
results_agg_v2['Round_v2'] = agg_round(results_agg_v2['Round'])
results_agg_v2.head()

Unnamed: 0,Round,Matchup,Strong_Win,Count,Round_v2
0,1,1,68,136,1
1,1,3,84,136,1
2,1,5,85,136,1
3,1,7,89,136,1
4,1,9,108,136,1


In [12]:
results_agg_v3 = results_agg_v2.groupby(['Round_v2', 'Matchup']).agg({'Strong_Win':sum, 'Count':sum}).reset_index()
results_agg_v3.head()

Unnamed: 0,Round_v2,Matchup,Strong_Win,Count
0,1,1,68,136
1,1,3,84,136
2,1,5,85,136
3,1,7,89,136
4,1,9,108,136


In [13]:
results_agg_v3['Key'] = [r*100 + m for r,m in zip(results_agg_v3['Round_v2'], results_agg_v3['Matchup'])]
results_agg_v4 = results_agg_v3.groupby('Key').agg({'Strong_Win':sum, 'Count':sum}).reset_index()
results_agg_v4['Win_Prob'] = results_agg_v4['Strong_Win']/results_agg_v4['Count']
results_agg_v4.head()

Unnamed: 0,Key,Strong_Win,Count,Win_Prob
0,101,68,136,0.5
1,103,84,136,0.617647
2,105,85,136,0.625
3,107,89,136,0.654412
4,109,108,136,0.794118


In [14]:
results_agg_v4['Win_Prob'] = [0.5 if c<10 else w for c,w in zip(results_agg_v4['Count'], results_agg_v4['Win_Prob'])]

In [18]:
results_agg_v4.to_csv('../data/model_v1.csv', index=False)

In [16]:
results_agg_v4

Unnamed: 0,Key,Strong_Win,Count,Win_Prob
0,101,68,136,0.5
1,103,84,136,0.617647
2,105,85,136,0.625
3,107,89,136,0.654412
4,109,108,136,0.794118
5,111,115,136,0.845588
6,113,128,136,0.941176
7,115,135,136,0.992647
8,201,48,83,0.578313
9,203,46,74,0.621622
