In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../src')
from data_prep import simulate_round_results, create_round_results

## Import data

In [2]:
results_compact = pd.read_csv('../data/Stage2DataFiles/NCAATourneyCompactResults.csv')
results_compact.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [3]:
teams = pd.read_csv('../data/Stage2DataFiles/teams.csv')
teams.head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2019
1,1102,Air Force,1985,2019
2,1103,Akron,1985,2019
3,1104,Alabama,1985,2019
4,1105,Alabama A&M,2000,2019


In [4]:
tourney_slots = pd.read_csv('../data/Stage2DataFiles/NCAATourneySlots.csv')
tourney_slots.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
0,1985,R1W1,W01,W16
1,1985,R1W2,W02,W15
2,1985,R1W3,W03,W14
3,1985,R1W4,W04,W13
4,1985,R1W5,W05,W12


In [5]:
tourney_seeds = pd.read_csv('../data/Stage2DataFiles/NCAATourneySeeds.csv')
tourney_seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [6]:
seeds = pd.read_csv('../data/Stage2DataFiles//NCAATourneySeeds.csv')
seeds['Seed_v2'] = [int(x[1:3]) for x in seeds['Seed']]
seeds.drop('Seed', axis=1, inplace=True)
seeds.head()

Unnamed: 0,Season,TeamID,Seed_v2
0,1985,1207,1
1,1985,1210,2
2,1985,1228,3
3,1985,1260,4
4,1985,1374,5


In [7]:
tourney_seed_round = pd.read_csv('../data/Stage2DataFiles/NCAATourneySeedRoundSlots.csv')
tourney_seed_round.head()

Unnamed: 0,Seed,GameRound,GameSlot,EarlyDayNum,LateDayNum
0,W01,1,R1W1,136,137
1,W01,2,R2W1,138,139
2,W01,3,R3W1,143,144
3,W01,4,R4W1,145,146
4,W01,5,R5WX,152,152


## Data Prep

In [8]:
# Eliminate 2019 season and add round variable
tourney_slots_v2 = tourney_slots.copy()

print (len(tourney_slots_v2))
tourney_slots_v2 = tourney_slots_v2.loc[tourney_slots_v2['Season']!=2019]
print (len(tourney_slots_v2))

@np.vectorize
def add_round(slot):
    if slot[0:1]!='R':
        return 0
    else:
        return int(slot.split('R')[1][0])

tourney_slots_v2['Round'] = add_round(tourney_slots_v2['Slot'])
display(tourney_slots_v2['Round'].value_counts())
tourney_slots_v2.head()

2251
2184


1    1088
2     544
3     272
4     136
5      68
0      42
6      34
Name: Round, dtype: int64

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round
0,1985,R1W1,W01,W16,1
1,1985,R1W2,W02,W15,1
2,1985,R1W3,W03,W14,1
3,1985,R1W4,W04,W13,1
4,1985,R1W5,W05,W12,1


In [9]:
# convert results to season, round, team format
tourney_seed_early_day = tourney_seed_round.groupby('GameRound').agg({'EarlyDayNum':'mean'}).reset_index()
tourney_seed_late_day = tourney_seed_round.groupby('GameRound').agg({'LateDayNum':'mean'}).reset_index()


results_round = pd.merge(results_compact, tourney_seed_early_day, 
                         left_on=['DayNum'], right_on=['EarlyDayNum'], 
                         how='left', 
                         validate='m:1')

results_round = pd.merge(results_round, tourney_seed_late_day, 
                         left_on=['DayNum'], right_on=['LateDayNum'], 
                         how='left', 
                         validate='m:1')

assert len(results_round.loc[
    (pd.isnull(results_round['GameRound_x'])) & 
    (pd.isnull(results_round['GameRound_y']))]
          )==0

valids = results_round.loc[
    (~pd.isnull(results_round['GameRound_x'])) & 
    (~pd.isnull(results_round['GameRound_y']))]
assert len(valids.loc[valids['GameRound_x']!=valids['GameRound_y']])==0

results_round['Round'] = results_round['GameRound_x'].combine_first(results_round['GameRound_y'])

results_w = results_round[['Season', 'Round', 'WTeamID', 'WScore', 'NumOT']].rename(columns={'WTeamID':'TeamID', 'WScore':'Score'})
results_l = results_round[['Season', 'Round', 'LTeamID', 'LScore', 'NumOT']].rename(columns={'LTeamID':'TeamID', 'LScore':'Score'})

results = results_w.append(results_l).reset_index(drop=True)
results.head()

Unnamed: 0,Season,Round,TeamID,Score,NumOT
0,1985,1.0,1116,63,0
1,1985,1.0,1120,59,0
2,1985,1.0,1207,68,0
3,1985,1.0,1229,58,0
4,1985,1.0,1242,49,0


## Simulate Rounds

In [10]:
r0_results = create_round_results(tourney_seeds, 0, teams, tourney_slots_v2, results)
r0_results.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,Score_Strong,NumOT_Strong,Score_Weak,NumOT_Weak,TeamID_Winner
0,2001,Y16,Y16a,Y16b,0,1322,Northwestern LA,1457,Winthrop,71,0,67,0,1322
1,2002,W16,W16a,W16b,0,1108,Alcorn St,1373,Siena,77,0,81,0,1373
2,2003,X16,X16a,X16b,0,1411,TX Southern,1421,UNC Asheville,84,1,92,1,1421
3,2004,Z16,Z16a,Z16b,0,1197,Florida A&M,1250,Lehigh,72,0,57,0,1197
4,2005,Z16,Z16a,Z16b,0,1105,Alabama A&M,1324,Oakland,69,0,79,0,1324


In [11]:
r0_results = r0_results.loc[r0_results['Season']==2018]
r0_results

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,Score_Strong,NumOT_Strong,Score_Weak,NumOT_Weak,TeamID_Winner
38,2018,W11,W11a,W11b,0,1382,St Bonaventure,1417,UCLA,65,0,58,0,1382
39,2018,W16,W16a,W16b,0,1254,Long Island,1347,Radford,61,0,71,0,1347
40,2018,X11,X11a,X11b,0,1113,Arizona St,1393,Syracuse,56,0,60,0,1393
41,2018,Z16,Z16a,Z16b,0,1300,NC Central,1411,TX Southern,46,0,64,0,1411


In [12]:
r1_seeds = r0_results[['Season', 'TeamID_Winner', 'Slot']].copy()
r1_seeds.rename(columns={'TeamID_Winner':'TeamID', 'Slot':'Seed'}, inplace=True)

play_ins = list(r1_seeds['Seed'].values)
# play_ins

tourney_seeds_no_play_in = tourney_seeds.copy()
for s in play_ins:
    tourney_seeds_no_play_in = tourney_seeds_no_play_in.loc[tourney_seeds_no_play_in['Seed']!=s+'a']
    tourney_seeds_no_play_in = tourney_seeds_no_play_in.loc[tourney_seeds_no_play_in['Seed']!=s+'b']

r1_seeds = r1_seeds.append(tourney_seeds_no_play_in).reset_index(drop=True)

r1_seeds = r1_seeds.loc[r1_seeds['Season']==2018]

print (r1_seeds.shape)
r1_seeds.head()

(64, 3)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,Season,Seed,TeamID
0,2018,W11,1382
1,2018,W16,1347
2,2018,X11,1393
3,2018,Z16,1411
2128,2018,W01,1437


In [13]:
r1_results = simulate_round_results(r1_seeds, 1, teams, tourney_slots_v2)
r1_results.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,TeamID_Winner,Strong_Win
0,2018,R1W1,W01,W16,1,1437,Villanova,1347,Radford,1437,1
1,2018,R1W2,W02,W15,1,1345,Purdue,1168,CS Fullerton,1345,1
2,2018,R1W3,W03,W14,1,1403,Texas Tech,1372,SF Austin,1403,1
3,2018,R1W4,W04,W13,1,1455,Wichita St,1267,Marshall,1455,1
4,2018,R1W5,W05,W12,1,1452,West Virginia,1293,Murray St,1452,1


In [14]:
r2_seeds = r1_results[['Season', 'TeamID_Winner', 'Slot']].copy()
r2_seeds.rename(columns={'TeamID_Winner':'TeamID', 'Slot':'Seed'}, inplace=True)
r2_seeds.head()

Unnamed: 0,Season,TeamID,Seed
0,2018,1437,R1W1
1,2018,1345,R1W2
2,2018,1403,R1W3
3,2018,1455,R1W4
4,2018,1452,R1W5


In [15]:
r2_seeds.shape

(64, 3)

In [16]:
r2_results = simulate_round_results(r2_seeds, 2, teams, tourney_slots_v2)
r2_results.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,TeamID_Winner,Strong_Win
0,2018,R2W1,R1W1,R1W8,2,1437,Villanova,1439,Virginia Tech,1437,1
1,2018,R2W1,R1W1,R1W8,2,1437,Villanova,1104,Alabama,1437,1
2,2018,R2W1,R1W1,R1W8,2,1347,Radford,1439,Virginia Tech,1347,1
3,2018,R2W1,R1W1,R1W8,2,1347,Radford,1104,Alabama,1347,1
4,2018,R2W2,R1W2,R1W7,2,1345,Purdue,1116,Arkansas,1345,1


In [17]:
r2_results.shape

(128, 11)

In [18]:
r3_seeds = r2_results[['Season', 'TeamID_Winner', 'Slot']].copy()
r3_seeds.rename(columns={'TeamID_Winner':'TeamID', 'Slot':'Seed'}, inplace=True)
r3_seeds.head()

Unnamed: 0,Season,TeamID,Seed
0,2018,1437,R2W1
1,2018,1437,R2W1
2,2018,1347,R2W1
3,2018,1347,R2W1
4,2018,1345,R2W2


In [19]:
r3_seeds.shape
r3_seeds.drop_duplicates(inplace=True)
r3_seeds.shape

(64, 3)

In [20]:
r3_results = simulate_round_results(r3_seeds, 3, teams, tourney_slots_v2)
r3_results.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,TeamID_Winner,Strong_Win
0,2018,R3W1,R2W1,R2W4,3,1437,Villanova,1455,Wichita St,1437,1
1,2018,R3W1,R2W1,R2W4,3,1437,Villanova,1267,Marshall,1437,1
2,2018,R3W1,R2W1,R2W4,3,1437,Villanova,1452,West Virginia,1437,1
3,2018,R3W1,R2W1,R2W4,3,1437,Villanova,1293,Murray St,1437,1
4,2018,R3W1,R2W1,R2W4,3,1347,Radford,1455,Wichita St,1347,1


In [21]:
r3_results.shape

(256, 11)

In [22]:
r4_seeds = r3_results[['Season', 'TeamID_Winner', 'Slot']].copy()
r4_seeds.rename(columns={'TeamID_Winner':'TeamID', 'Slot':'Seed'}, inplace=True)
r4_seeds.head()

Unnamed: 0,Season,TeamID,Seed
0,2018,1437,R3W1
1,2018,1437,R3W1
2,2018,1437,R3W1
3,2018,1437,R3W1
4,2018,1347,R3W1


In [23]:
r4_seeds.shape
r4_seeds.drop_duplicates(inplace=True)
r4_seeds.shape

(64, 3)

In [24]:
r4_results = simulate_round_results(r4_seeds, 4, teams, tourney_slots_v2)
r4_results.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,TeamID_Winner,Strong_Win
0,2018,R4W1,R3W1,R3W2,4,1437,Villanova,1345,Purdue,1437,1
1,2018,R4W1,R3W1,R3W2,4,1437,Villanova,1168,CS Fullerton,1437,1
2,2018,R4W1,R3W1,R3W2,4,1437,Villanova,1116,Arkansas,1437,1
3,2018,R4W1,R3W1,R3W2,4,1437,Villanova,1139,Butler,1437,1
4,2018,R4W1,R3W1,R3W2,4,1437,Villanova,1403,Texas Tech,1437,1


In [25]:
r4_results.shape

(512, 11)

In [26]:
r5_seeds = r4_results[['Season', 'TeamID_Winner', 'Slot']].copy()
r5_seeds.rename(columns={'TeamID_Winner':'TeamID', 'Slot':'Seed'}, inplace=True)
r5_seeds.head()

Unnamed: 0,Season,TeamID,Seed
0,2018,1437,R4W1
1,2018,1437,R4W1
2,2018,1437,R4W1
3,2018,1437,R4W1
4,2018,1437,R4W1


In [27]:
r5_seeds.shape
r5_seeds.drop_duplicates(inplace=True)
r5_seeds.shape

(64, 3)

In [28]:
r5_results = simulate_round_results(r5_seeds, 5, teams, tourney_slots_v2)
r5_results.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,TeamID_Winner,Strong_Win
0,2018,R5WX,R4W1,R4X1,5,1437,Villanova,1242,Kansas,1437,1
1,2018,R5WX,R4W1,R4X1,5,1437,Villanova,1335,Penn,1437,1
2,2018,R5WX,R4W1,R4X1,5,1437,Villanova,1371,Seton Hall,1437,1
3,2018,R5WX,R4W1,R4X1,5,1437,Villanova,1301,NC State,1437,1
4,2018,R5WX,R4W1,R4X1,5,1437,Villanova,1120,Auburn,1437,1


In [29]:
r6_seeds = r5_results[['Season', 'TeamID_Winner', 'Slot']].copy()
r6_seeds.rename(columns={'TeamID_Winner':'TeamID', 'Slot':'Seed'}, inplace=True)
r6_seeds.head()

Unnamed: 0,Season,TeamID,Seed
0,2018,1437,R5WX
1,2018,1437,R5WX
2,2018,1437,R5WX
3,2018,1437,R5WX
4,2018,1437,R5WX


In [30]:
r6_seeds.shape
r6_seeds.drop_duplicates(inplace=True)
r6_seeds.shape

(64, 3)

In [31]:
r6_results = simulate_round_results(r6_seeds, 6, teams, tourney_slots_v2)
r6_results.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,TeamID_Winner,Strong_Win
0,2018,R6CH,R5WX,R5YZ,6,1437,Villanova,1438,Virginia,1437,1
1,2018,R6CH,R5WX,R5YZ,6,1437,Villanova,1420,UMBC,1437,1
2,2018,R6CH,R5WX,R5YZ,6,1437,Villanova,1166,Creighton,1437,1
3,2018,R6CH,R5WX,R5YZ,6,1437,Villanova,1243,Kansas St,1437,1
4,2018,R6CH,R5WX,R5YZ,6,1437,Villanova,1112,Arizona,1437,1


## Combine results

In [32]:
comb_results = r1_results
for d in [r2_results, r3_results, r4_results, r5_results, r6_results]:
    comb_results = comb_results.append(d)
comb_results = comb_results.reset_index(drop=True)
print (comb_results.shape)
comb_results.head()

(4032, 11)


Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,TeamID_Winner,Strong_Win
0,2018,R1W1,W01,W16,1,1437,Villanova,1347,Radford,1437,1
1,2018,R1W2,W02,W15,1,1345,Purdue,1168,CS Fullerton,1345,1
2,2018,R1W3,W03,W14,1,1403,Texas Tech,1372,SF Austin,1403,1
3,2018,R1W4,W04,W13,1,1455,Wichita St,1267,Marshall,1455,1
4,2018,R1W5,W05,W12,1,1452,West Virginia,1293,Murray St,1452,1


## Join with original seed

In [33]:
# join to get original seed for strong and weak teams
comb_results_seeds = pd.merge(comb_results, seeds, 
                              left_on=['Season', 'TeamID_Strong'], 
                              right_on=['Season', 'TeamID'], 
                              validate='m:1')
comb_results_seeds.rename(columns={'Seed_v2':'Orig_Seed_Strong'}, inplace=True)
comb_results_seeds.drop('TeamID', axis=1, inplace=True)

comb_results_seeds = pd.merge(comb_results_seeds, seeds, 
                              left_on=['Season', 'TeamID_Weak'], 
                              right_on=['Season', 'TeamID'], 
                              validate='m:1')
comb_results_seeds.rename(columns={'Seed_v2':'Orig_Seed_Weak'}, inplace=True)
comb_results_seeds.drop('TeamID', axis=1, inplace=True)

comb_results_seeds.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,TeamID_Winner,Strong_Win,Orig_Seed_Strong,Orig_Seed_Weak
0,2018,R1W1,W01,W16,1,1437,Villanova,1347,Radford,1437,1,1,16
1,2018,R1W1,W01,W16,1,1437,Villanova,1347,Radford,1347,0,1,16
2,2018,R2W1,R1W1,R1W8,2,1437,Villanova,1439,Virginia Tech,1437,1,1,8
3,2018,R2W1,R1W1,R1W8,2,1437,Villanova,1439,Virginia Tech,1439,0,1,8
4,2018,R2W1,R1W1,R1W8,2,1347,Radford,1439,Virginia Tech,1347,1,16,8


## Get key info

In [34]:
@np.vectorize
def agg_matchup(w, s):
    min_s = min(w,s)
    max_s = max(w,s)
    diff = max_s - min_s
    return diff

comb_results_seeds['Matchup'] = agg_matchup(comb_results_seeds['Orig_Seed_Weak'], 
                                            comb_results_seeds['Orig_Seed_Strong'])

In [35]:
@np.vectorize
def agg_round(r):
    if r<3:
        return r
    elif r>=3:
        return 3
    
comb_results_seeds['Round_v2'] = agg_round(comb_results_seeds['Round'])
comb_results_seeds.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,TeamID_Winner,Strong_Win,Orig_Seed_Strong,Orig_Seed_Weak,Matchup,Round_v2
0,2018,R1W1,W01,W16,1,1437,Villanova,1347,Radford,1437,1,1,16,15,1
1,2018,R1W1,W01,W16,1,1437,Villanova,1347,Radford,1347,0,1,16,15,1
2,2018,R2W1,R1W1,R1W8,2,1437,Villanova,1439,Virginia Tech,1437,1,1,8,7,2
3,2018,R2W1,R1W1,R1W8,2,1437,Villanova,1439,Virginia Tech,1439,0,1,8,7,2
4,2018,R2W1,R1W1,R1W8,2,1347,Radford,1439,Virginia Tech,1347,1,16,8,8,2


In [36]:
comb_results_seeds['comp'] = 1
comb_results_seeds['comp'] = [1 if s<w else -1 for w,s in zip(comb_results_seeds['Orig_Seed_Weak'], 
                                            comb_results_seeds['Orig_Seed_Strong'])]
comb_results_seeds['comp'] = comb_results_seeds['comp']*[1 if w==1 else -1 for w in comb_results_seeds['Strong_Win']]


In [37]:
comb_results_seeds['Key'] = [r*100 + m for r,m in zip(comb_results_seeds['Round_v2'], 
                                                       comb_results_seeds['Matchup'])]



In [38]:
model_v1 = pd.read_csv('../data/model_v1.csv')
model_v1 = model_v1[['Key', 'Win_Prob']]
model_v1.head()

Unnamed: 0,Key,Win_Prob
0,101,0.5
1,103,0.617647
2,105,0.625
3,107,0.654412
4,109,0.794118


In [39]:
pred = pd.merge(comb_results_seeds, model_v1, on='Key', how='left', validate='m:1')
pred['Win_Prob'] = [p if c==1 else (1-p) for c,p in zip(pred['comp'], pred['Win_Prob'])]

In [40]:
# have lots of new scenarios.  maybe should have differnce in seeds as key instead

In [41]:
print (pred.loc[pd.isnull(pred['Win_Prob'])].shape)
pred.loc[pd.isnull(pred['Win_Prob'])]

(176, 18)


Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,TeamID_Winner,Strong_Win,Orig_Seed_Strong,Orig_Seed_Weak,Matchup,Round_v2,comp,Key,Win_Prob
64,2018,R4W1,R3W1,R3W2,4,1347,Radford,1345,Purdue,1347,1,16,2,14,3,-1,314,
65,2018,R4W1,R3W1,R3W2,4,1347,Radford,1345,Purdue,1345,0,16,2,14,3,1,314,
72,2018,R4W1,R3W1,R3W2,4,1437,Villanova,1168,CS Fullerton,1437,1,1,15,14,3,1,314,
73,2018,R4W1,R3W1,R3W2,4,1437,Villanova,1168,CS Fullerton,1168,0,1,15,14,3,-1,314,
144,2018,R4W1,R3W1,R3W2,4,1347,Radford,1403,Texas Tech,1347,1,16,3,13,3,-1,313,
145,2018,R4W1,R3W1,R3W2,4,1347,Radford,1403,Texas Tech,1403,0,16,3,13,3,1,313,
156,2018,R4W1,R3W1,R3W2,4,1437,Villanova,1372,SF Austin,1437,1,1,14,13,3,1,313,
157,2018,R4W1,R3W1,R3W2,4,1437,Villanova,1372,SF Austin,1372,0,1,14,13,3,-1,313,
256,2018,R5WX,R4W1,R4X1,5,1347,Radford,1242,Kansas,1347,1,16,1,15,3,-1,315,
257,2018,R5WX,R4W1,R4X1,5,1347,Radford,1242,Kansas,1242,0,16,1,15,3,1,315,


In [42]:
pred['Win_Prob'] = pred['Win_Prob'].fillna(0.5)

In [43]:
pred.to_csv('../data/opt_input.csv', index=False)