In [1]:
import pandas as pd

In [2]:
season_dat = pd.read_csv('data/DataFiles/RegularSeasonDetailedResults.csv')
season_dat.shape

(82041, 34)

### Advanced Metrics

Calculate:     
1) Offensive Efficiency - Possession based   
2) Defensive Efficiency - Possession based   
3) Net Rating - Possession based   
4) Assist Ratios      
5) eFG    

Maybe look for Dean Olivers Four Factor Formula 

## Regular Season Data 

In [3]:
# pos = possessions
season_dat['wPos'] = season_dat.apply(lambda row: 0.96*(row.WFGA + row.WTO + 0.44*row.WFTA - row.WOR), axis=1)
season_dat['lPos'] = season_dat.apply(lambda row: 0.96*(row.LFGA + row.LTO + 0.44*row.LFTA - row.LOR), axis=1)

In [4]:
season_dat['WOffRtg'] = season_dat.apply(lambda row: 100 * (row.WScore / row.wPos), axis=1)
season_dat['LOffRtg'] = season_dat.apply(lambda row: 100 * (row.LScore / row.lPos), axis=1)
#Defensive efficiency (DefRtg) = 100 x (Opponent points / Opponent possessions)
season_dat['WDefRtg'] = season_dat.LOffRtg
season_dat['LDefRtg'] = season_dat.WOffRtg
#Net Rating = Off.Rtg - Def.Rtg
season_dat['WNetRtg'] = season_dat.apply(lambda row:(row.WOffRtg - row.WDefRtg), axis=1)
season_dat['LNetRtg'] = season_dat.apply(lambda row:(row.LOffRtg - row.LDefRtg), axis=1)

In [5]:
#Assist Ratio : Percentage of team possessions that end in assists
season_dat['WAstR'] = season_dat.apply(lambda row: 100 * row.WAst / (row.WFGA + 0.44*row.WFTA + row.WAst + row.WTO), axis=1)
season_dat['LAstR'] = season_dat.apply(lambda row: 100 * row.LAst / (row.LFGA + 0.44*row.LFTA + row.LAst + row.LTO), axis=1)
#Turnover Ratio: Number of turnovers of a team per 100 possessions used.
#(TO * 100) / (FGA + (FTA * 0.44) + AST + TO)
season_dat['WTOR'] = season_dat.apply(lambda row: 100 * row.WTO / (row.WFGA + 0.44*row.WFTA + row.WAst + row.WTO), axis=1)
season_dat['LTOR'] = season_dat.apply(lambda row: 100 * row.LTO / (row.LFGA + 0.44*row.LFTA + row.LAst + row.LTO), axis=1)
                    
#eFG% : Effective Field Goal Percentage adjusting for the fact that 3pt shots are more valuable
season_dat['WeFGP'] = season_dat.apply(lambda row:(row.WFGM + 0.5 * row.WFGM3) / row.WFGA, axis=1)      
season_dat['LeFGP'] = season_dat.apply(lambda row:(row.LFGM + 0.5 * row.LFGM3) / row.LFGA, axis=1) 

In [6]:
#FTA Rate : How good a team is at drawing fouls.
season_dat['WFTAR'] = season_dat.apply(lambda row: row.WFTA / row.WFGA, axis=1)
season_dat['LFTAR'] = season_dat.apply(lambda row: row.LFTA / row.LFGA, axis=1)
                         
#OREB% : Percentage of team offensive rebounds
season_dat['WORP'] = season_dat.apply(lambda row: row.WOR / (row.WOR + row.LDR), axis=1)
season_dat['LORP'] = season_dat.apply(lambda row: row.LOR / (row.LOR + row.WDR), axis=1)
#DREB% : Percentage of team defensive rebounds
season_dat['WDRP'] = season_dat.apply(lambda row: row.WDR / (row.WDR + row.LOR), axis=1)
season_dat['LDRP'] = season_dat.apply(lambda row: row.LDR / (row.LDR + row.WOR), axis=1)                                      
#REB% : Percentage of team total rebounds
season_dat['WRP'] = season_dat.apply(lambda row: (row.WDR + row.WOR) / (row.WDR + row.WOR + row.LDR + row.LOR), axis=1)
season_dat['LRP'] = season_dat.apply(lambda row: (row.LDR + row.LOR) / (row.WDR + row.WOR + row.LDR + row.LOR), axis=1) 

In [7]:
season_dat['W4Factor'] = season_dat.apply(lambda row: .40*row.WeFGP + .25*row.WTOR + .20*row.WORP + .15*row.WFTAR, axis=1)
season_dat['L4Factor'] = season_dat.apply(lambda row: .40*row.LeFGP + .25*row.LTOR + .20*row.LORP + .15*row.LFTAR, axis=1)                                      

## Ken Pom Ranks 

In [8]:
ranks_dat = pd.read_csv('data/MasseyOrdinals.csv')
ranks_dat.shape

(3492320, 5)

In [9]:
ken_pom_ranks = ranks_dat[(ranks_dat['SystemName'] == 'POM') & (ranks_dat['RankingDayNum'] == 133)].drop(labels=['RankingDayNum', 'SystemName'],axis=1)

In [10]:
ken_pom_ranks.to_csv("data/Final/final_raw_kp_dat.csv",index=False)

## Tourney Data

In [11]:
df_tourney = pd.read_csv('data/DataFiles/NCAATourneyCompactResults.csv')
df_seed = pd.read_csv('data/DataFiles/NCAATourneySeeds.csv')
print(df_tourney.shape)
print(df_seed.shape)

(2184, 8)
(2218, 3)


In [12]:
df_seed['Seed'] = df_seed['Seed'].str.replace('[a-zA-Z]', '').astype('int64')

In [13]:
df_seed.to_csv('data/Final/final_raw_seeds.csv',index=False)

In [14]:
df_seedv2 = pd.merge(df_tourney, df_seed, how='inner', 
               left_on=['Season', 'WTeamID'], 
               right_on=['Season', 'TeamID']).rename(columns={"Seed": "W_SEED"})
df_seedv2 = pd.merge(df_seedv2, df_seed, how='inner', 
               left_on=['Season', 'LTeamID'], 
               right_on=['Season', 'TeamID']).rename(columns={"Seed": "L_SEED"})

In [15]:
df_seedv2 = pd.merge(df_seedv2,ken_pom_ranks,how='inner',
                    left_on = ['Season','WTeamID'],
                    right_on = ['Season','TeamID']).rename(columns={'OrdinalRank':'WKP'})
df_seedv2 = pd.merge(df_seedv2,ken_pom_ranks,how='inner',
                    left_on = ['Season','LTeamID'],
                    right_on = ['Season','TeamID']).rename(columns={'OrdinalRank':'LKP'})

In [16]:
df_seedv2['Seed_Diff'] = df_seedv2['W_SEED'] - df_seedv2['L_SEED']
df_seedv2['KP_Diff'] = df_seedv2['WKP'] - df_seedv2['LKP']

In [17]:
df_seedv2 = df_seedv2.drop(columns=['TeamID_x', 'TeamID_y'])

In [18]:
df_seedv2 = df_seedv2[df_seedv2['Season'] < 2014]

In [19]:
df_wins = pd.DataFrame()
df_wins['Seed_Diff'] = df_seedv2['Seed_Diff']
df_wins['KP_Diff'] = df_seedv2['KP_Diff']
df_wins['result'] = 1

df_losses = pd.DataFrame()
df_losses['Seed_Diff'] = -df_seedv2['Seed_Diff']
df_losses['KP_Diff'] = df_seedv2['KP_Diff']
df_losses['result'] = 0

df_seed_preds = pd.concat((df_wins, df_losses))

In [20]:
df_seed_preds.to_csv('data/Final/data_kp_seed.csv',index=False)