# Data start to finish

The purpose of this notebook is to go from start to finish with data loading, and have all steps collected in one notebook.

In [794]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import datetime as dt

from utils.data_loader import Dataset
from utils.data_cleaning import optimize, uniform_name

## Preprocessing data

This link contains every mlb game ever played with their elo scores as well as some other information we will find useful. This will play the role of a starter dataframe to then add the rest of our data to.

In [795]:
mlb_elo = pd.read_csv('https://projects.fivethirtyeight.com/mlb-api/mlb_elo.csv')

We only want dates from 2000-2019 (Not 2020 because of the COVID shortened season).

In [796]:
mlb_elo = mlb_elo[(mlb_elo['date'] >= '2001-03-29') & (mlb_elo['date'] <= '2019-9-29')]

We are going to drop any row that was a playoff row just giving us regular season games as well.

In [797]:
mlb_elo = mlb_elo[(mlb_elo['playoff'] != 'w') &
                  (mlb_elo['playoff'] != 'l') &
                 (mlb_elo['playoff'] != 'd') &
                 (mlb_elo['playoff'] != 'c')]

In [798]:
mlb_elo['home_win'] = np.where(mlb_elo['score1'] > mlb_elo['score2'], 1, 0)

Now, we can rename the columns to be more user friendly and start loading in the rest of our data.

In [799]:
mlb_elo = mlb_elo[['date', 'team1', 'team2', 'home_win', 'elo1_pre', 'elo2_pre', 'pitcher1', 'pitcher2']]

In [800]:
mlb_elo = mlb_elo.rename(columns={'date':'date', 
                'team1': 'home_team',
                'team2': 'away_team', 
                'elo1_pre': 'home_elo', 
                'elo2_pre': 'away_elo',
                'pitcher1': 'home_pitcher',
                'pitcher2': 'away_pitcher'})

In [801]:
mlb_elo.head()

Unnamed: 0,date,home_team,away_team,home_win,home_elo,away_elo,home_pitcher,away_pitcher
988,2019-09-29,STL,CHC,1,1545.306947,1531.092895,Jack Flaherty,Derek Holland
989,2019-09-29,KCR,MIN,1,1429.995452,1549.426829,Jorge Lopez,Martin Perez
990,2019-09-29,SEA,OAK,1,1467.7325,1577.14715,Justin Dunn,Tanner Roark
991,2019-09-29,NYM,ATL,1,1528.343228,1548.88195,Noah Syndergaard,Mike Soroka
992,2019-09-29,COL,MIL,1,1469.463417,1534.015807,Jeff Hoffman,Adrian Houser


# Loading hitter data

In [802]:
avg_df = pd.read_csv('../data/COVARIATE_AVG.csv', index_col = 'date')
obp_df = pd.read_csv('../data/COVARIATE_OBP.csv', index_col = 'date')
slg_df = pd.read_csv('../data/COVARIATE_SLG.csv', index_col = 'date')

In [803]:
avg_df.index = pd.to_datetime(avg_df.index, format='%Y%m%d')
obp_df.index = pd.to_datetime(obp_df.index, format='%Y%m%d')
slg_df.index = pd.to_datetime(slg_df.index, format='%Y%m%d')

In [804]:
avg_df.head()

Unnamed: 0_level_0,ANA,ARI,ATL,BAL,BOS,CHA,CHN,CIN,CLE,COL,...,PHI,PIT,SDN,SEA,SFN,SLN,TBA,TEX,TOR,WAS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-04-01,0.277406,0.263385,0.268227,0.269526,0.265407,0.283532,0.254189,0.272199,0.285689,0.290176,...,0.249414,0.264954,0.251828,0.26728,0.276079,0.267825,0.255143,0.281519,0.273459,0.264347
2001-04-02,0.277406,0.263385,0.268227,0.269526,0.265407,0.283532,0.254189,0.272199,0.285689,0.290176,...,0.249414,0.264954,0.251828,0.26728,0.276079,0.267825,0.255143,0.281519,0.273459,0.264347
2001-04-03,0.277406,0.263385,0.268227,0.269526,0.265407,0.283532,0.254189,0.272199,0.285689,0.290176,...,0.249414,0.264954,0.251828,0.26728,0.276079,0.267825,0.255143,0.281519,0.273459,0.264347
2001-04-04,0.277406,0.263385,0.268227,0.269526,0.265407,0.283532,0.254189,0.272199,0.285689,0.290176,...,0.249414,0.264954,0.251828,0.26728,0.276079,0.267825,0.255143,0.281519,0.273459,0.264347
2001-04-05,0.277406,0.263385,0.268227,0.269526,0.265407,0.283532,0.254189,0.272199,0.285689,0.290176,...,0.249414,0.264954,0.251828,0.26728,0.276079,0.267825,0.255143,0.281519,0.273459,0.264347


Below, we are changing the team name abbreviations to match each other for all of the dataframes.

In [805]:
mlb_elo['home_team'] = mlb_elo['home_team'].apply(uniform_name)
mlb_elo['away_team'] = mlb_elo['away_team'].apply(uniform_name)

In [806]:
assert set(mlb_elo['home_team'].unique()) ==  set(mlb_elo['away_team'].unique())

We are going to make these columns below but first we want to set them to zero just in case a null value occurs.

In [807]:
mlb_elo['home_avg'] = 0
mlb_elo['away_avg'] = 0
mlb_elo['home_obp'] = 0
mlb_elo['away_obp'] = 0
mlb_elo['home_slg'] = 0
mlb_elo['away_slg'] = 0

In [808]:
mlb_elo['date'] = pd.to_datetime(mlb_elo['date'])

The function below will go through the avg_df, obp_df, and slg_df and select the row based on the date and the column based on the specific team and then match that with its corresponding avg, obp, or slg respectively.

In [809]:
def get_covariates(x):
    x['home_avg'] = avg_df.loc[x['date']][x['home_team']]
    x['away_avg'] = avg_df.loc[x['date']][x['away_team']]
    x['home_obp'] = obp_df.loc[x['date']][x['home_team']]
    x['away_obp'] = obp_df.loc[x['date']][x['away_team']]
    x['home_slg'] = slg_df.loc[x['date']][x['home_team']]
    x['away_slg'] = slg_df.loc[x['date']][x['away_team']]
    return x

mlb_elo = mlb_elo.apply(get_covariates, axis = 1)

In [810]:
mlb_elo.head()

Unnamed: 0,date,home_team,away_team,home_win,home_elo,away_elo,home_pitcher,away_pitcher,home_avg,away_avg,home_obp,away_obp,home_slg,away_slg
988,2019-09-29,SLN,CHN,1,1545.306947,1531.092895,Jack Flaherty,Derek Holland,0.222222,0.243802,0.321875,0.31203,0.415771,0.392562
989,2019-09-29,KCA,MIN,1,1429.995452,1549.426829,Jorge Lopez,Martin Perez,0.289256,0.279352,0.350943,0.340741,0.475207,0.510121
990,2019-09-29,SEA,OAK,1,1467.7325,1577.14715,Justin Dunn,Tanner Roark,0.174312,0.217021,0.227468,0.29771,0.243119,0.357447
991,2019-09-29,NYN,ATL,1,1528.343228,1548.88195,Noah Syndergaard,Mike Soroka,0.256098,0.237903,0.336957,0.325,0.443089,0.395161
992,2019-09-29,COL,MIL,1,1469.463417,1534.015807,Jeff Hoffman,Adrian Houser,0.230496,0.233202,0.306709,0.331034,0.421986,0.43083


In [811]:
mlb_elo['Y'] = mlb_elo['date'].dt.year
mlb_elo['M'] = mlb_elo['date'].dt.month
mlb_elo['D'] = mlb_elo['date'].dt.day

In [812]:
mlb = mlb_elo[['date', 'Y', 'M', 'D',
               'home_team', 'away_team', 'home_win',
               'home_pitcher', 'away_pitcher',
               'home_elo', 'away_elo',
               'home_avg', 'away_avg',
               'home_obp', 'away_obp',
               'home_slg', 'away_slg']]

In [813]:
mlb = mlb.sort_values(by='date')
mlb = mlb.reset_index(drop=True)

In [814]:
mlb.head()

Unnamed: 0,date,Y,M,D,home_team,away_team,home_win,home_pitcher,away_pitcher,home_elo,away_elo,home_avg,away_avg,home_obp,away_obp,home_slg,away_slg
0,2001-04-01,2001,4,1,TOR,TEX,1,loaie001,hellr001,1499.563,1479.163,0.273459,0.281519,0.339283,0.349386,0.466387,0.443116
1,2001-04-02,2001,4,2,SFN,SDN,1,hernl003,willw001,1540.841,1492.8,0.276079,0.251828,0.35938,0.327088,0.468525,0.398252
2,2001-04-02,2001,4,2,SEA,OAK,1,Freddy Garcia,Tim Hudson,1519.464,1534.696,0.26728,0.268144,0.358599,0.357409,0.438008,0.454237
3,2001-04-02,2001,4,2,NYA,KCA,1,clemr001,suppj001,1529.511,1493.152,0.27508,0.285268,0.351633,0.344703,0.446269,0.421482
4,2001-04-02,2001,4,2,LAN,MIL,1,parkc002,Jamey Wright,1515.925,1490.396,0.254198,0.243537,0.336984,0.322708,0.426431,0.400071


# Making Features and Difference Columns

An important feature is ISO which is just slg minus avg

In [815]:
mlb['home_iso'] = mlb['home_slg'] - mlb['home_avg']
mlb['away_iso'] = mlb['away_slg'] - mlb['away_avg']

We can now make a few difference columns as well as percent difference. I am not sure if these will be useful but it will give the model a different way to look at the data.

In [816]:
mlb['elo_diff'] = mlb['home_elo'] - mlb['away_elo']
mlb['elo_pct_diff'] = (mlb['elo_diff'])/(mlb['home_elo'])*100

In [817]:
# > 0 = home is better
mlb['avg_diff'] = (mlb['home_avg'] - mlb['away_avg'])
mlb['obp_diff'] = (mlb['home_obp'] - mlb['away_obp'])
mlb['slg_diff'] = (mlb['home_slg'] - mlb['away_slg'])

# scaling doesn't matter for many cases
mlb['team_avg_pct_diff'] = (mlb['avg_diff'])/(mlb['home_avg'])*100
mlb['team_obp_pct_diff'] = (mlb['obp_diff'])/(mlb['home_obp'])*100
mlb['team_slg_pct_diff'] = (mlb['slg_diff'])/(mlb['home_slg'])*100

In [818]:
mlb.head()

Unnamed: 0,date,Y,M,D,home_team,away_team,home_win,home_pitcher,away_pitcher,home_elo,...,home_iso,away_iso,elo_diff,elo_pct_diff,avg_diff,obp_diff,slg_diff,team_avg_pct_diff,team_obp_pct_diff,team_slg_pct_diff
0,2001-04-01,2001,4,1,TOR,TEX,1,loaie001,hellr001,1499.563,...,0.192927,0.161597,20.4,1.360396,-0.00806,-0.010103,0.023271,-2.947374,-2.977845,4.989568
1,2001-04-02,2001,4,2,SFN,SDN,1,hernl003,willw001,1540.841,...,0.192446,0.146424,48.041,3.117843,0.024251,0.032292,0.070273,8.784099,8.985458,14.998766
2,2001-04-02,2001,4,2,SEA,OAK,1,Freddy Garcia,Tim Hudson,1519.464,...,0.170727,0.186092,-15.232,-1.002459,-0.000864,0.00119,-0.016229,-0.323318,0.331871,-3.70521
3,2001-04-02,2001,4,2,NYA,KCA,1,clemr001,suppj001,1529.511,...,0.171189,0.136214,36.359,2.377165,-0.010188,0.006929,0.024787,-3.703559,1.970596,5.554343
4,2001-04-02,2001,4,2,LAN,MIL,1,parkc002,Jamey Wright,1515.925,...,0.172233,0.156534,25.529,1.684054,0.01066,0.014276,0.026359,4.193722,4.236467,6.181414


Figure out how many days of rest each team had in each game.

In [819]:
mlb['home_rest'] = 0
mlb['away_rest'] = 0

In [820]:
last_game = dict(zip(mlb['home_team'].unique(), [-1]*30))

In [821]:
def home_rest(i):
    home_index = mlb.loc[i]['home_team']

    try:
        days_rest = (mlb.loc[i, 'date'] - last_game[home_index]).days
        last_game[home_index] = mlb.loc[i, 'date']
        return days_rest
    except:
        last_game[home_index] = mlb.loc[i, 'date']
        return 5 # >= 5

def away_rest(i):
    away_index = mlb.loc[i]['away_team']

    try:
        days_rest = (mlb.loc[i, 'date'] - last_game[away_index]).days
        last_game[away_index] = mlb.loc[i, 'date']
        return days_rest
    except:
        last_game[away_index] = mlb.loc[i, 'date']
        return 5 # >= 5

In [822]:
for i in range(len(mlb)):
    mlb.loc[i, 'home_rest'] = home_rest(i)
    mlb.loc[i, 'away_rest'] = away_rest(i)

In [823]:
mlb.head()

Unnamed: 0,date,Y,M,D,home_team,away_team,home_win,home_pitcher,away_pitcher,home_elo,...,elo_diff,elo_pct_diff,avg_diff,obp_diff,slg_diff,team_avg_pct_diff,team_obp_pct_diff,team_slg_pct_diff,home_rest,away_rest
0,2001-04-01,2001,4,1,TOR,TEX,1,loaie001,hellr001,1499.563,...,20.4,1.360396,-0.00806,-0.010103,0.023271,-2.947374,-2.977845,4.989568,5,5
1,2001-04-02,2001,4,2,SFN,SDN,1,hernl003,willw001,1540.841,...,48.041,3.117843,0.024251,0.032292,0.070273,8.784099,8.985458,14.998766,5,5
2,2001-04-02,2001,4,2,SEA,OAK,1,Freddy Garcia,Tim Hudson,1519.464,...,-15.232,-1.002459,-0.000864,0.00119,-0.016229,-0.323318,0.331871,-3.70521,5,5
3,2001-04-02,2001,4,2,NYA,KCA,1,clemr001,suppj001,1529.511,...,36.359,2.377165,-0.010188,0.006929,0.024787,-3.703559,1.970596,5.554343,5,5
4,2001-04-02,2001,4,2,LAN,MIL,1,parkc002,Jamey Wright,1515.925,...,25.529,1.684054,0.01066,0.014276,0.026359,4.193722,4.236467,6.181414,5,5


## Save results

In [824]:
mlb.to_csv('../data/mlb_games_df.csv', index=False)

## Fix pitcher names

Pitchers have a mixture of names and ids. The code below takes the `pitchers_summary.csv` file (which is created in `data_prep/pitcher_summary.ipynb`) which contains the foreign keys and names of all pitchers, and uses is to replace all pitcher names with their IDs.

In [825]:
pitchers_df = pd.read_csv('../data/pitchers_summary.csv')

Clean up pitcher names (remove spaces, periods, hyphens) to make joining easier (spellings are not identical across the two datasets).

In [826]:
mlb['home_pitcher_cleaned'] = mlb['home_pitcher'].str.replace(r'[\s\.\-]+', '').str.lower()
mlb['away_pitcher_cleaned'] = mlb['away_pitcher'].str.replace(r'[\s\.\-]+', '').str.lower()

pitchers_df['pitcher_cleaned'] = pitchers_df['Name'].str.replace(r'[\s\.\-]+', '').str.lower()

Join by retrokey for all that we can. We'll start with just the home pitchers.

In [827]:
def fill_missing_pitchers(left, right, left_on, right_on):
    # The only columns we need from the pitchers
    right_keys = right[['key_retro', 'key_bbref', 'pitcher_cleaned']]
    # Doing this whole "reset_index"..."set_index" preserves the indices (otherwise they get reset)
    left = left.reset_index()
    left = left.merge(right_keys, how='left', left_on=left_on, right_on=right_on)
    left = left.set_index('index')
    left = left.drop(['key_retro', 'pitcher_cleaned'], axis='columns')
    return left

In [828]:
mlb = fill_missing_pitchers(mlb, pitchers_df, 'home_pitcher', 'key_retro')

In [829]:
def add_key_bbref(df, prefix):
    # Rename the last column to include home/away
    cols = list(df.columns)
    cols[-1] = f'{prefix}_key_bbref'
    df.columns = cols
    return df

In [830]:
mlb = add_key_bbref(mlb, 'home')

Now we'll grab just those who didn't join on retrokey and try joining by cleaned name.

In [831]:
games_missing_df = mlb[mlb['home_key_bbref'].isna()]
games_missing_df = games_missing_df.drop('home_key_bbref', axis='columns')

games_missing_df = fill_missing_pitchers(games_missing_df, pitchers_df, 'home_pitcher_cleaned', 'pitcher_cleaned')
games_missing_df = add_key_bbref(games_missing_df, 'home')

In [832]:
mlb.update(games_missing_df)

Still 188 games with missing pitchers (far less actual pitchers, as they pitched multiple games)

In [833]:
mlb['home_key_bbref'].isna().sum()

188

Repeat for the away team pitcher.

In [834]:
mlb = fill_missing_pitchers(mlb, pitchers_df, 'away_pitcher', 'key_retro')
mlb = add_key_bbref(mlb, 'away')

pitchers_df = pitchers_df.drop_duplicates('pitcher_cleaned', keep=False)

games_missing_df = mlb[mlb['away_key_bbref'].isna()]
games_missing_df = games_missing_df.drop('away_key_bbref', axis='columns')
games_missing_df = fill_missing_pitchers(games_missing_df, pitchers_df, 'away_pitcher_cleaned', 'pitcher_cleaned')
games_missing_df = add_key_bbref(games_missing_df, 'away')

mlb.update(games_missing_df)

170 games with missing away pitchers

In [835]:
mlb['away_key_bbref'].isna().sum()

170

The following pitchers are still missing. We'll deal with these cases manually.

In [836]:
mlb[mlb['home_key_bbref'].isna()]['home_pitcher_cleaned'].unique()

array(['faustocarmona', 'zachbritton', 'natekarns', 'robbieross',
       'joshsmith', 'markleiterjr', 'joepalumbo'], dtype=object)

In [837]:
mlb[mlb['away_key_bbref'].isna()]['away_pitcher_cleaned'].unique()

array(['faustocarmona', 'zachbritton', 'natekarns', 'robbieross',
       'joshsmith', 'markleiterjr', 'mattfesta'], dtype=object)

In [838]:
mlb.head()

Unnamed: 0_level_0,date,Y,M,D,home_team,away_team,home_win,home_pitcher,away_pitcher,home_elo,...,slg_diff,team_avg_pct_diff,team_obp_pct_diff,team_slg_pct_diff,home_rest,away_rest,home_pitcher_cleaned,away_pitcher_cleaned,home_key_bbref,away_key_bbref
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2001-04-01,2001.0,4.0,1.0,TOR,TEX,1.0,loaie001,hellr001,1499.563,...,0.023271,-2.947374,-2.977845,4.989568,5.0,5.0,loaie001,hellr001,loaizes01,helliri01
1,2001-04-02,2001.0,4.0,2.0,SFN,SDN,1.0,hernl003,willw001,1540.841,...,0.070273,8.784099,8.985458,14.998766,5.0,5.0,hernl003,willw001,hernali01,williwo02
2,2001-04-02,2001.0,4.0,2.0,SEA,OAK,1.0,Freddy Garcia,Tim Hudson,1519.464,...,-0.016229,-0.323318,0.331871,-3.70521,5.0,5.0,freddygarcia,timhudson,garcifr03,hudsoti01
3,2001-04-02,2001.0,4.0,2.0,NYA,KCA,1.0,clemr001,suppj001,1529.511,...,0.024787,-3.703559,1.970596,5.554343,5.0,5.0,clemr001,suppj001,clemero02,suppaje01
4,2001-04-02,2001.0,4.0,2.0,LAN,MIL,1.0,parkc002,Jamey Wright,1515.925,...,0.026359,4.193722,4.236467,6.181414,5.0,5.0,parkc002,jameywright,parkch01,wrighja01


Just went on BR and found these people. Some are funky spellings, some are where multiple pitchers have the same name.

In [839]:
pitcher_map = {'faustocarmona': 'carmofa01',
              'zachbritton': 'brittza01',
              'natekarns': 'karnsna01',
              'robbieross': 'rossro01',
              'joshsmith': 'smithjo07',
              'markleiterjr': 'leitema02',
              'joepalumbo': 'palumjo01',
              'mattfesta': 'festama01'}

Replace missing pitchers. Due to how Pandas handles `.replace()`, we would need an exhaustive dictionary with all pitchers. The code below replaces with those values in the dictionary above, and fills the rest with `None`. That's why we then `.fillna` with the remaining data.

In [840]:
mlb['home_pitcher'] = mlb['home_pitcher_cleaned'].map(pitcher_map).fillna(mlb['home_key_bbref'])
mlb['away_pitcher'] = mlb['away_pitcher_cleaned'].map(pitcher_map).fillna(mlb['away_key_bbref'])

In [841]:
assert mlb['home_pitcher'].isna().sum() == mlb['away_pitcher'].isna().sum() == 0, 'There are still some pitchers with missing data!'
assert 'freddygarcia' not in mlb['home_pitcher'].unique(), 'There are pitcher names which are invalid keys!'

Rename the columns

In [842]:
mlb['home_pitcher'] = mlb['home_key_bbref']
mlb['away_pitcher'] = mlb['away_key_bbref']

Drop unnecessary columns

In [843]:
mlb = mlb.drop(['home_pitcher_cleaned', 'away_pitcher_cleaned', 'home_key_bbref', 'away_key_bbref'], axis='columns')

## Save the results

In [844]:
mlb.to_csv('../data/mlb_games_df.csv', index=False)

## Load using data loader

Now use the data loader (which uses `mlb_games_df.csv`) to add in other stats.

In [845]:
ds = Dataset('all')
ds.load_games(start_date='2000-01-01', end_date='2020-01-01');

In [846]:
ds.data.head()

Unnamed: 0,date,Y,M,D,home_team,away_team,home_win,home_pitcher,away_pitcher,home_elo,...,elo_diff,elo_pct_diff,avg_diff,obp_diff,slg_diff,team_avg_pct_diff,team_obp_pct_diff,team_slg_pct_diff,home_rest,away_rest
0,2001-04-01,2001,4.0,1.0,TOR,TEX,1.0,loaizes01,helliri01,1499.562988,...,20.4,1.360396,-0.00806,-0.010103,0.023271,-2.947374,-2.977845,4.989568,5.0,5.0
1,2001-04-02,2001,4.0,2.0,SFN,SDN,1.0,hernali01,williwo02,1540.840942,...,48.041,3.117843,0.024251,0.032292,0.070273,8.784099,8.985458,14.998766,5.0,5.0
2,2001-04-02,2001,4.0,2.0,SEA,OAK,1.0,garcifr03,hudsoti01,1519.463989,...,-15.232,-1.002459,-0.000864,0.00119,-0.016229,-0.323318,0.331871,-3.70521,5.0,5.0
3,2001-04-02,2001,4.0,2.0,NYA,KCA,1.0,clemero02,suppaje01,1529.510986,...,36.359001,2.377165,-0.010188,0.006929,0.024787,-3.703559,1.970596,5.554343,5.0,5.0
4,2001-04-02,2001,4.0,2.0,LAN,MIL,1.0,parkch01,wrighja01,1515.925049,...,25.528999,1.684054,0.01066,0.014276,0.026359,4.193722,4.236467,6.181414,5.0,5.0


## Add team stats, team pitching stats, and pitcher stats

Add the team columns shown below for the previous season.

In [847]:
team_stat_cols = ['W-L-pct', 'Avg_Attendance', 'Rank', 'R', 'RA', 'FP']
ds.add_team_stats(cols=team_stat_cols, year_offset=1);

Add the team pitching columns shown below for the previous season.

In [848]:
team_pitching_stat_cols = ['WHIP', 'ERA']
ds.add_team_pitching_stats(cols=team_pitching_stat_cols, year_offset=1);

In [849]:
pitcher_stat_cols = ['WHIP', 'ERA', 'IP', 'WPA']
ds.add_pitcher_stats(cols=pitcher_stat_cols, game_offset=162);

## Adding total runs and runs allowed

All of the code below just duplicates Morgan's work. However, the assert statement below shows that the calculations give exactly the same results he got in `data/data_bayes.csv`. So commenting out this code now to save time.

scores_df = pd.read_csv('https://projects.fivethirtyeight.com/mlb-api/mlb_elo.csv')

scores_df = scores_df[(scores_df['date'] >= '2000-02-29') & (scores_df['date'] <= '2019-9-29')]
scores_df = scores_df.sort_values(by='date')

scores_df = scores_df[(scores_df['playoff'] != 'w') &
                  (scores_df['playoff'] != 'l') &
                 (scores_df['playoff'] != 'd') &
                 (scores_df['playoff'] != 'c')]

scores_df['date'] = pd.to_datetime(scores_df['date'])

scores_df['Y'] = scores_df['date'].dt.year
scores_df['M'] = scores_df['date'].dt.month
scores_df['D'] = scores_df['date'].dt.day

scores_df = scores_df[['date', 'season', 'team1', 'team2', 'score1', 'score2']]

scores_df = scores_df.rename({'season':'Y',
                'team1': 'home_team',
                'team2': 'away_team',
                'score1': 'home_score',
                'score2': 'away_score'},
              axis=1)

scores_df = scores_df.reset_index(drop=True)

scores_df['home_total_R'] = 0
scores_df['home_total_RA'] = 0
scores_df['away_total_R'] = 0
scores_df['away_total_RA'] = 0

# Used to suppress warnings
pd.set_option('mode.chained_assignment', None)

for i in range(scores_df.shape[0]):
    # Go team-by-team
    for team in scores_df['home_team'].unique():
        # Find games (rows) where that team played (either home or away)
        if (scores_df.iloc[i]['home_team'] == team) | (scores_df.iloc[i]['away_team'] == team):
            # Find all games prior to the current game (and in the current season) where they played. 
            # Separate it by where they were home and away team.
            home_team_df = scores_df[scores_df['home_team'] == team]
            home_team_df = home_team_df[home_team_df['Y'] == (scores_df.iloc[i]['Y'])]
            home_team_df = home_team_df[home_team_df['date'] < scores_df.iloc[i]['date']]
            
            away_team_df = scores_df[scores_df['away_team'] == team]
            away_team_df = away_team_df[away_team_df['Y'] == (scores_df.iloc[i]['Y'])]
            away_team_df = away_team_df[away_team_df['date'] < scores_df.iloc[i]['date']]
            
            if scores_df.iloc[i]['home_team'] == team:
                scores_df['home_total_R'][i] = home_team_df['home_score'].sum() + away_team_df['away_score'].sum()
                scores_df['home_total_RA'][i] = home_team_df['away_score'].sum() + away_team_df['home_score'].sum()
            if scores_df.iloc[i]['away_team'] == team:
                scores_df['away_total_R'][i] = away_team_df['away_score'].sum() + home_team_df['home_score'].sum()
                scores_df['away_total_RA'][i] = away_team_df['home_score'].sum() + home_team_df['away_score'].sum()

scores_df.head()

scores_df['date'] = pd.to_datetime(scores_df['date'])

scores_df = scores_df[scores_df['date'] > '2001-01-01']

In [850]:
bayes_df = pd.read_csv('../data/data_bayes.csv')

In [851]:
bayes_df['date'] = pd.to_datetime(bayes_df['date'])

In [852]:
bayes_df['home_team'] = bayes_df['home_team'].apply(uniform_name)
bayes_df['away_team'] = bayes_df['away_team'].apply(uniform_name)

scores_df['home_team'] = scores_df['home_team'].apply(uniform_name)
scores_df['away_team'] = scores_df['away_team'].apply(uniform_name)

bayes_df.head()

bayes_df.columns

In [853]:
cols = ['date', 'home_team', 'away_team', 'home_total_R', 'home_total_RA',
       'away_total_R', 'away_total_RA']

We can see that the data in `data/data_bayes.csv` matches exactly what we just calculated for R, RA, etc. So we'll just use that data for simplicity.

assert np.all(scores_df[cols].values == bayes_df[cols].values)

Merge back with the dataset

In [854]:
bayes_df = bayes_df.drop(['Y', 'home_win', 'home_score', 'away_score', 'M', 'D'], axis='columns')
ds.data = ds.data.merge(bayes_df, on=['date', 'home_team', 'away_team'])

## Adding custom percent difference columns

Add in Morgan's percent difference columns.

In [855]:
def make_pct_diff_col(df, home_col, away_col):
    return (df[home_col] - df[away_col]) / df[home_col]

In [856]:
ds.data.columns

Index(['date', 'Y', 'M', 'D', 'home_team', 'away_team', 'home_win',
       'home_pitcher', 'away_pitcher', 'home_elo', 'away_elo', 'home_avg',
       'away_avg', 'home_obp', 'away_obp', 'home_slg', 'away_slg', 'home_iso',
       'away_iso', 'elo_diff', 'elo_pct_diff', 'avg_diff', 'obp_diff',
       'slg_diff', 'team_avg_pct_diff', 'team_obp_pct_diff',
       'team_slg_pct_diff', 'home_rest', 'away_rest',
       'home_W-L-pct_offset1year', 'home_Avg_Attendance_offset1year',
       'home_Rank_offset1year', 'home_R_offset1year', 'home_RA_offset1year',
       'home_FP_offset1year', 'away_W-L-pct_offset1year',
       'away_Avg_Attendance_offset1year', 'away_Rank_offset1year',
       'away_R_offset1year', 'away_RA_offset1year', 'away_FP_offset1year',
       'home_WHIP_offset1year', 'home_ERA_offset1year',
       'away_WHIP_offset1year', 'away_ERA_offset1year',
       'home_pitcher_season_game', 'home_pitcher_WHIP_avg_162games',
       'home_pitcher_ERA_avg_162games', 'home_pitcher_IP_avg_162

In [867]:
ds.data['home_ops'] = ds.data['home_slg'] + ds.data['home_obp']
ds.data['away_ops'] = ds.data['away_slg'] + ds.data['away_obp']

ds.data['home_RD'] = ds.data['home_R_offset1year'] - ds.data['home_RA_offset1year']
ds.data['away_RD'] = ds.data['away_R_offset1year'] - ds.data['away_RA_offset1year']

ds.data['pitcher_WHIP_pct_diff'] = make_pct_diff_col(ds.data, 'home_pitcher_WHIP_avg_162games', 'away_pitcher_WHIP_avg_162games')
ds.data['pitcher_ERA_pct_diff'] = make_pct_diff_col(ds.data, 'home_pitcher_ERA_avg_162games', 'away_pitcher_ERA_avg_162games')
ds.data['pitcher_IP_pct_diff'] = make_pct_diff_col(ds.data, 'home_pitcher_IP_avg_162games', 'away_pitcher_IP_avg_162games')

ds.data['team_W-L_pct_diff'] = make_pct_diff_col(ds.data, 'home_W-L-pct_offset1year', 'away_W-L-pct_offset1year')
ds.data['team_ops_pct_diff'] = make_pct_diff_col(ds.data, 'home_ops', 'away_ops')
ds.data['team_obp_pct_diff'] = make_pct_diff_col(ds.data, 'home_obp', 'away_obp')
ds.data['team_avg_pct_diff'] = make_pct_diff_col(ds.data, 'home_avg', 'away_avg')
ds.data['team_RD_pct_diff'] = make_pct_diff_col(ds.data, 'home_RD', 'away_RD')
# ds.data['team_WPA_pct_diff'] = make_pct_diff_col(ds.data, 'home_WPA', 'away_WPA')
ds.data['team_WHIP_pct_diff'] = make_pct_diff_col(ds.data, 'home_WHIP_offset1year', 'away_WHIP_offset1year')
ds.data['team_FP_pct_diff'] = make_pct_diff_col(ds.data, 'home_FP_offset1year', 'away_FP_offset1year')
ds.data['team_Rank_pct_diff'] = make_pct_diff_col(ds.data, 'home_Rank_offset1year', 'away_Rank_offset1year')
ds.data['team_ERA_pct_diff'] = make_pct_diff_col(ds.data, 'home_ERA_offset1year', 'away_ERA_offset1year')
ds.data['team_RA_pct_diff'] = make_pct_diff_col(ds.data, 'home_RA_offset1year', 'away_RA_offset1year')
ds.data['team_R_pct_diff'] = make_pct_diff_col(ds.data, 'home_R_offset1year', 'away_R_offset1year')
ds.data['team_bayes_pct_diff'] = make_pct_diff_col(ds.data, 'home_bayes_win', 'away_bayes_win')
ds.data['team_pytha_pct_diff'] = make_pct_diff_col(ds.data, 'home_pythag_expect', 'away_pythag_expect')

In [881]:
ds.data.columns

Index(['date', 'Y', 'M', 'D', 'home_team', 'away_team', 'home_win',
       'home_pitcher', 'away_pitcher', 'home_elo', 'away_elo', 'home_avg',
       'away_avg', 'home_obp', 'away_obp', 'home_slg', 'away_slg', 'home_iso',
       'away_iso', 'elo_diff', 'elo_pct_diff', 'avg_diff', 'obp_diff',
       'slg_diff', 'team_avg_pct_diff', 'team_obp_pct_diff',
       'team_slg_pct_diff', 'home_rest', 'away_rest',
       'home_W-L-pct_offset1year', 'home_Avg_Attendance_offset1year',
       'home_Rank_offset1year', 'home_R_offset1year', 'home_RA_offset1year',
       'home_FP_offset1year', 'away_W-L-pct_offset1year',
       'away_Avg_Attendance_offset1year', 'away_Rank_offset1year',
       'away_R_offset1year', 'away_RA_offset1year', 'away_FP_offset1year',
       'home_WHIP_offset1year', 'home_ERA_offset1year',
       'away_WHIP_offset1year', 'away_ERA_offset1year',
       'home_pitcher_season_game', 'home_pitcher_WHIP_avg_162games',
       'home_pitcher_ERA_avg_162games', 'home_pitcher_IP_avg_162

In [872]:
numerator = (ds.data['home_win_pct'] - (ds.data['home_win_pct']*ds.data['away_win_pct']))
denominator = (ds.data['home_win_pct'] + ds.data['away_win_pct'] - (2*ds.data['home_win_pct']*ds.data['away_win_pct']))
ds.data['log_5'] = numerator / denominator

In [873]:
final_df = ds.data.drop(['date', 'D', 'home_pitcher', 'away_pitcher'], axis='columns')

In [887]:
final_df.to_csv('../data/start_to_finish.csv', index=False)

In [862]:
old_df = pd.read_csv('../data/Final Data/pct-diff-mlb-games.csv')

In [868]:
old_df.columns

Index(['home_win', 'home_team', 'away_team', 'Y', 'M', 'avg_pct_diff',
       'obp_pct_diff', 'slg_pct_diff', 'team_ERA_pct_diff',
       'team_WHIP_pct_diff', 'team_W-L_pct_diff', 'team_Rank_pct_diff',
       'team_FP_pct_diff', 'R_pct_diff', 'RA_pct_diff', 'pytha_pct_diff',
       'win_pct_diff', 'bayes_pct_diff', 'pitcher_WHIP_pct_diff',
       'pitcher_ERA_pct_diff', 'pitcher_IP_pct_diff', 'ops_pct_diff',
       'RD_pct_diff', 'FP_pct_diff', 'Rank_pct_diff', 'WPA_pct_diff', 'log_5'],
      dtype='object')

In [874]:
final_df.columns

Index(['Y', 'M', 'home_team', 'away_team', 'home_win', 'home_elo', 'away_elo',
       'home_avg', 'away_avg', 'home_obp', 'away_obp', 'home_slg', 'away_slg',
       'home_iso', 'away_iso', 'elo_diff', 'elo_pct_diff', 'avg_diff',
       'obp_diff', 'slg_diff', 'team_avg_pct_diff', 'team_obp_pct_diff',
       'team_slg_pct_diff', 'home_rest', 'away_rest',
       'home_W-L-pct_offset1year', 'home_Avg_Attendance_offset1year',
       'home_Rank_offset1year', 'home_R_offset1year', 'home_RA_offset1year',
       'home_FP_offset1year', 'away_W-L-pct_offset1year',
       'away_Avg_Attendance_offset1year', 'away_Rank_offset1year',
       'away_R_offset1year', 'away_RA_offset1year', 'away_FP_offset1year',
       'home_WHIP_offset1year', 'home_ERA_offset1year',
       'away_WHIP_offset1year', 'away_ERA_offset1year',
       'home_pitcher_season_game', 'home_pitcher_WHIP_avg_162games',
       'home_pitcher_ERA_avg_162games', 'home_pitcher_IP_avg_162games',
       'home_pitcher_WPA_avg_162games', 'awa

In [875]:
missing_cols = list(set(old_df.columns) - set(final_df.columns))

In [882]:
common_cols = list(set(old_df.columns).intersection(final_df.columns))
common_cols.extend([f'team_{c}' for c in missing_cols])
common_cols = list(set(common_cols)) + ['team_W-L_pct_diff']
common_cols

['M',
 'Y',
 'pitcher_IP_pct_diff',
 'team_ERA_pct_diff',
 'team_ops_pct_diff',
 'team_FP_pct_diff',
 'team_slg_pct_diff',
 'team_R_pct_diff',
 'team_bayes_pct_diff',
 'team_avg_pct_diff',
 'team_obp_pct_diff',
 'pitcher_ERA_pct_diff',
 'pitcher_WHIP_pct_diff',
 'log_5',
 'home_win',
 'team_win_pct_diff',
 'team_W-L_pct_diff',
 'team_Rank_pct_diff',
 'away_team',
 'home_team',
 'team_WPA_pct_diff',
 'team_pytha_pct_diff',
 'team_RD_pct_diff',
 'team_WHIP_pct_diff',
 'team_RA_pct_diff',
 'team_W-L_pct_diff']