# Loading and Preprocessing our Data

In [370]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import datetime as dt

This link contains every mlb game ever played with their elo scores as well as some other information we will find useful. This will play the role of a starter dataframe to then add the rest of our data to.

In [371]:
mlb_elo = pd.read_csv('https://projects.fivethirtyeight.com/mlb-api/mlb_elo.csv')

We only want dates from 2000-2019 (Not 2020 because of the COVID shortened season).

In [372]:
mlb_elo = mlb_elo[(mlb_elo['date'] >= '2000-3-29') & (mlb_elo['date'] <= '2019-9-29')]

We are going to drop any row that was a playoff row just giving us regular season games as well.

In [373]:
mlb_elo = mlb_elo[(mlb_elo['playoff'] != 'w') &
                  (mlb_elo['playoff'] != 'l') &
                 (mlb_elo['playoff'] != 'd') &
                 (mlb_elo['playoff'] != 'c')]

In [374]:
mlb_elo.head()

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1_pre,elo2_pre,elo_prob1,elo_prob2,...,pitcher1_rgs,pitcher2_rgs,pitcher1_adj,pitcher2_adj,rating_prob1,rating_prob2,rating1_post,rating2_post,score1,score2
988,2019-09-29,2019,0,,STL,CHC,1545.306947,1531.092895,0.554774,0.445226,...,62.082428,46.537439,45.053652,-22.977772,0.644813,0.355187,1545.776596,1530.204782,9,0
989,2019-09-29,2019,0,,KCR,MIN,1429.995452,1549.426829,0.366016,0.633984,...,43.237569,44.422197,-18.509968,-29.389175,0.384372,0.615628,1430.50614,1544.557193,5,4
990,2019-09-29,2019,0,,SEA,OAK,1467.7325,1577.14715,0.379496,0.620504,...,47.343036,48.967613,5.136325,-10.11552,0.418133,0.581867,1465.818655,1559.092801,3,1
991,2019-09-29,2019,0,,NYM,ATL,1528.343228,1548.88195,0.504981,0.495019,...,52.198149,55.1697,-6.560683,23.550643,0.47483,0.52517,1533.426584,1542.584611,7,6
992,2019-09-29,2019,0,,COL,MIL,1469.463417,1534.015807,0.441904,0.558096,...,45.894572,50.547683,-12.012266,-1.101518,0.448627,0.551373,1482.362549,1528.964194,4,3


Now, we can rename the columns to be more user friendly and start loading in the rest of our data.

In [375]:
mlb_elo = mlb_elo[['date', 'team1', 'team2', 'elo1_pre', 'elo2_pre', 'pitcher1', 'pitcher2']]

In [376]:
mlb_elo = mlb_elo.rename(columns={'date':'date', 
                'team1': 'home_team',
                'team2': 'away_team', 
                'elo1_pre': 'home_elo', 
                'elo2_pre': 'away_elo',
                'pitcher1': 'home_pitcher',
                'pitcher2': 'away_pitcher'})

In [377]:
mlb_elo.head()

Unnamed: 0,date,home_team,away_team,home_elo,away_elo,home_pitcher,away_pitcher
988,2019-09-29,STL,CHC,1545.306947,1531.092895,Jack Flaherty,Derek Holland
989,2019-09-29,KCR,MIN,1429.995452,1549.426829,Jorge Lopez,Martin Perez
990,2019-09-29,SEA,OAK,1467.7325,1577.14715,Justin Dunn,Tanner Roark
991,2019-09-29,NYM,ATL,1528.343228,1548.88195,Noah Syndergaard,Mike Soroka
992,2019-09-29,COL,MIL,1469.463417,1534.015807,Jeff Hoffman,Adrian Houser


# Loading hitter data

In [379]:
avg_df = pd.read_csv('../output-data/covariates/COVARIATE_AVG.csv', index_col = 'date')
obp_df = pd.read_csv('../output-data/covariates/COVARIATE_OBP.csv', index_col = 'date')
slg_df = pd.read_csv('../output-data/covariates/COVARIATE_SLG.csv', index_col = 'date')

In [399]:
avg_df.head()

Unnamed: 0_level_0,ANA,ARI,ATL,BAL,BOS,CHA,CHN,CIN,CLE,COL,...,PHI,PIT,SDN,SEA,SFN,SLN,TBA,TEX,TOR,WAS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20010401,0.277406,0.263385,0.268227,0.269526,0.265407,0.283532,0.254189,0.272199,0.285689,0.290176,...,0.249414,0.264954,0.251828,0.26728,0.276079,0.267825,0.255143,0.281519,0.273459,0.264347
20010402,0.277406,0.263385,0.268227,0.269526,0.265407,0.283532,0.254189,0.272199,0.285689,0.290176,...,0.249414,0.264954,0.251828,0.26728,0.276079,0.267825,0.255143,0.281519,0.273459,0.264347
20010403,0.277406,0.263385,0.268227,0.269526,0.265407,0.283532,0.254189,0.272199,0.285689,0.290176,...,0.249414,0.264954,0.251828,0.26728,0.276079,0.267825,0.255143,0.281519,0.273459,0.264347
20010404,0.277406,0.263385,0.268227,0.269526,0.265407,0.283532,0.254189,0.272199,0.285689,0.290176,...,0.249414,0.264954,0.251828,0.26728,0.276079,0.267825,0.255143,0.281519,0.273459,0.264347
20010405,0.277406,0.263385,0.268227,0.269526,0.265407,0.283532,0.254189,0.272199,0.285689,0.290176,...,0.249414,0.264954,0.251828,0.26728,0.276079,0.267825,0.255143,0.281519,0.273459,0.264347


Below, we are chanining the team name abbreviations to match eachother for all of the dataframes.

In [378]:
def convert_team_ids(team):
    if (team == 'LAA'):
        return 'ANA' # matches up with previous table
    if (team == 'FLA'):
        return 'MIA' # matches up with previous table
    if (team == 'WSN'):
        return 'WAS' # matches up with previous table
    if (team == 'SFG'):
        return 'SFN'
    if (team == 'NYY'):
        return 'NYA'
    if (team == 'NYM'):
        return 'NYN'
    if (team == 'CHW'):
        return 'CHA'
    if (team == 'CHC'):
        return 'CHN'
    if (team == 'KCR'):
        return 'KCA'
    if (team == 'SDP'):
        return 'SDN'
    if (team == 'LAD'):
        return 'LAN'
    if (team == 'STL'):
        return 'SLN'
    if (team == 'TBD'):
        return 'TBA'
    return team

mlb_elo['home_team'] = mlb_elo['home_team'].apply(convert_team_ids)
mlb_elo['away_team'] = mlb_elo['away_team'].apply(convert_team_ids)

We are going to make these columns below but first we want to set them to zero just in case a null value occurs.

In [380]:
mlb_elo['home_avg'] = 0
mlb_elo['away_avg'] = 0
mlb_elo['home_obp'] = 0
mlb_elo['away_obp'] = 0
mlb_elo['home_slg'] = 0
mlb_elo['away_slg'] = 0

In [381]:
mlb_elo['date'] = mlb_elo.date.apply(lambda x: int(x.split('-')[0] + x.split('-')[1] + x.split('-')[2]))

The function below will go through the avg_df, obp_df, and slg_df and select the row based on the date and the column based on the specific team and then match that with its corresponding avg, obp, or slg respectively.

In [382]:
def get_covariates(x):
    x['home_avg'] = avg_df.loc[x['date']][x['home_team']]
    x['away_avg'] = avg_df.loc[x['date']][x['away_team']]
    x['home_obp'] = obp_df.loc[x['date']][x['home_team']]
    x['away_obp'] = obp_df.loc[x['date']][x['away_team']]
    x['home_slg'] = slg_df.loc[x['date']][x['home_team']]
    x['away_slg'] = slg_df.loc[x['date']][x['away_team']]
    return(x)
mlb_elo = mlb_elo.apply(get_covariates, axis = 1)

In [383]:
mlb_elo.head()

Unnamed: 0,date,home_team,away_team,home_elo,away_elo,home_pitcher,away_pitcher,home_avg,away_avg,home_obp,away_obp,home_slg,away_slg
988,20190929,SLN,CHN,1545.306947,1531.092895,Jack Flaherty,Derek Holland,0.222222,0.243802,0.321875,0.31203,0.415771,0.392562
989,20190929,KCA,MIN,1429.995452,1549.426829,Jorge Lopez,Martin Perez,0.289256,0.279352,0.350943,0.340741,0.475207,0.510121
990,20190929,SEA,OAK,1467.7325,1577.14715,Justin Dunn,Tanner Roark,0.174312,0.217021,0.227468,0.29771,0.243119,0.357447
991,20190929,NYN,ATL,1528.343228,1548.88195,Noah Syndergaard,Mike Soroka,0.256098,0.237903,0.336957,0.325,0.443089,0.395161
992,20190929,COL,MIL,1469.463417,1534.015807,Jeff Hoffman,Adrian Houser,0.230496,0.233202,0.306709,0.331034,0.421986,0.43083


We will want to make the date column a datetime type now so that we can make year, month, and day columns

In [384]:
mlb_elo['date'] = pd.to_datetime(mlb_elo['date'], format='%Y%m%d')

In [385]:
mlb_elo['Y'] = (mlb_elo.date).apply(lambda x: x.year)
mlb_elo['M'] = (mlb_elo.date).apply(lambda x: x.month)
mlb_elo['D'] = (mlb_elo.date).apply(lambda x: x.day)

In [386]:
mlb = mlb_elo[['date', 'Y', 'M', 'D',
               'home_team', 'away_team',
               'home_pitcher', 'away_pitcher',
               'home_elo', 'away_elo',
               'home_avg', 'away_avg',
               'home_obp', 'away_obp',
               'home_slg', 'away_slg']]

In [387]:
mlb = mlb.sort_values(by='date')
mlb = mlb.reset_index()

In [388]:
mlb.head()

Unnamed: 0,index,date,Y,M,D,home_team,away_team,home_pitcher,away_pitcher,home_elo,away_elo,home_avg,away_avg,home_obp,away_obp,home_slg,away_slg
0,47756,2001-04-01,2001,4,1,TOR,TEX,loaie001,hellr001,1499.563,1479.163,0.273459,0.281519,0.339283,0.349386,0.466387,0.443116
1,47746,2001-04-02,2001,4,2,SFN,SDN,hernl003,willw001,1540.841,1492.8,0.276079,0.251828,0.35938,0.327088,0.468525,0.398252
2,47747,2001-04-02,2001,4,2,SEA,OAK,Freddy Garcia,Tim Hudson,1519.464,1534.696,0.26728,0.268144,0.358599,0.357409,0.438008,0.454237
3,47748,2001-04-02,2001,4,2,NYA,KCA,clemr001,suppj001,1529.511,1493.152,0.27508,0.285268,0.351633,0.344703,0.446269,0.421482
4,47749,2001-04-02,2001,4,2,LAN,MIL,parkc002,Jamey Wright,1515.925,1490.396,0.254198,0.243537,0.336984,0.322708,0.426431,0.400071


# Making Features and Difference Columns

An important feature is ISO which is just slg minus avg

In [389]:
mlb['home_iso'] = mlb['home_slg'] - mlb['home_avg']
mlb['away_iso'] = mlb['away_slg'] - mlb['away_avg']

We can now make a few difference columns as well as percent difference. I am not sure if these will be useful but it will give the model a different way to look at the data.

In [391]:
mlb['elo_diff'] = mlb['home_elo'] - mlb['away_elo']
mlb['elo_pct_diff'] = (mlb['elo_diff'])/(mlb['home_elo'])*100

In [392]:
# > 0 = home is better
mlb['avg_diff'] = (mlb['home_avg'] - mlb['away_avg'])
mlb['obp_diff'] = (mlb['home_obp'] - mlb['away_obp'])
mlb['slg_diff'] = (mlb['home_slg'] - mlb['away_slg'])

# scaling doesn't matter for many cases
mlb['avg_pct_diff'] = (mlb['avg_diff'])/(mlb['home_avg'])*100
mlb['obp_pct_diff'] = (mlb['obp_diff'])/(mlb['home_obp'])*100
mlb['slg_pct_diff'] = (mlb['slg_diff'])/(mlb['home_slg'])*100

In [394]:
mlb = mlb.drop('index', axis='columns')

In [395]:
mlb.head()

Unnamed: 0,date,Y,M,D,home_team,away_team,home_pitcher,away_pitcher,home_elo,away_elo,...,away_slg,home_iso,away_iso,elo_diff,avg_diff,obp_diff,slg_diff,avg_pct_diff,obp_pct_diff,slg_pct_diff
0,2001-04-01,2001,4,1,TOR,TEX,loaie001,hellr001,1499.563,1479.163,...,0.443116,0.192927,0.161597,20.4,-0.00806,-0.010103,0.023271,-2.947374,-2.977845,4.989568
1,2001-04-02,2001,4,2,SFN,SDN,hernl003,willw001,1540.841,1492.8,...,0.398252,0.192446,0.146424,48.041,0.024251,0.032292,0.070273,8.784099,8.985458,14.998766
2,2001-04-02,2001,4,2,SEA,OAK,Freddy Garcia,Tim Hudson,1519.464,1534.696,...,0.454237,0.170727,0.186092,-15.232,-0.000864,0.00119,-0.016229,-0.323318,0.331871,-3.70521
3,2001-04-02,2001,4,2,NYA,KCA,clemr001,suppj001,1529.511,1493.152,...,0.421482,0.171189,0.136214,36.359,-0.010188,0.006929,0.024787,-3.703559,1.970596,5.554343
4,2001-04-02,2001,4,2,LAN,MIL,parkc002,Jamey Wright,1515.925,1490.396,...,0.400071,0.172233,0.156534,25.529,0.01066,0.014276,0.026359,4.193722,4.236467,6.181414


In [396]:
mlb.to_csv('mlb_games_df.csv')