In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import datetime as dt
import re

In [2]:
team_pitching_df = pd.read_csv('../data/team_pitching_stats.csv')

In [3]:
team_pitching_df.head()

Unnamed: 0,Team,W,L,ERA,G,GS,CG,ShO,SV,HLD,...,ER,HR,BB,IBB,HBP,WP,BK,SO,Year,WHIP
0,ATL,95,67,4.06,538,162,13,6,53,,...,649,165,484,52,37,23,6,1093,2000,1.327686
1,TEX,71,91,5.52,577,162,3,0,39,,...,876,202,661,40,63,40,6,918,2000,1.640308
2,KCA,77,85,5.48,491,162,10,3,29,,...,877,239,693,35,42,77,5,927,2000,1.582934
3,HOU,72,90,5.42,572,162,8,1,30,,...,865,234,598,25,60,55,3,1064,2000,1.526579
4,BAL,74,88,5.37,558,162,14,2,33,,...,855,202,665,32,36,51,1,1017,2000,1.543507


In [4]:
team_pitching_df.columns

Index(['Team', 'W', 'L', 'ERA', 'G', 'GS', 'CG', 'ShO', 'SV', 'HLD', 'BS',
       'IP', 'TBF', 'H', 'R', 'ER', 'HR', 'BB', 'IBB', 'HBP', 'WP', 'BK', 'SO',
       'Year', 'WHIP'],
      dtype='object')

In [5]:
team_pitching_df['WHIP'] = (team_pitching_df['H'] + team_pitching_df['BB'])/team_pitching_df['IP']
team_pitching_df['HR/9'] = (team_pitching_df['HR']/team_pitching_df['IP'])*9
team_pitching_df['K/9'] = (team_pitching_df['SO']/team_pitching_df['IP'])*9
team_pitching_df['K/BB'] = (team_pitching_df['SO'])/team_pitching_df['BB']



team_pitching_df['FIP'] = (13*team_pitching_df['HR'] + 
                           3*(team_pitching_df['BB'] + team_pitching_df['HBP']) - 
                           2*team_pitching_df['SO'])/team_pitching_df['IP']

In [6]:
team_pitching = team_pitching_df[['Team', 'Year', 'FIP', 
                                  'K/9', 'WHIP', 'K/BB', 'HR/9']]

team_pitching = team_pitching.rename({'Year': 'Y','Team': 'team'}, axis = 1)

home_team_pitching = team_pitching.rename({'Year': 'Y','Team': 'team', 'FIP': 'home_FIP', 
                                           'K/9': 'home_K/9', 'WHIP': 'home_WHIP',
                                           'K/BB': 'home_K/BB', 'HR/9':'home_HR/9'}, axis = 1)
away_team_pitching = team_pitching.rename({'Year': 'Y','Team': 'team','FIP': 'away_FIP', 
                                           'K/9': 'away_K/9', 'WHIP': 'away_WHIP',
                                           'K/BB': 'away_K/BB', 'HR/9':'away_HR/9'}, axis = 1)

In [7]:
home_team_pitching.head()

Unnamed: 0,team,Y,home_FIP,home_K/9,home_WHIP,home_K/BB,home_HR/9
0,ATL,2000,1.056871,6.830776,1.327686,2.258264,1.031178
1,TEX,2000,2.072778,5.781666,1.640308,1.388805,1.272218
2,KCA,2000,2.402891,5.797373,1.582934,1.337662,1.494684
3,HOU,2000,2.009463,6.662956,1.526579,1.779264,1.465349
4,BAL,2000,1.880539,6.386854,1.543507,1.529323,1.268579


In [8]:
away_team_pitching.head()

Unnamed: 0,team,Y,away_FIP,away_K/9,away_WHIP,away_K/BB,away_HR/9
0,ATL,2000,1.056871,6.830776,1.327686,2.258264,1.031178
1,TEX,2000,2.072778,5.781666,1.640308,1.388805,1.272218
2,KCA,2000,2.402891,5.797373,1.582934,1.337662,1.494684
3,HOU,2000,2.009463,6.662956,1.526579,1.779264,1.465349
4,BAL,2000,1.880539,6.386854,1.543507,1.529323,1.268579


In [9]:
home_team_pitching['team'].unique()

array(['ATL', 'TEX', 'KCA', 'HOU', 'BAL', 'COL', 'CHN', 'TOR', 'MIN',
       'WAS', 'ANA', 'PIT', 'TBA', 'CLE', 'PHI', 'LAN', 'DET', 'NYN',
       'SFN', 'BOS', 'ARI', 'SLN', 'CIN', 'SEA', 'OAK', 'MIA', 'MIL',
       'CHA', 'SDN', 'NYA'], dtype=object)

In [10]:
team_list = ['TOR', 'SFN', 'SEA', 'NYA', 'LAN', 'BAL', 'COL', 'CHN', 'MIA',
       'CLE', 'CIN', 'TEX', 'TBA', 'DET', 'ATL', 'HOU', 'MIL', 'WAS',
       'PHI', 'OAK', 'ARI', 'CHA', 'BOS', 'KCA', 'SLN', 'PIT', 'NYN',
       'MIN', 'SDN', 'ANA']

In [11]:
def uniform_name(df, col_name):
    df[col_name] = df[col_name].str.replace(r'\W', '')
    df[col_name] = df[col_name].str.lower()
    
    team_list = ['TOR', 'SFN', 'SEA', 'NYA', 'LAN', 'BAL', 'COL', 'CHN', 'MIA',
       'CLE', 'CIN', 'TEX', 'TBA', 'DET', 'ATL', 'HOU', 'MIL', 'WAS',
       'PHI', 'OAK', 'ARI', 'CHA', 'BOS', 'KCA', 'SLN', 'PIT', 'NYN',
       'MIN', 'SDN', 'ANA']
    
    
    team_dict = {'jays':'TOR','bluejays':'TOR', 'torontobluejays':'TOR', 'tor':'TOR', 
                 'giants':'SFN', 'sfg': 'SFN', 'sanfranciscogiants':'SFN', 'sf':'SFN', 'sfn':'SFN', 
                 'mariners':'SEA', 'seattlemariners':'SEA', 'sea':'SEA', 
                 'yankees':'NYA', 'nyy': 'NYA', 'newyorkyankees':'NYA', 'nya':'NYA', 
                 'dodgers':'LAN', 'lad':'LAN', 'losangelesdodgers':'LAN', 'la':'LAN', 'lan':'LAN', 
                 'orioles':'BAL', 'baltimoreorioles':'BAL', 'bal':'BAL', 
                 'rockies':'COL', 'coloradorockies':'COL', 'col':'COL', 
                 'cubs':'CHN', 'chc':'CHN', 'chicagocubs':'CHN', 'chn':'CHN', 
                 'marlins':'MIA', 'fla':'MIA', 'floridamarlins':'MIA', 'miamimarlins':'MIA', 'mia':'MIA', 
                 'indians':'CLE', 'clevelandindians':'CLE', 'cle':'CLE', 
                 'reds':'CIN', 'cincinnatireds':'CIN', 'cin':'CIN', 
                 'rangers':'TEX', 'texasrangers':'TEX', 'tex':'TEX', 
                 'rays':'TBA', 'devilrays':'TBA', 'tbd':'TBA', 'tampabayrays':'TBA', 'tampabaydevilrays':'TBA', 'tba':'TBA', 
                 'tigers':'DET', 'detriottigers':'DET', 'det':'DET', 
                 'braves':'ATL', 'atlantabraves':'ATL', 'atl':'ATL', 
                 'astros':'HOU', 'houstonastros':'HOU', 'hou':'HOU', 
                 'brewers':'MIL', 'milwaukeebrewers':'MIL', 'mil':'MIL', 
                 'nationals':'WAS', 'wsh':' WAS', 'wsn':'WAS', 'washingtonnationals':'WAS', 'montrealexpos':'WAS', 'expos':'WAS', 'mtl':'WAS','was':'WAS', 
                 'phillies':'PHI', 'philadelphiaphillies':'PHI', 'phi':'PHI', 
                 'as':'OAK', 'athletics':'OAK', 'oaklandathletics':'OAK', 'oaklandas':'OAK', 'oaklanda':'OAK', 'oak':'OAK', 
                 'diamondbacks':'ARI', 'arizonadiamondbacks':'ARI', 'ari':'ARI', 
                 'whitesox':'CHA', 'cws':'CHA', 'chicagowhitesox':'CHA', 'cha':'CHA', 
                 'redsox':'BOS', 'bostonredsox':'BOS', 'bos':'BOS', 
                 'royals':'KCA', 'kcr':'KCA', 'kansascityroyals':'KCA', 'kc':'KCA', 'kca':'KCA', 
                 'cardinals':'SLN', 'slc':'SLN', 'stl':'SLN', 'saintlouiscardinals':'SLN', 'stlouiscardinals':'SLN', 'sln':'SLN', 
                 'pirates':'PIT', 'pittsburghpirates':'PIT', 'pit':'PIT', 
                 'mets':'NYN', 'nym':'NYN', 'newyorkmets':'NYN', 'nyn':'NYN', 
                 'twins':'MIN', 'minnesotatwins':'MIN', 'min':'MIN', 
                 'padres':'SDN', 'sdp':'SDN', 'sandiegopadres':'SDN', 'sd':'SDN', 'sdn':'SDN', 
                 'angels':'ANA', 'laa':'ANA', 'losangelesangels':'ANA', 'losangelesangelsofanaheim':'ANA', 'ana':'ANA', }
    
    df[col_name] = df[col_name].map(team_dict)
    
    return df

In [12]:
home_team_pitching = uniform_name(home_team_pitching, 'team')
away_team_pitching = uniform_name(away_team_pitching, 'team')

In [13]:
mlb_games = pd.read_csv('../data/mlb_games_df.csv')
#mlb_games = mlb_games.drop('Unnamed: 0', axis=1)

In [14]:
mlb_games.head()

Unnamed: 0,date,Y,M,D,home_team,away_team,home_win,home_pitcher,away_pitcher,home_elo,...,elo_diff,elo_pct_diff,avg_diff,obp_diff,slg_diff,avg_pct_diff,obp_pct_diff,slg_pct_diff,home_rest,away_rest
0,4/1/01,2001.0,4.0,1.0,TOR,TEX,1.0,loaizes01,helliri01,1499.563,...,20.4,1.360396,-0.00806,-0.010103,0.023271,-2.947374,-2.977845,4.989568,5.0,5.0
1,4/2/01,2001.0,4.0,2.0,SFN,SDN,1.0,hernali01,williwo02,1540.841,...,48.041,3.117843,0.024251,0.032292,0.070273,8.784099,8.985458,14.998766,5.0,5.0
2,4/2/01,2001.0,4.0,2.0,SEA,OAK,1.0,garcifr03,hudsoti01,1519.464,...,-15.232,-1.002459,-0.000864,0.00119,-0.016229,-0.323318,0.331871,-3.70521,5.0,5.0
3,4/2/01,2001.0,4.0,2.0,NYA,KCA,1.0,clemero02,suppaje01,1529.511,...,36.359,2.377165,-0.010188,0.006929,0.024787,-3.703559,1.970596,5.554343,5.0,5.0
4,4/2/01,2001.0,4.0,2.0,LAN,MIL,1.0,parkch01,wrighja01,1515.925,...,25.529,1.684054,0.01066,0.014276,0.026359,4.193722,4.236467,6.181414,5.0,5.0


In [15]:
home_team_pitching.head()

Unnamed: 0,team,Y,home_FIP,home_K/9,home_WHIP,home_K/BB,home_HR/9
0,ATL,2000,1.056871,6.830776,1.327686,2.258264,1.031178
1,TEX,2000,2.072778,5.781666,1.640308,1.388805,1.272218
2,KCA,2000,2.402891,5.797373,1.582934,1.337662,1.494684
3,HOU,2000,2.009463,6.662956,1.526579,1.779264,1.465349
4,BAL,2000,1.880539,6.386854,1.543507,1.529323,1.268579


In [16]:
mlb_games['Y'] = mlb_games['Y'].astype(int)

In [17]:
for col in ['FIP', 'WHIP', 'K/9', 'HR/9', 'K/BB']:
    mlb_games['home_' + col] = float(0)
    mlb_games['away_' + col] = float(0)

In [18]:
team_list = ['TOR', 'SFN', 'SEA', 'NYA', 'LAN', 'BAL', 'COL', 'CHN', 'MIA',
       'CLE', 'CIN', 'TEX', 'TBA', 'DET', 'ATL', 'HOU', 'MIL', 'WAS',
       'PHI', 'OAK', 'ARI', 'CHA', 'BOS', 'KCA', 'SLN', 'PIT', 'NYN',
       'MIN', 'SDN', 'ANA']

In [20]:
pd.set_option('mode.chained_assignment', None)
c = 0
for i in range(mlb_games.shape[0]):
    if (i % 4000 == 0):
        print(i)
    for team in team_list:
        if mlb_games.iloc[i][4] == team:
            home_team_df = home_team_pitching[home_team_pitching['team'] == team]
            home_team_df = home_team_df[home_team_df['Y'] == (mlb_games.iloc[i][1]-1)]
            
            mlb_games['home_FIP'][i] = home_team_df.iloc[0][-5]
            mlb_games['home_WHIP'][i] = home_team_df.iloc[0][-3]
            mlb_games['home_K/9'][i] = home_team_df.iloc[0][-4]
            mlb_games['home_HR/9'][i] = home_team_df.iloc[0][-1]
            mlb_games['home_K/BB'][i] = home_team_df.iloc[0][-2]
            
        if mlb_games.iloc[i][5] == team:
            away_team_df = away_team_pitching[away_team_pitching['team'] == team]
            away_team_df = away_team_df[away_team_df['Y'] == (mlb_games.iloc[i][1]-1)]
            
            mlb_games['away_FIP'][i] = away_team_df.iloc[0][-5]
            mlb_games['away_WHIP'][i] = away_team_df.iloc[0][-3]
            mlb_games['away_K/9'][i] = away_team_df.iloc[0][-4]
            mlb_games['away_HR/9'][i] = away_team_df.iloc[0][-1]
            mlb_games['away_K/BB'][i] = away_team_df.iloc[0][-2]
    
    
    c+=1

0
4000
8000
12000
16000
20000
24000
28000
32000
36000
40000
44000


In [161]:
mlb_games.head()

Unnamed: 0,date,Y,M,D,home_team,away_team,home_win,home_pitcher,away_pitcher,home_elo,...,home_FIP,away_FIP,home_WHIP,away_WHIP,home_K/9,away_K/9,home_HR/9,away_HR/9,home_K/BB,away_K/BB
0,4/1/01,2001,4.0,1.0,TOR,TEX,1.0,loaizes01,helliri01,1499.563,...,1.705518,2.072778,1.513465,1.640308,6.124835,5.781666,1.221209,1.272218,1.746429,1.388805
1,4/2/01,2001,4.0,2.0,SFN,SDN,1.0,hernali01,williwo02,1540.841,...,1.248528,1.707902,1.436881,1.433761,6.705907,6.606127,0.941071,1.178124,1.727127,1.650231
2,4/2/01,2001,4.0,2.0,SEA,OAK,1.0,garcifr03,hudsoti01,1519.464,...,1.520261,1.475159,1.440466,1.498153,6.232306,6.0393,1.042881,0.990872,1.574132,1.565854
3,4/2/01,2001,4.0,2.0,NYA,KCA,1.0,clemero02,suppaje01,1529.511,...,1.480233,2.402891,1.428973,1.582934,6.572572,5.797373,1.118601,1.494684,1.802426,1.337662
4,4/2/01,2001,4.0,2.0,LAN,MIL,1.0,parkch01,wrighja01,1515.925,...,1.387543,1.846395,1.36955,1.52036,7.187543,5.936157,1.096194,1.06814,1.923333,1.328297


In [163]:
# > 0 = home is better
mlb_games['fip_diff'] = (mlb_games['home_FIP'] - mlb_games['away_FIP'])
mlb_games['whip_diff'] = (mlb_games['home_WHIP'] - mlb_games['away_WHIP'])
mlb_games['k/9_diff'] = (mlb_games['home_K/9'] - mlb_games['away_K/9'])
mlb_games['hr/9_diff'] = (mlb_games['home_HR/9'] - mlb_games['away_HR/9'])
mlb_games['k/bb_diff'] = (mlb_games['home_K/BB'] - mlb_games['away_K/BB'])


# scaling doesn't matter for many cases
mlb_games['fip_pct_diff'] = (mlb_games['fip_diff'])/(mlb_games['home_FIP'])*100
mlb_games['whip_pct_diff'] = (mlb_games['whip_diff'])/(mlb_games['home_WHIP'])*100
mlb_games['k/9_pct_diff'] = (mlb_games['k/9_diff'])/(mlb_games['home_K/9'])*100
mlb_games['hr/9_pct_diff'] = (mlb_games['hr/9_diff'])/(mlb_games['home_HR/9'])*100
mlb_games['k/bb_pct_diff'] = (mlb_games['k/bb_diff'])/(mlb_games['home_K/BB'])*100

In [167]:
mlb_games.columns

Index(['date', 'Y', 'M', 'D', 'home_team', 'away_team', 'home_win',
       'home_pitcher', 'away_pitcher', 'home_elo', 'away_elo', 'home_avg',
       'away_avg', 'home_obp', 'away_obp', 'home_slg', 'away_slg', 'home_iso',
       'away_iso', 'elo_diff', 'elo_pct_diff', 'avg_diff', 'obp_diff',
       'slg_diff', 'avg_pct_diff', 'obp_pct_diff', 'slg_pct_diff', 'home_rest',
       'away_rest', 'home_FIP', 'away_FIP', 'home_WHIP', 'away_WHIP',
       'home_K/9', 'away_K/9', 'home_HR/9', 'away_HR/9', 'home_K/BB',
       'away_K/BB', 'fip_diff', 'whip_diff', 'k/9_diff', 'hr/9_diff',
       'k/bb_diff', 'fip_pct_diff', 'whip_pct_diff', 'k/9_pct_diff',
       'hr/9_pct_diff', 'k/bb_pct_diff', 'iso_diff', 'iso_pct_diff',
       'rest_diff', 'rest_pct_diff'],
      dtype='object')

In [165]:
mlb_games['iso_diff'] = (mlb_games['home_iso'] - mlb_games['away_iso'])
mlb_games['iso_pct_diff'] = (mlb_games['iso_diff'])/(mlb_games['home_iso'])*100

In [166]:
mlb_games['rest_diff'] = (mlb_games['home_rest'] - mlb_games['away_rest'])
mlb_games['rest_pct_diff'] = (mlb_games['rest_diff'])/(mlb_games['home_rest'])*100

In [172]:
mlb_games.to_csv('../data/final_mlb_games_df.csv', index=False)

In [173]:
model_mlb = mlb_games[['Y', 'home_win', 'elo_pct_diff', 'obp_pct_diff', 'fip_pct_diff', 
                       'whip_pct_diff', 'k/9_pct_diff', 'hr/9_pct_diff',
                       'k/bb_pct_diff', 'iso_pct_diff','rest_pct_diff']]

In [174]:
model_mlb.to_csv('../data/pct_diff_df.csv', index=False)