# Finding game pitchers

The purpose of this notebook is to take the `mlb_games_df.csv` file and substitute the pitcher IDs (for some site) instead of name/whatever ID is currently showing up.

In [4]:
import pandas as pd
import numpy as np
import re

import pybaseball as pyb

In [5]:
games_df = pd.read_csv('../../data/mlb_games_df.csv')

In [6]:
games_df.head()

Unnamed: 0,date,Y,M,D,home_team,away_team,home_win,home_pitcher,away_pitcher,home_elo,...,elo_diff,elo_pct_diff,avg_diff,obp_diff,slg_diff,avg_pct_diff,obp_pct_diff,slg_pct_diff,home_rest,away_rest
0,2001-04-01,2001,4,1,TOR,TEX,1,loaie001,hellr001,1499.563,...,20.4,1.360396,-0.00806,-0.010103,0.023271,-2.947374,-2.977845,4.989568,5,5
1,2001-04-02,2001,4,2,SFN,SDN,1,hernl003,willw001,1540.841,...,48.041,3.117843,0.024251,0.032292,0.070273,8.784099,8.985458,14.998766,5,5
2,2001-04-02,2001,4,2,SEA,OAK,1,Freddy Garcia,Tim Hudson,1519.464,...,-15.232,-1.002459,-0.000864,0.00119,-0.016229,-0.323318,0.331871,-3.70521,5,5
3,2001-04-02,2001,4,2,NYA,KCA,1,clemr001,suppj001,1529.511,...,36.359,2.377165,-0.010188,0.006929,0.024787,-3.703559,1.970596,5.554343,5,5
4,2001-04-02,2001,4,2,LAN,MIL,1,parkc002,Jamey Wright,1515.925,...,25.529,1.684054,0.01066,0.014276,0.026359,4.193722,4.236467,6.181414,5,5


In [7]:
pitchers_df = pd.read_csv('../../data/pitchers_summary.csv')

In [8]:
pitchers_df.head()

Unnamed: 0,Name,first_season,last_season,games_played,games_started,teams,num_teams,key_mlbam,key_retro,key_bbref,key_fangraphs
0,A.J. Burnett,2000,2015,428,423,"MIA,TOR,NYA,PIT,PHI",5,150359,burna001,burnea.01,512
1,A.J. Cole,2015,2019,79,19,"WAS,CLE",3,595918,colea002,coleaj01,11467
2,A.J. Griffin,2012,2017,88,85,"OAK,TEX",2,456167,grifa002,griffaj01,11132
3,A.J. Murray,2007,2008,16,4,TEX,1,451262,murra001,murraaj01,3422
4,Aaron Blair,2016,2017,16,16,ATL,1,594760,blaia001,blairaa01,14934


## Standardizing team names

In [9]:
def uniform_name(df, col_name):
    df[col_name] = df[col_name].str.replace(r'[\W\s]', '')
    df[col_name] = df[col_name].str.lower()
    
    team_list = ['TOR', 'SFN', 'SEA', 'NYA', 'LAN', 'BAL', 'COL', 'CHN', 'MIA',
       'CLE', 'CIN', 'TEX', 'TBA', 'DET', 'ATL', 'HOU', 'MIL', 'WAS',
       'PHI', 'OAK', 'ARI', 'CHA', 'BOS', 'KCA', 'SLN', 'PIT', 'NYN',
       'MIN', 'SDN', 'ANA']
    
    
    
    team_dict = {'jays':'TOR','bluejays':'TOR', 'torontobluejays':'TOR', 'tor':'TOR', 
                 'giants':'SFN', 'sfg': 'SFN', 'sanfranciscogiants':'SFN', 'sf':'SFN', 'sfn':'SFN', 
                 'mariners':'SEA', 'seattlemariners':'SEA', 'sea':'SEA', 
                 'yankees':'NYA', 'nyy': 'NYA', 'newyorkyankees':'NYA', 'nya':'NYA', 
                 'dodgers':'LAN', 'lad':'LAN', 'losangelesdodgers':'LAN', 'la':'LAN', 'lan':'LAN', 
                 'orioles':'BAL', 'baltimoreorioles':'BAL', 'bal':'BAL', 
                 'rockies':'COL', 'coloradorockies':'COL', 'col':'COL', 
                 'cubs':'CHN', 'chc':'CHN', 'chicagocubs':'CHN', 'chn':'CHN', 
                 'marlins':'MIA', 'fla':'MIA', 'floridamarlins':'MIA', 'miamimarlins':'MIA', 'mia':'MIA', 
                 'indians':'CLE', 'clevelandindians':'CLE', 'cle':'CLE', 
                 'reds':'CIN', 'cincinnatireds':'CIN', 'cin':'CIN', 
                 'rangers':'TEX', 'texasrangers':'TEX', 'tex':'TEX', 
                 'rays':'TBA', 'devilrays':'TBA', 'tbd':'TBA', 'tampabayrays':'TBA', 'tampabaydevilrays':'TBA', 'tba':'TBA', 
                 'tigers':'DET', 'detriottigers':'DET', 'det':'DET', 
                 'braves':'ATL', 'atlantabraves':'ATL', 'atl':'ATL', 
                 'astros':'HOU', 'houstonastros':'HOU', 'hou':'HOU', 
                 'brewers':'MIL', 'milwaukeebrewers':'MIL', 'mil':'MIL', 
                 'nationals':'WAS', 'wsh':' WAS', 'wsn':'WAS', 'washingtonnationals':'WAS', 'montrealexpos':'WAS', 'expos':'WAS', 'mtl':'WAS','was':'WAS', 
                 'phillies':'PHI', 'philadelphiaphillies':'PHI', 'phi':'PHI', 
                 'as':'OAK', 'athletics':'OAK', 'oaklandathletics':'OAK', 'oaklandas':'OAK', 'oaklanda':'OAK', 'oak':'OAK', 
                 'diamondbacks':'ARI', 'arizonadiamondbacks':'ARI', 'ari':'ARI', 
                 'whitesox':'CHA', 'cws':'CHA', 'chicagowhitesox':'CHA', 'cha':'CHA', 
                 'redsox':'BOS', 'bostonredsox':'BOS', 'bos':'BOS', 
                 'royals':'KCA', 'kcr':'KCA', 'kansascityroyals':'KCA', 'kc':'KCA', 'kca':'KCA', 
                 'cardinals':'SLN', 'slc':'SLN', 'stl':'SLN', 'saintlouiscardinals':'SLN', 'stlouiscardinals':'SLN', 'sln':'SLN', 
                 'pirates':'PIT', 'pittsburghpirates':'PIT', 'pit':'PIT', 
                 'mets':'NYN', 'nym':'NYN', 'newyorkmets':'NYN', 'nyn':'NYN', 
                 'twins':'MIN', 'minnesotatwins':'MIN', 'min':'MIN', 
                 'padres':'SDN', 'sdp':'SDN', 'sandiegopadres':'SDN', 'sd':'SDN', 'sdn':'SDN', 
                 'angels':'ANA', 'laa':'ANA', 'losangelesangels':'ANA', 'losangelesangelsofanaheim':'ANA', 'ana':'ANA', }
    
    df[col_name] = df[col_name].map(team_dict)
    
    return df

In [10]:
def uniform_name_list(l):
    l = [re.sub(r'[\W\s]', '', x).lower() for x in l]
    team_list = ['TOR', 'SFN', 'SEA', 'NYA', 'LAN', 'BAL', 'COL', 'CHN', 'MIA',
       'CLE', 'CIN', 'TEX', 'TBA', 'DET', 'ATL', 'HOU', 'MIL', 'WAS',
       'PHI', 'OAK', 'ARI', 'CHA', 'BOS', 'KCA', 'SLN', 'PIT', 'NYN',
       'MIN', 'SDN', 'ANA']
    
    
    
    team_dict = {'jays':'TOR','bluejays':'TOR', 'torontobluejays':'TOR', 'tor':'TOR', 
                 'giants':'SFN', 'sfg': 'SFN', 'sanfranciscogiants':'SFN', 'sf':'SFN', 'sfn':'SFN', 
                 'mariners':'SEA', 'seattlemariners':'SEA', 'sea':'SEA', 
                 'yankees':'NYA', 'nyy': 'NYA', 'newyorkyankees':'NYA', 'nya':'NYA', 
                 'dodgers':'LAN', 'lad':'LAN', 'losangelesdodgers':'LAN', 'la':'LAN', 'lan':'LAN', 
                 'orioles':'BAL', 'baltimoreorioles':'BAL', 'bal':'BAL', 
                 'rockies':'COL', 'coloradorockies':'COL', 'col':'COL', 
                 'cubs':'CHN', 'chc':'CHN', 'chicagocubs':'CHN', 'chn':'CHN', 
                 'marlins':'MIA', 'fla':'MIA', 'floridamarlins':'MIA', 'miamimarlins':'MIA', 'mia':'MIA', 
                 'indians':'CLE', 'clevelandindians':'CLE', 'cle':'CLE', 
                 'reds':'CIN', 'cincinnatireds':'CIN', 'cin':'CIN', 
                 'rangers':'TEX', 'texasrangers':'TEX', 'tex':'TEX', 
                 'rays':'TBA', 'devilrays':'TBA', 'tbd':'TBA', 'tampabayrays':'TBA', 'tampabaydevilrays':'TBA', 'tba':'TBA', 
                 'tigers':'DET', 'detriottigers':'DET', 'det':'DET', 
                 'braves':'ATL', 'atlantabraves':'ATL', 'atl':'ATL', 
                 'astros':'HOU', 'houstonastros':'HOU', 'hou':'HOU', 
                 'brewers':'MIL', 'milwaukeebrewers':'MIL', 'mil':'MIL', 
                 'nationals':'WAS', 'wsh':' WAS', 'wsn':'WAS', 'washingtonnationals':'WAS', 'montrealexpos':'WAS', 'expos':'WAS', 'mtl':'WAS','was':'WAS', 
                 'phillies':'PHI', 'philadelphiaphillies':'PHI', 'phi':'PHI', 
                 'as':'OAK', 'athletics':'OAK', 'oaklandathletics':'OAK', 'oaklandas':'OAK', 'oaklanda':'OAK', 'oak':'OAK', 
                 'diamondbacks':'ARI', 'arizonadiamondbacks':'ARI', 'ari':'ARI', 
                 'whitesox':'CHA', 'cws':'CHA', 'chicagowhitesox':'CHA', 'cha':'CHA', 
                 'redsox':'BOS', 'bostonredsox':'BOS', 'bos':'BOS', 
                 'royals':'KCA', 'kcr':'KCA', 'kansascityroyals':'KCA', 'kc':'KCA', 'kca':'KCA', 
                 'cardinals':'SLN', 'slc':'SLN', 'stl':'SLN', 'saintlouiscardinals':'SLN', 'stlouiscardinals':'SLN', 'sln':'SLN', 
                 'pirates':'PIT', 'pittsburghpirates':'PIT', 'pit':'PIT', 
                 'mets':'NYN', 'nym':'NYN', 'newyorkmets':'NYN', 'nyn':'NYN', 
                 'twins':'MIN', 'minnesotatwins':'MIN', 'min':'MIN', 
                 'padres':'SDN', 'sdp':'SDN', 'sandiegopadres':'SDN', 'sd':'SDN', 'sdn':'SDN', 
                 'angels':'ANA', 'laa':'ANA', 'losangelesangels':'ANA', 'losangelesangelsofanaheim':'ANA', 'ana':'ANA', 
                '---': 'UNK'}
    l = [team_dict.get(x, x) for x in l]
    return l

In [11]:
test_teams = pitchers_df['teams'].iloc[0]

In [12]:
test_teams

'MIA,TOR,NYA,PIT,PHI'

In [13]:
def split_teams(r):
    teams = r.split("' '")
    teams = [re.sub(r"[\[\]']", '', x) for x in teams]
    teams_str = ','.join(uniform_name_list(teams))
    return re.sub(r',,', ',', teams_str)

In [14]:
split_teams(test_teams)

'miatornyapitphi'

In [15]:
pitchers_df['teams'] = pitchers_df['teams'].apply(split_teams)

AttributeError: 'float' object has no attribute 'split'

In [None]:
pitchers_df.head(10)

In [None]:
pitchers_df.to_csv('../data/pitchers_summary.csv', index=False)

In [None]:
import os

num_f = len(os.listdir('../data/pitchers_games'))

for i, f in enumerate(os.listdir('../data/pitchers_games')):
    df = pd.read_csv(f'../data/pitchers_games/{f}')
    df = uniform_name(df, 'Tm')
    df.to_csv(f'../data/pitchers_games/{f}')
    if i % int(num_f / 10) == 0 and i > 0:
        print(f'{10*i / int(num_f / 10)}% complete')

## Joining by retro key

How much of the games data has a foreign key for retrosheets?

In [None]:
games_df['home_pitcher'].nunique(), games_df['away_pitcher'].nunique()

In [None]:
pd.merge(games_df, pitchers_df, left_on='home_pitcher', right_on='key_retro')['home_pitcher'].nunique()

In [None]:
pd.merge(games_df, pitchers_df, left_on='away_pitcher', right_on='key_retro')['away_pitcher'].nunique()

So roughly half of pitchers have a useable retrosheet foreign key. Let's try names.

In [None]:
games_df['home_pitcher_cleaned'] = games_df['home_pitcher'].str.replace(r'[\s\.\-]+', '').str.lower()
games_df['away_pitcher_cleaned'] = games_df['away_pitcher'].str.replace(r'[\s\.\-]+', '').str.lower()

In [None]:
games_df.head()

In [None]:
pitchers_df['pitcher_cleaned'] = pitchers_df['Name'].str.replace(r'[\s\.\-]+', '').str.lower()

In [None]:
pitchers_df.head()

In [None]:
pd.merge(games_df, pitchers_df, left_on='home_pitcher_cleaned', right_on='pitcher_cleaned')['home_pitcher'].nunique()

## Join by retrokey

It seems like almost all of them can join by retrokey or (cleaned) name. Let's first join by retrokey.

In [None]:
def fill_missing_pitchers(left, right, left_on, right_on):
    # The only columns we need from the pitchers
    right_keys = right[['key_retro', 'key_bbref', 'pitcher_cleaned']]
    # Doing this whole "reset_index"..."set_index" preserves the indices (otherwise they get reset)
    left = left.reset_index()
    left = left.merge(right_keys, how='left', left_on=left_on, right_on=right_on)
    left = left.set_index('index')
    left = left.drop(['key_retro', 'pitcher_cleaned'], axis='columns')
    return left

In [None]:
games_df = fill_missing_pitchers(games_df, pitchers_df, 'home_pitcher', 'key_retro')

In [None]:
def add_key_bbref(df, prefix):
    # Rename the last column to include home/away
    cols = list(df.columns)
    cols[-1] = f'{prefix}_key_bbref'
    df.columns = cols
    return df

In [None]:
games_df = add_key_bbref(games_df, 'home')

In [None]:
games_df.head()

Note that there are some pitchers who have the same name but are different people.

In [None]:
pitchers_df['key_retro'].nunique(), pitchers_df['pitcher_cleaned'].nunique()

For now I'll just drop those pitchers to make things easier. Then I'll manually add them.

In [None]:
pitchers_df = pitchers_df.drop_duplicates('pitcher_cleaned', keep=False)

Now we'll grab just those who didnt' join on retrokey and try joining by cleaned name.

In [None]:
games_missing_df = games_df[games_df['home_key_bbref'].isna()]
games_missing_df = games_missing_df.drop('home_key_bbref', axis='columns')

In [None]:
games_missing_df.head()

In [None]:
games_missing_df = fill_missing_pitchers(games_missing_df, pitchers_df, 'home_pitcher_cleaned', 'pitcher_cleaned')

In [None]:
games_missing_df = add_key_bbref(games_missing_df, 'home')

In [None]:
games_missing_df.head()

In [None]:
games_df.update(games_missing_df)

In [None]:
games_df.head()

## Again for away team

In [None]:
pitchers_df = pd.read_csv('../data/pitchers_summary.csv')
pitchers_df['pitcher_cleaned'] = pitchers_df['Name'].str.replace(r'[\s\.\-]+', '').str.lower()

games_df = fill_missing_pitchers(games_df, pitchers_df, 'away_pitcher', 'key_retro')
games_df = add_key_bbref(games_df, 'away')

pitchers_df = pitchers_df.drop_duplicates('pitcher_cleaned', keep=False)

games_missing_df = games_df[games_df['away_key_bbref'].isna()]
games_missing_df = games_missing_df.drop('away_key_bbref', axis='columns')
games_missing_df = fill_missing_pitchers(games_missing_df, pitchers_df, 'away_pitcher_cleaned', 'pitcher_cleaned')
games_missing_df = add_key_bbref(games_missing_df, 'away')

games_df.update(games_missing_df)

In [None]:
games_df.head()

## That's everyone

In [None]:
games_df['home_pitcher'] = games_df['home_key_bbref']
games_df['away_pitcher'] = games_df['away_key_bbref']

In [None]:
games_df['home_pitcher'].isna().sum(), games_df['away_pitcher'].isna().sum()

In [None]:
games_df = games_df.drop(['home_pitcher_cleaned', 'away_pitcher_cleaned', 'home_key_bbref', 'away_key_bbref'], axis='columns')

In [None]:
games_df.head()

In [1188]:
games_df.to_csv('../data/mlb_games_df.csv', index=False)