In [2]:
import pandas as pd
import numpy as np
import time
from joblib import Parallel, delayed
import pybaseball
pd.options.mode.chained_assignment = None 
#https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas

This block will download all of the raw data from 2017 through 2021


Some thoughts:
    I have two options:
        1) Download the data and cut it down/clean it to what it I want,
        2) Download the data, save that, then clean it and save that
        
        Option 1 is more memory efficient but option 1 is more time but 
        option 2 is more time efficient if/when I make mistakes
        
        So for now I'm going with option two and if that is super fast then I will
        delete that result and combine them into one script. Okay here goes

In [9]:
def get_data_from_mlb(start_year, end_year = None):
    
    if end_year == None:
        end_year = start_year
    
    dfs = [] #https://github.com/jldbc/pybaseball/blob/master/EXAMPLES/imputed_derivation.ipynb
    for year in range(start_year, end_year + 1):
        print(f"Starting year {year}")
        dfs.append(pybaseball.statcast(start_dt=f'{year}-03-28', end_dt=f'{year}-10-03',verbose=False))
        #I'm overshooting the start and end here to make sure I don't miss anything
        #The statcast function is written in parallel, so that helps speed up this process
        
    # https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
    print("Info succesfully downloaded from statcast")
    return pd.concat(dfs) 
    

In [10]:
if __name__ == '__main__':

    result = get_data_from_mlb(2017, 2021)
    result = result[result['game_type'] == 'R'] #Regular Season
    folder_name = "data_files/"
    
    print("Writing raw data to csv")
    result.to_csv(folder_name + "raw_data_from_statcast.csv", index = False)
    print("Raw data succesfully written to csv")
    
    
    
   
    

That's a nice request you got there. It'd be a shame if something were to happen to it.
We strongly recommend that you enable caching before running this. It's as simple as `pybaseball.cache.enable()`.
Since the Statcast requests can take a *really* long time to run, if something were to happen, like: a disconnect;
gremlins; computer repair by associates of Rudy Giuliani; electromagnetic interference from metal trash cans; etc.;
you could lose a lot of progress. Enabling caching will allow you to immediately recover all the successful
subqueries if that happens.


Starting year 2017


100%|████████████████████████████████████████████████████████████████████████████████| 185/185 [00:59<00:00,  3.12it/s]


Starting year 2018


100%|████████████████████████████████████████████████████████████████████████████████| 189/189 [01:00<00:00,  3.14it/s]


Starting year 2019


100%|████████████████████████████████████████████████████████████████████████████████| 190/190 [01:01<00:00,  3.07it/s]


Starting year 2020


100%|██████████████████████████████████████████████████████████████████████████████████| 73/73 [00:28<00:00,  2.57it/s]


Starting year 2021


100%|████████████████████████████████████████████████████████████████████████████████| 190/190 [00:59<00:00,  3.21it/s]


Info succesfully downloaded from statcast
Writing raw data to csv


In [6]:
folder_name = "data_files/"
print("Loading raw data")
raw_data = pd.read_csv("data_files/raw_data_from_statcast.csv")
print("Raw data loaded")

Loading raw data
Raw data loaded


In [30]:
key_points = raw_data[[ 
                        "game_date",
                        "home_team",
                        "game_year",
                        "game_pk",
                        "description",
                        "plate_x",
                        "plate_z",
                        "sz_top",
                        "sz_bot",
]]


key_points = key_points[
            np.where(  
            (key_points['description'] == 'ball') | \
            (key_points['description'] == 'blocked_ball') |  \
            (key_points['description'] == 'called_strike'),
             True, False   
            )] 
        #Only looking at calls the umpire made

folder_name = "data_files/"    
print("Writing key_points data to csv")
key_points.to_csv(folder_name + "key_points.csv", index = False)
print("key_points written")

Writing key_points data to csv
key_points written


key_points.csv has all of the pitch information I need to work with. 

Now I need to group them by game, and rate the ump's performance for that game

In [86]:
class Rate_Game:
    """
    I verified that these methods work using game_pk:490098
    I had it print the results here and then wrote it as a 
    CSV and compared in excel
    """
    
    """
    Inputs:
        game_df: 
            pandas df of all of the pitches 
            In most cases this will be one game, but I guess I could use this to get overall averages too
        game_pk:
            ID of the game
    Output:
        A pandas series that will have the following information:
        game_pk, game_date, ump accuracy, ump consistency
    """
    def grade_game(game_df, game_pk):
        
        
        return pd.Series({
            'game_pk' : game_pk,
            'game_date' : game_df['game_date'].iloc[0], #Date pulled from first row
            'game_year' : game_df['game_year'].iloc[0], #Date pulled from first row
            'umpire_accuracy' : Rate_Game.grade_accuracy(game_df),
            'umpire_consistency' : Rate_Game.grade_consistency(game_df),
        })
        
   
    """ #TODO
    Inputs:
        game_df: 
            pandas df of all of the pitches 
    Output:
        Score of how consistent the ump was
    """   
    def grade_consistency(game_df):
        #Need to normalize game_df first
        
        game_df['normalized_pz'] = (2*(game_df['plate_z'] - game_df['sz_top'])/
            (game_df['sz_top'] - game_df['sz_bot'])) + 3.5
        
        
        strikes_df = game_df[game_df['description'].str.fullmatch('called_strike')]
        strikes_df = strikes_df[['plate_x', 'normalized_pz']]
        balls_df = game_df[~game_df['description'].str.fullmatch('called_strike')]
        balls_df = balls_df[['plate_x', 'normalized_pz']]
        
        #Lets get the bounds strike zone first
        print(strikes_df)
        
        #print(np.min(strikes_df['plate_z']))
        #print(np.max(strikes_df['plate_z']))
        #print(np.unique(strikes_df['plate_z'], return_counts = True))
        """
        I think we should look at the strike df and say
        If there is a strike that is more 
            up & in
            lower & in
            up & out
            lower& out
            (basically if its surrounded)
        cut it
        
        use a (higher and left-er).any etc
        
        Does this work if there's a half-moon shaped gap around the mid section?
        """
        
        return None
    
        
    """
    Inputs:
        game_df: 
            pandas df of all of the pitches 
    Output:
        How many pitches were called correctly in the zone divided by total number of called pitches
    """   
    def grade_accuracy(game_df):
        half_width_ball = 2.9/12
        sz_L = (-8.5/12) - half_width_ball
        sz_R = (8.5/12) + half_width_ball
        
        #https://stackoverflow.com/questions/30631841/pandas-how-do-i-assign-values-based-on-multiple-conditions-for-existing-columns
        game_df["in_zone"] = np.where(
            (game_df["plate_x"] > sz_L) & \
            (game_df["plate_x"] < sz_R) & \
            (game_df["plate_z"] > (game_df["sz_bot"] - half_width_ball)) & \
            (game_df["plate_z"] < (game_df["sz_top"] + half_width_ball)),
            True, False
        )
        
        game_df["correct_call"] = \
            ((game_df["in_zone"] == True) & (game_df['description'] == 'called_strike')) |\
              ((game_df["in_zone"] == False) & (game_df['description'] == 'blocked_ball')) |\
              ((game_df["in_zone"] == False) & (game_df['description'] == 'ball'))

        return game_df[game_df["correct_call"] == True].shape[0] /game_df.shape[0]
    
    


In [2]:
key_points = pd.read_csv("data_files/key_points.csv")

In [87]:
#Parallelized in next block
#Good to have in series for testing the Rate_Game function
start = time.time()
game_groups = key_points.groupby(['game_pk'])
for game_pk, game_df in game_groups.__iter__(): #iters through every game
    print(Rate_Game.grade_game(game_df, game_pk), "\n")
    break
time_elapsed =  time.time() - start #Takes ~1/3 of time in parallel
print(f"Time elapsed: {time_elapsed:.2f} seconds.")

        plate_x  normalized_pz
720048     0.62       2.574890
720051    -0.97       1.528302
720064    -0.79       1.997354
720068    -1.01       2.919753
720086    -1.08       2.112903
720100    -0.32       1.202857
720106     1.04       1.048387
720110    -0.59       2.236842
720119     1.01       1.264706
720135     0.44       2.423858
720136     0.63       2.662996
720139     0.35       1.406977
720141     0.82       2.125000
720145     0.45       2.657360
720148    -0.11       1.917112
720158    -0.19       2.340426
720175     0.48       2.591892
720177     0.02       2.019481
720182    -1.01       2.837423
720185     0.10       1.239130
720196     0.83       2.016746
720200     0.83       1.968293
720207    -0.81       0.964481
720209    -0.92       1.944444
720221     0.00       1.180982
720226     0.86       2.676471
720228     0.16       1.477901
720239     0.86       1.460396
720244     0.48       1.824324
720257    -0.55       2.250000
720258    -0.76       1.250000
720259  

In [48]:
start = time.time()
series_list = Parallel(n_jobs = -1)(
    delayed(Rate_Game.grade_game)(game_df, game_pk) for game_pk, game_df in game_groups.__iter__())
game_summary = pd.concat(series_list, axis = 1).T
time_elapsed =  time.time() - start #Takes ~1/3 of time in parallel
print(f"Time elapsed: {time_elapsed:.2f} seconds.")

Time elapsed: 15.87 seconds.


In [8]:
game_summary

Unnamed: 0,game_pk,game_date,game_year,umpire_accuracy,umpire_consistency
0,490098,2017-04-03,2017,0.847134,
1,490099,2017-04-02,2017,0.865922,
2,490100,2017-04-03,2017,0.853933,
3,490101,2017-04-03,2017,0.865385,
4,490102,2017-04-04,2017,0.909639,
...,...,...,...,...,...
11027,660933,2021-10-08,2021,0.900901,
11028,660934,2021-10-08,2021,0.924138,
11029,660936,2021-10-07,2021,0.915152,
11030,660937,2021-10-06,2021,0.915663,


To this point, we have pandas df that gives us a list of every game. Info included in this file is game_pk, game_year, game_date, umpire accuracy and umpire consistency. 

Now we need to match the games with the umpire. Umpire excel sheet was found [here](https://billpetti.github.io/baseball_tools/) after some Googling. I've tried to figure out how the Umpire Scorecards guy automates that but I've been unsuccesful, and this does the job. 

In [4]:
umpires = pd.read_csv("data_files/umpires_ids_game_pk.csv")
umpires = umpires[umpires['position'] == "HP"]
umpires = umpires[
    ((umpires['game_date'].str.contains('2021')) | \
    (umpires['game_date'].str.contains('2020')) | \
    (umpires['game_date'].str.contains('2019')) | \
    (umpires['game_date'].str.contains('2018')) | \
    (umpires['game_date'].str.contains('2017')))]
#umpires['game_year'] = umpires['game_date'].str[-4:]

#game_summary.info(memory_usage = "deep")
#umpires.info(memory_usage = "deep")
#interesting function that tells us the memory of the df

In [168]:
pd.merge(game_summary, umpires, on = "game_pk").drop(['game_date_y', 'position'], axis=1)

Unnamed: 0,game_pk,game_date_x,game_year,umpire_accuracy,umpire_consistency,id,name
0,490098,2017-04-03,2017,0.847134,,427269,Jeff Kellogg
1,490099,2017-04-02,2017,0.865922,,427144,Paul Emmel
2,490100,2017-04-03,2017,0.853933,,427552,Mike Winters
3,490101,2017-04-03,2017,0.865385,,427538,Joe West
4,490102,2017-04-04,2017,0.909639,,427192,Brian Gorman
...,...,...,...,...,...,...,...
10402,642209,2021-03-21,2021,0.671642,,608093,Junior Valentine
10403,642216,2021-03-22,2021,0.885246,,427554,Jim Wolf
10404,642217,2021-03-28,2021,0.831461,,427315,Alfonso Marquez
10405,642219,2021-03-27,2021,0.77551,,607884,Paul Clemons
