In [2]:
import pandas as pd
import numpy as np
import time
from joblib import Parallel, delayed
import pybaseball
pd.options.mode.chained_assignment = None 
#https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas

This block will download all of the raw data from 2017 through 2021


Some thoughts:
    I have two options:
        1) Download the data and cut it down/clean it to what it I want,
        2) Download the data, save that, then clean it and save that
        
        Option 1 is more memory efficient but option 1 is more time but 
        option 2 is more time efficient if/when I make mistakes
        
        So for now I'm going with option two and if that is super fast then I will
        delete that result and combine them into one script. Okay here goes

In [9]:
def get_data_from_mlb(start_year, end_year = None):
    
    if end_year == None:
        end_year = start_year
    
    dfs = [] #https://github.com/jldbc/pybaseball/blob/master/EXAMPLES/imputed_derivation.ipynb
    for year in range(start_year, end_year + 1):
        print(f"Starting year {year}")
        dfs.append(pybaseball.statcast(start_dt=f'{year}-03-28', end_dt=f'{year}-10-03',verbose=False))
        #I'm overshooting the start and end here to make sure I don't miss anything
        #The statcast function is written in parallel, so that helps speed up this process
        
    # https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
    print("Info succesfully downloaded from statcast")
    return pd.concat(dfs) 
    

In [3]:
if __name__ == '__main__':

    result = get_data_from_mlb(2017, 2021)
    result = result[result['game_type'] == 'R'] #Regular Season
    folder_name = "data_files/"
    
    print("Writing raw data to csv")
    result.to_csv(folder_name + "raw_data_from_statcast.csv", index = False)
    print("Raw data succesfully written to csv")
    

NameError: name 'get_data_from_mlb' is not defined

In [4]:
folder_name = "data_files/"
print("Loading raw data")
raw_data = pd.read_csv("data_files/raw_data_from_statcast.csv")
print("Raw data loaded")

Loading raw data
Raw data loaded


In [97]:
key_points = raw_data[[ 
                        "pitch_number",
                        "game_date",
                        "home_team",
                        "game_year",
                        "game_pk",
                        "description",
                        "plate_x",
                        "plate_z",
                        "sz_top",
                        "sz_bot",
                        "post_home_score",
                        "post_away_score"
]]

"""
key_points = key_points[
            np.where(  
            (key_points['description'] == 'ball') | \
            (key_points['description'] == 'blocked_ball') |  \
            (key_points['description'] == 'called_strike'),
             True, False   
            )] 
        #Only looking at calls the umpire made
"""
folder_name = "data_files/"    
print("Writing key_points data to csv")
key_points.to_csv(folder_name + "key_points.csv", index = False)
print("key_points written")

Writing key_points data to csv
key_points written


key_points.csv has all of the pitch information I need to work with. 

Now I need to group them by game, and rate the ump's performance for that game

In [261]:
class Rate_Game:
    """
    I verified that these methods work using game_pk:490098
    I had it print the results here and then wrote it as a 
    CSV and compared in excel
    """
    
    """
    Inputs:
        game_df: 
            pandas df of all of the pitches 
            In most cases this will be one game, but I guess I could use this to get overall averages too
        game_pk:
            ID of the game
    Output:
        A pandas series that will have the following information:
        game_pk, game_date, ump accuracy, ump consistency
    """
    def grade_game(game_df, game_pk):
        
        game_df = game_df.sort_index()
        
        post_home_score = game_df['post_home_score'].iloc[0]
        post_away_score = game_df['post_away_score'].iloc[0]
        
        game_df = game_df[
            np.where(  
            (game_df['description'] == 'ball') | \
            (game_df['description'] == 'blocked_ball') |  \
            (game_df['description'] == 'called_strike'),
             True, False   
            )] 
        #Only looking at calls the umpire made
        
        #print(game_df)
        
        return pd.Series({
            'game_pk' : game_pk,
            'game_date' : game_df['game_date'].iloc[0], #Date pulled from first row
            'game_year' : game_df['game_year'].iloc[0], #Date pulled from first row
            'umpire_accuracy' : Rate_Game.grade_accuracy(game_df),
            'umpire_consistency' : Rate_Game.grade_consistency(game_df),
            'home_team' : game_df['home_team'].iloc[0], #Date pulled from first row
            'home_score' : post_home_score,
            'away_score' : post_away_score
        })
        
   
   
    
        
    """
    Inputs:
        game_df: 
            pandas df of all of the pitches 
    Output:
        How many pitches were called correctly in the zone divided by total number of called pitches
    """   
    def grade_accuracy(game_df):
        half_width_ball = 2.9/12
        sz_L = (-8.5/12) - half_width_ball
        sz_R = (8.5/12) + half_width_ball
        
        #https://stackoverflow.com/questions/30631841/pandas-how-do-i-assign-values-based-on-multiple-conditions-for-existing-columns
        game_df["in_zone"] = np.where(
            (game_df["plate_x"] > sz_L) & \
            (game_df["plate_x"] < sz_R) & \
            (game_df["plate_z"] > (game_df["sz_bot"] - half_width_ball)) & \
            (game_df["plate_z"] < (game_df["sz_top"] + half_width_ball)),
            True, False
        )
        
        game_df["correct_call"] = \
            ((game_df["in_zone"] == True) & (game_df['description'] == 'called_strike')) |\
              ((game_df["in_zone"] == False) & (game_df['description'] == 'blocked_ball')) |\
              ((game_df["in_zone"] == False) & (game_df['description'] == 'ball'))

        return game_df[game_df["correct_call"] == True].shape[0] /game_df.shape[0]
    
    """ #TODO
    Inputs:
        game_df: 
            pandas df of all of the pitches 
    Output:
        Score of how consistent the ump was
    """   
    def grade_consistency(game_df):
        #Need to normalize game_df first
        
        game_df['normalized_pz'] = (2*(game_df['plate_z'] - game_df['sz_top'])/
            (game_df['sz_top'] - game_df['sz_bot'])) + 3.5
        
        
        strikes_df = game_df[game_df['description'].str.fullmatch('called_strike')]
        strikes_df = strikes_df[['plate_x', 'normalized_pz']]
        balls_df = game_df[~game_df['description'].str.fullmatch('called_strike')]
        balls_df = balls_df[['plate_x', 'normalized_pz']]
        
        #print("Num strikes: ", len(strikes_df))
        #print("Num balls: ", len(balls_df))
        
        
        """
        Ya know what?
        This is about to be terrible code 
        but its 2 AM so I don't care!!!
        """
        
        count = 0
        for index, row in balls_df.iterrows():
            #print("index: ", index)
            #print("Row: ", row, "\n")
            
            #Strike higher & righter
            list_of_truths = list(np.rot90(
                [row['plate_x'] > strikes_df['plate_x'], 
                row['normalized_pz'] > strikes_df['normalized_pz']]))
            
            higher_n_righter = False
            for truth in list_of_truths:
                if all(truth):
                    higher_n_righter = True
                    break
                    
            #Strike higher & lefter
            list_of_truths = list(np.rot90(
                [row['plate_x'] < strikes_df['plate_x'], 
                row['normalized_pz'] > strikes_df['normalized_pz']]))
            
            higher_n_lefter = False
            for truth in list_of_truths:
                if all(truth):
                    higher_n_lefter = True
                    break
                    
            #Strike lower & righter
            list_of_truths = list(np.rot90(
                [row['plate_x'] > strikes_df['plate_x'], 
                row['normalized_pz'] < strikes_df['normalized_pz']]))
            
            lower_n_righter = False
            for truth in list_of_truths:
                if all(truth):
                    lower_n_righter = True
                    break
                    
            #Strike lower & lefter
            list_of_truths = list(np.rot90(
                [row['plate_x'] < strikes_df['plate_x'], 
                row['normalized_pz'] < strikes_df['normalized_pz']]))
            
            lower_n_lefter = False
            for truth in list_of_truths:
                if all(truth):
                    lower_n_lefter = True
                    break
            
            if higher_n_righter and higher_n_lefter and \
                lower_n_righter and lower_n_lefter:
                count +=1
            
        return len(strikes_df)/(len(strikes_df) + count)


In [3]:
key_points = pd.read_csv("data_files/key_points.csv")
#key_points

In [255]:
#Parallelized in next block
#Good to have in series for testing the Rate_Game function
start = time.time()
game_groups = key_points.groupby(['game_pk'])
for game_pk, game_df in game_groups.__iter__(): #iters through every game
    print(Rate_Game.grade_game(game_df, game_pk), "\n")
    break
time_elapsed =  time.time() - start #Takes ~1/3 of time in parallel
print(f"Time elapsed: {time_elapsed:.2f} seconds.")

Num strikes:  44
Num balls:  113
game_pk                   490098
game_date             2017-04-03
game_year                   2017
umpire_accuracy         0.847134
umpire_consistency       0.77193
home_team                    NYM
post_home_score                6
post_away_score                0
dtype: object 

Time elapsed: 0.85 seconds.


In [256]:
start = time.time()
game_groups = key_points.groupby(['game_pk'])
series_list = Parallel(n_jobs = -1, verbose = 4)(
    delayed(Rate_Game.grade_game)(game_df, game_pk) for game_pk, game_df in game_groups.__iter__())
game_summary = pd.concat(series_list, axis = 1).T
time_elapsed =  time.time() - start #Takes ~1/3 of time in parallel
print(f"Time elapsed: {time_elapsed:.2f} seconds.")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:   23.8s
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed:   35.6s
[Parallel(n_jobs=-1)]: Done 1185 tasks      | elapsed:   48.3s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 2949 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 3512 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 4125 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 4786 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 5497 tasks      | el

Time elapsed: 443.73 seconds.


In [257]:
game_summary.sort_values(by = ['game_date'], ignore_index = True)

Unnamed: 0,game_pk,game_date,game_year,umpire_accuracy,umpire_consistency,home_team,post_home_score,post_away_score
0,490106,2017-04-02,2017,0.863946,0.867925,TB,7,3
1,490099,2017-04-02,2017,0.865922,0.941176,STL,4,3
2,490110,2017-04-02,2017,0.849673,0.96,ARI,6,5
3,490112,2017-04-03,2017,0.868132,1.0,BAL,3,2
4,490111,2017-04-03,2017,0.930435,0.974359,HOU,3,0
...,...,...,...,...,...,...,...,...
10610,632246,2021-10-03,2021,0.866071,0.945946,MIA,5,4
10611,632252,2021-10-03,2021,0.918605,0.983051,CWS,2,5
10612,632253,2021-10-03,2021,0.846154,1.0,ARI,5,4
10613,632239,2021-10-03,2021,0.953846,1.0,NYY,1,0


To this point, we have pandas df that gives us a list of every game. Info included in this file is game_pk, game_year, game_date, umpire accuracy and umpire consistency. 

Now we need to match the games with the umpire. Umpire excel sheet was found [here](https://billpetti.github.io/baseball_tools/) after some Googling. I've tried to figure out how the Umpire Scorecards guy automates that but I've been unsuccesful, and this does the job. 

In [258]:
umpires = pd.read_csv("data_files/umpires_ids_game_pk.csv")
umpires = umpires[umpires['position'] == "HP"]
umpires = umpires[
    ((umpires['game_date'].str.contains('2021')) | \
    (umpires['game_date'].str.contains('2020')) | \
    (umpires['game_date'].str.contains('2019')) | \
    (umpires['game_date'].str.contains('2018')) | \
    (umpires['game_date'].str.contains('2017')))]
#umpires['game_year'] = umpires['game_date'].str[-4:]

#game_summary.info(memory_usage = "deep")
#umpires.info(memory_usage = "deep")
#interesting function that tells us the memory of the df

In [17]:
pd.DataFrame(np.unique(umpires[['name']])).to_csv('data_files/' + "umpires.csv", index = False)

In [260]:
big_df = pd.merge(game_summary, umpires, on = "game_pk").drop(['game_date_y', 'position'], axis=1)

Unnamed: 0,game_pk,game_date_x,game_year,umpire_accuracy,umpire_consistency,home_team,post_home_score,post_away_score,id,name
0,490098,2017-04-03,2017,0.847134,0.77193,NYM,6,0,427269,Jeff Kellogg
1,490099,2017-04-02,2017,0.865922,0.941176,STL,4,3,427144,Paul Emmel
2,490100,2017-04-03,2017,0.853933,0.966102,TEX,5,8,427552,Mike Winters
3,490101,2017-04-03,2017,0.865385,0.862745,MIL,5,7,427538,Joe West
4,490102,2017-04-04,2017,0.909639,0.981132,CWS,3,6,427192,Brian Gorman
...,...,...,...,...,...,...,...,...,...,...
10211,634651,2021-04-04,2021,0.863354,0.910714,OAK,2,9,503493,Sean Barber
10212,634652,2021-04-01,2021,0.873134,1.0,MIA,0,1,427299,Jerry Layne
10213,634653,2021-04-02,2021,0.923077,0.927273,COL,6,11,482620,Chris Conroy
10214,634654,2021-04-03,2021,0.863388,0.918033,CIN,9,6,427413,Tony Randazzo


In [262]:
big_df.to_csv("data_files/umpire_performance.csv", index = False)

Up Above is the Umpire Performance