In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pybaseball import playerid_lookup
from pybaseball import statcast_pitcher
from pybaseball import statcast

## Importing data 

### Pitch-by-pitch data

Pitch by pitch data is imported via pybaseball. This data comes from Statcast (via Baseballsavant.com). It includes metrics such as velocity, spin rate, pitch coordinates etc. for all pitches, as well as outcomes of those pitches (Ball, strike, in play, etc.)

A description of all variables is available here:  https://baseballsavant.mlb.com/csv-docs

This data is not included on the github repo.

In [2]:
##To import the data, use the following code:
df_24 = statcast(start_dt='2024-03-28', end_dt='2024-10-31')
df = pd.DataFrame(df_24)
# df_24.to_csv("pitchbypitch_2024.csv")


This is a large query, it may take a moment to complete


100%|██████████| 218/218 [01:41<00:00,  2.14it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


In [22]:
#df = pd.read_csv("pitchbypitch_2024.csv")

### Seasonal batting and pitching data 

alongside the pitch-by-pitch data, seasonal batting and pitching data from the 2024 season is downloaded directly via Baseballsavant.com via thier custom leaderboard (https://baseballsavant.mlb.com/leaderboard/statcast). 

The dimensions of this data has been reduced using Principal component analysis - please review the separate "principal component analysis" notebook for details on how this was done.  Both raw and PCA variables will be used depending on the model. 

**how this data will be used** 

The seasonal data will be included alongside the pitch by pitch data for all players to capture their general trends as batters and pitchers. For example the batter Shohei Ohtani's batting average for 2024 is .314 -this will be included as a variable against all of the pitches he's seen that year. 

**please note** this approach does introduce some bias into the data as the outcome of each individual pitch contributes to the seasonal averages which may influence the predictions. Ideally a cumulative average prior to a given pitch or previous season averages should be used however this presents a number of data challenges such as a small sample size, people that didn't pitch/bat in the previous season etc. For simplicity this approach was chosen however people with a low numbers of plate appearances / batter faced was removed so that an individual pitch would have a negligible influence on the seasonal average.

In [4]:
#merging in batting data
batstats = pd.read_csv("data/batstats24_withPCA.csv") #ermging in seasonal batting 

In [5]:
batstats = batstats.drop(labels=['Unnamed: 0', 'year'], axis=1) #drop unused vars

In [6]:
batstats = batstats.rename(columns={"player_id": "batter_id"}) #rename id var to match with df - creating a separate pitcher and batter id
df = df.rename(columns={"batter": "batter_id"})

In [7]:
df = pd.merge(df, batstats, how='left', on=['batter_id']) #merging with main df

In [8]:
#merging in pitching data
pitchstats = pd.read_csv("data/pitchstats24_withPCA.csv")

In [9]:
pitchstats = pitchstats.drop(labels=['Name','year', 'ab'], axis=1)

In [10]:
#renaming columns so the differ from batting stats
columns_rename = ['PC1', 'PC2', 'PC3', 'PC4']

# Rename only the specified columns
pitchstats.rename(
    columns={col: f"{col}_pitch" for col in columns_rename if col in pitchstats.columns},
    inplace=True
)

In [11]:
#renaming columns so the differ from batting stats
columns_rename = ['k_percent', 'bb_percent', 'xba', 'xslg', 'xwoba',
       'xwobacon', 'sweet_spot_percent', 'barrel_batted_rate',
       'solidcontact_percent', 'flareburner_percent', 'poorlyunder_percent',
       'poorlytopped_percent', 'poorlyweak_percent', 'hard_hit_percent',
       'avg_hyper_speed', 'z_swing_percent', 'z_swing_miss_percent',
       'oz_swing_percent', 'oz_swing_miss_percent', 'team_abbrev', 'xhr',
       'xhr_rate']

# Rename only the specified columns
pitchstats.rename(
    columns={col: f"{col}_pitch" for col in columns_rename if col in pitchstats.columns},
    inplace=True
)

In [12]:
#verify column rename
pitchstats.columns

Index(['player_id', 'k_percent_pitch', 'bb_percent_pitch', 'xba_pitch',
       'xslg_pitch', 'xwoba_pitch', 'xwobacon_pitch',
       'sweet_spot_percent_pitch', 'barrel_batted_rate_pitch',
       'solidcontact_percent_pitch', 'flareburner_percent_pitch',
       'poorlyunder_percent_pitch', 'poorlytopped_percent_pitch',
       'poorlyweak_percent_pitch', 'hard_hit_percent_pitch',
       'avg_hyper_speed_pitch', 'z_swing_percent_pitch',
       'z_swing_miss_percent_pitch', 'oz_swing_percent_pitch',
       'oz_swing_miss_percent_pitch', 'team_abbrev_pitch', 'xhr_pitch',
       'xhr_rate_pitch', 'PC1_pitch', 'PC2_pitch', 'PC3_pitch', 'PC4_pitch'],
      dtype='object')

In [13]:
#rename ID var to match across data sets
pitchstats = pitchstats.rename(columns={"player_id": "pitcher_id"})
df = df.rename(columns={"pitcher": "pitcher_id"})

In [14]:
df = pd.merge(df, pitchstats, how='left', on=['pitcher_id']) #merge together

### Cleaning and processing data

### Deriving Variables and feature engineering 

The outcome variable that will be used for prediction is derived from three variable. Firstly the variable 'type' which categorises the outcome of a pitch into either a ball, strike or 'in play'.
The strikes are split into swinging strikes or called strikes using the variable 'description'. Strikes are split out as the fundamentally differ in zone location and how a batter reacts (i.e. whether the batter decides to swing or not)

Lastly 'in play' is split into weak or solid contact using the variable 'launch_speed_angle'. This is a metric dervied by statcast that categorises the quality of a contact based on launch angle and speed: 
1. Weak
2. Topped
3. Under
4. Flare/Burner
5. Solid contact
6. Barrel
  
The first three categories are grouped into 'weak' and the top 3 are grouped into "solid"

The purpose of this split is to solely consider the quality of a hit regardless of the fielding outcome (i.e. the ball was caught or not). As fundametally, a sucessful outing for a pitcher is to either get strikes and failing that, poor quality hits. 

In [15]:
##outcome variable 

df.loc[(df['type'] == "X") & (df['launch_speed_angle'] <= 3), 'outcome'] = 'Weak contact'
df.loc[(df['type'] == "X") & (df['launch_speed_angle'] > 3), 'outcome'] = 'Solid contact'
df.loc[(df['type'] == "S") & (df['description'].isin(["swinging_strike", "swinging_strike_blocked"])), 'outcome']  = "Swinging strike"
df.loc[(df['type'] == "S") & (df['description'].isin(["called_strike"])), 'outcome']  = "Called strike"
df.loc[df['type'] == "B", 'outcome'] = "Ball"

There are additional outcomes that, whilst falling under these categories are fundametally different. These include intentional walks and foul balls. This is out of scope for this project and have therefore been removed

In [16]:
pd.crosstab(df.description, df.type, dropna = False)

type,B,S,X
description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ball,240615,0,0
blocked_ball,15380,0,0
bunt_foul_tip,0,16,0
called_strike,0,118469,0
foul,0,131208,0
foul_bunt,0,1265,0
foul_tip,0,7469,0
hit_by_pitch,2066,0,0
hit_into_play,0,7,126632
missed_bunt,0,202,0


In [17]:
outcomes_to_remove  = ["foul", "foul_tip", "foul_bunt" ,"bunt_foul_tip", "pitchout", "missed_bunt"]
df = df[~df['description'].isin(outcomes_to_remove)]

In [18]:
#removing intentional walks 

# Find rows where the word exists
df = df[~df['des'].str.contains(rf'\b{"intentionally walks"}\b', case=False, na=False)]


In [19]:
df.outcome.value_counts()

outcome
Ball               257785
Called strike      118453
Swinging strike     80353
Weak contact        77650
Solid contact       48600
nan                   390
Name: count, dtype: int64

In [20]:
#there are a few missing vars for launch_angle_speed which are removed
df['outcome'].replace('nan', np.nan, inplace=True)
df = df.dropna(subset=['outcome'])

In [21]:
#verifying this split across metrics
pd.crosstab(df.description, df.outcome, dropna = False)

outcome,Ball,Called strike,Solid contact,Swinging strike,Weak contact
description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ball,240373,0,0,0,0
blocked_ball,15346,0,0,0,0
called_strike,0,118453,0,0,0
hit_by_pitch,2066,0,0,0,0
hit_into_play,0,0,48600,0,77650
swinging_strike,0,0,0,76321,0
swinging_strike_blocked,0,0,0,4032,0


In [75]:
#verifying this split across metrics
pd.crosstab(df.launch_speed_angle, df.outcome, dropna = False)

outcome,Ball,Called strike,Solid contact,Swinging strike,Weak contact
launch_speed_angle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,0,0,0,0,5618
2.0,0,0,0,0,39080
3.0,0,0,0,0,32952
4.0,0,0,30714,0,0
5.0,0,0,7937,0,0
6.0,0,0,9949,0,0
,257785,118453,0,80353,0


Some variables are also recoded to make them simple for analysis. 

1. On_base - how many people are on base at the time of the pitch (ranging from 0 to 3)
2. run_diff - what is the run differential of the fielding and batting team (positive indicates the fielding team is winning)

In [66]:
# addding "on_base" variable to denote how many people are on base at the time of the pitch
df[["on_3b","on_2b","on_1b"]] = ~df[["on_3b","on_2b","on_1b"]].isna() #converting to T/F var
df["on_base"] = df[["on_3b","on_2b","on_1b"]].sum(axis=1)



In [67]:
df["on_base"].value_counts()

on_base
0    337191
1    167461
2     66145
3     12399
Name: count, dtype: int64

In [68]:
## creating a run differential variable (pitching may be affected by team winning or losing)
df["run_diff"] = df.fld_score - df.bat_score	
df.run_diff.value_counts()

run_diff
 0     153961
-1      68484
 1      68087
 2      48756
-2      48221
 3      34832
-3      33380
 4      23058
-4      21741
 5      15527
-5      14380
 6       9964
-6       9647
 7       6952
-7       6494
 8       4012
-8       3586
 9       2496
-9       2408
 10      1896
-10      1603
 11       816
-11       715
 13       418
 12       383
-12       376
-13       336
 15       146
-14       135
 14       124
-15       109
 16        50
 17        42
-17        36
-16        25
Name: count, dtype: int64

In [69]:
df.loc[:, 'zone']  = df['zone'].astype('category') #converting zone to categorical var


In [73]:
###dropping columns that are not relevant to analysis 
cols_drop = ["Unnamed: 0","game_date","hit_location","spin_dir","spin_rate_deprecated","break_angle_deprecated",
             "break_length_deprecated","des","game_year","on_3b","on_2b","on_1b","tfs_deprecated","tfs_zulu_deprecated",
             "fielder_2","umpire","sv_id","fielder_2.1","fielder_3","fielder_4","fielder_5","fielder_6","fielder_7",
             "fielder_8","fielder_9","estimated_ba_using_speedangle","estimated_woba_using_speedangle","woba_value",
             "woba_denom","babip_value","iso_value","home_score","away_score","bat_score","fld_score","post_away_score",
             "post_home_score","post_bat_score","post_fld_score","if_fielding_alignment","of_fielding_alignment",
             "delta_home_win_exp","delta_run_exp","bat_speed","swing_length", "home_team", "away_team", "game_type",
             "inning_topbot","events","hc_x","hc_y","hit_distance_sc","launch_speed","launch_angle", "description",
             "launch_speed_angle","bb_type", "game_pk"]

df = df.drop(labels=cols_drop,axis=1)

#### Missing data
As described above, to ensure that seasonal averages don't exert too much influence on the pitch outcome prediction, we removes batters with less than 50 plate appearances and pitchers that have faced less than 50 batters. The seasonal data only include those with 50 appearances therefore those with less will have missing data and this will be removed. There are a few others vars with missing data where statcast failed to record pitch metrics, however these only make up a negligible percentage and have been removed

In [75]:
#assessing missing values 
missing  =df.isna().sum().sort_values(ascending=False)
print(missing[missing > 0])


team_abbrev_pitch           24454
team_abbrev                 22777
k_percent_pitch             15646
poorlyweak_percent_pitch    15646
bb_percent_pitch            15646
                            ...  
release_pos_y                 179
az                            179
sz_top                        179
sz_bot                        179
ay                            179
Length: 80, dtype: int64


In [76]:
# Drop rows with NaN values in all columns except team abbreviation as this is not used in analysis
df = df.dropna(subset=[col for col in df.columns if col not in ['team_abbrev', 'team_abbrev_pitch']])


In [43]:
#save to csv
#df.to_csv("pitchbypitch24_cleaned.csv", index = False)
# df_sub = df.sample(frac=0.01, random_state=1) #sample for review 
# df_sub.to_csv("pitchbypitch24_cleaned_sample.csv", index = False)

In [80]:
df

Unnamed: 0,pitch_type,release_speed,release_pos_x,release_pos_z,player_name,batter_id,pitcher_id,zone,stand,p_throws,...,team_abbrev_pitch,xhr_pitch,xhr_rate_pitch,PC1_pitch,PC2_pitch,PC3_pitch,PC4_pitch,outcome,on_base,run_diff
0,KC,77.5,-1.11,5.65,"Buehler, Walker",657077,621111,13.0,L,R,...,LAD,13.6,0.044156,1.139925,1.172247,-1.084624,0.292072,Swinging strike,0,1
1,KC,78.7,-1.01,5.73,"Buehler, Walker",657077,621111,14.0,L,R,...,LAD,13.6,0.044156,1.139925,1.172247,-1.084624,0.292072,Swinging strike,0,1
2,FC,93.1,-1.19,5.53,"Buehler, Walker",657077,621111,14.0,L,R,...,LAD,13.6,0.044156,1.139925,1.172247,-1.084624,0.292072,Swinging strike,0,1
3,KC,78.5,-1.19,5.70,"Buehler, Walker",657077,621111,13.0,L,R,...,LAD,13.6,0.044156,1.139925,1.172247,-1.084624,0.292072,Ball,0,1
4,KC,77.4,-1.23,5.78,"Buehler, Walker",669224,621111,7.0,L,R,...,LAD,13.6,0.044156,1.139925,1.172247,-1.084624,0.292072,Swinging strike,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723741,CU,78.0,-2.24,5.69,"Eovaldi, Nathan",641355,543135,5.0,L,R,...,TEX,22.3,0.034735,-0.061800,0.372794,1.226477,-1.156957,Called strike,0,0
723742,FF,96.7,-2.41,5.34,"Eovaldi, Nathan",673548,543135,2.0,R,R,...,TEX,22.3,0.034735,-0.061800,0.372794,1.226477,-1.156957,Weak contact,0,0
723743,CU,78.1,-2.27,5.62,"Eovaldi, Nathan",673548,543135,9.0,R,R,...,TEX,22.3,0.034735,-0.061800,0.372794,1.226477,-1.156957,Called strike,0,0
723744,FF,97.0,-2.40,5.36,"Eovaldi, Nathan",673548,543135,14.0,R,R,...,TEX,22.3,0.034735,-0.061800,0.372794,1.226477,-1.156957,Ball,0,0
