In [21]:
import pandas as pd
import matplotlib.pyplot as plt
from baseball_id import Lookup

In [3]:
'''
we'll be looking at various aspects of the batter-pitch interaction that effects swing length before moving onto looking at the batter's swing length and its effect on the possibility of hitting a line drive or a big hit. For this the key will be to group the events by their characteristics and the groups will have to be quite narrow. How narrow we get would be the key to coming up with very valuable insights without overfitting. We'd also like to see what kind of pitch gets certain batters to not swing. Even with these though we could use ball 4 walkouts as "success". In the end, what we want to deliver is a batter's percentage of hitting a line drive, pop out, big hit, strike out, and walk out (but not limited to these events) given a pitcher (or pitch style so if a certain batter tends to have swing strikeouts predominantly lefty curves and sliders).

so like an example insight from this analysis might be something like this:
Shohei Ohtani tends to have longer swing lengths against pitches from a nasty (high position variation) curveball that crosses within Ohtani's strikezone from a left handed pitcher and a fastball that lands towards the top of the strikezone from either left or right handed pitchers. However, because curveballs are slower, he tends to swing and miss on most of the nasty curveballs. On the contrary, he tends to be really patient with 2 seam fastballs and sliders from right handed pitchers as he rarely swings on these, but tend to miss sliders that land in the strike zone, while not swinging on any of the 2seam fastballs outside of the strikezone. Therefore, with Ohtani, we suggest you make full use of his batting style against pitchers that tend to throw a lot of 4 seam and 2 seam fastballs.
'''

'\nwe\'ll be looking at various aspects of the batter-pitch interaction that effects swing length before moving onto looking at the batter\'s swing length and its effect on the possibility of hitting a line drive or a big hit. For this the key will be to group the events by their characteristics and the groups will have to be quite narrow. How narrow we get would be the key to coming up with very valuable insights without overfitting. We\'d also like to see what kind of pitch gets certain batters to not swing. Even with these though we could use ball 4 walkouts as "success". In the end, what we want to deliver is a batter\'s percentage of hitting a line drive, pop out, big hit, strike out, and walk out (but not limited to these events) given a pitcher (or pitch style so if a certain batter tends to have swing strikeouts predominantly lefty curves and sliders).\n\nso like an example insight from this analysis might be something like this:\nShohei Ohtani tends to have longer swing length

In [17]:
drop_columns = ['pitch_type', 'spin_dir', 'spin_rate_deprecated', 'break_angle_deprecated', 'break_length_deprecated', 'tfs_deprecated', 'tfs_zulu_deprecated', 'umpire', "game_date", "game_year", "des", "player_name"]
df_raw = pd.read_csv("/Users/ianchang/Library/Mobile Documents/com~apple~CloudDocs/1. Project/Baseball/baseball/raw-data-ignore/statcast_pitch_swing_data_20240402_20240630.csv", low_memory=False).drop(columns=drop_columns)
df_no_bunt = df_raw[~df_raw['description'].str.contains('bunt', case=False, na=False)].copy()

In [18]:
num_cols = ['release_speed', 'release_pos_x', 'release_pos_z', 'plate_x', 'plate_z', 'hit_distance_sc', 
            'launch_speed', 'launch_angle', 'effective_speed', 'release_spin_rate']
df_no_bunt[num_cols] = df_no_bunt[num_cols].apply(pd.to_numeric, errors='coerce')

In [19]:
df_no_bunt

Unnamed: 0,release_speed,release_pos_x,release_pos_z,batter,pitcher,events,description,zone,game_type,stand,...,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,bat_speed,swing_length
0,95.0,-2.01,5.22,677587,622491,single,hit_into_play,3.0,R,L,...,0,4,0,Infield shade,Standard,239.0,-0.006,0.388,,
1,88.5,-2.09,4.95,677587,622491,,foul,4.0,R,L,...,0,4,0,Standard,Standard,253.0,0.000,-0.045,,
2,95.0,-2.02,5.12,677587,622491,,called_strike,2.0,R,L,...,0,4,0,Standard,Standard,238.0,0.000,-0.042,,
3,90.7,-1.26,5.13,660271,657277,walk,ball,11.0,R,L,...,5,5,2,Infield shade,Standard,234.0,0.009,0.082,,
4,95.4,-1.95,5.12,595978,622491,strikeout,foul_tip,5.0,R,R,...,0,4,0,Standard,Standard,238.0,0.008,-0.215,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346245,85.9,1.63,5.68,595909,694363,,called_strike,4.0,R,L,...,2,3,2,Standard,Standard,144.0,0.000,-0.067,,
346246,89.9,-1.12,6.41,650391,641755,,blocked_ball,14.0,R,R,...,3,3,3,Standard,Standard,200.0,0.000,0.057,,
346247,73.7,-2.44,5.80,700250,670102,,called_strike,6.0,R,L,...,1,8,1,Infield shade,Standard,47.0,0.000,-0.033,,
346248,94.1,2.97,5.97,593160,677053,,ball,12.0,R,R,...,5,5,6,Standard,Standard,139.0,0.000,0.032,,


In [52]:
name_variable = Lookup.from_mlb_ids([593160]).bref_name.to_string().strip().split()
name_variable[1:]

['Whit', 'Merrifield']

In [56]:
for index, row in df_no_bunt.iterrows():
    batter_variable = Lookup.from_mlb_ids([row.batter]).mlb_name.to_string().split()
    pitcher_variable = Lookup.from_mlb_ids([row.pitcher]).mlb_name.to_string().split()
    batter = " ".join(batter_variable[1:])
    pitcher = " ".join(pitcher_variable[1:])
    df_no_bunt.at[index, "batter_name"] = batter
    df_no_bunt.at[index, "pitcher_name"] = pitcher
