In [146]:
import pandas as pd

In [147]:
break_conversion = {
    2: 0,
    3: 0.25,
    4: 0.5,
    5: 1.0,
    6: 1.5,
    7: 2.0,
}

In [148]:
def get_break(brk):
    current_brk = 1
    while current_brk < 6:
        floor = break_conversion[current_brk + 1]
        if brk > floor:
            current_brk += 1
        else:
            break
    return current_brk

In [149]:
# from Baseball Savant, get breaking ball data using TY data
def read_bb(year):
    df = pd.read_csv(f'Breaking Balls {year - 1}-{year}.csv', index_col=False).drop('Unnamed: 16', axis=1)
    df = df[df['year'] == year]

    #SLIDER
    # Slider : Default
    # Hard Slider : Faster, less break
    slider_spd_mean = df['sl_avg_speed'].mean()
    slider_spd_std = df['sl_avg_speed'].std()
    df['SL Z-Spd'] = (df['sl_avg_speed'] - slider_spd_mean) / slider_spd_std
    
    slider_break_mean = df['sl_avg_break'].mean()
    slider_break_std = df['sl_avg_break'].std()
    df['SL Z-Brk'] = (df['sl_avg_break'] - slider_break_mean) / slider_break_std
    def sliderFlag(row):
        if not (row['SL Z-Brk'] + row['SL Z-Spd']) > -999:
            return None
        if (row['SL Z-Brk'] <= -0.5) and (row['SL Z-Spd'] >= 0.5):
            return 'Hard Slider'
        return 'Slider'
    
    df['SliderFlag'] = df.apply(lambda x: sliderFlag(x), axis=1)
    df['SliderZ'] = None
    
    slider_break_mean = df[df['SliderFlag'] == 'Slider']['sl_avg_break'].mean()
    slider_break_std = df[df['SliderFlag'] == 'Slider']['sl_avg_break'].std()
    
    hard_slider_break_mean = df[df['SliderFlag'] == 'Hard Slider']['sl_avg_break'].mean()
    hard_slider_break_std = df[df['SliderFlag'] == 'Hard Slider']['sl_avg_break'].std()
    
    df.loc[df['SliderFlag'] == 'Slider', 'SliderZ'] = (df['sl_avg_break'] - slider_break_mean) / slider_break_std
    df.loc[df['SliderFlag'] == 'Hard Slider', 'SliderZ'] = (df['sl_avg_break'] - hard_slider_break_mean) / hard_slider_break_std
            
    #CHANGEUP
    # Changeup : Default
    def changeFlag(row):
        if not (row['ch_avg_break'] + row['ch_avg_speed']) > -999:
            return None
        return 'Changeup'
    
    df['ChangeFlag'] = df.apply(lambda x: changeFlag(x), axis=1)
    df['ChangeZ'] = None
    
    change_break_mean = df[df['ChangeFlag'] == 'Changeup']['ch_avg_break'].mean()
    change_break_std = df[df['ChangeFlag'] == 'Changeup']['ch_avg_break'].std()
    
    df.loc[df['ChangeFlag'] == 'Changeup', 'ChangeZ'] = (df['ch_avg_break'] - change_break_mean) / change_break_std
           
    #CURVEBALL
    # Curveball : Default
    # Slow Curveball : Slower, more break
    curve_spd_mean = df['cu_avg_speed'].mean()
    curve_spd_std = df['cu_avg_speed'].std()
    df['CB Z-Spd'] = (df['cu_avg_speed'] - curve_spd_mean) / curve_spd_std
    
    curve_break_mean = df['cu_avg_break'].mean()
    curve_break_std = df['cu_avg_break'].std()
    df['CB Z-Brk'] = (df['cu_avg_break'] - curve_break_mean) / curve_break_std
    def curveFlag(row):
        if not (row['CB Z-Brk'] + row['CB Z-Spd']) > -999:
            return None
        if (row['CB Z-Brk'] >= 0.5) and (row['CB Z-Spd'] <= -0.5):
            return 'Slow Curve'
        return 'Curve'
    
    df['CurveFlag'] = df.apply(lambda x: curveFlag(x), axis=1)
    df['CurveZ'] = None
    
    curve_break_mean = df[df['CurveFlag'] == 'Curve']['cu_avg_break'].mean()
    curve_break_std = df[df['CurveFlag'] == 'Curve']['cu_avg_break'].std()
    
    slow_curve_break_mean = df[df['CurveFlag'] == 'Slow Curve']['cu_avg_break'].mean()
    slow_curve_break_std = df[df['CurveFlag'] == 'Slow Curve']['cu_avg_break'].std()
    
    df.loc[df['CurveFlag'] == 'Curve', 'CurveZ'] = (df['cu_avg_break'] - curve_break_mean) / curve_break_std
    df.loc[df['CurveFlag'] == 'Slow Curve', 'CurveZ'] = (df['cu_avg_break'] - slow_curve_break_mean) / slow_curve_break_std

    #SINKER
    # Sinker : Default
    # Hard Sinker : Faster, less break
    sinker_spd_mean = df['si_avg_speed'].mean()
    sinker_spd_std = df['si_avg_speed'].std()
    df['SI Z-Spd'] = (df['si_avg_speed'] - sinker_spd_mean) / sinker_spd_std
    
    sinker_break_mean = df['si_avg_break'].mean()
    sinker_break_std = df['si_avg_break'].std()
    df['SI Z-Brk'] = (df['si_avg_break'] - sinker_break_mean) / sinker_break_std
    def sinkerFlag(row):
        if not (row['SI Z-Brk'] + row['SI Z-Spd']) > -999:
            return None
        if (row['SI Z-Brk'] <= -0.5) and (row['SI Z-Spd'] >= 0.5):
            return 'Hard Sinker'
        return 'Sinker'
    
    df['SinkerFlag'] = df.apply(lambda x: sinkerFlag(x), axis=1)
    df['SinkerZ'] = None
    
    sinker_break_mean = df[df['SinkerFlag'] == 'Sinker']['si_avg_break'].mean()
    sinker_break_std = df[df['SinkerFlag'] == 'Sinker']['si_avg_break'].std()
    
    hard_sinker_break_mean = df[df['SinkerFlag'] == 'Hard Sinker']['si_avg_break'].mean()
    hard_sinker_break_std = df[df['SinkerFlag'] == 'Hard Sinker']['si_avg_break'].std()
    
    df.loc[df['SinkerFlag'] == 'Sinker', 'SinkerZ'] = (df['si_avg_break'] - sinker_break_mean) / sinker_break_std
    df.loc[df['SinkerFlag'] == 'Hard Sinker', 'SinkerZ'] = (df['si_avg_break'] - hard_sinker_break_mean) / hard_sinker_break_std

    #CUTTER
    # Cutter : Default
    def cutterFlag(row):
        if not (row['fc_avg_break'] + row['fc_avg_spin']) > -999:
            return None
        return 'Cutter'
    
    df['CutterFlag'] = df.apply(lambda x: cutterFlag(x), axis=1)
    df['CutterZ'] = None
    
    fc_break_mean = df[df['CutterFlag'] == 'Cutter']['fc_avg_break'].mean()
    fc_break_std = df[df['CutterFlag'] == 'Cutter']['fc_avg_break'].std()
    
    df.loc[df['CutterFlag'] == 'Cutter', 'CutterZ'] = (df['fc_avg_break'] - fc_break_mean) / fc_break_std
    
    #SPLITTER
    def splitterFlag(row):
        if not (row['fs_avg_break'] + row['fs_avg_speed']) > -999:
            return None
        return 'Splitter'
    
    df['SplitterFlag'] = df.apply(lambda x: splitterFlag(x), axis=1)
    df['SplitterZ'] = None
    
    fs_break_mean = df[df['SplitterFlag'] == 'Splitter']['fs_avg_break'].mean()
    fs_break_std = df[df['SplitterFlag'] == 'Splitter']['fs_avg_break'].std()
    
    df.loc[df['SplitterFlag'] == 'Splitter', 'SplitterZ'] = (df['fs_avg_break'] - fs_break_mean) / fs_break_std
    
    # Conversions
    df['Slider'] = df.apply(lambda x: f"{x['SliderFlag']} - {get_break(x['SliderZ'])}" if x['SliderFlag'] is not None else 'None', axis=1)
    df['Change'] = df.apply(lambda x: f"{x['ChangeFlag']} - {get_break(x['ChangeZ'])}" if x['ChangeFlag'] is not None else 'None', axis=1)
    df['Curve'] = df.apply(lambda x: f"{x['CurveFlag']} - {get_break(x['CurveZ'])}" if x['CurveFlag'] is not None else 'None', axis=1)
    df['Sinker'] = df.apply(lambda x: f"{x['SinkerFlag']} - {get_break(x['SinkerZ'])}" if x['SinkerFlag'] is not None else 'None', axis=1)
    df['Cutter'] = df.apply(lambda x: f"{x['CutterFlag']} - {get_break(x['CutterZ'])}" if x['CutterFlag'] is not None else 'None', axis=1)
    df['Splitter'] = df.apply(lambda x: f"{x['SplitterFlag']} - {get_break(x['SplitterZ'])}" if x['SplitterFlag'] is not None else 'None', axis=1)
    
    df = df.set_index('player_id')
    
    player_map = pd.read_csv('../PLAYERIDMAP 2022.csv')[['MLBID', 'PLAYERNAME']].set_index('MLBID')
    merge = df.merge(player_map, how='left', left_index=True, right_index=True)
    
    return merge[['PLAYERNAME', 'Slider', 'Change', 'Curve', 'Sinker', 'Cutter', 'Splitter']]
    
bb_data = read_bb(2022)
bb_data

Unnamed: 0,PLAYERNAME,Slider,Change,Curve,Sinker,Cutter,Splitter
425794,Adam Wainwright,,Changeup - 2,Slow Curve - 6,Sinker - 1,Cutter - 1,
425844,Zack Greinke,Slider - 3,Changeup - 1,Slow Curve - 1,Sinker - 1,Cutter - 1,
434378,Justin Verlander,Hard Slider - 5,Changeup - 5,Curve - 2,,,
434671,Anibal Sanchez,Slider - 1,Changeup - 2,Curve - 1,Sinker - 1,Cutter - 5,
445276,Kenley Jansen,Slider - 1,,,Sinker - 4,Cutter - 6,
...,...,...,...,...,...,...,...
682051,Garrett Hill,Slider - 1,Changeup - 1,Curve - 4,Sinker - 4,,
682171,Penn Murfee,Slider - 6,,,Sinker - 1,,
686752,Ryan Pepiot,Slider - 1,Changeup - 4,,,,
689225,Beau Brieske,Slider - 1,Changeup - 6,Curve - 1,Sinker - 4,,


In [150]:
read_bb(2022).to_csv('2022_BreakingBall.csv')