In [1]:
import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
pd.set_option('display.max_columns', None)
from IPython.display import display

from pybaseball import statcast_pitcher
from pybaseball import playerid_lookup

In [7]:
states = {
    (0,0): 0,
    (1,0): 1,
    (2,0): 2,
    (3,0): 3,
    (0,1): 4,
    (0,2): 5,
    (1,1): 6,
    (1,2): 7,
    (2,1): 8,
    (2,2): 9,
    (3,1): 10,
    (3,2): 11,
    "Out": 12, 
    "Single": 13, 
    "Double": 14, 
    "Triple": 15, 
    "HR": 16, 
    "Walk": 17
}

In [2]:
id = playerid_lookup("Halladay","Roy").key_mlbam.item() 
data = statcast_pitcher("2009-04-06", "2009-09-30", player_id = id)

Gathering player lookup table. This may take a moment.
Gathering Player Data


In [22]:
data['year'] = pd.DatetimeIndex(data['game_date']).year 
data['month'] = pd.DatetimeIndex(data['game_date']).month
df_2009 = data[(data['year']==2009) & (data['month'] >= 4)]
df_season = df_2009.filter(items=['balls',
                                'strikes',
                                'events',
                                'description',
                                'game_date'])
df_season = df_season.iloc[::-1] # reverse order from earliest to latest
non_terminal_states = list(zip(df_season.balls, df_season.strikes))
df_season['non_terminal_states'] = non_terminal_states
df_season = df_season.reset_index()
df_season.head(20)

Unnamed: 0,index,balls,strikes,events,description,game_date,non_terminal_states
0,3393,0,0,,ball,2009-04-06,"(0, 0)"
1,3392,1,0,,called_strike,2009-04-06,"(1, 0)"
2,3391,1,1,,ball,2009-04-06,"(1, 1)"
3,3390,2,1,,ball,2009-04-06,"(2, 1)"
4,3389,3,1,walk,ball,2009-04-06,"(3, 1)"
5,3388,0,0,,called_strike,2009-04-06,"(0, 0)"
6,3387,0,1,grounded_into_double_play,hit_into_play,2009-04-06,"(0, 1)"
7,3386,0,0,,swinging_strike,2009-04-06,"(0, 0)"
8,3385,0,1,,swinging_strike,2009-04-06,"(0, 1)"
9,3384,0,2,,ball,2009-04-06,"(0, 2)"


In [12]:
df_season.events.unique(), df_season.description.unique()

(array([nan, 'walk', 'grounded_into_double_play', 'field_out', 'strikeout',
        'home_run', 'single', 'double', 'field_error', 'force_out',
        'hit_by_pitch', 'sac_bunt', 'double_play', 'sac_fly', 'triple'],
       dtype=object),
 array(['ball', 'called_strike', 'hit_into_play', 'swinging_strike',
        'foul', 'foul_bunt', 'foul_tip', 'blocked_ball', 'hit_by_pitch',
        'swinging_strike_blocked', 'pitchout', 'missed_bunt'], dtype=object))

In [23]:
# Retrieve pitch and action transitions sequences
pitch_seq = []
action_seq = []
walk_flag = False
for i, row in df_season.iterrows():
    event = row['events']
    # no outcome, add non-terminal state
    if str(event) == "nan" or str(event) == "field_error" or str(event) == "fielders_choice":
        pitch_seq.append(states[row['non_terminal_states']])
    # if Out, add terminal state
    elif (str(event) == "strikeout" or 
            str(event) == "field_out" or 
            str(event) == "grounded_into_double_play" or 
            str(event) == "fielders_choice_out" or 
            str(event) == "sac_fly" or 
            str(event) == "force_out" or 
            str(event) == "sac_bunt" or 
            str(event) == "caught_stealing_2b" or 
            str(event) == "double_play"):
        pitch_seq.append(states["Out"])
    elif (str(event) == "single"):
        pitch_seq.append(states["Single"])
    elif (str(event) == "double"):
        pitch_seq.append(states["Double"])
    elif (str(event) == "triple"):
        pitch_seq.append(states["Triple"])
    elif (str(event) == "home_run"):
        pitch_seq.append(states["HR"])
    elif (str(event) == "walk"):
        pitch_seq.append(states[row['non_terminal_states']])
        pitch_seq.append(states["Walk"])
        walk_flag = True
    if (row['description'] == 'foul' or 
            row['description'] == 'swinging_strike' or 
            row['description'] == 'foul_tip' or
            row['description'] == 'swinging_strike_blocked' or
            row['description'] == 'foul_bunt' or 
            row['description'] == 'missed_bunt' or 
            row['description'] == 'hit_into_play'):
        action_seq.append(1)
    elif (row['description'] == 'called_strike' or 
            row['description'] == 'ball' or 
            row['description'] == 'blocked_ball' or 
            row['description'] == 'hit_by_pitch' or
            row['description'] == 'pitchout'):
        action_seq.append(0)
        if walk_flag:
            action_seq.append(0)
            walk_flag = False
len(pitch_seq), len(action_seq)

(3424, 3429)

In [26]:
a = {'state': pitch_seq,
     'action': action_seq}
pitch_seq_df = pd.DataFrame.from_dict(a, orient='index')
df = pitch_seq_df.transpose()
df

Unnamed: 0,state,action
0,0.0,0.0
1,1.0,0.0
2,6.0,0.0
3,8.0,0.0
4,10.0,0.0
...,...,...
3424,,0.0
3425,,1.0
3426,,0.0
3427,,0.0


In [28]:
df['next_state'] = df['state'].shift(-1)
M = np.zeros((12, 18, 2))
for i, row in df.iterrows():
    if math.isnan(row['next_state']) or current_state >= 12 or math.isnan(row['state']):
        continue
    current_state = int(row['state'])
    action = int(row['action'])
    next_state = int(row['next_state'])
    M[current_state, next_state, action] += 1

for row in M[:,:,0]:
    n = sum(row)
    if n > 0:
        row[:] = [f/sum(row) for f in row]

for row in M[:,:,1]:
    n = sum(row)
    if n > 0:
        row[:] = [f/sum(row) for f in row]

In [30]:
M.shape

(12, 18, 2)