In [1]:
!pip install pybaseball -q

[K     |████████████████████████████████| 409kB 5.8MB/s 
[K     |████████████████████████████████| 296kB 29.5MB/s 
[K     |████████████████████████████████| 17.7MB 234kB/s 
[?25h

In [2]:
from pybaseball import statcast
import pandas as pd

In [3]:
# Get certain days games
raw = statcast('2019-07-23')
raw['game_pk'].unique().astype(int)

array([567604, 567310, 567218, 567012, 566913, 566716, 566522, 566424,
       566227, 566033, 565943, 565851, 565657, 565559, 564880])

In [4]:
target = ['game_pk', 'game_date', 'home_team', 'away_team', 'inning', 'inning_topbot', 'events', 'des', 'description', 'on_1b', 'on_2b', 'on_3b', 'outs_when_up', 'home_score', 'away_score']

In [5]:
# df = pd.DataFrame(data=raw[raw['game_pk'] == 567012], columns=target)
# df = pd.DataFrame(data=raw[raw['game_pk'] == 567310], columns=target)
df = pd.DataFrame(data=raw, columns=target)
df = df.dropna(subset=['events'])
# here baserunner advances from 2nd to 3rd on a wild pitch, increasing leverage
# this data is not recorded under 'events' and is lost when we dropna(...) above
# raw.iloc[1228:1233]

In [6]:
lookup = pd.read_csv('https://raw.githubusercontent.com/michael-rowland/stressful-baseball/main/li_lookup.csv')

In [7]:
# Converts bases to boolean on seperate columns
# https://chrisalbon.com/python/data_wrangling/pandas_expand_cells_containing_lists/
bases = (
    lookup['bases']
    .apply(lambda x: [True if base.isnumeric() else False for base in x.split(' ')])
    .apply(pd.Series)
    .rename(columns = lambda x: f'on_{x+1}b')
)
lookup = pd.concat([lookup[:], bases[:]], axis=1)

In [8]:
# make data "tidy"
lookup = lookup.melt(
    id_vars=['inning', 'inning_topbot', 'outs', 'on_1b', 'on_2b', 'on_3b'],
    value_vars=['-4', '-3', '-2', '-1', '0', '1', '2', '3', '4'],
    var_name='score_diff',
    value_name='leverage'
)
lookup['score_diff'] = lookup['score_diff'].astype(int)
lookup = lookup.dropna(subset=['leverage'])

In [9]:
df['score_diff'] = df['home_score'] - df['away_score']
df['score_diff'] = df['score_diff'].apply(lambda x: 4 if x > 4 else (-4 if x < -4 else x))
df['inning_pre'] = df['inning']
df['inning'] = df['inning'].apply(lambda x: 9 if x >= 9 else x)
df['on_1b'] = df['on_1b'].apply(lambda x: True if x > 0 else False)
df['on_2b'] = df['on_2b'].apply(lambda x: True if x > 0 else False)
df['on_3b'] = df['on_3b'].apply(lambda x: True if x > 0 else False)
df['outs'] = df['outs_when_up']

In [10]:
pre_merge_size = df.shape
df = pd.merge(df, lookup, on=['inning', 'inning_topbot', 'outs', 'on_1b', 'on_2b', 'on_3b', 'score_diff'])
post_merge_size = df.shape

In [11]:
print(pre_merge_size)
print(post_merge_size)
assert(pre_merge_size[0] == post_merge_size[0])

(1238, 18)
(1238, 19)


In [12]:
# from google.colab import files
# df.to_csv('test_output.csv')
# files.download('test_output.csv')

In [13]:
df.head()

Unnamed: 0,game_pk,game_date,home_team,away_team,inning,inning_topbot,events,des,description,on_1b,on_2b,on_3b,outs_when_up,home_score,away_score,score_diff,inning_pre,outs,leverage
0,567604.0,2019-07-23,MIL,CIN,9.0,Bot,grounded_into_double_play,"Jesus Aguilar grounds into a double play, thir...",hit_into_play,True,False,False,1.0,6.0,14.0,-4.0,9.0,1.0,0.6
1,567604.0,2019-07-23,MIL,CIN,9.0,Bot,single,Orlando Arcia singles on a ground ball to cent...,hit_into_play_no_out,False,False,False,1.0,6.0,14.0,-4.0,9.0,1.0,0.2
2,567310.0,2019-07-23,CWS,MIA,9.0,Bot,field_out,Jose Abreu flies out sharply to center fielder...,hit_into_play,False,False,False,1.0,1.0,5.0,-4.0,9.0,1.0,0.2
3,566424.0,2019-07-23,SEA,TEX,9.0,Bot,field_out,"Austin Nola grounds out, shortstop Elvis Andru...",hit_into_play,False,False,False,1.0,2.0,7.0,-4.0,9.0,1.0,0.2
4,564880.0,2019-07-23,ARI,BAL,9.0,Bot,field_out,Christian Walker flies out to right fielder Tr...,hit_into_play,False,False,False,1.0,2.0,7.0,-4.0,9.0,1.0,0.2


In [14]:
games = df.groupby('game_pk')
games['leverage'].sum() / games['inning_pre'].max()

game_pk
564880.0     5.188889
565559.0    13.493333
565657.0    10.572727
565851.0    12.488889
565943.0     6.144444
566033.0     7.122222
566227.0     9.111111
566424.0     5.033333
566522.0    11.653846
566716.0    10.500000
566913.0    12.230000
567012.0    17.100000
567218.0    13.733333
567310.0     4.166667
567604.0     5.355556
dtype: float64

In [15]:
df[df['game_pk'] == 565559]['inning_pre'].max()

15.0