# Feature Engineering
The purpose of this notebook is to work on engineering more features, to improve model performance from the pipeline_architecture.ipynb file.  The main goal is to improve the bias in the pitch location predictions, as well as the prediction power of the pitch classification algorithm with new features.

Importing various packages:

In [180]:
import pickle
from sqlalchemy import create_engine
import pandas as pd
from importlib import reload
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_formats = ['retina']
%matplotlib inline

plt.rcParams['figure.figsize'] = (9, 6)
sns.set(context='notebook', style='whitegrid', font_scale=1.2)

Opening up a SQL Alchemy Engine, to work on this in SQL.

In [181]:
#First, creating an engine and then importing the various .csv files.
engine = create_engine('postgresql://patrickbovard:localhost@localhost:5432/mlb_pitches')

Re-acquainting myself with all the data I have:

### At-bats:

In [182]:
query = '''
--first, selecting all the standard columns:
SELECT *
FROM atbats
LIMIT 5
;
'''
df = pd.read_sql(query, engine)

df.head()

Unnamed: 0.1,Unnamed: 0,inning,top,ab_id,g_id,p_score,batter_id,pitcher_id,stand,p_throws,event,o
0,0,1.0,1.0,2019000000.0,201900001.0,0.0,594777,571666,L,R,Flyout,1
1,1,1.0,1.0,2019000000.0,201900001.0,0.0,545361,571666,R,R,Flyout,2
2,2,1.0,1.0,2019000000.0,201900001.0,0.0,571506,571666,L,R,Groundout,3
3,3,1.0,0.0,2019000000.0,201900001.0,0.0,543257,502239,L,R,Single,0
4,4,1.0,0.0,2019000000.0,201900001.0,0.0,656305,502239,R,R,Flyout,1


All of these (outside of id's) are currently in use as features, with the exception of event - perhaps previous at-bat event could help predict pitch type?

### Games:

In [4]:
query = '''
--first, selecting all the standard columns:
SELECT *
FROM games
LIMIT 5
;
'''
df = pd.read_sql(query, engine)

df.head()

Unnamed: 0.1,Unnamed: 0,attendance,away_final_score,away_team,date,elapsed_time,g_id,home_final_score,home_team,start_time,umpire_1B,umpire_2B,umpire_3B,umpire_HP,venue_name,weather,wind,delay
0,0,35055.0,3.0,sln,2015-04-05,184.0,201500001.0,0.0,chn,7:17 PM,Mark Wegner,Marty Foster,Mike Muchlinski,Mike Winters,Wrigley Field,"44 degrees, clear","7 mph, In from CF",0.0
1,1,45909.0,1.0,ana,2015-04-06,153.0,201500002.0,4.0,sea,1:12 PM,Ron Kulpa,Brian Knight,Vic Carapazza,Larry Vanover,Safeco Field,"54 degrees, cloudy","1 mph, Varies",0.0
2,2,36969.0,2.0,atl,2015-04-06,156.0,201500003.0,1.0,mia,4:22 PM,Laz Diaz,Chris Guccione,Cory Blaser,Jeff Nelson,Marlins Park,"80 degrees, partly cloudy","16 mph, In from CF",16.0
3,3,31042.0,6.0,bal,2015-04-06,181.0,201500004.0,2.0,tba,3:12 PM,Ed Hickox,Paul Nauert,Mike Estabrook,Dana DeMuth,Tropicana Field,"72 degrees, dome","0 mph, None",0.0
4,4,45549.0,8.0,bos,2015-04-06,181.0,201500005.0,0.0,phi,3:08 PM,Phil Cuzzi,Tony Randazzo,Will Little,Gerry Davis,Citizens Bank Park,"71 degrees, partly cloudy","11 mph, Out to RF",0.0


### Pitches:

In [5]:
query = '''
--first, selecting all the standard columns:
SELECT *
FROM pitches
LIMIT 100;
;
'''
df = pd.read_sql(query, engine)

df.head()

Unnamed: 0,px,pz,start_speed,end_speed,spin_rate,spin_dir,break_angle,break_length,break_y,ax,...,event_num,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b
0,0.416,2.963,92.9,84.1,2305.052,159.235,-25.0,3.2,23.7,7.665,...,3,0.0,2015000000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.191,2.347,92.8,84.1,2689.935,151.40200000000004,-40.7,3.4,23.7,12.043,...,4,0.0,2015000000.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0
2,-0.518,3.284,94.1,85.2,2647.972,145.125,-43.7,3.7,23.7,14.368,...,5,0.0,2015000000.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0
3,-0.641,1.221,91.0,84.0,1289.59,169.75099999999995,-1.3,5.0,23.8,2.104,...,6,0.0,2015000000.0,0.0,2.0,0.0,4.0,0.0,0.0,0.0
4,-1.821,2.083,75.4,69.6,1374.569,280.671,18.4,12.0,23.8,-10.28,...,7,0.0,2015000000.0,1.0,2.0,0.0,5.0,0.0,0.0,0.0


In [6]:
df.type_confidence.value_counts()

2.0      93
0.648     1
0.778     1
0.693     1
0.898     1
0.821     1
0.763     1
Name: type_confidence, dtype: int64

In [7]:
df.nasty.describe()

count     99.000000
mean      43.979798
std       16.744161
min       12.000000
25%       31.500000
50%       43.000000
75%       53.500000
max      100.000000
Name: nasty, dtype: float64

In [8]:
df.columns

Index(['px', 'pz', 'start_speed', 'end_speed', 'spin_rate', 'spin_dir',
       'break_angle', 'break_length', 'break_y', 'ax', 'ay', 'az', 'sz_bot',
       'sz_top', 'type_confidence', 'vx0', 'vy0', 'vz0', 'x', 'x0', 'y', 'y0',
       'z0', 'pfx_x', 'pfx_z', 'nasty', 'zone', 'code', 'type', 'pitch_type',
       'event_num', 'b_score', 'ab_id', 'b_count', 's_count', 'outs',
       'pitch_num', 'on_1b', 'on_2b', 'on_3b'],
      dtype='object')

## Other Pitch Rates:

In modeling_prep.ipynb, I used a query to create running pitch counts for each pitcher.  Utilizing a similar format for some new ones:

Repeating, but over last 100 pitches:

In [9]:
query = '''
--first, selecting all the standard columns:
SELECT pitcher_id, pitcher_full_name, pitch_type,
--selecting counts of each pitch type, over the last 100 pitches the pitcher has thrown:
(count(CASE WHEN pitch_type = 'FF' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_ff,
(count(CASE WHEN pitch_type = 'SL' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_sl,
(count(CASE WHEN pitch_type = 'FT' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_ft,
(count(CASE WHEN pitch_type = 'CH' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_ch,
(count(CASE WHEN pitch_type = 'CU' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_cu,
(count(CASE WHEN pitch_type = 'SI' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_si,
(count(CASE WHEN pitch_type = 'FC' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_fc,
(count(CASE WHEN pitch_type = 'KC' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_kc,
(count(CASE WHEN pitch_type = 'FS' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_fs,
(count(CASE WHEN pitch_type = 'KN' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_kn,
(count(CASE WHEN pitch_type = 'EP' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_ep,
(count(CASE WHEN pitch_type = 'FO' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_fo,
(count(CASE WHEN pitch_type = 'SC' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_sc

FROM full_pitch_data
LIMIT 1000
;
'''
df = pd.read_sql(query, engine)

df.head()

Unnamed: 0,pitcher_id,pitcher_full_name,pitch_type,last_100_ff,last_100_sl,last_100_ft,last_100_ch,last_100_cu,last_100_si,last_100_fc,last_100_kc,last_100_fs,last_100_kn,last_100_ep,last_100_fo,last_100_sc
0,112526,Bartolo Colon,FF,0,0,0,0,0,0,0,0,0,0,0,0,0
1,112526,Bartolo Colon,FT,1,0,0,0,0,0,0,0,0,0,0,0,0
2,112526,Bartolo Colon,SL,1,0,1,0,0,0,0,0,0,0,0,0,0
3,112526,Bartolo Colon,FF,1,1,1,0,0,0,0,0,0,0,0,0,0
4,112526,Bartolo Colon,FT,2,1,1,0,0,0,0,0,0,0,0,0,0


What about px/pz, for the last x times a pitcher has thrown a pitch:

In [10]:
query = '''
--first, selecting all the standard columns:
SELECT pitcher_id, pitcher_full_name, pitch_type,
--selecting avg px, over the last 10 pitches the pitcher has thrown:
(avg(CASE WHEN pitch_type = 'FF' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_ff,
(avg(CASE WHEN pitch_type = 'SL' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_sl,
(avg(CASE WHEN pitch_type = 'FT' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_ft,
(avg(CASE WHEN pitch_type = 'CH' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_ch,
(avg(CASE WHEN pitch_type = 'CU' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_cu,
(avg(CASE WHEN pitch_type = 'SI' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_si,
(avg(CASE WHEN pitch_type = 'FC' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_fc,
(avg(CASE WHEN pitch_type = 'KC' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_kc,
(avg(CASE WHEN pitch_type = 'FS' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_fs,
(avg(CASE WHEN pitch_type = 'KN' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_kn,
(avg(CASE WHEN pitch_type = 'EP' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_ep,
(avg(CASE WHEN pitch_type = 'FO' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_fo,
(avg(CASE WHEN pitch_type = 'SC' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_sc

FROM full_pitch_data
LIMIT 1000
;
'''
df = pd.read_sql(query, engine)

df.head()

Unnamed: 0,pitcher_id,pitcher_full_name,pitch_type,avg_px_ff,avg_px_sl,avg_px_ft,avg_px_ch,avg_px_cu,avg_px_si,avg_px_fc,avg_px_kc,avg_px_fs,avg_px_kn,avg_px_ep,avg_px_fo,avg_px_sc
0,112526,Bartolo Colon,FF,,,,,,,,,,,,,
1,112526,Bartolo Colon,FT,0.445,,,,,,,,,,,,
2,112526,Bartolo Colon,SL,0.445,,-0.296,,,,,,,,,,
3,112526,Bartolo Colon,FF,0.445,0.748,-0.296,,,,,,,,,,
4,112526,Bartolo Colon,FT,0.751,0.748,-0.296,,,,,,,,,,


Same, but for pz:

In [11]:
query = '''
--first, selecting all the standard columns:
SELECT pitcher_id, pitcher_full_name, pitch_type, px,
--selecting counts of each pitch type, over the last 100 pitches the pitcher has thrown:
(avg(px) FILTER (WHERE pitch_type = 'FF') OVER (PARTITION BY pitcher_id, pitch_type ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_ff,
(avg(px) FILTER (WHERE pitch_type = 'FT') OVER (PARTITION BY pitcher_id, pitch_type ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_ft,
(avg(px) FILTER (WHERE pitch_type = 'CU') OVER (PARTITION BY pitcher_id, pitch_type ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_cu,
(avg(px) FILTER (WHERE pitch_type = 'CH') OVER (PARTITION BY pitcher_id, pitch_type ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_ch,
(avg(px) FILTER (WHERE pitch_type = 'SI') OVER (PARTITION BY pitcher_id, pitch_type ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_si


FROM full_pitch_data
ORDER BY ab_id, pitch_num ASC
LIMIT 1000
;
'''
df = pd.read_sql(query, engine)

df.head(15)

Unnamed: 0,pitcher_id,pitcher_full_name,pitch_type,px,avg_px_ff,avg_px_ft,avg_px_cu,avg_px_ch,avg_px_si
0,452657,Jon Lester,FF,0.416,,,,,
1,452657,Jon Lester,FF,-0.191,0.416,,,,
2,452657,Jon Lester,FF,-0.518,0.1125,,,,
3,452657,Jon Lester,FF,-0.641,-0.097667,,,,
4,452657,Jon Lester,CU,-1.821,,,,,
5,452657,Jon Lester,FF,0.627,-0.45,,,,
6,452657,Jon Lester,FF,-1.088,-0.177333,,,,
7,452657,Jon Lester,FC,-0.257,,,,,
8,452657,Jon Lester,FF,1.47,-0.367333,,,,
9,452657,Jon Lester,FF,-1.337,0.336333,,,,


In [12]:
query = '''
--first, selecting all the standard columns:
SELECT pitcher_id, pitcher_full_name, pitch_type,
--selecting counts of each pitch type, over the last 100 pitches the pitcher has thrown:
(avg(CASE WHEN pitch_type = 'FF' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_ff,
(avg(CASE WHEN pitch_type = 'SL' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_sl,
(avg(CASE WHEN pitch_type = 'FT' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_ft,
(avg(CASE WHEN pitch_type = 'CH' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_ch,
(avg(CASE WHEN pitch_type = 'CU' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_cu,
(avg(CASE WHEN pitch_type = 'SI' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_si,
(avg(CASE WHEN pitch_type = 'FC' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_fc,
(avg(CASE WHEN pitch_type = 'KC' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_kc,
(avg(CASE WHEN pitch_type = 'FS' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_fs,
(avg(CASE WHEN pitch_type = 'KN' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_kn,
(avg(CASE WHEN pitch_type = 'EP' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_ep,
(avg(CASE WHEN pitch_type = 'FO' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_fo,
(avg(CASE WHEN pitch_type = 'SC' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_sc

FROM full_pitch_data
LIMIT 1000
;
'''
df = pd.read_sql(query, engine)

df.head()

Unnamed: 0,pitcher_id,pitcher_full_name,pitch_type,avg_pz_ff,avg_pz_sl,avg_pz_ft,avg_pz_ch,avg_pz_cu,avg_pz_si,avg_pz_fc,avg_pz_kc,avg_pz_fs,avg_pz_kn,avg_pz_ep,avg_pz_fo,avg_pz_sc
0,112526,Bartolo Colon,FF,,,,,,,,,,,,,
1,112526,Bartolo Colon,FT,2.705,,,,,,,,,,,,
2,112526,Bartolo Colon,SL,2.705,,1.189,,,,,,,,,,
3,112526,Bartolo Colon,FF,2.705,1.26,1.189,,,,,,,,,,
4,112526,Bartolo Colon,FT,3.1155,1.26,1.189,,,,,,,,,,


These work and likely can paint a good picture of where the pitcher is locating the ball, but will have to handle the NaN - a fair way could be middle of the strikezone (0 for px, ~1.85 for pz), since I don't want to lose those rows.  

Columns with None value can be removed from that pitcher's modeling, or changed to 0 - ultimately, it won't matter since they don't throw that pitch.

## Merging the Above in one query:

In [13]:
query = '''
--first, selecting all the standard columns:
SELECT *
FROM full_pitch_data
ORDER BY ab_id, pitch_num ASC
LIMIT 10
;
'''
df = pd.read_sql(query, engine)

df.head(10)

Unnamed: 0,inning,batter_id,pitcher_id,top,ab_id,p_score,stand,p_throws,event,home_team,...,pitch_num,last_pitch_type,last_pitch_px,last_pitch_pz,last_pitch_speed,pitcher_full_name,pitcher_run_diff,hitter_full_name,Date_Time_Date,Season
0,1.0,572761,452657,1.0,2015000000.0,0.0,L,L,Groundout,chn,...,1.0,,,,,Jon Lester,0.0,Matt Carpenter,2015-04-05,2015
1,1.0,572761,452657,1.0,2015000000.0,0.0,L,L,Groundout,chn,...,2.0,FF,0.416,2.963,92.9,Jon Lester,0.0,Matt Carpenter,2015-04-05,2015
2,1.0,572761,452657,1.0,2015000000.0,0.0,L,L,Groundout,chn,...,3.0,FF,-0.191,2.347,92.8,Jon Lester,0.0,Matt Carpenter,2015-04-05,2015
3,1.0,572761,452657,1.0,2015000000.0,0.0,L,L,Groundout,chn,...,4.0,FF,-0.518,3.284,94.1,Jon Lester,0.0,Matt Carpenter,2015-04-05,2015
4,1.0,572761,452657,1.0,2015000000.0,0.0,L,L,Groundout,chn,...,5.0,FF,-0.641,1.221,91.0,Jon Lester,0.0,Matt Carpenter,2015-04-05,2015
5,1.0,572761,452657,1.0,2015000000.0,0.0,L,L,Groundout,chn,...,6.0,CU,-1.821,2.083,75.4,Jon Lester,0.0,Matt Carpenter,2015-04-05,2015
6,1.0,518792,452657,1.0,2015000000.0,0.0,L,L,Double,chn,...,1.0,,,,,Jon Lester,0.0,Jason Heyward,2015-04-05,2015
7,1.0,518792,452657,1.0,2015000000.0,0.0,L,L,Double,chn,...,2.0,FF,-1.088,1.61,93.3,Jon Lester,0.0,Jason Heyward,2015-04-05,2015
8,1.0,407812,452657,1.0,2015000000.0,0.0,R,L,Single,chn,...,1.0,,,,,Jon Lester,0.0,Matt Holliday,2015-04-05,2015
9,1.0,407812,452657,1.0,2015000000.0,0.0,R,L,Single,chn,...,2.0,FF,1.47,2.35,92.1,Jon Lester,0.0,Matt Holliday,2015-04-05,2015


In [14]:
query = '''
--first, selecting all the standard columns:
SELECT pitcher_id, batter_id, event, pitcher_full_name, pitch_type, "Season",
--selecting counts of each pitch type, over the last 100 pitches the pitcher has thrown:
(count(CASE WHEN pitch_type = 'FF' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_ff,
(count(CASE WHEN pitch_type = 'SL' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_sl,
(count(CASE WHEN pitch_type = 'FT' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_ft,
(count(CASE WHEN pitch_type = 'CH' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_ch,
(count(CASE WHEN pitch_type = 'CU' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_cu,
(count(CASE WHEN pitch_type = 'SI' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_si,
(count(CASE WHEN pitch_type = 'FC' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_fc,
(count(CASE WHEN pitch_type = 'KC' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_kc,
(count(CASE WHEN pitch_type = 'FS' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_fs,
(count(CASE WHEN pitch_type = 'KN' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_kn,
(count(CASE WHEN pitch_type = 'EP' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_ep,
(count(CASE WHEN pitch_type = 'FO' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_fo,
(count(CASE WHEN pitch_type = 'SC' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_sc,

--selecting avg px, over the last 3 pitches the pitcher has thrown:
(avg(CASE WHEN pitch_type = 'FF' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_ff,
(avg(CASE WHEN pitch_type = 'SL' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_sl,
(avg(CASE WHEN pitch_type = 'FT' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_ft,
(avg(CASE WHEN pitch_type = 'CH' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_ch,
(avg(CASE WHEN pitch_type = 'CU' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_cu,
(avg(CASE WHEN pitch_type = 'SI' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_si,
(avg(CASE WHEN pitch_type = 'FC' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_fc,
(avg(CASE WHEN pitch_type = 'KC' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_kc,
(avg(CASE WHEN pitch_type = 'FS' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_fs,
(avg(CASE WHEN pitch_type = 'KN' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_kn,
(avg(CASE WHEN pitch_type = 'EP' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_ep,
(avg(CASE WHEN pitch_type = 'FO' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_fo,
(avg(CASE WHEN pitch_type = 'SC' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_sc,

--selecting avg pz, over the last 3 pitches the pitcher has thrown:
(avg(CASE WHEN pitch_type = 'FF' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_ff,
(avg(CASE WHEN pitch_type = 'SL' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_sl,
(avg(CASE WHEN pitch_type = 'FT' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_ft,
(avg(CASE WHEN pitch_type = 'CH' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_ch,
(avg(CASE WHEN pitch_type = 'CU' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_cu,
(avg(CASE WHEN pitch_type = 'SI' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_si,
(avg(CASE WHEN pitch_type = 'FC' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_fc,
(avg(CASE WHEN pitch_type = 'KC' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_kc,
(avg(CASE WHEN pitch_type = 'FS' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_fs,
(avg(CASE WHEN pitch_type = 'KN' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_kn,
(avg(CASE WHEN pitch_type = 'EP' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_ep,
(avg(CASE WHEN pitch_type = 'FO' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_fo,
(avg(CASE WHEN pitch_type = 'SC' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_sc

FROM full_pitch_data
ORDER BY ab_id, pitch_num ASC
;
'''
df = pd.read_sql(query, engine)

df.head(10)

Unnamed: 0,pitcher_id,batter_id,event,pitcher_full_name,pitch_type,Season,last_100_ff,last_100_sl,last_100_ft,last_100_ch,...,avg_pz_ch,avg_pz_cu,avg_pz_si,avg_pz_fc,avg_pz_kc,avg_pz_fs,avg_pz_kn,avg_pz_ep,avg_pz_fo,avg_pz_sc
0,452657,572761,Groundout,Jon Lester,FF,2015,0,0,0,0,...,,,,,,,,,,
1,452657,572761,Groundout,Jon Lester,FF,2015,1,0,0,0,...,,,,,,,,,,
2,452657,572761,Groundout,Jon Lester,FF,2015,2,0,0,0,...,,,,,,,,,,
3,452657,572761,Groundout,Jon Lester,FF,2015,3,0,0,0,...,,,,,,,,,,
4,452657,572761,Groundout,Jon Lester,CU,2015,4,0,0,0,...,,,,,,,,,,
5,452657,572761,Groundout,Jon Lester,FF,2015,4,0,0,0,...,,2.083,,,,,,,,
6,452657,518792,Double,Jon Lester,FF,2015,5,0,0,0,...,,2.083,,,,,,,,
7,452657,518792,Double,Jon Lester,FC,2015,6,0,0,0,...,,2.083,,,,,,,,
8,452657,407812,Single,Jon Lester,FF,2015,6,0,0,0,...,,,,2.047,,,,,,
9,452657,407812,Single,Jon Lester,FF,2015,7,0,0,0,...,,,,2.047,,,,,,


In [15]:
df.tail(10)

Unnamed: 0,pitcher_id,batter_id,event,pitcher_full_name,pitch_type,Season,last_100_ff,last_100_sl,last_100_ft,last_100_ch,...,avg_pz_ch,avg_pz_cu,avg_pz_si,avg_pz_fc,avg_pz_kc,avg_pz_fs,avg_pz_kn,avg_pz_ep,avg_pz_fo,avg_pz_sc
3555824,571704,663993,Groundout,Ken Giles,SL,2019,43,54,3,0,...,,,,,,,,,,
3555825,571704,663993,Groundout,Ken Giles,SL,2019,43,54,3,0,...,,,,,,,,,,
3555826,571704,622110,Groundout,Ken Giles,FF,2019,43,54,3,0,...,,,,,,,,,,
3555827,571704,622110,Groundout,Ken Giles,SL,2019,43,54,3,0,...,,,,,,,,,,
3555828,571704,622110,Groundout,Ken Giles,SL,2019,43,54,3,0,...,,,,,,,,,,
3555829,571704,622110,Groundout,Ken Giles,FF,2019,42,55,3,0,...,,,,,,,,,,
3555830,571704,605421,Strikeout,Ken Giles,SL,2019,42,55,3,0,...,,,,,,,,,,
3555831,571704,605421,Strikeout,Ken Giles,FF,2019,41,56,3,0,...,,,,,,,,,,
3555832,571704,605421,Strikeout,Ken Giles,SL,2019,41,56,3,0,...,,,,,,,,,,
3555833,571704,605421,Strikeout,Ken Giles,SL,2019,40,57,3,0,...,,,,,,,,,,


In [16]:
df[df.Season != 2019].tail(15)

Unnamed: 0,pitcher_id,batter_id,event,pitcher_full_name,pitch_type,Season,last_100_ff,last_100_sl,last_100_ft,last_100_ch,...,avg_pz_ch,avg_pz_cu,avg_pz_si,avg_pz_fc,avg_pz_kc,avg_pz_fs,avg_pz_kn,avg_pz_ep,avg_pz_fo,avg_pz_sc
2848356,623352,450314,Flyout,Josh Hader,SL,2018,82,17,1,0,...,,,,,,,,,,
2848357,623352,450314,Flyout,Josh Hader,FF,2018,81,18,1,0,...,,,,,,,,,,
2848358,623352,595879,Single,Josh Hader,SL,2018,81,18,1,0,...,,,,,,,,,,
2848359,623352,595879,Single,Josh Hader,FF,2018,80,19,1,0,...,,,,,,,,,,
2848360,623352,595879,Single,Josh Hader,FF,2018,80,19,1,0,...,,,,,,,,,,
2848361,623352,595879,Single,Josh Hader,FF,2018,80,19,1,0,...,,,,,,,,,,
2848362,623352,595879,Single,Josh Hader,FF,2018,80,19,1,0,...,,,,,,,,,,
2848363,623352,595879,Single,Josh Hader,FF,2018,80,19,1,0,...,,,,,,,,,,
2848364,623352,595879,Single,Josh Hader,FF,2018,80,19,1,0,...,,,,,,,,,,
2848365,623352,595879,Single,Josh Hader,FF,2018,80,19,1,0,...,,,,,,,,,,


Saving this data as a new pickled file: (commenting out after initial run)

In [17]:
#with open('../Data/new_pitch_rates.pickle', 'wb') as to_write:
#    pickle.dump(df, to_write)

## Next Round:
Based on model performance in Pipeline_Part_2.ipynb, performance wasn't improved much by the above.  Working on some additional feature engineering below:

In [9]:
query = '''

SELECT *
FROM full_pitch_data
ORDER BY ab_id, pitch_num ASC
LIMIT 10
;
'''
df = pd.read_sql(query, engine)

df.head(10)

Unnamed: 0,inning,batter_id,pitcher_id,top,ab_id,p_score,stand,p_throws,event,home_team,...,pitch_num,last_pitch_type,last_pitch_px,last_pitch_pz,last_pitch_speed,pitcher_full_name,pitcher_run_diff,hitter_full_name,Date_Time_Date,Season
0,1.0,572761,452657,1.0,2015000000.0,0.0,L,L,Groundout,chn,...,1.0,,,,,Jon Lester,0.0,Matt Carpenter,2015-04-05,2015
1,1.0,572761,452657,1.0,2015000000.0,0.0,L,L,Groundout,chn,...,2.0,FF,0.416,2.963,92.9,Jon Lester,0.0,Matt Carpenter,2015-04-05,2015
2,1.0,572761,452657,1.0,2015000000.0,0.0,L,L,Groundout,chn,...,3.0,FF,-0.191,2.347,92.8,Jon Lester,0.0,Matt Carpenter,2015-04-05,2015
3,1.0,572761,452657,1.0,2015000000.0,0.0,L,L,Groundout,chn,...,4.0,FF,-0.518,3.284,94.1,Jon Lester,0.0,Matt Carpenter,2015-04-05,2015
4,1.0,572761,452657,1.0,2015000000.0,0.0,L,L,Groundout,chn,...,5.0,FF,-0.641,1.221,91.0,Jon Lester,0.0,Matt Carpenter,2015-04-05,2015
5,1.0,572761,452657,1.0,2015000000.0,0.0,L,L,Groundout,chn,...,6.0,CU,-1.821,2.083,75.4,Jon Lester,0.0,Matt Carpenter,2015-04-05,2015
6,1.0,518792,452657,1.0,2015000000.0,0.0,L,L,Double,chn,...,1.0,,,,,Jon Lester,0.0,Jason Heyward,2015-04-05,2015
7,1.0,518792,452657,1.0,2015000000.0,0.0,L,L,Double,chn,...,2.0,FF,-1.088,1.61,93.3,Jon Lester,0.0,Jason Heyward,2015-04-05,2015
8,1.0,407812,452657,1.0,2015000000.0,0.0,R,L,Single,chn,...,1.0,,,,,Jon Lester,0.0,Matt Holliday,2015-04-05,2015
9,1.0,407812,452657,1.0,2015000000.0,0.0,R,L,Single,chn,...,2.0,FF,1.47,2.35,92.1,Jon Lester,0.0,Matt Holliday,2015-04-05,2015


In [10]:
df.columns

Index(['inning', 'batter_id', 'pitcher_id', 'top', 'ab_id', 'p_score', 'stand',
       'p_throws', 'event', 'home_team', 'away_team', 'b_score', 'on_1b',
       'on_2b', 'on_3b', 'px', 'pz', 'zone', 'pitch_type', 'start_speed',
       'type', 'b_count', 's_count', 'outs', 'pitch_num', 'last_pitch_type',
       'last_pitch_px', 'last_pitch_pz', 'last_pitch_speed',
       'pitcher_full_name', 'pitcher_run_diff', 'hitter_full_name',
       'Date_Time_Date', 'Season'],
      dtype='object')

In [190]:
query = '''
--first, selecting all the standard columns:
SELECT pitcher_id, batter_id, event, pitcher_full_name, pitch_type, "Season",
--selecting counts of each pitch type, over the last 10 pitches the pitcher has thrown:
(count(CASE WHEN pitch_type = 'FF' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS last_10_ff,
(count(CASE WHEN pitch_type = 'SL' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS last_10_sl,
(count(CASE WHEN pitch_type = 'FT' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS last_10_ft,
(count(CASE WHEN pitch_type = 'CH' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS last_10_ch,
(count(CASE WHEN pitch_type = 'CU' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS last_10_cu,
(count(CASE WHEN pitch_type = 'SI' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS last_10_si,
(count(CASE WHEN pitch_type = 'FC' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS last_10_fc,
(count(CASE WHEN pitch_type = 'KC' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS last_10_kc,
(count(CASE WHEN pitch_type = 'FS' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS last_10_fs,
(count(CASE WHEN pitch_type = 'KN' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS last_10_kn,
(count(CASE WHEN pitch_type = 'EP' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS last_10_ep,
(count(CASE WHEN pitch_type = 'FO' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS last_10_fo,
(count(CASE WHEN pitch_type = 'SC' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS last_10_sc,

--Last 5:
(count(CASE WHEN pitch_type = 'FF' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 5 PRECEDING EXCLUDE CURRENT ROW)) AS last_5_ff,
(count(CASE WHEN pitch_type = 'SL' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 5 PRECEDING EXCLUDE CURRENT ROW)) AS last_5_sl,
(count(CASE WHEN pitch_type = 'FT' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 5 PRECEDING EXCLUDE CURRENT ROW)) AS last_5_ft,
(count(CASE WHEN pitch_type = 'CH' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 5 PRECEDING EXCLUDE CURRENT ROW)) AS last_5_ch,
(count(CASE WHEN pitch_type = 'CU' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 5 PRECEDING EXCLUDE CURRENT ROW)) AS last_5_cu,
(count(CASE WHEN pitch_type = 'SI' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 5 PRECEDING EXCLUDE CURRENT ROW)) AS last_5_si,
(count(CASE WHEN pitch_type = 'FC' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 5 PRECEDING EXCLUDE CURRENT ROW)) AS last_5_fc,
(count(CASE WHEN pitch_type = 'KC' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 5 PRECEDING EXCLUDE CURRENT ROW)) AS last_5_kc,
(count(CASE WHEN pitch_type = 'FS' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 5 PRECEDING EXCLUDE CURRENT ROW)) AS last_5_fs,
(count(CASE WHEN pitch_type = 'KN' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 5 PRECEDING EXCLUDE CURRENT ROW)) AS last_5_kn,
(count(CASE WHEN pitch_type = 'EP' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 5 PRECEDING EXCLUDE CURRENT ROW)) AS last_5_ep,
(count(CASE WHEN pitch_type = 'FO' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 5 PRECEDING EXCLUDE CURRENT ROW)) AS last_5_fo,
(count(CASE WHEN pitch_type = 'SC' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 5 PRECEDING EXCLUDE CURRENT ROW)) AS last_5_sc


FROM full_pitch_data
ORDER BY ab_id, pitch_num ASC
;
'''
last10_df = pd.read_sql(query, engine)

last10_df.head(10)

Unnamed: 0,pitcher_id,batter_id,event,pitcher_full_name,pitch_type,Season,last_10_ff,last_10_sl,last_10_ft,last_10_ch,...,last_5_ch,last_5_cu,last_5_si,last_5_fc,last_5_kc,last_5_fs,last_5_kn,last_5_ep,last_5_fo,last_5_sc
0,452657,572761,Groundout,Jon Lester,FF,2015,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,452657,572761,Groundout,Jon Lester,FF,2015,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,452657,572761,Groundout,Jon Lester,FF,2015,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,452657,572761,Groundout,Jon Lester,FF,2015,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,452657,572761,Groundout,Jon Lester,CU,2015,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,452657,572761,Groundout,Jon Lester,FF,2015,4,0,0,0,...,0,1,0,0,0,0,0,0,0,0
6,452657,518792,Double,Jon Lester,FF,2015,5,0,0,0,...,0,1,0,0,0,0,0,0,0,0
7,452657,518792,Double,Jon Lester,FC,2015,6,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,452657,407812,Single,Jon Lester,FF,2015,6,0,0,0,...,0,1,0,1,0,0,0,0,0,0
9,452657,407812,Single,Jon Lester,FF,2015,7,0,0,0,...,0,1,0,1,0,0,0,0,0,0


In [191]:
last10_df.columns

Index(['pitcher_id', 'batter_id', 'event', 'pitcher_full_name', 'pitch_type',
       'Season', 'last_10_ff', 'last_10_sl', 'last_10_ft', 'last_10_ch',
       'last_10_cu', 'last_10_si', 'last_10_fc', 'last_10_kc', 'last_10_fs',
       'last_10_kn', 'last_10_ep', 'last_10_fo', 'last_10_sc', 'last_5_ff',
       'last_5_sl', 'last_5_ft', 'last_5_ch', 'last_5_cu', 'last_5_si',
       'last_5_fc', 'last_5_kc', 'last_5_fs', 'last_5_kn', 'last_5_ep',
       'last_5_fo', 'last_5_sc'],
      dtype='object')

In [192]:
last10_df[last10_df.Season != 2019].tail(10)

Unnamed: 0,pitcher_id,batter_id,event,pitcher_full_name,pitch_type,Season,last_10_ff,last_10_sl,last_10_ft,last_10_ch,...,last_5_ch,last_5_cu,last_5_si,last_5_fc,last_5_kc,last_5_fs,last_5_kn,last_5_ep,last_5_fo,last_5_sc
2848361,623352,595879,Single,Josh Hader,FF,2018,7,3,0,0,...,0,0,0,0,0,0,0,0,0,0
2848362,623352,595879,Single,Josh Hader,FF,2018,8,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2848363,623352,595879,Single,Josh Hader,FF,2018,8,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2848364,623352,595879,Single,Josh Hader,FF,2018,8,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2848365,623352,595879,Single,Josh Hader,FF,2018,8,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2848366,623352,595879,Single,Josh Hader,SL,2018,8,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2848367,623352,519203,Flyout,Josh Hader,FF,2018,8,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2848368,623352,519203,Flyout,Josh Hader,FF,2018,8,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2848369,623352,519203,Flyout,Josh Hader,FF,2018,9,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2848370,623352,519203,Flyout,Josh Hader,FF,2018,9,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [193]:
last10_df.shape

(3555834, 32)

Pickling out the last 10 and last 5 data to use in my modeling:

In [194]:
with open('../Data/last_10_data.pickle', 'wb') as to_write:
    pickle.dump(last10_df, to_write)

# Next: Pipeline_Part_2.ipynb