# Feature Engineering
The purpose of this notebook is to work on engineering more features, to improve model performance from the pipeline_architecture.ipynb file.  The pitch location was underfitting, and pitch type was not as accurate as would be ideal.

Importing various packages:

In [2]:
import pickle
from sqlalchemy import create_engine
import pandas as pd
from importlib import reload
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_formats = ['retina']
%matplotlib inline

plt.rcParams['figure.figsize'] = (9, 6)
sns.set(context='notebook', style='whitegrid', font_scale=1.2)

Opening up a SQL Alchemy Engine, to work on this in SQL.

In [3]:
#First, creating an engine and then importing the various .csv files.
engine = create_engine('postgresql://patrickbovard:localhost@localhost:5432/mlb_pitches')

Re-acquainting myself with all the data I have:

### At-bats:

In [5]:
query = '''
--first, selecting all the standard columns:
SELECT *
FROM atbats
LIMIT 5
;
'''
df = pd.read_sql(query, engine)

df.head()

Unnamed: 0.1,Unnamed: 0,inning,top,ab_id,g_id,p_score,batter_id,pitcher_id,stand,p_throws,event,o
0,0,1.0,1.0,2019000000.0,201900001.0,0.0,594777,571666,L,R,Flyout,1
1,1,1.0,1.0,2019000000.0,201900001.0,0.0,545361,571666,R,R,Flyout,2
2,2,1.0,1.0,2019000000.0,201900001.0,0.0,571506,571666,L,R,Groundout,3
3,3,1.0,0.0,2019000000.0,201900001.0,0.0,543257,502239,L,R,Single,0
4,4,1.0,0.0,2019000000.0,201900001.0,0.0,656305,502239,R,R,Flyout,1


All of these (outside of id's) are currently in use as features, with the exception of event - perhaps previous at-bat event could help predict pitch type?

### Games:

In [6]:
query = '''
--first, selecting all the standard columns:
SELECT *
FROM games
LIMIT 5
;
'''
df = pd.read_sql(query, engine)

df.head()

Unnamed: 0.1,Unnamed: 0,attendance,away_final_score,away_team,date,elapsed_time,g_id,home_final_score,home_team,start_time,umpire_1B,umpire_2B,umpire_3B,umpire_HP,venue_name,weather,wind,delay
0,0,35055.0,3.0,sln,2015-04-05,184.0,201500001.0,0.0,chn,7:17 PM,Mark Wegner,Marty Foster,Mike Muchlinski,Mike Winters,Wrigley Field,"44 degrees, clear","7 mph, In from CF",0.0
1,1,45909.0,1.0,ana,2015-04-06,153.0,201500002.0,4.0,sea,1:12 PM,Ron Kulpa,Brian Knight,Vic Carapazza,Larry Vanover,Safeco Field,"54 degrees, cloudy","1 mph, Varies",0.0
2,2,36969.0,2.0,atl,2015-04-06,156.0,201500003.0,1.0,mia,4:22 PM,Laz Diaz,Chris Guccione,Cory Blaser,Jeff Nelson,Marlins Park,"80 degrees, partly cloudy","16 mph, In from CF",16.0
3,3,31042.0,6.0,bal,2015-04-06,181.0,201500004.0,2.0,tba,3:12 PM,Ed Hickox,Paul Nauert,Mike Estabrook,Dana DeMuth,Tropicana Field,"72 degrees, dome","0 mph, None",0.0
4,4,45549.0,8.0,bos,2015-04-06,181.0,201500005.0,0.0,phi,3:08 PM,Phil Cuzzi,Tony Randazzo,Will Little,Gerry Davis,Citizens Bank Park,"71 degrees, partly cloudy","11 mph, Out to RF",0.0


Perhaps here wind and temp/weather could help.

### Pitches:

In [10]:
query = '''
--first, selecting all the standard columns:
SELECT *
FROM pitches
LIMIT 100;
;
'''
df = pd.read_sql(query, engine)

df.head()

Unnamed: 0,px,pz,start_speed,end_speed,spin_rate,spin_dir,break_angle,break_length,break_y,ax,...,event_num,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b
0,0.416,2.963,92.9,84.1,2305.052,159.235,-25.0,3.2,23.7,7.665,...,3,0.0,2015000000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.191,2.347,92.8,84.1,2689.935,151.40200000000004,-40.7,3.4,23.7,12.043,...,4,0.0,2015000000.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0
2,-0.518,3.284,94.1,85.2,2647.972,145.125,-43.7,3.7,23.7,14.368,...,5,0.0,2015000000.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0
3,-0.641,1.221,91.0,84.0,1289.59,169.75099999999995,-1.3,5.0,23.8,2.104,...,6,0.0,2015000000.0,0.0,2.0,0.0,4.0,0.0,0.0,0.0
4,-1.821,2.083,75.4,69.6,1374.569,280.671,18.4,12.0,23.8,-10.28,...,7,0.0,2015000000.0,1.0,2.0,0.0,5.0,0.0,0.0,0.0


In [12]:
df.type_confidence.value_counts()

2.0      93
0.821     1
0.898     1
0.778     1
0.648     1
0.693     1
0.763     1
Name: type_confidence, dtype: int64

In [16]:
df.nasty.describe()

count     99.000000
mean      43.979798
std       16.744161
min       12.000000
25%       31.500000
50%       43.000000
75%       53.500000
max      100.000000
Name: nasty, dtype: float64

In [8]:
df.columns

Index(['px', 'pz', 'start_speed', 'end_speed', 'spin_rate', 'spin_dir',
       'break_angle', 'break_length', 'break_y', 'ax', 'ay', 'az', 'sz_bot',
       'sz_top', 'type_confidence', 'vx0', 'vy0', 'vz0', 'x', 'x0', 'y', 'y0',
       'z0', 'pfx_x', 'pfx_z', 'nasty', 'zone', 'code', 'type', 'pitch_type',
       'event_num', 'b_score', 'ab_id', 'b_count', 's_count', 'outs',
       'pitch_num', 'on_1b', 'on_2b', 'on_3b'],
      dtype='object')

There are a few I haven't used here that coudl be helpful: break_angle, break_length, code (i.e. use preceding code), type_confidence - i.e. setting a minimum threshold for pitches that can't be correctly classified, sz_top and sz_bottom (to somewhat regulate where the strikezone is on a per-pitch basis).

# Re-Running Joins

To get the data I want, I'll have to re-run some joins on the tables:

In [29]:
query = '''
--Queuing up the game/at bat info from teh above query:
WITH game_player_ab AS (

WITH game_ab AS (
SELECT a.inning, a.batter_id, a.pitcher_id, a.top, a.ab_id, a.p_score, a.stand, a.p_throws, a.event, g.date, g.home_team, g.away_team
FROM atbats as a
RIGHT JOIN games as g
    ON a.g_id = g.g_id
)

SELECT g.*, 
p.first_name as Pitcher_First_Name, p.last_name as Pitcher_Last_Name, 
h.first_name as Hitter_First_Name, h.last_name as Hitter_Last_Name

FROM game_ab as g

--first, joining up the pitcher's name:
LEFT JOIN players as p
    ON g.pitcher_id = p.id

--now, joining the hitter's name:
LEFT JOIN players as h
    ON g.batter_id = h.id

--Ordering:
ORDER BY g.ab_id ASC
)

SELECT 
--First, taking all the data above, along with the batting team score:
gpa.*, pi.b_score,

--Adding the runners on base:
pi.on_1b, pi.on_2b, pi.on_3b,

--Now, selecting data from the pitch table:
pi.px, pi.pz, pi.zone, pi.pitch_type, pi.start_speed,pi.type, pi.b_count, pi.s_count, pi.outs, pi.pitch_num,

--Finally, adding in previous pitch info (for that at bat).  This will lead to some NaN values for the first pitch of an at-bat, which is expected:
(max(pi.pitch_type) OVER (PARTITION BY gpa.ab_id ORDER BY pi.pitch_num ASC ROWS 1 PRECEDING EXCLUDE CURRENT ROW)) as last_pitch_type,
(max(pi.px) OVER (PARTITION BY gpa.ab_id ORDER BY pi.pitch_num ASC ROWS 1 PRECEDING EXCLUDE CURRENT ROW)) as last_pitch_px,
(max(pi.pz) OVER (PARTITION BY gpa.ab_id ORDER BY pi.pitch_num ASC ROWS 1 PRECEDING EXCLUDE CURRENT ROW)) as last_pitch_pz,
(max(pi.start_speed) OVER (PARTITION BY gpa.ab_id ORDER BY pi.pitch_num ASC ROWS 1 PRECEDING EXCLUDE CURRENT ROW)) as last_pitch_speed

FROM pitches as pi
RIGHT JOIN game_player_ab as gpa
    ON pi.ab_id = gpa.ab_id
    
--Ordering:
ORDER BY gpa.ab_id ASC, pi.pitch_num ASC
LIMIT 10;
;
'''
combined_data_df = pd.read_sql(query, engine)

combined_data_df.head(10)

Unnamed: 0,inning,batter_id,pitcher_id,top,ab_id,p_score,stand,p_throws,event,date,...,start_speed,type,b_count,s_count,outs,pitch_num,last_pitch_type,last_pitch_px,last_pitch_pz,last_pitch_speed
0,1.0,572761,452657,1.0,2015000000.0,0.0,L,L,Groundout,2015-04-05,...,92.9,S,0.0,0.0,0.0,1.0,,,,
1,1.0,572761,452657,1.0,2015000000.0,0.0,L,L,Groundout,2015-04-05,...,92.8,S,0.0,1.0,0.0,2.0,FF,0.416,2.963,92.9
2,1.0,572761,452657,1.0,2015000000.0,0.0,L,L,Groundout,2015-04-05,...,94.1,S,0.0,2.0,0.0,3.0,FF,-0.191,2.347,92.8
3,1.0,572761,452657,1.0,2015000000.0,0.0,L,L,Groundout,2015-04-05,...,91.0,B,0.0,2.0,0.0,4.0,FF,-0.518,3.284,94.1
4,1.0,572761,452657,1.0,2015000000.0,0.0,L,L,Groundout,2015-04-05,...,75.4,B,1.0,2.0,0.0,5.0,FF,-0.641,1.221,91.0
5,1.0,572761,452657,1.0,2015000000.0,0.0,L,L,Groundout,2015-04-05,...,92.9,X,2.0,2.0,0.0,6.0,CU,-1.821,2.083,75.4
6,1.0,518792,452657,1.0,2015000000.0,0.0,L,L,Double,2015-04-05,...,93.3,B,0.0,0.0,1.0,1.0,,,,
7,1.0,518792,452657,1.0,2015000000.0,0.0,L,L,Double,2015-04-05,...,89.3,X,1.0,0.0,1.0,2.0,FF,-1.088,1.61,93.3
8,1.0,407812,452657,1.0,2015000000.0,0.0,R,L,Single,2015-04-05,...,92.1,B,0.0,0.0,1.0,1.0,,,,
9,1.0,407812,452657,1.0,2015000000.0,0.0,R,L,Single,2015-04-05,...,89.3,B,1.0,0.0,1.0,2.0,FF,1.47,2.35,92.1


In [31]:
pitches_2019 = pd.read_csv('../Data/Kaggle_Files/2019_pitches.csv')

In [48]:
pitches_2019.code.value_counts()

B     243901
F     128929
C     119334
X      79562
S      75750
D      28164
*B     17505
E      16946
T       6447
W       5552
V       2611
H       1963
L       1684
M        353
P         60
O         29
Name: code, dtype: int64

ONes that exist in 2019: break_length, break_angle, break_y, ax, ay, az, vx/y/z0, pfx_x/z

## Other Pitch Rates:

In modeling_prep.ipynb, I used a query to create running pitch counts for each pitcher.  Utilizing a similar format for some new ones:

Repeating, but over last 100 pitches:

In [22]:
query = '''
--first, selecting all the standard columns:
SELECT pitcher_id, pitcher_full_name, pitch_type,
--selecting counts of each pitch type, over the last 100 pitches the pitcher has thrown:
(count(CASE WHEN pitch_type = 'FF' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_ff,
(count(CASE WHEN pitch_type = 'SL' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_sl,
(count(CASE WHEN pitch_type = 'FT' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_ft,
(count(CASE WHEN pitch_type = 'CH' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_ch,
(count(CASE WHEN pitch_type = 'CU' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_cu,
(count(CASE WHEN pitch_type = 'SI' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_si,
(count(CASE WHEN pitch_type = 'FC' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_fc,
(count(CASE WHEN pitch_type = 'KC' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_kc,
(count(CASE WHEN pitch_type = 'FS' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_fs,
(count(CASE WHEN pitch_type = 'KN' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_kn,
(count(CASE WHEN pitch_type = 'EP' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_ep,
(count(CASE WHEN pitch_type = 'FO' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_fo,
(count(CASE WHEN pitch_type = 'SC' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_sc

FROM full_pitch_data
LIMIT 1000
;
'''
df = pd.read_sql(query, engine)

df.head()

Unnamed: 0,pitcher_id,pitcher_full_name,pitch_type,last_100_ff,last_100_sl,last_100_ft,last_100_ch,last_100_cu,last_100_si,last_100_fc,last_100_kc,last_100_fs,last_100_kn,last_100_ep,last_100_fo,last_100_sc
0,112526,Bartolo Colon,FF,0,0,0,0,0,0,0,0,0,0,0,0,0
1,112526,Bartolo Colon,FT,1,0,0,0,0,0,0,0,0,0,0,0,0
2,112526,Bartolo Colon,SL,1,0,1,0,0,0,0,0,0,0,0,0,0
3,112526,Bartolo Colon,FF,1,1,1,0,0,0,0,0,0,0,0,0,0
4,112526,Bartolo Colon,FT,2,1,1,0,0,0,0,0,0,0,0,0,0


What about px/pz, for the last x times a pitcher has thrown a pitch:

In [49]:
query = '''
--first, selecting all the standard columns:
SELECT pitcher_id, pitcher_full_name, pitch_type,
--selecting avg px, over the last 10 pitches the pitcher has thrown:
(avg(CASE WHEN pitch_type = 'FF' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_ff,
(avg(CASE WHEN pitch_type = 'SL' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_sl,
(avg(CASE WHEN pitch_type = 'FT' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_ft,
(avg(CASE WHEN pitch_type = 'CH' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_ch,
(avg(CASE WHEN pitch_type = 'CU' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_cu,
(avg(CASE WHEN pitch_type = 'SI' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_si,
(avg(CASE WHEN pitch_type = 'FC' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_fc,
(avg(CASE WHEN pitch_type = 'KC' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_kc,
(avg(CASE WHEN pitch_type = 'FS' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_fs,
(avg(CASE WHEN pitch_type = 'KN' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_kn,
(avg(CASE WHEN pitch_type = 'EP' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_ep,
(avg(CASE WHEN pitch_type = 'FO' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_fo,
(avg(CASE WHEN pitch_type = 'SC' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_sc

FROM full_pitch_data
LIMIT 1000
;
'''
df = pd.read_sql(query, engine)

df.head()

Unnamed: 0,pitcher_id,pitcher_full_name,pitch_type,avg_px_ff,avg_px_sl,avg_px_ft,avg_px_ch,avg_px_cu,avg_px_si,avg_px_fc,avg_px_kc,avg_px_fs,avg_px_kn,avg_px_ep,avg_px_fo,avg_px_sc
0,112526,Bartolo Colon,FF,,,,,,,,,,,,,
1,112526,Bartolo Colon,FT,0.445,,,,,,,,,,,,
2,112526,Bartolo Colon,SL,0.445,,-0.296,,,,,,,,,,
3,112526,Bartolo Colon,FF,0.445,0.748,-0.296,,,,,,,,,,
4,112526,Bartolo Colon,FT,0.751,0.748,-0.296,,,,,,,,,,


Same, but for pz:

In [50]:
query = '''
--first, selecting all the standard columns:
SELECT pitcher_id, pitcher_full_name, pitch_type,
--selecting counts of each pitch type, over the last 100 pitches the pitcher has thrown:
(avg(CASE WHEN pitch_type = 'FF' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_ff,
(avg(CASE WHEN pitch_type = 'SL' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_sl,
(avg(CASE WHEN pitch_type = 'FT' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_ft,
(avg(CASE WHEN pitch_type = 'CH' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_ch,
(avg(CASE WHEN pitch_type = 'CU' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_cu,
(avg(CASE WHEN pitch_type = 'SI' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_si,
(avg(CASE WHEN pitch_type = 'FC' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_fc,
(avg(CASE WHEN pitch_type = 'KC' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_kc,
(avg(CASE WHEN pitch_type = 'FS' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_fs,
(avg(CASE WHEN pitch_type = 'KN' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_kn,
(avg(CASE WHEN pitch_type = 'EP' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_ep,
(avg(CASE WHEN pitch_type = 'FO' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_fo,
(avg(CASE WHEN pitch_type = 'SC' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 10 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_sc

FROM full_pitch_data
LIMIT 1000
;
'''
df = pd.read_sql(query, engine)

df.head()

Unnamed: 0,pitcher_id,pitcher_full_name,pitch_type,avg_pz_ff,avg_pz_sl,avg_pz_ft,avg_pz_ch,avg_pz_cu,avg_pz_si,avg_pz_fc,avg_pz_kc,avg_pz_fs,avg_pz_kn,avg_pz_ep,avg_pz_fo,avg_pz_sc
0,112526,Bartolo Colon,FF,,,,,,,,,,,,,
1,112526,Bartolo Colon,FT,2.705,,,,,,,,,,,,
2,112526,Bartolo Colon,SL,2.705,,1.189,,,,,,,,,,
3,112526,Bartolo Colon,FF,2.705,1.26,1.189,,,,,,,,,,
4,112526,Bartolo Colon,FT,3.1155,1.26,1.189,,,,,,,,,,


These work and likely can paint a good picture of where the pitcher is locating the ball, but will have to handle the NaN - a fair way could be middle of the strikezone (0 for px, ~1.85 for pz), since I don't want to lose those rows.  

Columns with None value can be removed from that pitcher's modeling, or changed to 0 - ultimately, it won't matter since they don't throw that pitch.

## Merging the Above in one query:

In [56]:
query = '''
--first, selecting all the standard columns:
SELECT pitcher_id, pitcher_full_name, pitch_type,
--selecting counts of each pitch type, over the last 100 pitches the pitcher has thrown:
(count(CASE WHEN pitch_type = 'FF' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_ff,
(count(CASE WHEN pitch_type = 'SL' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_sl,
(count(CASE WHEN pitch_type = 'FT' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_ft,
(count(CASE WHEN pitch_type = 'CH' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_ch,
(count(CASE WHEN pitch_type = 'CU' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_cu,
(count(CASE WHEN pitch_type = 'SI' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_si,
(count(CASE WHEN pitch_type = 'FC' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_fc,
(count(CASE WHEN pitch_type = 'KC' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_kc,
(count(CASE WHEN pitch_type = 'FS' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_fs,
(count(CASE WHEN pitch_type = 'KN' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_kn,
(count(CASE WHEN pitch_type = 'EP' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_ep,
(count(CASE WHEN pitch_type = 'FO' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_fo,
(count(CASE WHEN pitch_type = 'SC' THEN pitch_type END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 100 PRECEDING EXCLUDE CURRENT ROW)) AS last_100_sc,

--selecting avg px, over the last 3 pitches the pitcher has thrown:
(avg(CASE WHEN pitch_type = 'FF' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_ff,
(avg(CASE WHEN pitch_type = 'SL' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_sl,
(avg(CASE WHEN pitch_type = 'FT' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_ft,
(avg(CASE WHEN pitch_type = 'CH' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_ch,
(avg(CASE WHEN pitch_type = 'CU' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_cu,
(avg(CASE WHEN pitch_type = 'SI' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_si,
(avg(CASE WHEN pitch_type = 'FC' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_fc,
(avg(CASE WHEN pitch_type = 'KC' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_kc,
(avg(CASE WHEN pitch_type = 'FS' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_fs,
(avg(CASE WHEN pitch_type = 'KN' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_kn,
(avg(CASE WHEN pitch_type = 'EP' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_ep,
(avg(CASE WHEN pitch_type = 'FO' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_fo,
(avg(CASE WHEN pitch_type = 'SC' THEN px END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_px_sc,

--selecting avg pz, over the last 3 pitches the pitcher has thrown:
(avg(CASE WHEN pitch_type = 'FF' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_ff,
(avg(CASE WHEN pitch_type = 'SL' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_sl,
(avg(CASE WHEN pitch_type = 'FT' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_ft,
(avg(CASE WHEN pitch_type = 'CH' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_ch,
(avg(CASE WHEN pitch_type = 'CU' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_cu,
(avg(CASE WHEN pitch_type = 'SI' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_si,
(avg(CASE WHEN pitch_type = 'FC' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_fc,
(avg(CASE WHEN pitch_type = 'KC' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_kc,
(avg(CASE WHEN pitch_type = 'FS' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_fs,
(avg(CASE WHEN pitch_type = 'KN' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_kn,
(avg(CASE WHEN pitch_type = 'EP' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_ep,
(avg(CASE WHEN pitch_type = 'FO' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_fo,
(avg(CASE WHEN pitch_type = 'SC' THEN pz END) OVER (PARTITION BY pitcher_id ORDER BY ab_id, pitch_num ASC ROWS 3 PRECEDING EXCLUDE CURRENT ROW)) AS avg_pz_sc

FROM full_pitch_data
ORDER BY ab_id, pitch_num ASC
;
'''
df = pd.read_sql(query, engine)

df.head()

Unnamed: 0,pitcher_id,pitcher_full_name,pitch_type,last_100_ff,last_100_sl,last_100_ft,last_100_ch,last_100_cu,last_100_si,last_100_fc,...,avg_pz_ch,avg_pz_cu,avg_pz_si,avg_pz_fc,avg_pz_kc,avg_pz_fs,avg_pz_kn,avg_pz_ep,avg_pz_fo,avg_pz_sc
0,452657,Jon Lester,FF,0,0,0,0,0,0,0,...,,,,,,,,,,
1,452657,Jon Lester,FF,1,0,0,0,0,0,0,...,,,,,,,,,,
2,452657,Jon Lester,FF,2,0,0,0,0,0,0,...,,,,,,,,,,
3,452657,Jon Lester,FF,3,0,0,0,0,0,0,...,,,,,,,,,,
4,452657,Jon Lester,CU,4,0,0,0,0,0,0,...,,,,,,,,,,


In [59]:
df.iloc[2870371, :]

pitcher_id                  591693
pitcher_full_name    Edubray Ramos
pitch_type                      SL
last_100_ff                     37
last_100_sl                     58
last_100_ft                      2
last_100_ch                      3
last_100_cu                      0
last_100_si                      0
last_100_fc                      0
last_100_kc                      0
last_100_fs                      0
last_100_kn                      0
last_100_ep                      0
last_100_fo                      0
last_100_sc                      0
avg_px_ff                      0.9
avg_px_sl                    1.505
avg_px_ft                      NaN
avg_px_ch                      NaN
avg_px_cu                      NaN
avg_px_si                      NaN
avg_px_fc                      NaN
avg_px_kc                      NaN
avg_px_fs                      NaN
avg_px_kn                      NaN
avg_px_ep                      NaN
avg_px_fo                      NaN
avg_px_sc           

In [54]:
df.last_100_ff.ewm(com=1).mean()

0       0.000000
1       0.666667
2       0.857143
3       0.933333
4       1.483871
         ...    
995    37.710686
996    37.355343
997    37.177671
998    37.088836
999    36.544418
Name: last_100_ff, Length: 1000, dtype: float64