# Predicting Probability of Pitch Type X - Machine Learning with Baseball Data#

## Molly Gibson ##
## August 9, 2018 ##

In [25]:
import pandas as pd
import numpy as np
import math
from column_report import get_column_report
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.utils import resample


# so we can view large chunks of the dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


pitches = pd.read_csv('pitches', low_memory=False)

metadata = pd.read_csv('pitch_by_pitch_metadata.csv', encoding='latin1')

In [4]:
# the first thing I like to do is get a sense of the dataset we're working 
# with. Let's look at the dimensions and a quick overview of what 
# each column looks like column_report is a module written by a 
# team member at Code for San Francsico

print(pitches.shape)


get_column_report(pitches)

(718961, 125)


Unnamed: 0,Column Name,Data Type,Unique Count,Sample Value,NaNs,% NaN
0,uid,int64,718961,14143226,0,0.0
22,stand,object,2,L,0,0.0
23,b_height,object,18,5-8,0,0.0
24,pitcher_id,int64,662,460024,0,0.0
25,p_throws,object,2,R,0,0.0
26,at_bat_des,object,129102,"Maicer Izturis grounds out, second baseman Chr...",0,0.0
27,event,object,30,Groundout,0,0.0
31,away_team_runs,int64,20,0,0,0.0
21,batter_id,int64,936,430895,0,0.0
32,home_team_runs,int64,23,0,0,0.0


In [37]:
# so let's get rid of these columns that are 100% NaN values
cols = [c for c in pitches.columns if c[:6]!='runner']

pitches = pitches[cols]

In [26]:
# FEATURE SELECTION: 

# construct our feature set: Since we're predicting in real-time, we'll 
# only have access to features available prior to pitch - as well as 
# prior probabilities we calculate on our own

# we'll want to know what inning it is, the number of pitches he's 
# thrown at bat, 
# total number of pitches thrown, balls, strikes, fouls, outs, 
# batter height, which side batter stands on,
# number of runs for both teams, batter_id, pitcher_id, and team ids.
# then calculate priors for the batter: batting average (which we'll
# need 'event' for), 
# and for the pitcher: 
# previous pitch type, previous pitch outcome (pitch_des), and 
# pitcher stats such as OOPS, 
# K/9IP, K/BB, WHIP, 
# OOPS, ERA (it's unlikely that I'll have time to calculate all of these)

# let's take a look at the variables we'll have for each 
# pitch before it's thrown
# and keep track of the features we'll want to use
features = list(metadata[metadata.available_prior_to_pitch=='Yes'].column_name)



# drop = set(['uid', 'year', 'date', 'game_pk', 'start_tfs', 'start_tfs_zulu', 
#                     'pitch_id', 'on_1b', 'on_2b', 'on_3b', 'home_team_runs', 
#                     'away_team_runs'])

# features = [feat for feat in features if feat not in drop]

# print(features)

metadata[metadata.available_prior_to_pitch=='Yes']


Unnamed: 0,column_name,available_prior_to_pitch,description
0,uid,Yes,unique id
1,game_pk,Yes,unique game id
2,year,Yes,year
3,date,Yes,date
4,team_id_b,Yes,team_id for the batting team
5,team_id_p,Yes,team_id for the pitching team
6,inning,Yes,inning number
7,top,Yes,binary: is top half of inning
8,at_bat_num,Yes,incrementing at bat count for game
9,pcount_at_bat,Yes,pitches thrown in at bat


In [12]:
print('Number of pitchers in our dataset: %i' % 
      len(pitches.pitcher_id.unique()))

print('Average number of pitches per pitcher in our dataset: %.2f' % 
      pitches.groupby('pitcher_id').size().mean())

Number of pitchers in our dataset: 662
Average number of pitches per pitcher in our dataset: 1086.04


In [151]:
# FEATURE ENGINEERING:
# (I'll come back and calculate more pitcher statistics)

def calculate_batting_avg(row):
    ''' Find player at bat's batting average 
        based on games played previous to current one 
        
        Batting average: sum of singles, doubles, triples, 
        home runs/ total number of at bats'''
    
    tmstmp = row['start_tfs_zulu']
    batter_id = row['batter_id']
    
    # we're interested in the batter's activity before the current game 
    # (if I have extra time I'll go back and calculate his batting avg
    # up to this point including earlier in
    # this current game; but we don't have pitch_tfs prior to the pitch 
    # so we'd have to look at the previous 
    # pitch's timestamp; for simplicity I'll just calculate in previous games)
    minidf = pitches[(pitches.batter_id==batter_id) 
                     & (pitches.start_tfs_zulu < tmstmp)]
    
    # total number of at_bats:
    at_bats = len(minidf.groupby(['at_bat_num', 'game_pk']).size())
    # sum of singles, doubles, triples, and home runs:
    hits = len(minidf[minidf.event.isin(['Single', 'Double', 'Triple', 'Home Run'])]\
               .groupby(['game_pk', 'at_bat_num']).size())
    
    try:
        return hits / float(at_bats)
    except ZeroDivisionError:
        return None

    
def calculate_era(row):
    ''' Find pitcher's Earned Run Average in our data prior to current game.
    
        ERA: 9 * (Earned Runs Allowed / Innings Pitched) '''
    
    tmstmp = row['start_tfs_zulu']
    pitcher_id = row['pitcher_id']
    
    minidf = pitches[(pitches.pitcher_id==pitcher_id) 
                     & (pitches.start_tfs_zulu < tmstmp)]
    
    # this will have one row for each unique game he pitched the first inning, second inning, etc. 
    # so will give us the total number of innings pitched to date
    innings = len(minidf.groupby(['inning', 'game_pk']))
    
    # this is a proxy because it doesn't account for pitches in which multiple runs are scored
    runs_allowed = len(minidf[minidf.pitch_des.str.contains('run(s)', regex=False, flags=re.IGNORECASE)])
    
    try: 
        return 9 * (runs_allowed / float(innings))
    except ZeroDivisionError:
        return None
    
def are_bases_loaded(row):
    ''' Return 1 if bases are loaded, 0 otherwise '''
    
    if (row['on_1b']!=0) & (row['on_2b']!=0) & (row['on_3b']!=0):
        return 1
    else:
        return 0
    
def score_differential(row):
    ''' Calculate absolute value of different between scores ''' 
    
    return abs(row['away_team_runs'] - row['home_team_runs'])



def convert_height(height):
    ''' Batter heights in string format "5-8" - convert to inches '''
    
    height = height.split('-')
    
    inches = int(height[0])*12 + int(height[1])
    
    return inches


    

In [28]:
# construct new features and add to dataframe

batting_averages = pitches.apply(calculate_batting_avg, axis=1)

bases_loaded = pitches.apply(are_bases_loaded, axis=1)

score_differentials = pitches.apply(score_differential, axis=1)

earned_run_averages = pitches.apply(calculate_era, axis=1)


pitches['b_height'] = pitches['b_height'].apply(convert_height)
pitches['batting_avg'] = batting_averages
pitches['bases_loaded'] = bases_loaded
pitches['score_differential'] = score_differentials
pitches['pitcher_era'] = earned_run_averages

# add new features to list
features.extend(['batting_avg', 'bases_loaded', 'score_differential', 'pitcher_era'])

# store dataframe because calculating batting averages took a long time 
# (let run overnight)
%store pitches

Stored 'pitches' (DataFrame)


In [1]:
%store -r pitches

In [14]:
# # fill null pitcher_era values with mean ERA for each pitcher
era_means = {}

for pid in pitches.pitcher_id.unique():
    avg_era = pitches[(~pitches.pitcher_era.isnull()) & (pitches.pitcher_id==pid)].pitcher_era.mean()
    era_means[pid] = avg_era
    
    
for index, row in pitches[pitches.pitcher_era.isnull()].iterrows():
    pitcher_id = row.pitcher_id
    mean_era = era_means[pitcher_id]
    pitches.loc[index, 'pitcher_era'] = mean_era

    
print(len(pitches[pitches.pitcher_era.isnull()]))    

11


In [16]:
# and null batting_avg values with batter's mean batting average

batting_avg_means = {}

for bid in pitches.batter_id.unique():
    mean_batting_avg = pitches[(~pitches.batting_avg.isnull()) & (pitches.batter_id==bid)].batting_avg.mean()
    batting_avg_means[bid] = mean_batting_avg
    
for index, row in pitches[pitches.batting_avg.isnull()].iterrows():
    batter_id = row.batter_id
    mean_batting_avg = batting_avg_means[batter_id]
    pitches.loc[index, 'batting_avg'] = mean_batting_avg
    
print(len(pitches[pitches.batting_avg.isnull()]))

223


In [43]:

# the only columns (that we're ultimately concerned with) that have NaN values are batting_avg and pitcher_era
# where we have no data for the given batter/pitcher
# and there are only 223 rows & 11 rows, respectively, of NaN's, so we'll just drop these rows

pitches.dropna(subset=['pitcher_era', 'batting_avg'], inplace=True)



In [44]:
# get a sense of the class balance : only 1/5 pitches are in play
print('%.2f percent of records are pitch type X \n' % 
                  float(100*len(pitches[pitches.type=='X'])/float(len(pitches))))

# so let's upsample our dataframe to even out the frequency of positive and negative labels

df_x = pitches[pitches.type=='X']
df_sb = pitches[pitches.type!='X']

df_x_upsampled = resample(df_x, replace=True, n_samples=582748, random_state=415)

df_upsampled = pd.concat([df_sb, df_x_upsampled])


print(pitches.type.value_counts())
print(df_upsampled.type.value_counts())



18.95 percent of records are pitch type X 

S    318171
B    264363
X    136193
Name: type, dtype: int64
X    582748
S    318171
B    264363
Name: type, dtype: int64


In [45]:
# Now that we've constructed a preliminary feature set and resampled records,
# let's start to build our model
# The goal is to predict the probability of pitch type X in real-time
# We'll use Logistic Regression to build a binary classifier and predict 
# probabilities

# we're interested in binary classifier: pitch type X or not
y = df_upsampled.type.map({'X':1, 'B':0, 'S':0})

X = df_upsampled[features].copy()

# and let's encode our categorical variables:  (0: Right, 1: Left)
hmap = {'R':0, 'L':1}

X.replace({'stand':hmap, 'p_throws':hmap}, inplace=True)
 

# get rid of columns that aren't predictor variables
X.drop(['uid', 'year', 'date', 'game_pk', 'start_tfs', 'start_tfs_zulu', 
                    'pitch_id', 'on_1b', 'on_2b', 'on_3b', 'home_team_runs', 
                    'away_team_runs'], axis=1, inplace=True)

# manually add intercept
X['const'] = 1


X.head()


Unnamed: 0,team_id_b,team_id_p,inning,top,at_bat_num,pcount_at_bat,pcount_pitcher,balls,strikes,fouls,outs,batter_id,stand,b_height,pitcher_id,p_throws,batting_avg,bases_loaded,score_differential,pitcher_era,const
0,108,118,1,1,1,1,1,0,0,0,0,430895,1,68,460024,0,0.274556,0,0,3.811815,1
1,108,118,1,1,1,2,2,1,0,0,0,430895,1,68,460024,0,0.274556,0,0,3.811815,1
2,108,118,1,1,1,3,3,2,0,0,0,430895,1,68,460024,0,0.274556,0,0,3.811815,1
4,108,118,1,1,2,1,5,0,0,0,1,435062,0,70,460024,0,0.276916,0,0,0.0,1
5,108,118,1,1,2,2,6,0,1,0,1,435062,0,70,460024,0,0.276916,0,0,0.0,1


In [46]:


# split data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                    test_size=0.30, random_state=415)

X_validate, X_test, y_validate, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=608)

In [47]:
# I like to use statsmodels for the R-like statistical summary of the 
# regression, which gives us a feel
# for the importance of each predictor variable and their coefficients

logit_model = sm.Logit(y_train, X_train)
result = logit_model.fit()
print(result.summary())

probabilities = result.predict(X_validate)

predictions = [1 if p > 0.5 else 0 for p in probabilities]


print(classification_report(y_validate, predictions, digits=3))

Optimization terminated successfully.
         Current function value: 0.673784
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:                   type   No. Observations:               815697
Model:                          Logit   Df Residuals:                   815676
Method:                           MLE   Df Model:                           20
Date:                Mon, 27 Aug 2018   Pseudo R-squ.:                 0.02793
Time:                        09:18:34   Log-Likelihood:            -5.4960e+05
converged:                       True   LL-Null:                   -5.6540e+05
                                        LLR p-value:                     0.000
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
team_id_b           9.641e-06      0.000      0.061      0.952      -0.000       0.000
team_

In [48]:
# But that's not good!!! Why is the model NEVER predicting that a 
# pitch will be in play?!
# it might have something to do with the class imbalance; I'll 
# try using scikit-learn where I can 
# specify class_weight='balanced' in the input parameters

logreg = LogisticRegression()

logreg.fit(X_train, y_train)

train_accuracy = logreg.score(X_train, y_train)
val_accuracy = logreg.score(X_validate, y_validate)

probabilities = logreg.predict_proba(X=X_train)

predictions = logreg.predict(X_validate)


print('Logistic regression classifier accuracy (train set): %.3f' % 
                  train_accuracy)
print('Logistic regression classifier accuracy (validation set): %.3f\n' % 
                  val_accuracy)

print(classification_report(predictions, y_validate))

Logistic regression classifier accuracy (train set): 0.574
Logistic regression classifier accuracy (validation set): 0.573

             precision    recall  f1-score   support

          0       0.62      0.57      0.59     96513
          1       0.52      0.58      0.55     78279

avg / total       0.58      0.57      0.57    174792



In [49]:
from sklearn.model_selection import GridSearchCV

logreg = LogisticRegression()

penalty = ['l1', 'l2']

C = np.logspace(0, 4, 10)

hyperparams = dict(C=C, penalty=penalty)

clf = GridSearchCV(logreg, hyperparams, cv=5, verbose=0)

best_model = clf.fit(X_train, y_train)

# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

preds = best_model.predict(X_validate)

print(classification_report(preds, y_validate))

KeyboardInterrupt: 

In [None]:
# Ok, that's a bit better - still not great but at least now it's 
# predicting almost 90K pitches of type X

# Recall is over 50% for both positive and negative cases,
# meaning the model was able to pick out the majority of 
# both cases. Precision was right above 50%, meaning 
# just over half of the pitches predicted to
# be in play actually were. 

# Moving forward, I would test the collinearity of the input 
# variables and focus more on feature selection to 
# ensure that highly correlated columns aren't negatively 
# affecting the model's performance.
# I would also calculate statistics for each pitcher, such 
# as WHIP (Walks plus Hits per Inning Pitched),
# and calculate prior probabilities for each pitcher/batter pair
# I would tune the hyperparameters of my model using 
# GridSearchCV & cross validation 
# Then I'd try out different classifiers to see how they perform 
# differently. The next models I'd try
# would be Linear SVM and Random Forest
