In [41]:
## load the necessities

import pandas as pd
import numpy as np
import pickle

import time

## Load sklearn modules

### model selection and evaluation modules
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import confusion_matrix

### pre-processing and pipeline steps
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

### estimators
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier



## Are you checking out my code?

The csv data is scraped from Statcast. The datafiles are huge (about 350MB each) and not on my github repository. You can use the following cells 
to test the functionality of the code yourself, either by a scraper to download the above data or by using the data subset file.

In [46]:
## Commnet out the last lines in this cell, then uncomment an option below to run this notebook.


## OPTION 1:
## Slow, but complete.
## Uncomment and run the following code to scrape a file identical to the one used above to verify this code as-is.

#import datetime
#import scraper ##this is one of my files, found in the same repo as this ipynb.

#start = datetime.date(2019,3,20)
#end = datetime.date(2019,10,1)
#df1 = scraper.statcast_scrape(start,end)
#df = df1.dropna(subset=['events'])


## OPTION 2: 
## Fast, but the models will be fitting a smaller amount of data and your results may vary
## from mine if you do.
## The 'statcast2019_sample.csv' was produced with DataFrame.sample(50_000, random_state = 1). It is smaller and on github.
## Uncomment below to read that file as sample data to inspect this code.

#df = pd.read_csv('data/statcast2019_sample.csv')

## OPTION 3: 
## You already have these files, saved in the right way. (99% chance you are me, if so.)
## This code occasionally crashes my old, slow computer. Hence the pickling (which is also faster.)

#df = pd.read_csv('data/statcast_dumps/statcast2019.csv').dropna(subset=['events'])
#df
#all_data = df.copy()
#df = df.sample(n=50_000)

##So I'm creating a pickle to replace it.

#pickle_out = open('pickles/hr_clf_data_subset.pickle','wb')
#pickle.dump(df, pickle_out)

##And now I can load that data with

pickle_in = open('pickles/hr_clf_data_subset.pickle','rb')
df = pickle.load(pickle_in)

In [3]:
#print(df['des'].unique())
for i in df.columns:
    print("'{}'',".format(i))

'Unnamed: 0'',
'index'',
'pitch_type'',
'game_date'',
'release_speed'',
'release_pos_x'',
'release_pos_z'',
'player_name'',
'batter'',
'pitcher'',
'events'',
'description'',
'spin_dir'',
'spin_rate_deprecated'',
'break_angle_deprecated'',
'break_length_deprecated'',
'zone'',
'des'',
'game_type'',
'stand'',
'p_throws'',
'home_team'',
'away_team'',
'type'',
'hit_location'',
'bb_type'',
'balls'',
'strikes'',
'game_year'',
'pfx_x'',
'pfx_z'',
'plate_x'',
'plate_z'',
'on_3b'',
'on_2b'',
'on_1b'',
'outs_when_up'',
'inning'',
'inning_topbot'',
'hc_x'',
'hc_y'',
'tfs_deprecated'',
'tfs_zulu_deprecated'',
'fielder_2'',
'umpire'',
'sv_id'',
'vx0'',
'vy0'',
'vz0'',
'ax'',
'ay'',
'az'',
'sz_top'',
'sz_bot'',
'hit_distance_sc'',
'launch_speed'',
'launch_angle'',
'effective_speed'',
'release_spin_rate'',
'release_extension'',
'game_pk'',
'pitcher.1'',
'fielder_2.1'',
'fielder_3'',
'fielder_4'',
'fielder_5'',
'fielder_6'',
'fielder_7'',
'fielder_8'',
'fielder_9'',
'release_pos_y'',
'estimated_ba_usin

In [4]:
#Checking for in the park HR.

df[df['events'] == 'home_run'].dropna(subset=['hit_location'])['des']

14335     Adalberto Mondesi hits an inside-the-park home...
200750    George Springer hits an inside-the-park home r...
203754    Kevin Kiermaier hits an inside-the-park home r...
223013    Avisail Garcia hits an inside-the-park home ru...
289952    Hunter Pence hits an inside-the-park home run ...
318774    Ian Desmond hits an inside-the-park home run (...
348009    Ben Gamel hits an inside-the-park home run (5)...
356763    Tommy La Stella hits an inside-the-park home r...
415030    Royals challenged (tag play), call on the fiel...
471236    Yuli Gurriel hits an inside-the-park home run ...
503374    Ketel Marte hits an inside-the-park home run (...
524797    Whit Merrifield hits an inside-the-park home r...
633839    Scott Kingery hits an inside-the-park home run...
Name: des, dtype: object

In [5]:
## Make a list of batted ball data. I'm also including batter handedness and home team (which is a proxy for the park.)
batted_ball_data = [
    #des,
    #game_type,
    #'events',
    'stand',
    'p_throws',
    'home_team',
    #away_team,
    #type,
    #'hit_location,
    #'hc_x',
    #'hc_y',
    'launch_speed',
    'launch_angle',
    'field_angle'
]

In [6]:
## Add a categorical variable for home runs.
df['is_homer'] = df['events'].apply(lambda x: True if x == 'home_run' else False)
df['stand'] = df['stand'].apply(lambda x: True if x == 'R' else False)
df['p_throws'] = df['p_throws'].apply(lambda x: True if x == 'R' else False)


In [7]:
##hc_x and hc_y are unusual. An internet search suggests that they are coordinates for
##displying the hit location on a field map for MLB's app.
##That's obvious data leakage, but I do want the relative angle that these numbers can imply.

## Running the code below shows that they aren't scaled in the same way, both have a minimum of 0.

df[df['events'] == 'home_run'][['hc_x','hc_y']].dropna().sort_values(by='hc_x', ascending=False)

## Scale these
## I'm assuming that the max of hc_y is about the same actual distance as the max of hc_x. 
df['hc_x'] = df['hc_x'] / df['hc_x'].max()
df['hc_y'] = df['hc_y'] / df['hc_y'].max()
df['field_angle'] = np.arctan(df['hc_y']/df['hc_x'])

In [8]:
## Before going any further, I want to see how well a basic model
## can classify this data based on the most obvious features: launch angle and launch speed

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.dummy import DummyClassifier

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier


## Get small test and train splits to train a decision tree.
temp = df.dropna(subset=['launch_angle','launch_speed','is_homer'])
X_train, X_test, y_train, y_test = train_test_split(temp[['launch_angle','launch_speed']], 
                                                    temp['is_homer'], 
                                                    test_size=5_000, 
                                                    train_size=10_000,
                                                    random_state = 12
                                                    )

baseline_clf = DecisionTreeClassifier(max_depth = 8).fit(X_train,y_train)
dummy_clf = DummyClassifier().fit(X_train, y_train)
print(y_test.sum()/len(y_test))
print("Dummy score: {}".format(dummy_clf.score(X_test,y_test)))
print("Train: {}".format(baseline_clf.score(X_train,y_train)))
print("Test: {}".format(baseline_clf.score(X_test,y_test)))
print("Cofusion:\n{}".format(confusion_matrix(y_test,baseline_clf.predict(X_test))))


0.057
Dummy score: 0.8968
Train: 0.9792
Test: 0.9638
Cofusion:
[[4653   62]
 [ 119  166]]


#### That was surprisingly bad

I though a decision tree would do better than that, but it only correctly classifies 156 of 216 home runs, which is a pretty weak recall rate. Precision is even worse, with 256 classifications (about .6) 

In [10]:
temp = df.dropna(subset =batted_ball_data) ## variables named 'temp' are always temporary and not expected to be used in later cells.
temp = temp[batted_ball_data+['is_homer']]
X_train, X_test, y_train, y_test = train_test_split(temp[batted_ball_data], 
                                                    temp['is_homer'], 
                                                    test_size=5_000, 
                                                    train_size=10_000,
                                                    random_state = 12
                                                    )

col_transf = ColumnTransformer([
    ('one_hot',OneHotEncoder(sparse=False),['home_team'])
    ],
    remainder = 'passthrough'
)
tree_clf_pipe = Pipeline(
    [('column_transformer', col_transf),
    ('tree_clf', DecisionTreeClassifier())]
)

In [11]:
tree_clf_pipe.fit(X_train,y_train)

dummy_clf = DummyClassifier().fit(X_train, y_train)
print(y_test.sum()/len(y_test))
print("Dummy score: {}".format(dummy_clf.score(X_test,y_test)))
print("Train: {}".format(tree_clf_pipe.score(X_train,y_train)))
print("Test: {}".format(tree_clf_pipe.score(X_test,y_test)))
print("Cofusion:\n{}".format(confusion_matrix(y_test,tree_clf_pipe.predict(X_test))))

## Let's turn that into something quicker to re-use.
def performance_summary(model,X,y,X_dev,y_dev):
    
    dummy_clf = DummyClassifier().fit(X, y)
    print(y_dev.sum()/len(y_dev))
    print("Dummy score: {}".format(dummy_clf.score(X_dev,y_dev)))
    print("Train: {}".format(model.score(X,y)))
    print("Test: {}".format(model.score(X_dev,y_dev)))
    print("Train Cofusion:\n{}".format(confusion_matrix(y,model.predict(X))))
    print("Cofusion:\n{}".format(confusion_matrix(y_dev,model.predict(X_dev))))

0.0504
Dummy score: 0.9054
Train: 1.0
Test: 0.9648
Cofusion:
[[4653   95]
 [  81  171]]


In [12]:

mlp_6 = MLPClassifier(
    hidden_layer_sizes=(6),
    activation = 'relu',
    max_iter = 1000,
    solver = 'sgd',
    learning_rate_init = .03,
    alpha = 0
)

col_transf = ColumnTransformer([
    ('one_hot',OneHotEncoder(sparse=False),['home_team']),
    ('scaler',StandardScaler(),['launch_speed','launch_angle','field_angle'])
    ],
    remainder = 'passthrough'
)

mlp_6_pipe = Pipeline(
    [('column_transformer', col_transf),
    ('mlp_clf', mlp_6)]
)



In [13]:
mlp_6_pipe.fit(X_train,y_train)

Pipeline(steps=[('column_transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('one_hot',
                                                  OneHotEncoder(sparse=False),
                                                  ['home_team']),
                                                 ('scaler', StandardScaler(),
                                                  ['launch_speed',
                                                   'launch_angle',
                                                   'field_angle'])])),
                ('mlp_clf',
                 MLPClassifier(alpha=0, hidden_layer_sizes=6,
                               learning_rate_init=0.03, max_iter=1000,
                               solver='sgd'))])

In [14]:
performance_summary(mlp_6_pipe,X_train,y_train,X_test,y_test)

0.0504
Dummy score: 0.9016
Train: 0.981
Test: 0.9742
Train Cofusion:
[[9385  102]
 [  88  425]]
Cofusion:
[[4676   72]
 [  57  195]]


## First MLP does about the same as a decision tree (but overfits less.)

Let's try giving it more data.

In [15]:
## We up the training data to the full set.

X_train, X_test, y_train, y_test = train_test_split(temp[batted_ball_data], 
                                                    temp['is_homer'], 
                                                    test_size=10_000, 
                                                    ##train_size=10_000,
                                                    random_state = 12
                                                    )
X_test, X_final_test, y_test, y_final_test = train_test_split(X_test, y_test, test_size = 5000)##I know the trandional is X_dev and X_test but I didn't start out this way
mlp_6_pipe.fit(X_train,y_train)
performance_summary(mlp_6_pipe,X_train,y_train,X_test,y_test)

## Fun, no so fun side note: on my first pass with more data, there was a total failure: 
## it predicted the majority class. (Uhg.)
## I forgot to scale the data, which immediately eliminated the issue.


0.0502
Dummy score: 0.9096
Train: 0.9773509502389902
Test: 0.9792
Train Cofusion:
[[106278   1255]
 [  1318   4752]]
Cofusion:
[[4702   47]
 [  57  194]]


In [17]:
## Let's see if a bigger network can do more...



k_fold = KFold(n_splits=2) #saves time; let's find something that looks promising, then get more rigorous.

parameters = {'hidden_layer_sizes':[6,10,20]}
mlp_grid = GridSearchCV(mlp_6,param_grid=parameters,return_train_score=True, cv=k_fold)
mlp_grid.fit(col_transf.fit_transform(X_train),y_train)

GridSearchCV(cv=KFold(n_splits=2, random_state=None, shuffle=False),
             estimator=MLPClassifier(alpha=0, hidden_layer_sizes=6,
                                     learning_rate_init=0.03, max_iter=1000,
                                     solver='sgd'),
             param_grid={'hidden_layer_sizes': [6, 10, 20]},
             return_train_score=True)

In [19]:
## A larger network slightly over fits but does better on the training set.
mlp_grid.cv_results_

{'mean_fit_time': array([14.41100788, 14.17461848, 24.47932947]),
 'std_fit_time': array([4.05109358, 1.9933486 , 8.08766687]),
 'mean_score_time': array([0.01457548, 0.0177834 , 0.02378428]),
 'std_score_time': array([0.00104523, 0.00062966, 0.00161564]),
 'param_hidden_layer_sizes': masked_array(data=[6, 10, 20],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'hidden_layer_sizes': 6},
  {'hidden_layer_sizes': 10},
  {'hidden_layer_sizes': 20}],
 'split0_test_score': array([0.9743671 , 0.97508891, 0.97635647]),
 'split1_test_score': array([0.97503565, 0.97649689, 0.97720111]),
 'mean_test_score': array([0.97470138, 0.9757929 , 0.97677879]),
 'std_test_score': array([0.00033428, 0.00070399, 0.00042232]),
 'rank_test_score': array([3, 2, 1], dtype=int32),
 'split0_train_score': array([0.97702505, 0.97769405, 0.97975388]),
 'split1_train_score': array([0.97581071, 0.97582832, 0.97776487]),
 'mean_train_score': array([0.97641788,

In [24]:
## Let's try a deeper network. See if we can really capture the training data.


parameters = {'hidden_layer_sizes':[(20,8),(30,10),(40,12)],
                'learning_rate_init' : [.01, .005]} #lower the learning rate. In my experience, deeper networks need to learn a little slower.
mlp_grid = GridSearchCV(mlp_6,param_grid=parameters,return_train_score=True, cv=k_fold)
mlp_grid.fit(col_transf.fit_transform(X_train),y_train)

GridSearchCV(cv=KFold(n_splits=2, random_state=None, shuffle=False),
             estimator=MLPClassifier(alpha=0, hidden_layer_sizes=6,
                                     learning_rate_init=0.03, max_iter=1000,
                                     solver='sgd'),
             param_grid={'hidden_layer_sizes': [(20, 8), (30, 10), (40, 12)],
                         'learning_rate_init': [0.01, 0.005]},
             return_train_score=True)

In [25]:
mlp_grid.cv_results_

{'mean_fit_time': array([29.85540533, 25.91176689, 21.42548037, 36.91480339, 46.65487754,
        37.54543984]),
 'std_fit_time': array([4.81753135, 2.44942605, 5.53138924, 0.11891854, 9.84136808,
        4.8721987 ]),
 'mean_score_time': array([0.03223228, 0.0301404 , 0.03638661, 0.03563738, 0.04789138,
        0.04614401]),
 'std_score_time': array([0.00418973, 0.00170326, 0.00021374, 0.0006547 , 0.00229836,
        0.00048256]),
 'param_hidden_layer_sizes': masked_array(data=[(20, 8), (20, 8), (30, 10), (30, 10), (40, 12),
                    (40, 12)],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_learning_rate_init': masked_array(data=[0.01, 0.005, 0.01, 0.005, 0.01, 0.005],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'hidden_layer_sizes': (20, 8), 'learning_rate_init': 0.01},
  {'hidden_layer_sizes': (20, 8), 'learning_ra

In [32]:
## Why not?
# 
parameters = {'hidden_layer_sizes':[(60,18,10)],
            #'learning_rate':['invscaling'],
            'learning_rate_init':[.008],
            'random_state':[5]}
k_fold.n_splits = 5 ## Let's make sure these are getting about the same results.
mlp_grid = GridSearchCV(mlp_6,param_grid=parameters,return_train_score=True, cv=k_fold)
mlp_grid.fit(col_transf.fit_transform(X_train),y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=MLPClassifier(alpha=0, hidden_layer_sizes=6,
                                     learning_rate_init=0.03, max_iter=1000,
                                     solver='sgd'),
             param_grid={'hidden_layer_sizes': [(60, 18, 10)],
                         'learning_rate_init': [0.003], 'random_state': [5]},
             return_train_score=True)

In [33]:
mlp_grid.cv_results_

{'mean_fit_time': array([86.45864501]),
 'std_fit_time': array([19.61016358]),
 'mean_score_time': array([0.02781482]),
 'std_score_time': array([0.00113451]),
 'param_hidden_layer_sizes': masked_array(data=[(60, 18, 10)],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_learning_rate_init': masked_array(data=[0.003],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_random_state': masked_array(data=[5],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'hidden_layer_sizes': (60, 18, 10),
   'learning_rate_init': 0.003,
   'random_state': 5}],
 'split0_test_score': array([0.97623344]),
 'split1_test_score': array([0.97720171]),
 'split2_test_score': array([0.97491308]),
 'split3_test_score': array([0.97869718]),
 'split4_test_score': array([0.97583627]),
 'mean_test_score': array([0.97657633]),
 'std_test_score': array([0.00129002]),
 'rank_test_score': array([1], dty

In [37]:
parameters = {'hidden_layer_sizes':[(80,30,15), (120,45,22)],
            #'learning_rate':['invscaling'],
            'learning_rate_init':[.005, .002],#,.01,.02],
            'random_state':[5]}
k_fold.n_splits = 2 ## Let's make sure these are getting about the same results.
mlp_grid = GridSearchCV(mlp_6,param_grid=parameters,return_train_score=True, cv=k_fold)

start = time.time()
mlp_grid.fit(col_transf.fit_transform(X_train),y_train)
stop = time.time()
print(stop-start)

1211.920559167862


In [40]:
#results_array = []
#results_array.append(mlp_grid.cv_results_)
#mlp_grid.cv_results_
results_array

[{'mean_fit_time': array([ 65.08505464, 141.37754381, 154.3692373 , 193.35038435]),
  'std_fit_time': array([ 1.42266822, 15.15124643, 89.1627599 ,  9.17383134]),
  'mean_score_time': array([0.09056962, 0.12514114, 0.22130871, 0.20579088]),
  'std_score_time': array([0.00115216, 0.00059223, 0.01740718, 0.05881989]),
  'param_hidden_layer_sizes': masked_array(data=[(80, 30, 15), (80, 30, 15), (120, 45, 22),
                     (120, 45, 22)],
               mask=[False, False, False, False],
         fill_value='?',
              dtype=object),
  'param_learning_rate_init': masked_array(data=[0.005, 0.002, 0.005, 0.002],
               mask=[False, False, False, False],
         fill_value='?',
              dtype=object),
  'param_random_state': masked_array(data=[5, 5, 5, 5],
               mask=[False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'hidden_layer_sizes': (80, 30, 15),
    'learning_rate_init': 0.005,
    'random_state': 5},


In [17]:
mlp_grid.cv_results_

{'mean_fit_time': array([131.34892829]),
 'std_fit_time': array([34.87962894]),
 'mean_score_time': array([0.2677265]),
 'std_score_time': array([0.00543774]),
 'param_hidden_layer_sizes': masked_array(data=[(200, 80)],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_learning_rate_init': masked_array(data=[0.005],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_random_state': masked_array(data=[5],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'hidden_layer_sizes': (200, 80),
   'learning_rate_init': 0.005,
   'random_state': 5}],
 'split0_test_score': array([0.97678779]),
 'split1_test_score': array([0.97781768]),
 'split2_test_score': array([0.97691922]),
 'mean_test_score': array([0.9771749]),
 'std_test_score': array([0.00045767]),
 'rank_test_score': array([1]),
 'split0_train_score': array([0.98118439]),
 'split1_train_score': array([0.98015449]),
 'split

In [148]:
mlp_100_40 = MLPClassifier(
    hidden_layer_sizes=(100,40),
    activation = 'relu',
    max_iter = 1000,
    learning_rate_init = .03,
    alpha = 0
)

mlp_100_40_pipe = Pipeline(
    [('column_transformer', col_transf),
    ('mlp_clf', mlp_100_40)]
)

In [147]:
mlp_100_40_pipe.fit(X_train,y_train)
performance_summary(mlp_100_40_pipe,X_train,y_train,X_test,y_test)

0.0506
Dummy score: 0.9016
Train: 0.9465683124565373
Test: 0.9494
Train Cofusion:
[[107533      0]
 [  6070      0]]
Cofusion:
[[4747    0]
 [ 253    0]]
