In [1]:
from pybaseball import *

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from tqdm.auto import tqdm
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import TensorBoard

import pandas as pd
import numpy as np
import os

%matplotlib inline

Using TensorFlow backend.


In [2]:
raw_data = pd.read_csv('data/data.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
speed_data = pd.read_csv('data/sprint_speed.csv')

In [4]:
speed_data.head()

Unnamed: 0,last_name,first_name,player_id,team_id,team,position,age,competitive_runs,hp_to_1b,sprint_speed
0,Buxton,Byron,621439,142,MIN,CF,26,38,,30.3
1,Galloway,Isaac,543194,146,MIA,CF,30,19,4.13,30.2
2,Gore,Terrance,605253,118,KC,RF,28,14,,30.2
3,Berti,Jon,542932,146,MIA,3B,29,24,,30.1
4,Mondesi,Adalberto,609275,118,KC,SS,24,55,,30.0


In [5]:
speed_data.rename({
    'last_name': 'batter_last_name',
    ' first_name': 'batter_first_name'
}, axis='columns', inplace=True)

In [6]:
speed_data.columns

Index(['batter_last_name', 'batter_first_name', 'player_id', 'team_id', 'team',
       'position', 'age', 'competitive_runs', 'hp_to_1b', 'sprint_speed'],
      dtype='object')

In [7]:
raw_data.shape

(2189706, 90)

In [8]:
raw_data.columns

Index(['index', 'pitch_type', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'player_name', 'batter', 'pitcher', 'events',
       'description', 'spin_dir', 'spin_rate_deprecated',
       'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des',
       'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type',
       'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x',
       'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b',
       'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y',
       'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
       'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1',
       'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',
     

In [9]:
raw_data.bb_type.unique()

array([nan, 'ground_ball', 'fly_ball', 'popup', 'line_drive'],
      dtype=object)

In [10]:
raw_data.events.unique()

array(['strikeout', nan, 'field_out', 'grounded_into_double_play', 'walk',
       'home_run', 'force_out', 'single', 'triple', 'double',
       'hit_by_pitch', 'double_play', 'sac_bunt', 'sac_fly',
       'field_error', 'caught_stealing_2b', 'strikeout_double_play',
       'catcher_interf', 'fielders_choice_out', 'other_out',
       'fielders_choice', 'pickoff_1b', 'batter_interference',
       'sac_fly_double_play', 'run', 'caught_stealing_home',
       'pickoff_caught_stealing_2b', 'caught_stealing_3b', 'pickoff_2b',
       'pickoff_caught_stealing_3b', 'triple_play',
       'pickoff_caught_stealing_home', 'pickoff_3b',
       'sac_bunt_double_play', 'intent_walk'], dtype=object)

In [11]:
data = raw_data.merge(speed_data[['batter_last_name', 'batter_first_name', 'player_id', 'sprint_speed']], left_on='batter', right_on='player_id')
data[['batter_last_name', 'batter_first_name', 'sprint_speed']].head()

Unnamed: 0,batter_last_name,batter_first_name,sprint_speed
0,Machado,Manny,26.6
1,Machado,Manny,26.6
2,Machado,Manny,26.6
3,Machado,Manny,26.6
4,Machado,Manny,26.6


## Preprocessing

In [12]:
# Create outcome feature as target variable. Include sac flies as outs.

is_hit = ['single', 'double', 'triple', 'home_run']
is_out = ['field_out', 'grounded_into_double_play', 'force_out', 'sac_fly', 'fielders_choice_out', 'fielders_choice', 'field_error', 'triple_play']

data = data[data.events.isin(is_hit) | data.events.isin(is_out)]
data['outcome'] = np.where(data.events.isin(is_hit), data.events, 'out')

In [13]:
# Grab only the records and features we're interested in, drop nans

features_of_interest = ['launch_speed', 'launch_angle', 'hc_x', 'hc_y', 'sprint_speed', 'home_team', 'outcome']
numerical_features = ['launch_speed', 'launch_angle', 'hc_x', 'hc_y', 'sprint_speed']
categorical_features = ['home_team', 'outcome']
bb_events = ['ground_ball', 'fly_ball', 'popup', 'line_drive']
data = data[data.bb_type.isin(bb_events)][features_of_interest]
print(data.shape)
data.dropna(inplace=True)
print(data.shape)
data.head()

(271595, 7)
(271166, 7)


Unnamed: 0,launch_speed,launch_angle,hc_x,hc_y,sprint_speed,home_team,outcome
4,99.2,9.0,101.62,143.38,26.6,LAD,out
21,79.9,-9.0,98.79,171.35,26.6,LAD,out
25,104.8,12.0,159.0,64.82,26.6,LAD,single
34,85.2,37.0,161.17,84.05,26.6,LAD,out
40,59.2,68.0,135.29,165.78,26.6,LAD,out


In [14]:
# Standardize numeric data

scaled_data = data.copy()
scaler = StandardScaler().fit(data[numerical_features])
scaled_data[numerical_features] = scaler.transform(data[numerical_features])

scaled_data.head()

Unnamed: 0,launch_speed,launch_angle,hc_x,hc_y,sprint_speed,home_team,outcome
4,0.797615,-0.103445,-0.624566,0.503374,-0.011243,LAD,out
21,-0.586662,-0.768667,-0.694438,1.15211,-0.011243,LAD,out
25,1.199271,0.007425,0.792121,-1.318744,-0.011243,LAD,single
34,-0.206524,0.931344,0.845697,-0.872724,-0.011243,LAD,out
40,-2.071354,2.077004,0.206731,1.022919,-0.011243,LAD,out


In [15]:
# Transform home_team and outcome to categories. Consider adding drop_first=True kwarg to remove one rank from encoding.

# enc = OneHotEncoder().fit(scaled_data[categorical_features])
# print(enc.categories_)
# pd.DataFrame(enc.transform(scaled_data[categorical_features]).toarray()).head()

scaled_data = pd.get_dummies(scaled_data)

## Phew. The data is preprocessed.

In [16]:
scaled_data.head()

Unnamed: 0,launch_speed,launch_angle,hc_x,hc_y,sprint_speed,home_team_ARI,home_team_ATL,home_team_BAL,home_team_BOS,home_team_CHC,...,home_team_STL,home_team_TB,home_team_TEX,home_team_TOR,home_team_WSH,outcome_double,outcome_home_run,outcome_out,outcome_single,outcome_triple
4,0.797615,-0.103445,-0.624566,0.503374,-0.011243,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
21,-0.586662,-0.768667,-0.694438,1.15211,-0.011243,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
25,1.199271,0.007425,0.792121,-1.318744,-0.011243,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
34,-0.206524,0.931344,0.845697,-0.872724,-0.011243,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
40,-2.071354,2.077004,0.206731,1.022919,-0.011243,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [17]:
# Split the data into X and y variables, get train/test split

seed = 34
np.random.seed(seed)

X = scaled_data.iloc[:, :-5]
y = scaled_data.iloc[:, -5:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
print(X_train.shape)
print(y_train.shape)

(189816, 35)
(189816, 5)


In [18]:
def create_model(optimizer='adam', init='glorot_uniform'):
    # Create model
    model = Sequential()
    model.add(Dense(20, input_shape=(35,), kernel_initializer=init, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(10, kernel_initializer=init, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(5, kernel_initializer=init, activation='sigmoid'))
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [31]:
# Do a GridSearch

model = KerasClassifier(build_fn=create_model, verbose=10, callbacks=TensorBoard())
# grid search epochs, batch size and optimizer
optimizers = ['adam']
init = ['glorot_uniform']
# epochs = [50, 100, 150]
# batches = [5, 10, 20]
epochs = [1]
batches = [20]
param_grid = dict(optimizer=optimizers, epochs=epochs, batch_size=batches, init=init)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=10)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

RuntimeError: Cannot clone object <keras.wrappers.scikit_learn.KerasClassifier object at 0x000001F380ED91D0>, as the constructor either does not set or modifies parameter callbacks

In [None]:
# Don't do a grid search

In [22]:
model = KerasClassifier(build_fn=create_model, epochs=100, verbose=10, callbacks=[TensorBoard()])
model_result = model.fit(X_train, y_train)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100


KeyboardInterrupt: 