In [315]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import keras.layers as layers
from keras.models import Model, load_model
from keras.engine import Layer

Using TensorFlow backend.


In [265]:
obp = ['single', 'double', 'home_run',
       'hit_by_pitch', 'field_error','walk', 'triple',
       'fielders_choice', 'intent_walk', 'run', 'catcher_interf']

In [266]:
df = pd.read_csv('pitcher_data/545333_data.csv')
df = df.drop(columns=df.columns[:5].append(df.columns[9:])).dropna()
df = df[~df['events'].str.contains('caught|pickoff|defensive')==True]
df['obp'] = df.events.isin(obp).astype(int)
full_df = df 

In [267]:
for filename in os.listdir('pitcher_data/'):
    if filename.endswith('.csv') and filename != '545333_data.csv':
        df = pd.read_csv('pitcher_data/'+filename)
        df = df.drop(columns=df.columns[:5].append(df.columns[9:])).dropna()
        df = df[~df['events'].str.contains('caught|pickoff|defensive')==True]
        df['obp'] = df.events.isin(obp).astype(int)
        full_df = full_df.append(df)

In [348]:
pitchers = pd.read_csv('2019_pitchers.csv')
batters = pd.read_csv('2019_batters.csv')

In [329]:
df = full_df.groupby(['batter', 'pitcher']).filter(lambda x : len(x)>10)
new_df = df.groupby(['batter', 'pitcher'], as_index=False).mean()
final_df = new_df[new_df.batter.isin(batters.player_id)]
final_df = final_df.reset_index(drop=True)

In [330]:
final_df['pitcher_factorized'] = pd.factorize(final_df['pitcher'])[0]
final_df['batter_factorized'] = pd.factorize(final_df['batter'])[0]
num_batters = len(pd.unique(final_df.batter))
num_pitchers = len(pd.unique(final_df.pitcher))

# Model

In [340]:
def build_model(): 
    input_pitcher = layers.Input(shape=(1,))
    pitcher_embedding = layers.Embedding(num_pitchers, 5)(input_pitcher)
    input_batter = layers.Input(shape=(1,))
    batter_embedding = layers.Embedding(num_batters, 5)(input_batter)
    pitcher_flat = layers.Flatten()(pitcher_embedding)
    batter_flat = layers.Flatten()(batter_embedding)
    cl_concat = layers.Concatenate()([pitcher_flat, batter_flat])
    cl_dense = layers.Dense(200, activation='relu')(cl_concat)
    cl_dense1 = layers.Dense(100, activation='relu')(cl_dense)
    cl_dense2 = layers.Dense(50, activation='relu')(cl_dense1)
    cl_dense3 = layers.Dense(20, activation='relu')(cl_dense2)
    pred = layers.Dense(1, activation="linear")(cl_dense3)
    model = Model([input_pitcher, input_batter], pred)
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [341]:
train_pitcher, test_pitcher, train_batter, test_batter, train_labels, test_labels = train_test_split(final_df['pitcher_factorized'], final_df['batter_factorized'], final_df['obp'], test_size=0.1)

In [342]:
model = None
model = build_model()
history = model.fit([train_pitcher, train_batter],
          train_labels,
         validation_data=([test_pitcher, test_batter], test_labels),
          epochs=100,
          batch_size=32
         )

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 1, 5)         2065        input_5[0][0]                    
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 1, 5)         2225        input_6[0][0]                    
__________________________________________________________________________________________________
flatten_5 

Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100


Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [343]:
preds = model.predict([test_pitcher, test_batter])

In [344]:
np.squeeze(preds[:20])

array([0.55220854, 0.3218457 , 0.33141193, 0.33235413, 0.33718342,
       0.3272677 , 0.213367  , 0.22204569, 0.25352243, 0.33906928,
       0.45192206, 0.13417628, 0.48530632, 0.11167203, 0.3653856 ,
       0.32060146, 0.12423144, 0.28989902, 0.22881377, 0.5126    ],
      dtype=float32)

In [345]:
test_labels.to_numpy()[:20]

array([0.41176471, 0.42105263, 0.29411765, 0.25      , 0.42857143,
       0.28571429, 0.5       , 0.31578947, 0.25490196, 0.53846154,
       0.42424242, 0.4375    , 0.2       , 0.46428571, 0.27272727,
       0.34782609, 0.21428571, 0.27272727, 0.24390244, 0.3125    ])