Pitch Predictor yay!

Step 1) get the goods

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
![ -d pitchpredictor ] || git clone https://github.com/pdex/pitch-predictor pitchpredictor
!(cd pitchpredictor; git pull --rebase)

Cloning into 'pitchpredictor'...
remote: Enumerating objects: 87, done.[K
remote: Counting objects: 100% (87/87), done.[K
remote: Compressing objects: 100% (61/61), done.[K
remote: Total 87 (delta 40), reused 61 (delta 23), pack-reused 0[K
Unpacking objects: 100% (87/87), done.
Already up to date.
Current branch master is up to date.


Step 2) get the data

In [1]:
!pip install -r pitchpredictor/requirements.txt



# load game years

This defines the functionality to load games from disk. The actual loading will happen further down.

In [0]:
# define event classes



class XPlayEvent:
  def __init__(self, max_pitches=None, start_time=None, play_result=None,
               inning=None, away_bat=None, away_score=None, home_score=None,
               pitcher_id=None, batter_id=None, men_on=None, pitcher_team=None,
               batter_team=None, pitcher=None, batter=None, batter_score=None,
               pitcher_score=None):
    self.max_pitches = max_pitches
    self.start_time = start_time
    self.play_result = play_result
    self.inning = inning
    self.away_bat = away_bat
    self.away_score = away_score
    self.home_score = home_score
    self.pitcher_id = pitcher_id
    self.batter_id = batter_id
    self.men_on = men_on
    self.pitcher_team = pitcher_team
    self.batter_team = batter_team
    self.pitcher = pitcher
    self.batter = batter
    self.batter_score = batter_score
    self.pitcher_score = pitcher_score


class XPitchEvent:
  def __init__(self, pe=None, balls=None, strikes=None, outs=None,
               pitch_num=None, pitch=None, prior_seq=None):
    self.pe = pe
    self.balls = balls
    self.strikes = strikes
    self.outs = outs
    self.pitch_num = pitch_num
    self.pitch = pitch
    self.prior_seq = prior_seq


In [4]:
from pitchpredictor import EventsLoader
#from pitchpredictor import XPlayEvent
#from pitchpredictor import XPitchEvent
from pitchpredictor import Fetcher
from pitchpredictor import GameParser
from pitchpredictor import setup_parameters
import random
from collections import Counter

from tqdm.auto import tqdm


def group_play_events(events):
  xplay_list = {x.pe.start_time: x.pe for x in tqdm(events)}
  return list(xplay_list.values())


def extract_players(events):
  all_players = set(map(lambda x: x.pe.pitcher, tqdm(events)))
  all_players.update(map(lambda x: x.pe.batter, tqdm(events)))
  all_players = sorted(list(all_players))
  print("ALL PLAYERS[",len(all_players),"] ( e.g., ",random.sample(all_players,3),")")

  all_pitchtypes = sorted(set(map(lambda x: x.pitch, tqdm(events))))
  print("ALL PITCHTYPES[",len(all_pitchtypes),"]:", all_pitchtypes)

  all_outcometypes = sorted(set(map(lambda x: x.pe.play_result, tqdm(events))))
  print("ALL RESULTS[",len(all_outcometypes),"]:", all_outcometypes)
  return all_players, all_pitchtypes, all_outcometypes

def vectorize_players(all_players, xpitch_list, xplay_list, all_pitchtypes, all_outcometypes):
  print("Vectorizing players...",flush=True)
  all_player_pitchcounts = {player: [] for player in all_players}
  all_player_outcomecounts = {player: [] for player in all_players}
  for elt in xpitch_list:
      all_player_pitchcounts[elt.pe.pitcher].append(elt.pitch)
      all_player_pitchcounts[elt.pe.batter].append(elt.pitch)
  for elt in xplay_list:
      all_player_outcomecounts[elt.pitcher].append(elt.play_result)
      all_player_outcomecounts[elt.batter].append(elt.play_result)

  all_pitch_counters = {x: Counter(all_player_pitchcounts[x]) for x in all_player_pitchcounts.keys()}
  all_outcome_counters = {x: Counter(all_player_outcomecounts[x]) for x in all_player_outcomecounts.keys()}

  all_vectors = {}
  for player in tqdm(all_pitch_counters):
      all_vectors[player] = [all_pitch_counters[player][pitchtype] for pitchtype in all_pitchtypes]
      all_vectors[player].extend([all_outcome_counters[player][outcometype] for outcometype in all_outcometypes])

  random_player = random.choice(all_players)
  print(",".join(all_pitchtypes + all_outcometypes))
  print(",".join(map(lambda x : "{:2d}".format(x), all_vectors[random_player])))
  print("DONE.",flush=True)
  return all_vectors


# modeling
from collections import Counter
from os.path import dirname, exists
import os
from os import path, makedirs
from sklearn.model_selection import train_test_split
from time import strftime, localtime
import bz2
import copy
import numpy as np
import pickle
import random
from tqdm.keras import TqdmCallback
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Activation, BatchNormalization
from keras.models import Sequential
from keras.optimizers import SGD
from keras.utils import to_categorical


def encode(data, vocab):
  encoded = to_categorical(vocab.index(data), num_classes=len(vocab))
  return encoded


def decode(datum, vocab):
  return vocab[np.argmax(datum)]


def pitch_event_to_array(pitch_event, all_pitchtypes, all_vectors):
  if len(pitch_event.prior_seq) > 0:
    last_pitch_id = all_pitchtypes.index(pitch_event.prior_seq[-1])
  else:
    last_pitch_id = -1
  out = [
    pitch_event.balls,
    pitch_event.strikes,
    pitch_event.outs,
    pitch_event.pe.batter_score,
    pitch_event.pe.pitcher_score,
    pitch_event.pe.inning,
    last_pitch_id
  ]
  out.extend(all_vectors[pitch_event.pe.batter])
  out.extend(all_vectors[pitch_event.pe.pitcher])
  return np.array(out)


def pitch_event_to_result(pitch_event, all_pitchtypes):
  return encode(pitch_event.pitch, all_pitchtypes)


def train_model(base_dir, xevent_list, all_pitchtypes, all_vectors, start_time_str, num_epochs):
  print(pitch_event_to_array(random.choice(xevent_list), all_pitchtypes, all_vectors), flush=True)

  print("Splitting for evaluation...",end="",flush=True)
  xevent_list, eval_xevent_list = train_test_split(xevent_list,test_size=0.1)
  print("DONE.",flush=True)


  print("Making input/output arrays...",end="",flush=True)
  input_array = np.array(list(map(lambda x : pitch_event_to_array(x, all_pitchtypes, all_vectors),xevent_list))) 
  output_array = np.array(list(map(lambda x : pitch_event_to_result(x, all_pitchtypes),xevent_list)))
  print("DONE.",flush=True)

  print("Building model...",end="",flush=True)
  model = Sequential()
  model.add(Dense(1000, input_shape=input_array[0].shape))
  model.add(BatchNormalization())
  model.add(Dense(1000))
  model.add(Dense(1000))
  model.add(Activation('relu'))
  model.add(Dense(len(output_array[0]), activation='softmax'))
  optimizer = SGD()
  print("DONE.")
  print(model.summary())

  model.compile(loss='kullback_leibler_divergence', metrics=['accuracy'], optimizer=optimizer)

  checkpoint_path = os.path.join(base_dir, "ckpt/pp-" + start_time_str + "-{epoch:04d}.ckpt")
  checkpoint_dir = dirname(checkpoint_path)

  cp_callback = ModelCheckpoint(filepath=checkpoint_path)

  # this split is more for just basic sanity checking at checkpoints.
  X_train, X_test, y_train, y_test = train_test_split(input_array, output_array, test_size=0.05)
  print("len(x)", len(X_train))
  print("len(y)", len(y_train))
  batch_size = 32
  tqdm_callback = TqdmCallback(epochs=num_epochs, data_size=len(X_train), batch_size=batch_size, verbose=2)
  callbacks = [cp_callback, tqdm_callback]
  model.fit(
    X_train,
    y_train,
    batch_size=batch_size,
    epochs=num_epochs,
    verbose=0,
    callbacks=callbacks,
    validation_data=(X_test,y_test)
  )
  model_path = os.path.join(base_dir, "models/model-" + start_time_str + ".h5") 
  model.save(model_path)
  loss, acc = model.evaluate(X_test,y_test, verbose=2)
  print("Model accuracy: {:5.2f}%".format(100*acc))
  return model, eval_xevent_list


def evaluate_model(model, eval_xevent_list, all_pitchtypes, all_vectors):
  thresholds = [0.05,0.1,0.2]
  place_threshold = 3
  success_above_threshold = Counter()
  success_above_rank = 0
  exact_success = 0
  validation_sample_size = len(eval_xevent_list)

  print("Evaluating against {} samples...".format(validation_sample_size),end="",flush=True)
  for x in tqdm(eval_xevent_list):
    prediction = model.predict(np.array([pitch_event_to_array(x, all_pitchtypes, all_vectors)]))
    prediction_list = prediction.tolist()[0]
    index_pitch = all_pitchtypes.index(x.pitch)
    index_predmax = np.argmax(prediction_list)
    nth_rank = sorted(prediction_list,reverse=True)[place_threshold]
    for threshold in thresholds:
      if (index_pitch in filter(lambda p : prediction_list[p] > threshold, range(len(prediction_list)))):
        success_above_threshold[threshold] += 1
    if (index_pitch in filter(lambda p : prediction_list[p] > nth_rank, range(len(prediction_list)))):
      success_above_rank += 1
    if (index_pitch == index_predmax):
      exact_success += 1
  print("DONE.")

  for threshold in thresholds:
    percent_above_threshold = (100.0*success_above_threshold[threshold])/validation_sample_size
    print("{:2.2f}% above {:2.0f}% predicted chance".format(percent_above_threshold,(100*threshold)))
  percent_above_rank = (100.0*success_above_rank)/validation_sample_size
  print("{:2.2f}% at rank {} or better".format(percent_above_rank,place_threshold))
  percent_exact_correct = (100.0*exact_success)/validation_sample_size
  print("{:2.2f}% exact success".format(percent_exact_correct,place_threshold))


def main(base_dir):
  data_years = [2016,2017,2018,2019]
  fetcher = Fetcher(base_dir)
  for year in data_years:
    fetcher.fetch_year(year)
  loader = EventsLoader(base_dir, GameParser(XPitchEvent, XPlayEvent))
  years, num_epochs = setup_parameters(data_years, testing=False)
  #years = data_years
  #num_epochs = 50
  valid_years, pitch_events = loader.load_years(years)
  play_events = group_play_events(pitch_events)
  all_players, all_pitchtypes, all_outcometypes = extract_players(pitch_events)
  vectors = vectorize_players(all_players, pitch_events, play_events, all_pitchtypes, all_outcometypes)
  # checkpointing timestamp
  start_time_str = strftime("%Y%m%d-%H%M",localtime())
  model, eval_xevent_list = train_model(base_dir, pitch_events, all_pitchtypes, vectors, start_time_str, num_epochs)
  evaluate_model(model, eval_xevent_list, all_pitchtypes, vectors)


drive_dir = "drive/My Drive/pitch-predictor"

main(drive_dir)

2016 :  drive/My Drive/pitch-predictor/data/schedule-p4-2016.pkl.bz2
drive/My Drive/pitch-predictor/data/schedule-p4-2016.pkl.bz2 exists. Loading...DONE.


HBox(children=(FloatProgress(value=0.0, max=2967.0), HTML(value='')))


DONE.
2017 :  drive/My Drive/pitch-predictor/data/schedule-p4-2017.pkl.bz2
drive/My Drive/pitch-predictor/data/schedule-p4-2017.pkl.bz2 exists. Loading...DONE.


HBox(children=(FloatProgress(value=0.0, max=3020.0), HTML(value='')))


DONE.
2018 :  drive/My Drive/pitch-predictor/data/schedule-p4-2018.pkl.bz2
drive/My Drive/pitch-predictor/data/schedule-p4-2018.pkl.bz2 exists. Loading...DONE.


HBox(children=(FloatProgress(value=0.0, max=2969.0), HTML(value='')))


DONE.
2019 :  drive/My Drive/pitch-predictor/data/schedule-p4-2019.pkl.bz2
drive/My Drive/pitch-predictor/data/schedule-p4-2019.pkl.bz2 exists. Loading...DONE.


HBox(children=(FloatProgress(value=0.0, max=2965.0), HTML(value='')))


DONE.
year(2017): 713821 events loaded
year(2019): 695296 events loaded
year(2016): 707611 events loaded
year(2018): 691778 events loaded
YEARS LOADED: 2017 2019 2016 2018


HBox(children=(FloatProgress(value=0.0, max=2808506.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2808506.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2808506.0), HTML(value='')))


ALL PLAYERS[ 3227 ] ( e.g.,  ['Will Craig', 'Colin Bray', 'Louie Lechich'] )


HBox(children=(FloatProgress(value=0.0, max=2808506.0), HTML(value='')))


ALL PITCHTYPES[ 16 ]: ['AB', 'CH', 'CU', 'EP', 'FC', 'FF', 'FO', 'FS', 'FT', 'IN', 'KC', 'KN', 'PO', 'SC', 'SI', 'SL']


HBox(children=(FloatProgress(value=0.0, max=2808506.0), HTML(value='')))


ALL RESULTS[ 4 ]: ['field_out', 'hit', 'strikeout', 'walk']
Vectorizing players...


HBox(children=(FloatProgress(value=0.0, max=3227.0), HTML(value='')))


AB,CH,CU,EP,FC,FF,FO,FS,FT,IN,KC,KN,PO,SC,SI,SL,field_out,hit,strikeout,walk
 0,143,150, 0,87,501, 0,21,183, 0,17, 0, 1, 0,111,340,206,105,122,18
DONE.
[   0    1    0    5    0    5   -1    0  893  732    2  453 3140    1
  129  990    0  210   11    3    0  645 1273 1093  478  438  185    0
    8    0    0   94  293    0    0    0    0    0    0    0    0    0
   87   59   25   28   13]
Splitting for evaluation...DONE.
Making input/output arrays...DONE.
Building model...DONE.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1000)              48000     
_________________________________________________________________
batch_normalization_1 (Batch (None, 1000)              4000      
_________________________________________________________________
dense_2 (Dense)              (None, 1000)              1001000   
________________________________

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75039.75), HTML(value='')))



Model accuracy: 45.35%
Evaluating against 280851 samples...

HBox(children=(FloatProgress(value=0.0, max=280851.0), HTML(value='')))


DONE.
97.47% above  5% predicted chance
91.74% above 10% predicted chance
71.69% above 20% predicted chance
86.57% at rank 3 or better
45.53% exact success
