## NOTES

---

Features:
- Shifting?
- Home/Away

Per pitcher models

Exponential Smoothing

Permutation Importances

In [1]:
%%capture
import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    !pip install category_encoders==2.*
    !pip install pybaseball

# If you're working locally:
else:
    DATA_PATH = '../data/'

In [48]:
# import os
import numpy as np
import pandas as pd
import category_encoders as ce
from pybaseball import statcast_pitcher

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## DATA PROCESSING

---


In [3]:
def pitch_encoding(df):
  '''
  uses OneHotEncoder to encode pitch_name to seperate columns
  '''
  df = df.copy()

  encoder = ce.OneHotEncoder(use_cat_names=True)
  encoded = encoder.fit_transform(df['previous_pitch'])
  # careful on the 'on' parameter, may cause issues later
  df = pd.merge(df, encoded, how='outer', on=df.index)
  df = df.drop(['key_0', 'previous_pitch_nan'], axis=1)
  
  # MAY WANT TO CHANGE THIS NAN SOMETIME
  # df.loc[0, 'previous_pitch']
  
  return df

In [4]:
def pitch_tendency(df, window):
  '''
  THIS IS INCOMPLETE, BUT MVP, FIRST ROWS OF WINDOW WHILE ROW < WINDOW
  AVERAGES INCLUDE FIRST ROW, WHICH ARE ALL ZEROS, SO SUM OF ALL TENDENCIES
  DOES NOT EQUAL 1.0. THIS IS ONLY AN ISSUE FOR FIRST 5, AND COULD BE UPDATED 
  BY REMOVING MIN_PERIODS=1 PARAMETER
  '''
  df = df.copy()

  pitches = df[[col for col in df if col.startswith('previous_pitch_')]]

  for i in pitches.columns:
    df[f'{i[-2:]}_tendency_{window}'] = df[i].rolling(
        window, min_periods=1).mean()
    
  return df

In [5]:
def pitch_counter(df):
  '''
  adds columns of cumulative count for each specific pitch type
  '''
  df = df.copy()

  pitches = [col for col in df if col.startswith('previous_pitch_')]
  
  for pitch in pitches:
    df[f'count_{i[-2:]}'] = (df[pitch]] == 1).cumsum()

#     counter = 0
#     target = f'{i[-2:]}_count'
#     df[target] = 0
#     for j in range(len(df)):
#       if df.loc[j, i] == 1:
#         counter += 1
#       df.loc[j, target] = counter
  
  return df

In [6]:
def total_pitches(df):
  '''
  calculates total pitches (prior to current/target pitch) since entering
  the game
  '''
  df = df.copy()

  df['pitch_total_prior'] = df.groupby('game_date').cumcount(ascending=True)

  return df

In [7]:
def score_margin(df):
  '''
  calculates the score margin for the pitcher's team
  '''
  df = df.copy()

  df['score_margin'] = df['fld_score'] - df['bat_score']
  df = df.drop(['fld_score', 'bat_score'], axis=1)

  return df

In [8]:
def pitcher_advantage(df):
  '''
  generates categorical 'pitcher_advantage' variable based on the count
  '''
  df = df.copy()

  advantages = {'0-0':'neutral', '0-1':'ahead', '0-2':'ahead', '1-0':'neutral', 
                '1-1':'neutral', '1-2':'ahead', '2-0':'behind', '2-1':'neutral',
                '2-2':'ahead', '3-0':'behind', '3-1':'behind', '3-2':'neutral'}

  df['count'] = df['balls'].astype(str) + '-' + df['strikes'].astype(str)
  df['pitcher_advantage'] = df['count'].map(advantages)
  df = df.drop('count', axis=1)

  return df

In [9]:
def baserunners_mapping(df):
  '''
  maps baserunner to boolean values
  '''
  df = df.copy()

  df['on_1b'] = df['on_1b'].notnull()
  df['on_2b'] = df['on_2b'].notnull()
  df['on_3b'] = df['on_3b'].notnull()
  
  df['baserunner'] = df['on_1b'] | df['on_2b'] | df['on_3b']

  return df

In [10]:
def at_bat_counter(df):
  '''
  generates column for number of batters faced
  '''
  df = df.copy()

  df['at_bat_count'] = df.groupby('at_bat_number').ngroup(ascending=True)+1
  df = df.drop('at_bat_number', axis=1)

  return df

In [11]:
def pitch_cleaning(df):
  '''
  cleans target ('pitch_type'), dropping NaN's (bad data) and pitch out's
  '''
  df = df.copy()

  df = df.dropna(subset=['pitch_type'])
  
  df['pitch_type'] = df['pitch_type'].str.lower()
  remove_pitches = ['po']
  df = df[~df['pitch_type'].isin(remove_pitches)]
  df = df.drop('pitch_name', axis=1)

  return df

In [12]:
def pitch_map(df):
  '''
  generates dictionary mapping 'pitch_type' to 'pitch_name'
  '''
  df = df.copy()

  df = df.dropna(subset=['pitch_type', 'pitch_name'])
  pitch_types = df['pitch_name'].unique().tolist()
  pitch_types_abv = [i.lower() for i in df['pitch_type'].unique().tolist()]
  
  return dict(zip(pitch_types_abv, pitch_types))

In [13]:
def wrangle_game(df):
  '''
  this function does data wrangling and feature engineering on a per game basis
  '''
  df = df.copy()

  # REORDERS FIRST PITCH TO LAST
  df = df.reindex(index=df.index[::-1])
  df = df.reset_index(drop=True)

  df = pitch_cleaning(df)
  df['previous_pitch'] = df['pitch_type'].shift(1)
  df['strike_previous'] = df['type'].shift(1) == 'S'
  # this may need to be encoded somehow
  df['pitch_location_previous'] = df['zone'].shift(1)
  df = pitch_encoding(df)
  df = pitch_tendency(df, 5)
  df = pitch_tendency(df, 10)
  df = pitch_tendency(df, 20)
  df = pitch_counter(df)
  df = total_pitches(df)
  df = score_margin(df)
  df = pitcher_advantage(df)
  df = baserunners_mapping(df)
  df = at_bat_counter(df)
  
  return df

In [109]:
sample = 'https://raw.githubusercontent.com/michael-rowland/Pitch-Predictions/master/sample.csv'
df = pd.read_csv(sample)
df = df[df['game_date'] == '2018-09-28']
df = df[features]
df = wrangle_game(df)

In [110]:
def strike_encoding(df):
  '''
  uses OneHotEncoder to encode pitch_name to seperate columns if previous pitch was strike
  '''
  df = df.copy()
  
  s = df['strike_previous'] == True
  df['strike_pitch'] = s.mask(s, df['previous_pitch'])
  
  encoder = ce.OneHotEncoder(use_cat_names=True)
  encoded = encoder.fit_transform(df['strike_pitch'])
  # careful on the 'on' parameter, may cause issues later
  df = pd.merge(df, encoded, how='outer', on=df.index)
  df = df.drop(['key_0', 'strike_pitch_False'], axis=1)
  
  return df

df = strike_encoding(df)

In [115]:
def strike_percentage(df, percentage=True):
  '''
  creates column for each pitch type of percentage (or raw count) pitch has been
  thrown for a strike.
  
  COULD FURTHER ENGINEER STRIKE PERCENTAGE TO BE BASED OFF OF NUMBER OF STRIKES 
  RATHER THAN TOTAL PITCHES
  '''
  df = df.copy()

  pitches = [col for col in df if col.startswith('strike_pitch_')]

  for i in pitches:
    if percentage:
      df[f'{i[-2:]}_cumulative_strikes'] = (df[f'strike_pitch_{i[-2:]}'] == 1).cumsum() / df['pitch_total_prior']
    else:
      df[f'{i[-2:]}_cumulative_strikes'] = (df[f'strike_pitch_{i[-2:]}'] == 1).cumsum()
      
  return df

df = strike_percentage(df, percentage=True)

In [130]:
def strike_counter(df, window):
  '''
  creates columns of number of previous n=window strikes thrown for each pitch 
  type
  '''
  df = df.copy()
  
  pitches = [col for col in df if col.startswith('strike_pitch_')]
  
  for i in pitches:
    target = f'{i[-2:]}_strike_tendency_{window}'
    df[target] = df[df['strike_pitch'] != False][i].rolling(window, 
                                                            min_periods=1).sum()
    df[target] = df[target].fillna(method='ffill')

  return df

df = strike_counter(df, 5)
df.head(15).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
pitch_type,ff,ch,ff,ff,ff,ff,cu,ff,cu,ff,cu,cu,ff,ft,cu
game_date,2018-09-28,2018-09-28,2018-09-28,2018-09-28,2018-09-28,2018-09-28,2018-09-28,2018-09-28,2018-09-28,2018-09-28,2018-09-28,2018-09-28,2018-09-28,2018-09-28,2018-09-28
stand,L,L,L,L,L,L,L,L,R,R,R,R,R,L,R
balls,0,0,1,1,0,0,0,1,0,0,1,2,2,0,0
strikes,0,1,1,2,0,1,2,2,0,1,1,1,2,0,0
on_3b,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
on_2b,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
on_1b,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
outs_when_up,0,0,0,0,1,1,1,1,2,2,2,2,2,0,1
inning,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2


In [None]:
# %%capture
# raw_data = statcast_pitcher('2015-01-01', '2020-01-01', 621244)

In [15]:
target = 'pitch_type'
features = ['pitch_name', 'pitch_type', 'game_date', 'stand', 'balls', 
            'strikes', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 
            'at_bat_number', 'bat_score', 'fld_score', 'type', 'zone']

In [None]:
df = pd.DataFrame()

games = raw_data['game_date'].unique().tolist()
# this game is all NaN's, need to remove or write exception catcher
games.remove('2018-04-18')
# for game in games:
for game in games:
  game_data = wrangle_game(raw_data[raw_data['game_date'] == game][features])
  df = df.append(game_data, ignore_index=True, sort=False)

## MODEL

---



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

In [None]:
backup = df

In [None]:
train, test = train_test_split(df, train_size=0.8, random_state=42)
train, val = train_test_split(train, train_size=0.8, random_state=42)
train.shape, val.shape, test.shape

In [None]:
target = 'pitch_type'
features = df.columns.drop([target, 'game_date'])

X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]
y_test = test[target]

In [None]:
processor = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(strategy='median')
)
X_train_processed = processor.fit_transform(X_train)
X_val_processed = processor.transform(X_val)

model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train_processed, y_train)

print(f'Train accuracy: {model.score(X_train_processed, y_train):.5f}')
print(f'Validation accuracy: {model.score(X_val_processed, y_val):.5f}')

In [None]:
def log_model(txt_file='model_log.txt'):
    with open(txt_file, 'a') as file:
        file.write(f'Validation accuracy: {model.score(X_val_processed, y_val):.5f}\n')
        file.write(f'Features: {X_train.columns}\n')
        file.write(f'{model.get_params}\n')
        file.write('\n')
        file.close()