## NOTES

---

Features:
- Shifting?
- Home/Away

Per pitcher models

Exponential Smoothing

Permutation Importances

In [0]:
%%capture
import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    !pip install category_encoders==2.*
    !pip install pybaseball

# If you're working locally:
else:
    DATA_PATH = '../data/'

In [0]:
import pandas as pd
import category_encoders as ce
from pybaseball import statcast_pitcher

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## DATA PROCESSING

---


In [0]:
def pitch_encoding(df):
  '''
  uses OneHotEncoder to encode pitch_name to seperate columns
  '''
  df = df.copy()

  encoder = ce.OneHotEncoder(use_cat_names=True)
  encoded = encoder.fit_transform(df['previous_pitch'])
  # careful on the 'on' parameter, may cause issues later
  df = pd.merge(df, encoded, how='outer', on=df.index)
  df = df.drop(['key_0', 'previous_pitch_nan'], axis=1)
  
  # MAY WANT TO CHANGE THIS NAN SOMETIME
  # df.loc[0, 'previous_pitch']
  
  return df

In [0]:
def pitch_tendency(df, window):
  '''
  THIS IS INCOMPLETE, BUT MVP, FIRST ROWS OF WINDOW WHILE ROW < WINDOW
  AVERAGES INCLUDE FIRST ROW, WHICH ARE ALL ZEROS, SO SUM OF ALL TENDENCIES
  DOES NOT EQUAL 1.0. THIS IS ONLY AN ISSUE FOR FIRST 5, AND COULD BE UPDATED 
  BY REMOVING MIN_PERIODS=1 PARAMETER
  '''
  df = df.copy()

  pitches = df[[col for col in df if col.startswith('previous_pitch_')]]
  
  for i in pitches.columns:
    df[f'{i[-2:]}_tendency_{window}'] = df[i].rolling(
        window, min_periods=1).mean()
    
  return df

In [0]:
def pitch_counter(df):
  '''
  adds columns of cumulative count for each specific pitch type
  '''
  df = df.copy()

  pitches = df[[col for col in df if col.startswith('previous_pitch_')]]

  for i in pitches.columns:
    counter = 0
    target = f'{i[-2:]}_count'
    df[target] = 0
    for j in range(len(df)):
      if df.loc[j, i] == 1:
        counter += 1
      df.loc[j, target] = counter
  
  return df

In [0]:
def total_pitches(df):
  '''
  calculates total pitches (prior to current/target pitch) since entering
  the game
  '''
  df = df.copy()

  df['pitch_total_prior'] = df.groupby('game_date').cumcount(ascending=True)

  return df

In [0]:
def score_margin(df):
  '''
  calculates the score margin for the pitcher's team
  '''
  df = df.copy()

  df['score_margin'] = df['fld_score'] - df['bat_score']
  df = df.drop(['fld_score', 'bat_score'], axis=1)

  return df

In [0]:
def pitcher_advantage(df):
  '''
  generates categorical 'pitcher_advantage' variable based on the count
  '''
  df = df.copy()

  advantages = {'0-0':'neutral', '0-1':'ahead', '0-2':'ahead', '1-0':'neutral', 
                '1-1':'neutral', '1-2':'ahead', '2-0':'behind', '2-1':'neutral',
                '2-2':'ahead', '3-0':'behind', '3-1':'behind', '3-2':'neutral'}

  df['count'] = df['balls'].astype(str) + '-' + df['strikes'].astype(str)
  df['pitcher_advantage'] = df['count'].map(advantages)
  df = df.drop('count', axis=1)

  return df

In [0]:
def baserunners_mapping(df):
  '''
  maps baserunner to boolean values
  '''
  df = df.copy()

  df['on_1b'] = df['on_1b'].notnull()
  df['on_2b'] = df['on_2b'].notnull()
  df['on_3b'] = df['on_3b'].notnull()

  return df

In [0]:
def at_bat_counter(df):
  '''
  generates column for number of batters faced
  '''
  df = df.copy()

  df['at_bat_count'] = df.groupby('at_bat_number').ngroup(ascending=True)+1
  df = df.drop('at_bat_number', axis=1)

  return df

In [0]:
def pitch_cleaning(df):
  '''
  cleans target ('pitch_type'), dropping NaN's (bad data) and pitch out's
  '''
  df = df.copy()

  df = df.dropna(subset=['pitch_type'])
  df['pitch_type'] = df['pitch_type'].str.lower()
  remove_pitches = ['po']
  df = df[~df['pitch_type'].isin(remove_pitches)]
  df = df.drop('pitch_name', axis=1)

  return df

In [0]:
def pitch_map(df):
  '''
  generates dictionary mapping 'pitch_type' to 'pitch_name'
  '''
  df = df.copy()

  df = df.dropna(subset=['pitch_type', 'pitch_name'])
  pitch_types = df['pitch_name'].unique().tolist()
  pitch_types_abv = [i.lower() for i in df['pitch_type'].unique().tolist()]
  
  return dict(zip(pitch_types_abv, pitch_types))

In [0]:
def wrangle_game(df):
  '''
  this function does data wrangling and feature engineering on a per game basis
  '''
  df = df.copy()

  # REORDERS FIRST PITCH TO LAST
  df = df.reindex(index=df.index[::-1])
  df = df.reset_index(drop=True)

  df = pitch_cleaning(df)
  df['previous_pitch'] = df['pitch_type'].shift(1)
  df = pitch_encoding(df)
  df = pitch_tendency(df, 5)
  df = pitch_tendency(df, 10)
  df = pitch_tendency(df, 20)
  df = pitch_counter(df)
  df = total_pitches(df)
  df = score_margin(df)
  df = pitcher_advantage(df)
  df = baserunners_mapping(df)
  df = at_bat_counter(df)
  
  return df

In [0]:
%%capture
raw_data = statcast_pitcher('2015-01-01', '2020-01-01', 621244)

In [0]:
target = 'pitch_type'
features = ['pitch_name', 'pitch_type', 'game_date', 'stand', 'balls', 
            'strikes', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 
            'at_bat_number', 'bat_score', 'fld_score']

In [0]:
df = pd.DataFrame()

games = raw_data['game_date'].unique().tolist()
# this game is all NaN's, need to remove or write exception catcher
games.remove('2018-04-18')
for game in games:
  game_data = wrangle_game(raw_data[raw_data['game_date'] == game][features])
  df = df.append(game_data, ignore_index=True, sort=False)

## MODEL

---



In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

In [0]:
backup = df

In [31]:
train, test = train_test_split(df, train_size=0.8, random_state=42)
train, val = train_test_split(train, train_size=0.8, random_state=42)
train.shape, val.shape, test.shape

((6266, 35), (1567, 35), (1959, 35))

In [0]:
target = 'pitch_type'
features = df.columns.drop([target, 'game_date'])

X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]
y_test = test[target]

In [33]:
processor = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(strategy='median')
)
X_train_processed = processor.fit_transform(X_train)
X_val_processed = processor.transform(X_val)

model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train_processed, y_train)

print(f'Train accuracy: {model.score(X_train_processed, y_train):.5f}')
print(f'Validation accuracy: {model.score(X_val_processed, y_val):.5f}')

Train accuracy: 0.99346
Validation accuracy: 0.37907


In [34]:
model.get_params


<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)>