In [1]:
import pandas as pd
from matplotlib import __version__ as mpv
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklv
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.metrics import log_loss, average_precision_score, make_scorer, precision_score, mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import Birch
from sklearn.cluster import DBSCAN
from sklearn.svm import SVC
import hdbscan
from sklearn import metrics
from sklearn.cluster import OPTICS
import numpy as np

sns.set()
%matplotlib inline

print('Using version %s of pandas' % pd.__version__)
print('Using version %s of matplotlib' % mpv)
print('Using version %s of seaborn' % sns.__version__)
print('Using version %s of sklearn' % sklv)
print('Using version %s of numpy' % np.__version__)

Using version 1.0.5 of pandas
Using version 3.2.2 of matplotlib
Using version 0.10.1 of seaborn
Using version 0.23.1 of sklearn
Using version 1.19.0 of numpy


In [2]:
def hithardness(val):
    if val == 'soft':
        return 1
    elif val == 'medium':
        return 2
    elif val == 'hard':
        return 3

In [3]:
def target_cat_muti(val):
    if (val == 'Pop Out'):
        return 1
    elif (val in ['Sac Bunt','Bunt Groundout','Bunt Pop Out','Bunt Lineout']):
        return 2
    elif (val in ['Groundout', 'Grounded Into DP', 'Forceout', 'Triple Play', 'Fielders Choice Out', 'Fielders Choice']):
        return 3
    elif (val in ['Flyout', 'Sac Fly','Double Play', 'Sac Fly Double Play']):
        return 4
    elif (val in ['Fan Interference', 'Catcher Interference','Batter Interference']):
        return 5
    elif (val == 'Field Error'):
        return 6
    elif (val == 'Lineout'):
        return 7
    elif (val == 'Single'):
        return 8
    elif (val == 'Double'):
        return 9
    elif (val == 'Triple'):
        return 10
    elif (val == 'Home Run'):
        return 11
    else:
        return 0

In [4]:
def target_cat_bin(val):
    if (val in ['Sac Bunt','Bunt Groundout','Bunt Pop Out','Bunt Lineout','Pop Out',
                 'Groundout', 'Grounded Into DP', 'Forceout', 'Triple Play', 'Fielders Choice Out', 'Fielders Choice',
                 'Flyout', 'Sac Fly','Double Play', 'Sac Fly Double Play','Lineout'
                 'Fan Interference', 'Catcher Interference','Batter Interference','Field Error']):
        return 0
    elif (val in ['Single', 'Double', 'Triple', 'Home Run']):
        return 1
    else:
        return 0

In [5]:
# Start with 2015 as then is when hit data started to become widely available
df2015 = pd.read_csv('pitch_data_2015.csv')

In [6]:
df2015.columns

Index(['Unnamed: 0', 'season', 'game', 'batter', 'batSide', 'pitcher',
       'pitchHand', 'count_balls', 'count_strikes', 'pitch_result',
       'pitch_type', 'breakY', 'break_length', 'spin_direction', 'spin_rate',
       'end_speed', 'nasty_factor', 'start_speed', 'pitch_number', 'result',
       'result_rbi', 'result_type', 'hit_hardness', 'hit_angle', 'hit_speed',
       'hit_distance', 'hit_trajectory'],
      dtype='object')

In [7]:
# remove records that did not have a result of a ball hit
df2015.dropna(subset=['result','hit_hardness'], inplace=True)

In [8]:
# Saved wrong, so drop an unneeded column
df2015.drop(['Unnamed: 0'], axis=1, inplace=True)

# While some of these columns may be useful for additional analysis, going to drop them for this project
df2015.drop(['season','game','batter','batSide','pitcher','pitchHand','breakY','pitch_number',
             'result_rbi', 'result_type','hit_trajectory','pitch_result','pitch_type'
            ], axis=1, inplace=True)

# Each season there are a few pitches that don't register, drop them
df2015.dropna(subset=['break_length', 'spin_direction','spin_rate',
                 'end_speed','start_speed','hit_distance','nasty_factor'], inplace=True)

df2015['count_balls'] = df2015['count_balls'].astype('category')
df2015['count_strikes'] = df2015['count_strikes'].astype('category')
df2015['hhcode'] = df2015['hit_hardness'].apply(hithardness)
df2015['hhcode'] = df2015['hhcode'].astype('category')
df2015.drop(['hit_hardness'], axis=1, inplace=True)

In [9]:
# create train/test sets by using different seasons of data
# train will be 2015 data
X_train = df2015.drop(['result'], axis=1)
Y_train = df2015['result']

In [10]:
# Map hit results to integer values and convert to categorical
y_cat_train = Y_train.map(target_cat_bin)
#y_cat_train = y_cat_train.astype('category')

In [11]:
df2016 = pd.read_csv('pitch_data_2016.csv')

In [12]:
# 2016 is my test
# remove records that did not have a result of a ball hit
df2016.dropna(subset=['result','hit_hardness'], inplace=True)

df2016.drop(['Unnamed: 0'], axis=1, inplace=True)

# While some of these columns may be useful for additional analysis, going to drop them for this project
df2016.drop(['season','game','batter','batSide','pitcher','pitchHand','breakY','pitch_number',
             'result_rbi', 'result_type','hit_trajectory','pitch_result','pitch_type'
            ], axis=1, inplace=True)

# Each season there are a few pitches that don't register, drop them
df2016.dropna(subset=['break_length', 'spin_direction','spin_rate',
                 'end_speed','start_speed','hit_distance','nasty_factor'], inplace=True)

df2016['count_balls'] = df2016['count_balls'].astype('category')
df2016['count_strikes'] = df2016['count_strikes'].astype('category')
df2016['hhcode'] = df2016['hit_hardness'].apply(hithardness)
df2016['hhcode'] = df2016['hhcode'].astype('category')
df2016.drop(['hit_hardness'], axis=1, inplace=True)

In [13]:
# test will be 2016 data
X_test = df2016.drop(['result'], axis=1)
Y_test = df2016['result']

In [15]:
from sklearn.preprocessing import label_binarize
#y_cat_test = Y_test.map(target_cat)
y_cat_test = Y_test.map(target_cat_muti)

#y_cat_test = y_cat_test.astype('category')

y_cat_test = label_binarize(y_cat_test, classes=[1,2,3,4,5,6,7,8,9,10,11])



In [16]:
y_cat_test

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [78]:
clf = Pipeline([('select_best', SelectKBest()),
                #('lr', LogisticRegression(class_weight="balanced",multi_class="multinomial"))])
                ('lr', LogisticRegression(class_weight="balanced"))])

params = {'lr__C': [10**i for i in range(-3,3)],
          'lr__max_iter': [100,250,500],
          'select_best__k': [7,9,11,13,'all']
         }

grid = GridSearchCV(clf,
                       param_grid=params,
                       scoring={'avr': make_scorer(precision_score, needs_proba=True),
                                'll': make_scorer(log_loss, greater_is_better=False, needs_proba=True)},
                       n_jobs=-1,
                       cv=5,
                   refit='avr')

In [79]:
grid.fit(X_train, y_cat_train)

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [17]:
lr = LogisticRegression(C=0.1,random_state=0).fit(X_train, y_cat_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [81]:
lr.coef_

array([[-2.83349197e-03, -3.97340956e-03, -6.96365706e-02,
        -1.70939648e-04,  3.65535533e-06, -2.19807980e-02,
         1.05788870e-03, -2.56506127e-02, -2.95574934e-02,
         2.67964433e-02,  8.12722376e-03, -1.18519343e-03]])

In [82]:
lr.predict(X_train.iloc[60].values.reshape(1,-1))

array([0], dtype=int64)

In [83]:
y_cat_train.iloc[60]

0

In [84]:
lr.score(X_train,y_cat_train)

0.7141719814120925

In [85]:
from sklearn.model_selection import cross_val_score
cross_val_score(lr, X_train, y_cat_train, cv=3)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

array([0.68566978, 0.72416926, 0.72356499])

In [92]:
average_precision_score(y_cat_train, lr.predict(X_train))

0.4646040582224984

In [90]:
log_loss(y_cat_train, lr.predict_log_proba(X_train))

0.6931471805599453

In [20]:
np.unique(lr.predict(X_train), return_counts=True)

(array([0, 1], dtype=int64), array([91308, 24251], dtype=int64))

In [None]:
np.