In [182]:
import pandas as pd
from matplotlib import __version__ as mpv
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklv
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.metrics import log_loss, average_precision_score, make_scorer, precision_score, mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import Birch
from sklearn.cluster import DBSCAN
from sklearn.svm import SVC
import hdbscan
from sklearn import metrics
from sklearn.cluster import OPTICS
import numpy as np

sns.set()
%matplotlib inline

print('Using version %s of pandas' % pd.__version__)
print('Using version %s of matplotlib' % mpv)
print('Using version %s of seaborn' % sns.__version__)
print('Using version %s of sklearn' % sklv)
print('Using version %s of numpy' % np.__version__)

Using version 1.0.5 of pandas
Using version 3.2.2 of matplotlib
Using version 0.10.1 of seaborn
Using version 0.23.1 of sklearn
Using version 1.19.0 of numpy


In [96]:
# Start with 2015 as then is when hit data started to become widely available
df2015 = pd.read_csv('pitch_data_2015.csv')

In [97]:
df2015.columns

Index(['Unnamed: 0', 'season', 'game', 'batter', 'batSide', 'pitcher',
       'pitchHand', 'count_balls', 'count_strikes', 'pitch_result',
       'pitch_type', 'breakY', 'break_length', 'spin_direction', 'spin_rate',
       'end_speed', 'nasty_factor', 'start_speed', 'pitch_number', 'result',
       'result_rbi', 'result_type', 'hit_hardness', 'hit_angle', 'hit_speed',
       'hit_distance', 'hit_trajectory'],
      dtype='object')

In [98]:
# View a sample
df2015.sample(n=20) 

Unnamed: 0.1,Unnamed: 0,season,game,batter,batSide,pitcher,pitchHand,count_balls,count_strikes,pitch_result,...,start_speed,pitch_number,result,result_rbi,result_type,hit_hardness,hit_angle,hit_speed,hit_distance,hit_trajectory
201116,201116,2015,414347,518626,R,502188,R,0,1,Swinging Strike,...,85.8,1,,,,,,,,
456932,456932,2015,415228,476883,L,502239,R,1,2,Swinging Strike,...,93.6,3,,,,,,,,
58640,58640,2015,413848,461235,L,456051,R,1,1,Swinging Strike,...,85.5,2,,,,,,,,
218843,218843,2015,414399,543228,R,407845,R,3,1,Ball,...,96.1,4,,,,,,,,
648551,648551,2015,415862,460060,R,489119,L,1,0,Ball In Dirt,...,91.2,1,,,,,,,,
576596,576596,2015,415636,611177,R,543045,L,2,1,Foul Tip,...,90.1,3,,,,,,,,
42898,42898,2015,413795,573627,L,446372,R,0,1,Foul,...,93.7,1,,,,,,,,
43660,43660,2015,413789,472528,L,501925,R,0,1,Called Strike,...,85.7,1,,,,,,,,
693962,693962,2015,416019,545361,R,623430,R,2,1,Ball,...,82.6,3,,,,,,,,
536979,536979,2015,415501,408045,L,592332,R,1,1,Ball,...,93.7,2,,,,,,,,


In [99]:
# remove records that did not have a result of a ball hit
df2015.dropna(subset=['result','hit_hardness'], inplace=True)

In [100]:
# Saved wrong, so drop an unneeded column
df2015.drop(['Unnamed: 0'], axis=1, inplace=True)

# While some of these columns may be useful for additional analysis, going to drop them for this project
df2015.drop(['season','game','batter','batSide','pitcher','pitchHand','breakY','pitch_number',
             'result_rbi', 'result_type','hit_trajectory','pitch_result'
            ], axis=1, inplace=True)

# Each season there are a few pitches that don't register, drop them
df2015.dropna(subset=['break_length', 'spin_direction','spin_rate',
                 'end_speed','start_speed','hit_distance','nasty_factor'], inplace=True)

In [101]:
df2015

Unnamed: 0,count_balls,count_strikes,pitch_type,break_length,spin_direction,spin_rate,end_speed,nasty_factor,start_speed,result,hit_hardness,hit_angle,hit_speed,hit_distance
20,1,0,Cutter,6.0,167.0,1307.0,80.7,18.14,87.5,Double,soft,24.21,73.53,209.00
33,2,1,Cutter,7.2,144.0,845.0,80.7,32.93,87.5,Groundout,medium,-12.85,96.46,12.23
54,0,0,Cutter,6.0,177.0,1012.0,80.9,46.86,87.5,Single,medium,13.41,83.62,234.12
58,2,0,Cutter,4.8,159.0,1486.0,80.8,34.86,87.2,Double,medium,24.57,85.00,259.69
67,0,1,Four-Seam Fastball,4.8,190.0,1675.0,81.8,38.88,89.2,Flyout,medium,30.92,99.23,364.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714127,0,0,Changeup,7.2,204.0,943.0,76.9,48.97,82.8,Lineout,medium,12.07,82.42,151.65
714134,2,2,Cutter,7.2,114.0,934.0,84.2,44.74,90.3,Field Error,medium,44.49,77.52,230.83
714138,1,2,Knuckle Curve,13.2,51.0,1762.0,72.9,54.84,79.2,Groundout,medium,4.11,70.76,60.92
714142,0,2,Knuckle Curve,12.0,60.0,1601.0,73.8,47.60,79.5,Groundout,medium,-24.03,93.46,128.97


In [102]:
df2015['count_balls'] = df2015['count_balls'].astype('category')
df2015['count_strikes'] = df2015['count_strikes'].astype('category')
df2015['pitch_type'] = df2015['pitch_type'].astype('category')
df2015['hit_hardness'] = df2015['hit_hardness'].astype('category')

In [103]:
df2015.isnull().sum()

count_balls       0
count_strikes     0
pitch_type        0
break_length      0
spin_direction    0
spin_rate         0
end_speed         0
nasty_factor      0
start_speed       0
result            0
hit_hardness      0
hit_angle         0
hit_speed         0
hit_distance      0
dtype: int64

In [104]:
# create train/test sets by using different seasons of data
# train will be 2015 data
X_train = df2015.drop(['result'], axis=1)
Y_train = df2015['result']

In [106]:
df2016 = pd.read_csv('pitch_data_2016.csv')

In [107]:
# 2016 is my test
# remove records that did not have a result of a ball hit
df2016.dropna(subset=['result','hit_hardness'], inplace=True)

df2016.drop(['Unnamed: 0'], axis=1, inplace=True)

# While some of these columns may be useful for additional analysis, going to drop them for this project
df2016.drop(['season','game','batter','batSide','pitcher','pitchHand','breakY','pitch_number',
             'result_rbi', 'result_type','hit_trajectory','pitch_result'
            ], axis=1, inplace=True)

# Each season there are a few pitches that don't register, drop them
df2016.dropna(subset=['break_length', 'spin_direction','spin_rate',
                 'end_speed','start_speed','hit_distance','nasty_factor'], inplace=True)

df2016['count_balls'] = df2016['count_balls'].astype('category')
df2016['count_strikes'] = df2016['count_strikes'].astype('category')
df2016['pitch_type'] = df2016['pitch_type'].astype('category')
df2016['hit_hardness'] = df2016['hit_hardness'].astype('category')

In [108]:
df2016.head()

Unnamed: 0,count_balls,count_strikes,pitch_type,break_length,spin_direction,spin_rate,end_speed,nasty_factor,start_speed,result,hit_hardness,hit_angle,hit_speed,hit_distance
1,1,0,Two-Seam Fastball,7.2,106.0,2484.0,83.4,48.69,90.4,Groundout,medium,2.99,94.65,114.24
2,0,0,Two-Seam Fastball,6.0,121.0,2587.0,82.1,34.22,89.6,Groundout,soft,-69.98,56.79,2.64
10,0,1,Cutter,7.2,126.0,877.0,83.2,34.68,88.4,Groundout,medium,6.39,82.15,140.49
20,3,2,Sinker,7.2,250.0,1434.0,83.4,26.32,90.4,Single,medium,-13.52,100.77,220.72
22,1,0,Four-Seam Fastball,6.0,215.0,876.0,82.3,50.96,88.9,Lineout,medium,11.83,84.15,126.94


In [109]:
# test will be 2013 data
X_test = df2016.drop(['result'], axis=1)
Y_test = df2016['result']

In [110]:
X_train.head()

Unnamed: 0,count_balls,count_strikes,pitch_type,break_length,spin_direction,spin_rate,end_speed,nasty_factor,start_speed,hit_hardness,hit_angle,hit_speed,hit_distance
20,1,0,Cutter,6.0,167.0,1307.0,80.7,18.14,87.5,soft,24.21,73.53,209.0
33,2,1,Cutter,7.2,144.0,845.0,80.7,32.93,87.5,medium,-12.85,96.46,12.23
54,0,0,Cutter,6.0,177.0,1012.0,80.9,46.86,87.5,medium,13.41,83.62,234.12
58,2,0,Cutter,4.8,159.0,1486.0,80.8,34.86,87.2,medium,24.57,85.0,259.69
67,0,1,Four-Seam Fastball,4.8,190.0,1675.0,81.8,38.88,89.2,medium,30.92,99.23,364.35


In [153]:
def hithardness(val):
    if val == 'soft':
        return 1
    elif val == 'medium':
        return 2
    elif val == 'hard':
        return 3
    

In [156]:
X_train['hhcode'] = X_train['hit_hardness'].apply(hithardness)

In [162]:
X_train.drop(['hit_hardness'], axis=1, inplace=True)

In [159]:
X_train.isnull().sum()

count_balls       0
count_strikes     0
break_length      0
spin_direction    0
spin_rate         0
end_speed         0
nasty_factor      0
start_speed       0
hit_hardness      0
hit_angle         0
hit_speed         0
hit_distance      0
hhcode            0
dtype: int64

In [112]:
X_test.isnull().sum()

count_balls       0
count_strikes     0
pitch_type        0
break_length      0
spin_direction    0
spin_rate         0
end_speed         0
nasty_factor      0
start_speed       0
hit_hardness      0
hit_angle         0
hit_speed         2
hit_distance      0
dtype: int64

In [113]:
Y_train.value_counts()

Groundout               29665
Single                  26307
Flyout                  19567
Lineout                 11605
Double                   8069
Home Run                 4864
Pop Out                  4088
Grounded Into DP         3377
Forceout                 3353
Field Error              1380
Sac Fly                  1187
Triple                    925
Double Play               433
Fielders Choice Out       251
Sac Bunt                  202
Bunt Groundout            116
Fielders Choice            83
Fan Interference           42
Sac Fly Double Play        20
Bunt Pop Out               13
Triple Play                 4
Bunt Lineout                3
Batter Interference         2
Catcher Interference        2
Sac Bunt Double Play        1
Name: result, dtype: int64

In [114]:
Y_test.value_counts()

Groundout               28609
Single                  26038
Flyout                  19826
Lineout                 10652
Double                   8085
Home Run                 5547
Pop Out                  4571
Grounded Into DP         3371
Forceout                 3285
Field Error              1335
Sac Fly                  1168
Triple                    855
Double Play               413
Sac Bunt                  223
Fielders Choice Out       220
Bunt Groundout            115
Fielders Choice            80
Fan Interference           43
Sac Fly Double Play        22
Bunt Pop Out               13
Triple Play                 6
Catcher Interference        5
Batter Interference         4
Bunt Lineout                2
Name: result, dtype: int64

In [115]:
def target_cat(val):
    if (val == 'Pop Out'):
        return 1
    elif (val in ['Sac Bunt','Bunt Groundout','Bunt Pop Out','Bunt Lineout']):
        return 2
    elif (val in ['Groundout', 'Grounded Into DP', 'Forceout', 'Triple Play', 'Fielders Choice Out', 'Fielders Choice']):
        return 3
    elif (val in ['Flyout', 'Sac Fly','Double Play', 'Sac Fly Double Play']):
        return 4
    elif (val in ['Fan Interference', 'Catcher Interference','Batter Interference']):
        return 5
    elif (val == 'Field Error'):
        return 6
    elif (val == 'Lineout'):
        return 7
    elif (val == 'Single'):
        return 8
    elif (val == 'Double'):
        return 9
    elif (val == 'Triple'):
        return 10
    elif (val == 'Home Run'):
        return 11
    else:
        return 0

In [116]:
# integer encode
#label_encoder = LabelEncoder()
#y_train_encode = label_encoder.fit_transform(Y_train)
#y_train_encode
y_cat_train = Y_train.map(target_cat)

In [117]:
y_cat_train.value_counts(dropna=False)

3     36733
8     26307
4     21207
7     11605
9      8069
11     4864
1      4088
6      1380
10      925
2       334
5        46
0         1
Name: result, dtype: int64

In [118]:
### Ignore the next few cells. I didn't know what I wanted to do so I tried a number of things. Memory was a problem...

In [119]:
# Keep getting memory error, trying this solution: https://github.com/scikit-learn/scikit-learn/issues/16027
#class BirchChunked(Birch):
#    def predict(self, X):
#        # the original code
#        X = check_array(X, accept_sparse='csr')
#        self._check_fit(X)
#        return self.subcluster_labels_[pairwise_distances_argmin(X, self.subcluster_centers_)]

In [120]:
## birch didn't work. keep running out of memory
#birch_grid = GridSearchCV(estimator=BirchChunked(),
#                       param_grid={'threshold': [0.25,.5,.75], 'branching_factor': [10,50,80],
#                                  'n_clusters': None},
#                        param_grid={'eps': [0.1,.5,.75], 'min_samples': [50,100],
#                                  'n_clusters': None},
#                       scoring="adjusted_rand_score",
#                       n_jobs=-1,
#                       cv=5)

In [121]:
#optics_grid = GridSearchCV(estimator=OPTICS(),
#                           param_grid={'max_eps': [0.1,.5,.75], 'min_samples': [50,100]},
#                       scoring="adjusted_rand_score",
#                       n_jobs=-1,
#                       cv=5)

In [122]:
#birch_grid.fit(X_train, y_train_encode)
#optics_grid.fit(X_train, y_train_encode)

In [123]:
## I don't think clustering really works with GridSearchCV which sucks because it looks/sounds neat and want to use it
#clusterer = hdbscan.HDBSCAN(min_cluster_size=100)

In [124]:
#clusterer.fit(X_train)

In [125]:
#np.unique(clusterer.labels_)
#np.zeros_like(db.labels_, dtype=int)
#core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
#core_samples_mask[db.core_sample_indices_] = True
#labels = db.labels_

In [126]:
#len(clusterer.probabilities_)

In [127]:
#clusterer.condensed_tree_.plot(select_clusters=True,selection_palette=sns.color_palette('deep', 8))

In [128]:
#g = clusterer.condensed_tree_.to_networkx()
#g.number_of_nodes()

In [129]:
#clusterer.condensed_tree_.to_pandas()

In [130]:
#clusterer.single_linkage_tree_.plot()

In [145]:
X_train.drop(['pitch_type'],axis=1,inplace=True)

In [166]:
clf = Pipeline([('select_best', SelectKBest()),
                ('lr', LogisticRegression(class_weight="balanced"))])

In [193]:
params = {'lr__C': [10**i for i in range(-3,3)],
          'select_best__k': [7,9,11,13,'all']
         }

grid = GridSearchCV(clf,
                       param_grid=params,
                       scoring={'mse': make_scorer(mean_squared_error),
                               'r2s': make_scorer(r2_score)},
                       n_jobs=-1,
                       cv=5,
                   refit='mse')

In [194]:
grid.fit(X_train, y_cat_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('select_best', SelectKBest()),
                                       ('lr',
                                        LogisticRegression(class_weight='balanced'))]),
             n_jobs=-1,
             param_grid={'lr__C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'select_best__k': [7, 9, 11, 13, 'all']},
             refit='mse',
             scoring={'mse': make_scorer(mean_squared_error),
                      'r2s': make_scorer(r2_score)})

In [195]:
grid.cv_results_

{'mean_fit_time': array([21.70440183, 22.19619575, 22.51319633,  0.23619943, 23.11579986,
        21.82119765, 21.79626217, 21.81219778,  0.24620152, 22.15039878,
        21.70419731, 22.06179924, 22.41360021,  0.25899701, 22.42019982,
        21.76679797, 22.10779948, 22.45619826,  0.26579866, 22.20439939,
        21.69360032, 22.14820013, 22.35723977,  0.2489984 , 22.24159966,
        21.90346627, 22.12939839, 21.78699789,  0.24939919, 20.57319789]),
 'std_fit_time': array([0.82710755, 0.63149428, 0.50394256, 0.00549167, 0.54580731,
        0.50107612, 0.5928434 , 0.52207491, 0.01558833, 0.73684216,
        0.72130596, 0.48443407, 0.73535491, 0.00626059, 0.64445535,
        0.75183333, 0.42799745, 0.86052243, 0.0163377 , 0.75841199,
        0.76801868, 0.54783933, 0.81194604, 0.01681564, 0.78157352,
        0.46713113, 0.47636252, 0.63944826, 0.00458504, 0.8730355 ]),
 'mean_score_time': array([0.09080091, 0.1001996 , 0.11000009, 0.        , 0.10139856,
        0.09219971, 0.08653765

In [196]:
grid.best_score_, grid.best_params_

(8.530936365557952, {'lr__C': 0.1, 'select_best__k': 9})

In [197]:
grid.best_index_

11

In [205]:
grid.score

<bound method BaseSearchCV.score of GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('select_best', SelectKBest()),
                                       ('lr',
                                        LogisticRegression(class_weight='balanced',
                                                           multi_class='multinomial'))]),
             n_jobs=-1,
             param_grid={'lr__C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'select_best__k': [3, 5, 7, 9, 'all']},
             refit='avr',
             scoring={'avr': make_scorer(precision_score, needs_proba=True),
                      'll': make_scorer(log_loss, greater_is_better=False, needs_proba=True)})>

In [198]:
grid.cv_results_['params'][grid.best_index_]

{'lr__C': 0.1, 'select_best__k': 9}

In [201]:
clf = Pipeline([('select_best', SelectKBest()),
                ('lr', LogisticRegression(class_weight="balanced",multi_class="multinomial"))])

params = {'lr__C': [10**i for i in range(-3,3)],
          'select_best__k': [3,5,7,9,'all']
         }

grid = GridSearchCV(clf,
                       param_grid=params,
                       scoring={'avr': make_scorer(precision_score, needs_proba=True),
                                'll': make_scorer(log_loss, greater_is_better=False, needs_proba=True)},
                       n_jobs=-1,
                       cv=5,
                   refit='avr')

In [202]:
grid.fit(X_train, y_cat_train)



ValueError: Classification metrics can't handle a mix of multiclass and continuous-multioutput targets

In [206]:
y_cat_train.value_counts()

3     36733
8     26307
4     21207
7     11605
9      8069
11     4864
1      4088
6      1380
10      925
2       334
5        46
0         1
Name: result, dtype: int64

In [204]:
np.unique(y_cat_train)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11], dtype=int64)