## Predictive Machine Learning Senior Capstone
### Predicting pitches in Major League Baseball 
#### Code Contributors: Dylan Mullican and Matt Kline 

In [2]:
# import packages
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
df=pd.read_csv("Kershaw_15_to_20.csv")
df.tail()

Unnamed: 0,pitch_type,batter,stand,p_throws,balls,strikes,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,pitcher,pitch_number,bat_score,fld_score
15590,FF,519083,R,L,1,0,0,0,571976,0,1,Top,477132,2,0,0
15591,FF,519083,R,L,0,0,0,0,571976,0,1,Top,477132,1,0,0
15592,FF,571976,R,L,0,2,0,0,0,0,1,Top,477132,3,0,0
15593,FF,571976,R,L,0,1,0,0,0,0,1,Top,477132,2,0,0
15594,FF,571976,R,L,0,0,0,0,0,0,1,Top,477132,1,0,0


In [4]:
# Data Shape 
df.shape

(15595, 16)

In [5]:
df.describe()

Unnamed: 0,batter,balls,strikes,on_3b,on_2b,on_1b,outs_when_up,inning,pitcher,pitch_number,bat_score,fld_score
count,15595.0,15595.0,15595.0,15595.0,15595.0,15595.0,15595.0,15595.0,15595.0,15595.0,15595.0,15595.0
mean,532664.498172,0.772684,0.939981,31951.112793,72733.540366,127535.000834,0.967938,3.852837,477132.0,2.846105,0.861173,1.77294
std,78536.558497,0.911119,0.827645,127115.678051,185042.11842,229715.675376,0.814372,2.046705,0.0,1.70719,1.141756,2.080079
min,112526.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,477132.0,1.0,0.0,0.0
25%,457803.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,477132.0,1.0,0.0,0.0
50%,542583.0,1.0,1.0,0.0,0.0,0.0,1.0,4.0,477132.0,3.0,0.0,1.0
75%,595879.0,1.0,2.0,0.0,0.0,0.0,2.0,5.0,477132.0,4.0,1.0,3.0
max,671250.0,3.0,2.0,668942.0,669374.0,671250.0,2.0,9.0,477132.0,12.0,5.0,12.0


In [6]:
from sklearn import datasets, linear_model
#Target Vector
y = df['pitch_type']
y.shape

(15595,)

In [7]:
#Feature Matrix
X = df[['bat_score', 'strikes', 'outs_when_up', 'pitch_number', 'on_1b', 'on_2b', 'on_3b', 'batter', 'inning', 'fld_score', 'p_throws', 'pitcher', 'stand', 'inning_topbot', 'balls']]
X.shape

(15595, 15)

In [8]:
#Convert the text data to numerical
df.p_throws = df.p_throws.replace(['R', 'L'], [0,1])
df.stand = df.stand.replace(['R', 'L'], [0,1])
df.inning_topbot = df.inning_topbot.replace(['Bot', 'Top'], [0,1])

In [9]:
#convert features dataframe to a numpy array 
y = df['pitch_type'].values
X = df[['bat_score', 'strikes', 'outs_when_up', 'pitch_number', 'on_1b', 'on_2b', 'on_3b', 'batter', 'inning', 'fld_score', 'p_throws', 'pitcher', 'stand', 'inning_topbot', 'balls']].values
print(y)
print("")
print(X)

['FF' 'FF' 'SL' ... 'FF' 'FF' 'FF']

[[2 0 1 ... 1 0 0]
 [2 0 0 ... 0 0 0]
 [2 2 2 ... 0 0 3]
 ...
 [0 2 0 ... 0 1 0]
 [0 1 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]]


In [10]:
#Encoding the correct labels for each pitch-type classification
from sklearn.preprocessing import LabelEncoder

l_encoder = LabelEncoder()
l_encoder.fit(y)
l_encoder.classes_

array(['CH', 'CU', 'FF', 'SL'], dtype=object)

In [11]:
#Assigning numerical values to pitch labels
y_enc = l_encoder.transform(y)
np.unique(y_enc)

array([0, 1, 2, 3])

In [12]:
print(y_enc)

[2 2 3 ... 2 2 2]


In [13]:
#Splitting data into a training (75%) and testing (25%) dataset 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [14]:
print('X_train Shape of the data : {}'.format(X_train.shape))
print('y_train Shape of the data : {}'.format(y_train.shape))

X_train Shape of the data : (11696, 15)
y_train Shape of the data : (11696,)


In [15]:
print('X_test Shape of the data : {}'.format(X_test.shape))
print('y_test Shape of the data : {}'.format(y_test.shape))

X_test Shape of the data : (3899, 15)
y_test Shape of the data : (3899,)


In [16]:
#separate features from the target for exploration
features = df.drop('pitch_type', axis=1)

In [17]:
features.tail()

Unnamed: 0,batter,stand,p_throws,balls,strikes,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,pitcher,pitch_number,bat_score,fld_score
15590,519083,0,1,1,0,0,0,571976,0,1,1,477132,2,0,0
15591,519083,0,1,0,0,0,0,571976,0,1,1,477132,1,0,0
15592,571976,0,1,0,2,0,0,0,0,1,1,477132,3,0,0
15593,571976,0,1,0,1,0,0,0,0,1,1,477132,2,0,0
15594,571976,0,1,0,0,0,0,0,0,1,1,477132,1,0,0


In [18]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

### Build a KNN Classification Model 

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

for k in [10,100,200]:
    knn= KNeighborsClassifier(n_neighbors=k)

    knn.fit(X_train_std,y_train)
    y_pred=knn.predict(X_test_std)

    print('Test accuracy (using knn.score() for k = {0} is:{1:0.3f}'.format(k, knn.score(X_test_std, y_test)))

    print("Model's Predictive Accuracy for k ={0} is: {1:0.2f}".format(k,metrics.accuracy_score(y_test, y_pred)))

    print('Misclassified samples for k={0} are {1}\n'.format(k, (y_pred != y_test).sum()))

Test accuracy (using knn.score() for k = 10 is:0.475
Model's Predictive Accuracy for k =10 is: 0.48
Misclassified samples for k=10 are 2046

Test accuracy (using knn.score() for k = 100 is:0.505
Model's Predictive Accuracy for k =100 is: 0.50
Misclassified samples for k=100 are 1931

Test accuracy (using knn.score() for k = 200 is:0.506
Model's Predictive Accuracy for k =200 is: 0.51
Misclassified samples for k=200 are 1928



In [20]:
y_pred = knn.predict(X_test_std)
print("Test set predictions:\n {}".format(y_pred))

Test set predictions:
 ['SL' 'FF' 'SL' ... 'FF' 'SL' 'FF']


In [21]:
print('\nTest set accuracy: {0:0.2f}%'.format(100*knn.score(X_test_std, y_test)))


Test set accuracy: 50.55%


In [22]:
print('The predicted class membership probability is: {0}'.format(knn.predict_proba(X_test_std[:10,:])))
# Column for each pitch type 

The predicted class membership probability is: [[0.005 0.235 0.37  0.39 ]
 [0.005 0.19  0.605 0.2  ]
 [0.    0.215 0.315 0.47 ]
 [0.015 0.105 0.69  0.19 ]
 [0.01  0.11  0.69  0.19 ]
 [0.    0.17  0.45  0.38 ]
 [0.    0.1   0.62  0.28 ]
 [0.    0.105 0.55  0.345]
 [0.    0.13  0.605 0.265]
 [0.    0.285 0.33  0.385]]


In [24]:
print("Classification Report:")
print(metrics.classification_report(y_test, y_pred, target_names = ['CH','CU', 'FF', 'SL']))

Classification Report:
              precision    recall  f1-score   support

          CH       0.00      0.00      0.00        20
          CU       0.36      0.01      0.02       692
          FF       0.53      0.78      0.63      1814
          SL       0.45      0.40      0.42      1373

    accuracy                           0.51      3899
   macro avg       0.34      0.30      0.27      3899
weighted avg       0.47      0.51      0.45      3899



### Build a Logistic Regression Classification Model

#### Import and Instantiate the Logistic Regression Model in SciKit-Learn 

In [25]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='newton-cg',
                       multi_class='multinomial',
                       random_state=0)

#### Train model by calling fit function

In [26]:
lr.fit(X_train_std, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=0, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

#### Use the test set to create the model's predictions. Name the prediction vector *y_pred* 

In [27]:
y_pred = lr.predict(X_test_std)
print("Test set predictions:\n {}".format(y_pred))

Test set predictions:
 ['FF' 'FF' 'SL' ... 'FF' 'SL' 'FF']


In [28]:
print('\nTest set accuracy: {0:0.2f}%'.format(100*lr.score(X_test_std, y_test)))


Test set accuracy: 48.94%


In [29]:
print("Model's performance accuracy: {0:0.3f}".format(lr.score(X_test_std, y_test)))

Model's performance accuracy: 0.489


In [30]:
print('The predicted probability for belonging to the classes[class 0 = CH, class 1 = CU, class 2 = FF, class 3 = SI, class 4 = SL] are {0}'.format(lr.predict_proba(X_test_std[:10])))


The predicted probability for belonging to the classes[class 0 = CH, class 1 = CU, class 2 = FF, class 3 = SI, class 4 = SL] are [[2.35406385e-03 1.30844763e-01 4.92500795e-01 3.74300378e-01]
 [3.24576198e-04 1.76383448e-01 5.98706215e-01 2.24585761e-01]
 [3.98073451e-03 2.89555021e-01 3.05318276e-01 4.01145969e-01]
 [8.22496865e-03 9.72989151e-02 6.55902680e-01 2.38573436e-01]
 [4.31285729e-03 9.23593945e-02 6.23521448e-01 2.79806300e-01]
 [2.14090583e-06 1.29084814e-01 4.37436087e-01 4.33476959e-01]
 [2.45243754e-04 6.12928740e-02 7.34467681e-01 2.03994201e-01]
 [4.22235718e-05 7.31637666e-02 6.11112909e-01 3.15681101e-01]
 [1.16547270e-03 6.92773344e-02 6.51495210e-01 2.78061982e-01]
 [5.17469527e-03 1.74410218e-01 4.00382610e-01 4.20032476e-01]]


In [32]:
print("Classification Report:")
print(metrics.classification_report(y_test, y_pred, target_names = ['CH','CU', 'FF', 'SL']))


Classification Report:
              precision    recall  f1-score   support

          CH       0.00      0.00      0.00        20
          CU       0.31      0.12      0.17       692
          FF       0.54      0.75      0.63      1814
          SL       0.41      0.34      0.37      1373

    accuracy                           0.49      3899
   macro avg       0.32      0.30      0.29      3899
weighted avg       0.45      0.49      0.45      3899



### Build Linear Support Vector Classifier Model

#### Import and instantiate the Linear Support Vector Classifier Model in SciKit-Learn: 

In [33]:
from sklearn import svm

lsv = svm.SVC(probability=True, kernel='linear')

In [34]:
# Train the model 
lsv.fit(X_train_std, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

#### Use the test set to create the model's predictions. Name the prediction vector "y_pred"

In [37]:
y_pred = lsv.predict(X_test_std)
print("Test set predictions:\n {}".format(y_pred))
print('\nTest set accuracy: {0:0.2f}%'.format(100*lsv.score(X_test_std, y_test)))

Test set predictions:
 ['FF' 'FF' 'SL' ... 'FF' 'SL' 'FF']

Test set accuracy: 50.06%


In [38]:
print("Model's performance accuracy: {0:0.3f}".format(lsv.score(X_test_std, y_test)))

Model's performance accuracy: 0.501


Use SciKit learn's built-in predict method to test the model's predictive performance for the first row of data in X_test_std

In [39]:
lsv.predict_proba(X_test_std[1:2])

array([[0.00392935, 0.14763314, 0.54407027, 0.30436724]])

In [40]:
print('Number of misclassified samples = {0}\n'.format((y_pred != y_test).sum()))

Number of misclassified samples = 1947



In [41]:
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n{}".format(confusion))

Confusion matrix:
[[   0    0   17    3]
 [   0    0  308  384]
 [   0    0 1275  539]
 [   0    0  696  677]]


In [56]:
print("Classification Report:")
print(metrics.classification_report(y_test, y_pred, target_names = ['CH','CU', 'FF','SL']))

Classification Report:
              precision    recall  f1-score   support

          CH       0.00      0.00      0.00        18
          CU       0.30      0.35      0.32       664
          FF       0.54      0.59      0.57      1837
          SL       0.39      0.33      0.36      1380

    accuracy                           0.45      3899
   macro avg       0.31      0.32      0.31      3899
weighted avg       0.45      0.45      0.45      3899



### **Build a MLPClassifier Model**

In [43]:
from sklearn.neural_network import MLPClassifier
# Initialize the Multi Layer Perceptron Classifier
model=MLPClassifier(solver='adam', alpha=0.005, batch_size=200, epsilon=1e-08, hidden_layer_sizes=(400,),
                    learning_rate='adaptive', max_iter=700)

In [44]:
# Train the model
model.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.005, batch_size=200, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(400,), learning_rate='adaptive',
              learning_rate_init=0.001, max_iter=700, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [58]:
# Predict for the test set
y_pred=model.predict(X_test)
print(y_pred)

['FF' 'FF' 'CU' ... 'CU' 'CU' 'SL']


In [59]:
from sklearn.metrics import accuracy_score
# Calculate the accuracy of our model
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)

# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 39.19%


In [60]:
#Predict pitch using in-game scenario displayed in index 22 of dataset
print('The predicted pitch type for this scenario is: {}'
      .format(model.predict([X_test[22]])[0]))

The predicted pitch type for this scenario is: CU


In [61]:
print('The true pitch type is: {}'.format(y_test[22]))

The true pitch type is: CU


In [67]:
print("Classification Report:")
print(metrics.classification_report(y_test, y_pred, target_names = ['CH','CU', 'FF', 'SL']))

Classification Report:
              precision    recall  f1-score   support

          CH       0.00      0.00      0.00        18
          CU       0.30      0.35      0.32       664
          FF       0.54      0.59      0.57      1837
          SL       0.39      0.33      0.36      1380

    accuracy                           0.45      3899
   macro avg       0.31      0.32      0.31      3899
weighted avg       0.45      0.45      0.45      3899



### **Random Forest Model**

In [63]:
from sklearn.ensemble import RandomForestClassifier

 
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, random_state=0)

forest = RandomForestClassifier(n_estimators=5, random_state=2)
#Train the model
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=5,
                       n_jobs=None, oob_score=False, random_state=2, verbose=0,
                       warm_start=False)

In [64]:
# the difference between X_test_std and X_test
# Prediction using X_test_std (std: standard deviation)
y_pred = lsv.predict(X_test_std)
print("Test set predictions:\n {}".format(y_pred))
print('\nTest set accuracy: {0:0.2f}%'.format(100*forest.score(X_test_std, y_test)))

Test set predictions:
 ['FF' 'FF' 'SL' ... 'FF' 'SL' 'FF']

Test set accuracy: 46.55%


In [65]:
# Predict for the test set using X_test
y_pred=forest.predict(X_test)
print(y_pred)

['FF' 'FF' 'FF' ... 'SL' 'SL' 'CU']


In [66]:
print("Model's performance accuracy: {0:0.3f}".format(forest.score(X_test_std, y_test)))

Model's performance accuracy: 0.466
