In [1]:
import pandas as pd
import sqlite3

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
conn = sqlite3.connect('pitching.sqlite')

In [3]:
tr_features = pd.read_sql('select * from train_features', conn)
tr_labels = pd.read_sql('select * from train_labels', conn)

In [4]:
tr_features.head()

Unnamed: 0,batter_r,pitcher_r,inning,score_diff,is_behind,is_ahead,is_even,on_1b_ind,on_2b_ind,on_3b_ind,pitch_offset_fb_1
0,1,0,3,-1,0,0,1,0,0,1,0
1,1,0,1,0,0,0,1,0,0,0,0
2,0,1,1,0,0,0,1,0,0,1,0
3,0,0,2,0,0,0,1,0,0,0,0
4,0,1,5,2,0,0,1,0,0,0,0


In [5]:
rf = RandomForestClassifier()

In [6]:
scores = cross_val_score(rf, tr_features, tr_labels.values.ravel(), cv=5)

In [7]:
scores

array([0.56810408, 0.56837324, 0.5684181 , 0.57252285, 0.57129559])

In [8]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))
    
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [9]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators':[5, 50 , 100],
    'max_depth': [2, 10, 20, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(tr_features, tr_labels.values.ravel())

print_results(cv)

BEST PARAMS: {'max_depth': 10, 'n_estimators': 100}

0.57 (+/-0.0) for {'max_depth': 2, 'n_estimators': 5}
0.57 (+/-0.0) for {'max_depth': 2, 'n_estimators': 50}
0.57 (+/-0.0) for {'max_depth': 2, 'n_estimators': 100}
0.585 (+/-0.003) for {'max_depth': 10, 'n_estimators': 5}
0.586 (+/-0.003) for {'max_depth': 10, 'n_estimators': 50}
0.586 (+/-0.003) for {'max_depth': 10, 'n_estimators': 100}
0.568 (+/-0.003) for {'max_depth': 20, 'n_estimators': 5}
0.57 (+/-0.003) for {'max_depth': 20, 'n_estimators': 50}
0.57 (+/-0.003) for {'max_depth': 20, 'n_estimators': 100}
0.567 (+/-0.004) for {'max_depth': None, 'n_estimators': 5}
0.569 (+/-0.003) for {'max_depth': None, 'n_estimators': 50}
0.57 (+/-0.003) for {'max_depth': None, 'n_estimators': 100}


In [10]:
rf1 = RandomForestClassifier(n_estimators=50, max_depth=10)
rf1.fit(tr_features, tr_labels.values.ravel())

In [11]:
rf2 = RandomForestClassifier(n_estimators=5, max_depth=10)
rf2.fit(tr_features, tr_labels.values.ravel())

In [12]:
rf3 = RandomForestClassifier(n_estimators=100, max_depth=10)
rf3.fit(tr_features, tr_labels.values.ravel())

In [13]:
val_features = pd.read_sql('select * from val_features', conn)
val_labels = pd.read_sql('select * from val_labels', conn)

In [14]:
for mdl in [rf1, rf2, rf3]:
    y_pred = mdl.predict(val_features)
    accuracy = round(accuracy_score(val_labels, y_pred), 3)
    precision = precision_score(val_labels, y_pred, average=None)
    recall = recall_score(val_labels, y_pred, average=None)
    print('MAX DEPTH: {} / # OF EST: {} -- A: {} / P: {} / R: {}'.format(
                mdl.max_depth, 
                mdl.n_estimators, 
                accuracy,
                precision,
                recall
            )
         )   

MAX DEPTH: 10 / # OF EST: 50 -- A: 0.585 / P: [0.54337949 0.59490718] / R: [0.23532443 0.85027136]
MAX DEPTH: 20 / # OF EST: 100 -- A: 0.572 / P: [0.50559975 0.59553418] / R: [0.30667604 0.77294172]
MAX DEPTH: 10 / # OF EST: 100 -- A: 0.585 / P: [0.5427756  0.59510856] / R: [0.23758882 0.84846228]


In [15]:
test_features = pd.read_sql('select * from test_features', conn)
test_labels = pd.read_sql('select * from test_labels', conn)

In [16]:
y_pred = rf1.predict(test_features)

In [17]:
accuracy = round(accuracy_score(test_labels, y_pred), 3)
print(accuracy)

0.584


In [None]:
## All 3 models accurately predicted pitch type about 58% of the time. 
## Considering MLB avg. Fastball usage is 57-58%, this is not much better than if someone were to just guest "fastball" every pitch
## Next steps would be to do more work profiling hitters and pitchers. Right now we consider pitchers and batters via their statcast ID.
## It would be more powerful to group hitters and pitchers by their tedencies. 
## For ex., categorizing pitchers by offspeed usage, or hitters by how well they hit fastballs

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=766ac258-4b25-4aeb-9ca3-1c0ba0991c26' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>