### Import Modules

In [15]:
import pandas as pd
import numpy as np
import datetime as dt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, lars_path, SGDRegressor
from sklearn.metrics import r2_score, accuracy_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import matplotlib.pyplot as plt
import scipy.stats as stats
sns.set()

### Load and Clean Data

In [36]:
df['pitcher_age'] = [int(str(abs(pd.to_datetime(age)
                            - pd.to_datetime(dt.datetime.now().strftime("%Y-%m-%d")))).split()[0]) for age
                                in df['pitcher_age']]

pitch_types = pd.get_dummies(df['pitch_type.x'])
df = pd.concat([df, pitch_types], axis=1)
df['target_speed'] = df['start_speed']
df['pos_x'] = df['px']
df['pos_y'] = df['pz']
df = df.rename(columns={"pitch_speed": "average_speed"})
df = df.drop(['pitch_type.x', 'stand', 'Unnamed: 0', 'start_speed', 'px', 'pz'], axis=1)
df = df.dropna(axis=0)

In [38]:
df.head(20)

Unnamed: 0,pitcher_name,batter_name,pitcher_age,average_speed,batter_BB,batter_K,batter_AVG,batter_OBP,batter_SLG,batter_ISO,...,CU,FC,FF,FT,KC,SI,SL,target_speed,pos_x,pos_y
0,Anibal Sanchez,Bryce Harper,12931,90.4,0.252481,0.291437,0.219572,0.413122,0.445472,0.288218,...,0,0,1,0,0,0,0,90.3,0.657117,1.042848
1,Anibal Sanchez,Bryce Harper,12931,90.4,0.252481,0.291437,0.219572,0.413122,0.445472,0.288218,...,0,0,1,0,0,0,0,89.3,1.215628,1.250705
2,Anibal Sanchez,Ian Desmond,12931,90.4,0.133072,0.29712,0.216684,0.327548,0.380902,0.209339,...,0,0,1,0,0,0,0,88.2,-1.030648,4.479985
3,Anibal Sanchez,Nolan Arenado,12931,90.4,0.156542,0.233255,0.273277,0.387771,0.52095,0.308659,...,0,0,1,0,0,0,0,89.9,-0.628675,1.396618
4,Anibal Sanchez,Ian Desmond,12931,90.4,0.133072,0.29712,0.216684,0.327548,0.380902,0.209339,...,0,0,1,0,0,0,0,87.1,-0.293118,3.231231
5,Anibal Sanchez,Nolan Arenado,12931,90.4,0.156542,0.233255,0.273277,0.387771,0.52095,0.308659,...,0,0,1,0,0,0,0,90.2,-2.155751,3.353314
6,Anibal Sanchez,Nolan Arenado,12931,90.4,0.156542,0.233255,0.273277,0.387771,0.52095,0.308659,...,0,0,1,0,0,0,0,88.8,0.164625,3.035604
7,Anibal Sanchez,Nolan Arenado,12931,90.4,0.156542,0.233255,0.273277,0.387771,0.52095,0.308659,...,0,0,1,0,0,0,0,90.5,0.029887,3.07624
8,Anibal Sanchez,Nolan Arenado,12931,90.4,0.156542,0.233255,0.273277,0.387771,0.52095,0.308659,...,0,0,1,0,0,0,0,91.1,0.22253,1.175243
9,Anibal Sanchez,Nolan Arenado,12931,90.4,0.156542,0.233255,0.273277,0.387771,0.52095,0.308659,...,0,0,1,0,0,0,0,91.3,0.158271,3.345362


### Data pre-processing

In [9]:
X = df.loc[:, 'pitcher_age': 'SL']
y = df.loc[:, 'target_speed']

In [10]:
# create polynomial features
poly_degree = 2

# normalize data
std = StandardScaler()

# split linear features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

# split polynomial features
pf = PolynomialFeatures(degree = poly_degree)
features = df.loc[:, 'pitcher_age': 'SL'].columns.tolist()
pf.fit(df[features])
feat_array = pf.transform(df[features])
df_poly = pd.DataFrame(feat_array, columns = pf.get_feature_names(input_features=features))
X_poly = df_poly.loc[:,pf.get_feature_names(input_features=features)]
y_poly = df['target_speed']
X_poly_train, X_poly_test, y_poly_train, y_poly_test = train_test_split(X_poly, y_poly, test_size=0.2, random_state=42)

# transform linear features
std.fit(X_train.values)
X_tr = std.transform(X_train.values)
X_te = std.transform(X_test.values)

# transform polynomial features
std.fit(X_poly_train.values)
X_poly_tr = std.transform(X_poly_train.values)
X_poly_te = std.transform(X_poly_test.values)

### Cross-validation for Model Selection

In [16]:
linear = LinearRegression()
ridge = Ridge(alpha = 0.1)
lasso = Lasso(alpha = 0.1)
grad = SGDRegressor(max_iter=1000, tol=1e-3)

In [17]:
def CV_best(X, y):
    
    # model selection
    model_list = [linear, ridge, lasso, grad]
    model_results = {}
    
    # cross-validation method
    kf = KFold(n_splits=5, shuffle=True, random_state = 1000)
    
    # test each model
    for model in model_list:
        model_results[str(str(model)[0:5] + '^' + str(1))] = np.mean(cross_val_score(model, X_tr, y_train, cv=kf, scoring='r2'))
        model_results[str(str(model)[0:5] + '^' + str(poly_degree))] = np.mean(cross_val_score(model, X_poly_tr, y_poly_train, cv=kf, scoring='r2'))

    results = pd.DataFrame()
    results['Model'] = model_results.keys()
    results['R'] = model_results.values()
    results = results.sort_values(by = ['R'], ascending=False).reset_index().drop(['index'], axis=1)

    return display(results.head(len(model_list) * 2))

In [18]:
print(CV_best(X, y))

Unnamed: 0,Model,R
0,Linea^2,0.9433754
1,Ridge^2,0.9423941
2,Ridge^1,0.941238
3,Linea^1,0.941238
4,Lasso^2,0.9405582
5,Lasso^1,0.9405377
6,SGDRe^1,0.9403223
7,SGDRe^2,-1.066663e+20


None


In [29]:
model = lasso
poly = 0

### Model Training

In [30]:
if poly:
    model.fit(X_poly_tr, y_poly_train);
    coef = model.coef_;
    variables = df_poly.columns.to_list()
    df_coef = pd.DataFrame()
    df_coef['Variable'] = variables
    df_coef['Coef'] = coef
    print(df_coef.sort_values(by=['Coef'], ascending=False).head(df_coef.shape[0]).reset_index().drop(['index'], axis=1))
else:
    model.fit(X_tr, y_train);
    coef = model.coef_;
    variables = df.loc[:, 'pitcher_age': 'SL'].columns.to_list()
    df_coef = pd.DataFrame()
    df_coef['Variable'] = variables
    df_coef['Coef'] = coef
    print(df_coef.sort_values(by=['Coef'], ascending=False).head(df_coef.shape[0]).reset_index().drop(['index'], axis=1))

               Variable      Coef
0         average_speed  5.196373
1           pitcher_age -0.000000
2    split_batter_zone1 -0.000000
3    split_batter_zone8  0.000000
4    split_batter_zone7  0.000000
5    split_batter_zone6  0.000000
6    split_batter_zone5 -0.000000
7    split_batter_zone4 -0.000000
8    split_batter_zone3 -0.000000
9    split_batter_zone2  0.000000
10   split_pitcher_HRFB  0.000000
11     split_pitcher_FB -0.000000
12     split_pitcher_LD -0.000000
13    split_pitcher_FIP -0.000000
14   split_pitcher_WHIP -0.000000
15    split_pitcher_ISO -0.000000
16    split_pitcher_SLG -0.000000
17    split_pitcher_OBP -0.000000
18   split_batter_zone9  0.000000
19  split_pitcher_zone1 -0.000000
20  split_pitcher_zone2  0.000000
21  split_pitcher_zone3  0.000000
22                   SI  0.000000
23                   KC -0.000000
24                   FT  0.000000
25                   FF  0.000000
26                   FC -0.000000
27                   CU -0.000000
28            

### Model Evaluation

In [33]:
# evaluation metrics formulas

def MAE(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true))

def SSE(actuals, preds):
    return np.sum((actuals - preds)**2)

def RMSE(actuals, preds): #root mean squared error
    return np.sqrt(np.mean((actuals - preds)**2))

def SST(y):
    return np.sum((y - np.mean(y))**2)

def R2(actuals, preds):
    return 1 - SSE(actuals, preds) / SST(actuals)

In [34]:
if poly:
    preds = model.predict(X_poly_te)
    print(f'MAE: {MAE(y_poly_test, preds):.3f}')
    print(f'SSE: {SSE(y_poly_test, preds):.3f}')
    print(f'RMSE: {RMSE(y_poly_test, preds):.3f}')
    print(f'SST: {SST(preds):.3f}')
    print(f'R^2: {R2(y_poly_test, preds):.3f}')
    df_test = pd.DataFrame(X_poly_te, columns = df_poly.columns.values[0:len(df_poly.columns)])
    df_test['target_speed'] = df['target_speed']
    df_test['preds'] = preds
    df_test['diff'] = abs(df_test['target_speed'] - df_test['preds'])
    print(f'Average Error: {np.mean(df_test["diff"]):.2f}')
    print(f'Within 10 MPH: {((df_test[df_test["diff"] < 10].count()[0] / df_test["diff"].count()) * 100):.2f} %')
    print(f'Within 5 MPH: {((df_test[df_test["diff"] < 5].count()[0] / df_test["diff"].count()) * 100):.2f} %')
    print(f'Within 3 MPH: {((df_test[df_test["diff"] < 3].count()[0] / df_test["diff"].count()) * 100):.2f} %')
    print(f'Within 1 MPH: {((df_test[df_test["diff"] < 1].count()[0] / df_test["diff"].count()) * 100):.2f} %')

else:
    preds = model.predict(X_te)
    print(f'MAE: {MAE(y_test, preds):.3f}')
    print(f'SSE: {SSE(y_test, preds):.3f}')
    print(f'RMSE: {RMSE(y_test, preds):.3f}')
    print(f'SST: {SST(preds):.3f}')
    print(f'R^2: {R2(y_test, preds):.3f}')
    df_test = pd.DataFrame(X_te, columns = df.columns.values[1:X_te.shape[1] + 1])
    df_test['target_speed'] = df['target_speed']
    df_test['preds'] = preds
    df_test['diff'] = abs(df_test['target_speed'] - df_test['preds'])
    print(f'Average Error: {np.mean(df_test["diff"]):.2f}')
    print(f'Within 10 MPH: {((df_test[df_test["diff"] < 10].count()[0] / df_test["diff"].count()) * 100):.2f} %')
    print(f'Within 5 MPH: {((df_test[df_test["diff"] < 5].count()[0] / df_test["diff"].count()) * 100):.2f} %')
    print(f'Within 3 MPH: {((df_test[df_test["diff"] < 3].count()[0] / df_test["diff"].count()) * 100):.2f} %')
    print(f'Within 1 MPH: {((df_test[df_test["diff"] < 1].count()[0] / df_test["diff"].count()) * 100):.2f} %')

MAE: 1.028
SSE: 60560.965
RMSE: 1.339
SST: 907026.627
R^2: 0.940
Average Error: 5.78
Within 10 MPH: 80.67 %
Within 5 MPH: 53.68 %
Within 3 MPH: 36.08 %
Within 1 MPH: 12.70 %


### Model Implementation

In [None]:
df.head(1).to_csv('testing.csv')
inplay = pd.read_csv('testing.csv').drop(['Unnamed: 0'], axis=1)
inplay.head()

In [None]:
current_info = inplay.loc[:, 'pitcher_age': 'SL']

std.fit(current_info.values)
current_info = std.transform(current_info.values)

In [None]:
display_model = linear
display_model.fit(X_tr, y_train);

In [None]:
if inplay['CH'].values == 1:
    pitch = 'changeup'
if inplay['CU'].values == 1:
    pitch = 'curveball'
if inplay['FC'].values == 1:
    pitch = 'cutter'
if inplay['FF'].values == 1:
    pitch = 'four-seamer'
if inplay['FT'].values == 1:
    pitch = 'two-seamer'
if inplay['KC'].values == 1:
    pitch = 'knuckle-curve'
if inplay['SI'].values == 1:
    pitch = 'sinker'
if inplay['SL'].values == 1:
    pitch = 'slider'
    
print(f'Incoming {pitch} at: {display_model.predict(current_info)[0]:.1f} MPH.')