In [2]:
import pandas as pd
import numpy as np
import datetime as dt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, lars_path
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import matplotlib.pyplot as plt
import scipy.stats as stats
sns.set()

### Load and Clean Data

In [7]:
df = pd.read_csv('uncorrelated_regression_data.csv')

df['pitcher_age'] = [int(str(abs(pd.to_datetime(age)
                            - pd.to_datetime(dt.datetime.now().strftime("%Y-%m-%d")))).split()[0]) for age
                                in df['pitcher_age']]

pitch_types = pd.get_dummies(df['pitch_type.x'])
df = pd.concat([df, pitch_types], axis=1)
df['target_speed'] = df['start_speed']
df['pos_x'] = df['px']
df['pos_y'] = df['pz']
df = df.drop(['Row.names', 'pitch_type.x', 'pitcher_name', 'batter_name', 'stand', 'Unnamed: 0', 'start_speed', 'px', 'pz'], axis=1)
df = df.dropna(axis=0)
df.head()

Unnamed: 0,pitcher_age,batter_BB,batter_K,batter_AVG,batter_wRC,batter_LD,batter_FB,batter_HRFB,pitcher_BB,pitcher_K,...,CU,FC,FF,FT,KC,SI,SL,target_speed,pos_x,pos_y
0,12929,0.252481,0.291437,0.219572,470.840389,0.300555,0.298969,0.297082,0.121653,0.324566,...,0,0,1,0,0,0,0,90.3,0.657117,1.042848
1,12929,0.156542,0.233255,0.273277,458.07137,0.269206,0.253066,0.336735,0.121653,0.324566,...,0,0,1,0,0,0,0,91.3,0.158271,3.345362
2,9683,0.102081,0.123537,0.25434,191.246234,0.255507,0.295154,0.131343,0.143976,0.184133,...,0,0,1,0,0,0,0,92.4,0.212772,1.844342
3,10421,0.071709,0.263752,0.226466,162.141372,0.263994,0.187595,0.076613,0.143773,0.295788,...,0,0,0,0,0,0,1,85.8,-0.18491,1.03831
4,9330,0.12068,0.330675,0.22037,218.198317,0.265531,0.252505,0.265873,0.135742,0.250977,...,0,0,1,0,0,0,0,93.2,-0.833162,4.468802


### Data pre-processing

In [8]:
X = df.loc[:, 'pitcher_age': 'SL']
y = df.loc[:, 'target_speed']

In [9]:
# create polynomial features
poly_degree = 2

# normalize data
std = StandardScaler()

# split linear features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

# split polynomial features
pf = PolynomialFeatures(degree = poly_degree)
features = df.loc[:, 'pitcher_age': 'SL'].columns.tolist()
pf.fit(df[features])
feat_array = pf.transform(df[features])
df_poly = pd.DataFrame(feat_array, columns = pf.get_feature_names(input_features=features))
X_poly = df_poly.loc[:,pf.get_feature_names(input_features=features)]
y_poly = df['target_speed']
X_poly_train, X_poly_test, y_poly_train, y_poly_test = train_test_split(X_poly, y_poly, test_size=0.2, random_state=42)

# transform linear features
std.fit(X_train.values)
X_tr = std.transform(X_train.values)
X_te = std.transform(X_test.values)

# transform polynomial features
std.fit(X_poly_train.values)
X_poly_tr = std.transform(X_poly_train.values)
X_poly_te = std.transform(X_poly_test.values)

### Cross-validation for Model Selection

In [11]:
linear = LinearRegression()
ridge = Ridge(alpha = 0.1)
lasso = Lasso(alpha = 0.1)

In [12]:
def CV_best(X, y):
    
    # model selection
    model_list = [lr, ridge, lasso]
    model_results = {}
    
    # cross-validation method
    kf = KFold(n_splits=5, shuffle=True, random_state = 1000)
    
    # test each model
    for model in model_list:
        model_results[str(str(model)[0:5] + '^' + str(1))] = np.mean(cross_val_score(model, X_tr, y_train, cv=kf, scoring='r2'))
        model_results[str(str(model)[0:5] + '^' + str(poly_degree))] = np.mean(cross_val_score(model, X_poly_tr, y_poly_train, cv=kf, scoring='r2'))

    results = pd.DataFrame()
    results['Model'] = model_results.keys()
    results['R'] = model_results.values()
    results = results.sort_values(by = ['R'], ascending=False).reset_index().drop(['index'], axis=1)

    return display(results.head(len(model_list) * 2))

In [13]:
print(CV_best(X, y))

Unnamed: 0,Model,R
0,Linea^2,0.880199
1,Ridge^2,0.879917
2,Lasso^2,0.787421
3,Linea^1,0.764511
4,Ridge^1,0.764511
5,Lasso^1,0.757357


None


In [14]:
model = linear
poly = 2

### Model Training

In [16]:
if poly:
    model.fit(X_poly_tr, y_poly_train);
    coef = model.coef_;
    variables = df_poly.columns.to_list()
    df_coef = pd.DataFrame()
    df_coef['Variable'] = variables
    df_coef['Coef'] = coef
    print(df_coef.sort_values(by=['Coef'], ascending=False).head(df_coef.shape[0]).reset_index().drop(['index'], axis=1))
else:
    model.fit(X_tr, y_train);
    coef = model.coef_;
    variables = df.loc[:, 'pitcher_age': 'SL'].columns.to_list()
    df_coef = pd.DataFrame()
    df_coef['Variable'] = variables
    df_coef['Coef'] = coef
    print(df_coef.sort_values(by=['Coef'], ascending=False).head(df_coef.shape[0]).reset_index().drop(['index'], axis=1))

                    Variable          Coef
0                batter_K FF  1.296326e+10
1                   movement  1.181413e+10
2        split_pitcher_LD FF  9.901317e+09
3              pitcher_FB FF  9.893163e+09
4                batter_K SL  8.673138e+09
5      split_batter_zone8 FF  8.451232e+09
6                batter_K FT  7.944540e+09
7      split_batter_zone6 FF  7.939790e+09
8          split_batter_HRFB  7.603331e+09
9     split_pitcher_zone6 FF  6.956384e+09
10             pitcher_FB SL  6.591416e+09
11       split_pitcher_LD SL  6.483273e+09
12               batter_K SI  6.457070e+09
13               batter_K CH  6.382703e+09
14               batter_K FC  6.083714e+09
15       split_pitcher_LD FT  5.894861e+09
16               batter_K CU  5.801259e+09
17              batter_BB FF  5.657640e+09
18             pitcher_FB FT  5.557701e+09
19     split_batter_zone8 SL  5.466175e+09
20                        CU  5.447225e+09
21     split_batter_zone6 SL  5.177125e+09
22         

### Model Evaluation

In [18]:
# evaluation metrics formulas

def MAE(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true))

def SSE(actuals, preds):
    return np.sum((actuals - preds)**2)

def RMSE(actuals, preds): #root mean squared error
    return np.sqrt(np.mean((actuals - preds)**2))

def SST(y):
    return np.sum((y - np.mean(y))**2)

def R2(actuals, preds):
    return 1 - SSE(actuals, preds) / SST(actuals)

In [19]:
if poly:
    preds = model.predict(X_poly_te)
    print(f'MAE: {MAE(y_poly_test, preds):.3f}')
    print(f'SSE: {SSE(y_poly_test, preds):.3f}')
    print(f'RMSE: {RMSE(y_poly_test, preds):.3f}')
    print(f'SST: {SST(preds):.3f}')
    print(f'R^2: {R2(y_poly_test, preds):.3f}')
    df_test = pd.DataFrame(X_poly_te, columns = df_poly.columns.values[0:3003])
    df_test['target_speed'] = df['target_speed']
    df_test['preds'] = preds
    df_test['diff'] = abs(df_test['target_speed'] - df_test['preds'])
    print(f'Average Error: {np.mean(df_test["diff"]):.2f}')
    print(f'Within 10 MPH: {((df_test[df_test["diff"] < 10].count()[0] / df_test["diff"].count()) * 100):.2f} %')
    print(f'Within 5 MPH: {((df_test[df_test["diff"] < 5].count()[0] / df_test["diff"].count()) * 100):.2f} %')
    print(f'Within 3 MPH: {((df_test[df_test["diff"] < 3].count()[0] / df_test["diff"].count()) * 100):.2f} %')
    print(f'Within 1 MPH: {((df_test[df_test["diff"] < 1].count()[0] / df_test["diff"].count()) * 100):.2f} %')

else:
    preds = model.predict(X_te)
    print(f'MAE: {MAE(y_test, preds):.3f}')
    print(f'SSE: {SSE(y_test, preds):.3f}')
    print(f'RMSE: {RMSE(y_test, preds):.3f}')
    print(f'SST: {SST(preds):.3f}')
    print(f'R^2: {R2(y_test, preds):.3f}')
    df_test = pd.DataFrame(X_te, columns = df.columns.values[1:77])
    df_test['target_speed'] = df['target_speed']
    df_test['preds'] = preds
    df_test['diff'] = abs(df_test['target_speed'] - df_test['preds'])
    print(f'Average Error: {np.mean(df_test["diff"]):.2f}')
    print(f'Within 10 MPH: {((df_test[df_test["diff"] < 10].count()[0] / df_test["diff"].count()) * 100):.2f} %')
    print(f'Within 5 MPH: {((df_test[df_test["diff"] < 5].count()[0] / df_test["diff"].count()) * 100):.2f} %')
    print(f'Within 3 MPH: {((df_test[df_test["diff"] < 3].count()[0] / df_test["diff"].count()) * 100):.2f} %')
    print(f'Within 1 MPH: {((df_test[df_test["diff"] < 1].count()[0] / df_test["diff"].count()) * 100):.2f} %')

MAE: 1.452
SSE: 119405.066
RMSE: 1.880
SST: 893042.369
R^2: 0.882
Average Error: 5.86
Within 10 MPH: 80.19 %
Within 5 MPH: 52.46 %
Within 3 MPH: 35.19 %
Within 1 MPH: 12.60 %


### Model Implementation

In [23]:
df.head(1).to_csv('testing.csv')
inplay = pd.read_csv('testing.csv').drop(['Unnamed: 0'], axis=1)
inplay.head()

Unnamed: 0,pitcher_age,batter_BB,batter_K,batter_AVG,batter_wRC,batter_LD,batter_FB,batter_HRFB,pitcher_BB,pitcher_K,...,CU,FC,FF,FT,KC,SI,SL,target_speed,pos_x,pos_y
0,12929,0.252481,0.291437,0.219572,470.840389,0.300555,0.298969,0.297082,0.121653,0.324566,...,0,0,1,0,0,0,0,90.3,0.657117,1.042848


In [24]:
current_info = inplay.loc[:, 'pitcher_age': 'SL']

std.fit(current_info.values)
current_info = std.transform(current_info.values)

In [25]:
display_model = linear
display_model.fit(X_tr, y_train);

In [26]:
if inplay['CH'].values == 1:
    pitch = 'changeup'
if inplay['CU'].values == 1:
    pitch = 'curveball'
if inplay['FC'].values == 1:
    pitch = 'cutter'
if inplay['FF'].values == 1:
    pitch = 'four-seamer'
if inplay['FT'].values == 1:
    pitch = 'two-seamer'
if inplay['KC'].values == 1:
    pitch = 'knuckle-curve'
if inplay['SI'].values == 1:
    pitch = 'sinker'
if inplay['SL'].values == 1:
    pitch = 'slider'
    
print(f'Incoming {pitch} at: {display_model.predict(current_info)[0]:.1f} MPH.')

Incoming four-seamer at: 90.0 MPH.


In [27]:
df.head(1).to_csv('testing.csv')
inplay = pd.read_csv('testing.csv').drop(['Unnamed: 0'], axis=1)
inplay.head()

current_info = inplay.loc[:, 'pitcher_age': 'SL']

std.fit(current_info.values)
current_info = std.transform(current_info.values)

display_model = linear
display_model.fit(X_tr, y_train);

if inplay['CH'].values == 1:
    pitch = 'changeup'
if inplay['CU'].values == 1:
    pitch = 'curveball'
if inplay['FC'].values == 1:
    pitch = 'cutter'
if inplay['FF'].values == 1:
    pitch = 'four-seamer'
if inplay['FT'].values == 1:
    pitch = 'two-seamer'
if inplay['KC'].values == 1:
    pitch = 'knuckle-curve'
if inplay['SI'].values == 1:
    pitch = 'sinker'
if inplay['SL'].values == 1:
    pitch = 'slider'
    
print(f'Incoming {pitch} at: {display_model.predict(current_info)[0]:.1f} MPH.')

Incoming four-seamer at: 90.0 MPH.
