# Whoo hoo statcast

Could it get better? Physics data available for me to play with!

In [1]:
import pandas as pd 
import numpy as np


In [3]:
statcast = pd.read_csv('../data/hitters_statcast_since2016.csv').dropna()
statcast['Barrel%'] = statcast['Barrel%'].apply(lambda x: float(x.strip('%')))
statcast['HardHit%'] = statcast['HardHit%'].apply(lambda x: float(x.strip('%')))

##Introduce a number of physical quantities, very approximate.
statcast['cosLA'] = np.cos(statcast['LA']/180*np.pi)
statcast['sinLA'] = np.sin(statcast['LA']/180*np.pi)
statcast['EV_x'] = statcast['EV']*statcast['cosLA']
statcast['EV_y'] = statcast['EV']*statcast['sinLA']
acceleration = -32 * 60 * 60 / 5280
func = (lambda x: -2 * x / acceleration)
statcast['Hangtime'] = func(statcast['EV_y'])
statcast['Distance'] = statcast['Hangtime']*statcast['EV_x']

standard = pd.read_csv('../data/hitters_since_1947.csv')
standard = standard[standard['Season'] >= 2016]

statcast_cols = statcast.select_dtypes(exclude = 'object').drop(['playerid','Season'], axis = 1).columns 
standard_cols = standard.select_dtypes(exclude = 'object').drop(['playerid','Season'], axis = 1).columns

df = pd.merge(statcast, standard.select_dtypes(exclude='object'), on = ['playerid','Season'], how = 'left').dropna()

Index(['G', 'AB', 'PA', 'H', '1B', '2B', '3B', 'HR', 'R', 'RBI', 'BB', 'IBB',
       'SO', 'HBP', 'SF', 'SH', 'GDP', 'SB', 'CS', 'AVG'],
      dtype='object')

In [4]:
def multiseason_lines(df, stats, num_seasons):
    """
    Adds columns to rows in a DataFrame for previous seasons; useful for making past seasons features
    in ML applicaitons.

    Parameters
    ----------
    df: DataFrame
        A DataFrame that includes multiple seasons of player data in separate rows.
        The DataFrame must include a unique 'playerid' column (per Fangraphs) and 
        the seasons must be labeled 'Season' with int-type data.
    stats: list-like
        The stats to include from previous seasons.

    num_seasons: int
        The number of past seasons to include as past stats.

    Returns
    -------
        A DataFrame with the columns from df and additional columns from stats
        labeled with suffixes '_1', '_2'...'_x' for the past season's stats.
    """
    out = df.copy()
    for n in range(num_seasons):
        df1 = df.copy()
        df1['Season'] = df1['Season'] + 1 + n
        out = pd.merge(out, df1[stats], how = 'left', on=['playerid','Season'],
            suffixes=("","_"+str(1+n)))
    return out



In [5]:
df['Events']

0       3
1       1
2       1
3       2
4       2
       ..
4034    1
4035    1
4036    1
4037    1
4038    1
Name: Events, Length: 3561, dtype: int64

In [6]:
stats = df.select_dtypes(exclude='object').columns
df = multiseason_lines(df,stats,1).dropna()
df = df[(df['Events']>70) & (df['Events_1']>70)]

In [7]:
correlations = df.corr()
df['PA_x'].min()

87

In [50]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

target = df['HR']/df['Events']
data = df[[ x for x in df.select_dtypes(exclude='object').columns if x.endswith("_1")]]
X_train, X_test, y_train, y_test = train_test_split(data, target)

In [52]:
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
for e in data.columns:
    x = scaler.fit_transform(target.to_frame())
    y = scaler.fit_transform(data[e].to_frame())
    print("{}: {:.2f}".format(e, r2_score(x,y)))

PA_x_1: -0.67
Events_1: -0.92
EV_1: 0.07
maxEV_1: 0.05
LA_1: -0.23
Barrels_1: 0.07
Barrel%_1: 0.33
HardHit_1: -0.46
HardHit%_1: 0.12
cosLA_1: -1.76
sinLA_1: -0.24
EV_x_1: -0.51
EV_y_1: -0.17
Hangtime_1: -0.17
Distance_1: -0.11
G_1: -0.73
AB_1: -0.72
PA_y_1: -0.67
H_1: -0.81
1B_1: -1.14
2B_1: -0.69
3B_1: -1.19
HR_1: -0.02
R_1: -0.57
RBI_1: -0.32
BB_1: -0.42
IBB_1: -0.62
SO_1: -0.19
HBP_1: -0.79
SF_1: -0.82
SH_1: -1.64
GDP_1: -0.86
SB_1: -1.29
CS_1: -1.30
AVG_1: -1.19


# Estblishing a Baseline

Any decent information we get should improve on using a simple linear regression of a previous season's HR total.
So, we findout how that scores.


In [53]:
linear = LinearRegression().fit(X_train['HR_1'].to_frame(), y_train)

print(linear.score(X_train['HR_1'].to_frame(), y_train),
    linear.score(X_test['HR_1'].to_frame(), y_test)) ##rough result: .35 and .39 (!)

0.2674801009709844 0.158082159316845


In [56]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha = .01).fit(X_train,y_train)
#cols = ['maxEV_1', 'Barrels_1', 'HardHit_1','HardHit%_1']
linear.fit(X_train,y_train)
print(linear.score(X_train, y_train),
    linear.score(X_test, y_test))
print(lasso.score(X_train, y_train),
    lasso.score(X_test,y_test))

0.5290183360492855 0.4168192840454249
0.4912840921324404 0.43239241175752063


In [57]:
print(lasso.intercept_)
list(zip(X_train.columns,lasso.coef_))
#data.columns

-0.05604306913010315


[('PA_x_1', 0.0),
 ('Events_1', -5.397609953416596e-05),
 ('EV_1', 0.0),
 ('maxEV_1', 0.0005152190149085961),
 ('LA_1', -0.0),
 ('Barrels_1', 0.00035294536247633927),
 ('Barrel%_1', 0.0),
 ('HardHit_1', 0.0),
 ('HardHit%_1', 0.0007021854367475714),
 ('cosLA_1', 0.0),
 ('sinLA_1', -0.0),
 ('EV_x_1', 0.0),
 ('EV_y_1', 0.0),
 ('Hangtime_1', 0.0),
 ('Distance_1', 0.0001256171449374855),
 ('G_1', -0.0),
 ('AB_1', 0.0),
 ('PA_y_1', 0.0),
 ('H_1', -0.0),
 ('1B_1', -2.5119933737901395e-05),
 ('2B_1', 0.0),
 ('3B_1', -0.0),
 ('HR_1', 0.0),
 ('R_1', 0.0),
 ('RBI_1', 8.259670003708295e-05),
 ('BB_1', 4.387571330121394e-05),
 ('IBB_1', 0.0),
 ('SO_1', 0.00011921598636100388),
 ('HBP_1', -0.0),
 ('SF_1', -0.0),
 ('SH_1', -0.0),
 ('GDP_1', 0.0),
 ('SB_1', -0.0),
 ('CS_1', -0.0),
 ('AVG_1', -0.0)]

## Hmmm...

The above cell show some of the usual linear regression perversion: R_1 and RBI_1 have non-zero coefs while HR_1 is 0!

Let's try a minimal set of features.

In [58]:
physical_data = [x+'_1' for x in statcast_cols if x not in ['PA','Events']]
minimal = physical_data + ['HR_1']
print(physical_data)

['EV_1', 'maxEV_1', 'LA_1', 'Barrels_1', 'Barrel%_1', 'HardHit_1', 'HardHit%_1', 'cosLA_1', 'sinLA_1', 'EV_x_1', 'EV_y_1', 'Hangtime_1', 'Distance_1']


In [60]:
linear.fit(X_train[minimal], y_train)
linear.score(X_test[minimal],y_test)

for i in [.02,.1,1,10]:
    lasso.alpha = i
    lasso.fit(X_train[minimal],y_train)
    print(lasso.score(X_train[minimal],y_train),lasso.score(X_test[minimal],y_test))


0.45641209375597713 0.40039136410436127
0.2619294790412363 0.20106895116110235
0.0 -0.0011264423226786668
0.0 -0.0011264423226786668


In [61]:
print(lasso.intercept_)
lasso.coef_
list(zip(minimal,lasso.coef_))

0.04892412034495372


[('EV_1', 0.0),
 ('maxEV_1', 0.0),
 ('LA_1', 0.0),
 ('Barrels_1', 0.0),
 ('Barrel%_1', 0.0),
 ('HardHit_1', 0.0),
 ('HardHit%_1', 0.0),
 ('cosLA_1', -0.0),
 ('sinLA_1', 0.0),
 ('EV_x_1', 0.0),
 ('EV_y_1', 0.0),
 ('Hangtime_1', 0.0),
 ('Distance_1', 0.0),
 ('HR_1', 0.0)]

In [68]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

lasso_pipe = Pipeline(
    steps=[('scaler',scaler),
        #('poly',PolynomialFeatures(degree=2)),
        ('lasso',lasso)
    ]

)

lasso.alpha = .001
lasso_pipe.fit(X_train[minimal],y_train)
lasso_pipe.score(X_test[minimal],y_test)

0.46074970953739347

In [73]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=25)
knn_pipe = Pipeline(
    steps=[('scaler',scaler),
        #('poly',PolynomialFeatures(degree=2)),
        ('knn_reg',knn)
    ]
)
knn_pipe.fit(X_train[minimal],y_train)
knn_pipe.score(X_test[minimal],y_test)

0.4228787956483231

In [84]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

svr = SVR(C=.00001)
svr_pipe = Pipeline(
    steps=[('scaler',scaler),
        #('poly',PolynomialFeatures(degree=2)),
        ('svr_reg',svr)
    ]
)
svr_pipe.fit(X_train[minimal],y_train)
svr_pipe.score(X_test[minimal],y_test)
##Is this broken? am I missing something? This is the worst result I've ever had.

-0.8717217302246587

In [88]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators=100)

gbr.fit(X_train[minimal],y_train)
gbr.score(X_test[minimal], y_test)

0.4415848200154988

In [86]:
GradientBoostingRegressor?

[0;31mInit signature:[0m
[0mGradientBoostingRegressor[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mloss[0m[0;34m=[0m[0;34m'ls'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlearning_rate[0m[0;34m=[0m[0;36m0.1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_estimators[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msubsample[0m[0;34m=[0m[0;36m1.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcriterion[0m[0;34m=[0m[0;34m'friedman_mse'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_split[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_leaf[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_weight_fraction_leaf[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_depth[0m[0;34m=[0m[0;36m3[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_impurity_decrease[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m