# Whoo hoo statcast

Could it get better? Physics data available for me to play with!

In [1]:
import pandas as pd 
import numpy as np


In [208]:
statcast = pd.read_csv('../data/hitters_statcast_since2016.csv').dropna()
statcast['Barrel%'] = statcast['Barrel%'].apply(lambda x: float(x.strip('%')))
statcast['HardHit%'] = statcast['HardHit%'].apply(lambda x: float(x.strip('%')))

##Introduce a number of physical quantities, very approximate.
statcast['cosLA'] = np.cos(statcast['LA']/180*np.pi)
statcast['sinLA'] = np.sin(statcast['LA']/180*np.pi)
statcast['EV_x'] = statcast['EV']*statcast['cosLA']
statcast['EV_y'] = statcast['EV']*statcast['sinLA']
acceleration = -32 * 60 * 60 / 5280
func = (lambda x: -2 * x / acceleration)
statcast['Hangtime'] = func(statcast['EV_y'])
statcast['Distance'] = statcast['Hangtime']*statcast['EV_x']

standard = pd.read_csv('../data/hitters_since_1947.csv')
standard = standard[standard['Season'] >= 2016]

statcast_cols = statcast.select_dtypes(exclude = 'object').drop(['playerid','Season'], axis = 1).columns 
standard_cols = standard.select_dtypes(exclude = 'object').drop(['playerid','Season'], axis = 1).columns

df = pd.merge(statcast, standard.select_dtypes(exclude='object'), on = ['playerid','Season'], how = 'left').dropna()

Index(['G', 'AB', 'PA', 'H', '1B', '2B', '3B', 'HR', 'R', 'RBI', 'BB', 'IBB',
       'SO', 'HBP', 'SF', 'SH', 'GDP', 'SB', 'CS', 'AVG'],
      dtype='object')

In [210]:
def multiseason_lines(df, stats, num_seasons):
    """
    Adds columns to rows in a DataFrame for previous seasons; useful for making past seasons features
    in ML applicaitons.

    Parameters
    ----------
    df: DataFrame
        A DataFrame that includes multiple seasons of player data in separate rows.
        The DataFrame must include a unique 'playerid' column (per Fangraphs) and 
        the seasons must be labeled 'Season' with int-type data.
    stats: list-like
        The stats to include from previous seasons.

    num_seasons: int
        The number of past seasons to include as past stats.

    Returns
    -------
        A DataFrame with the columns from df and additional columns from stats
        labeled with suffixes '_1', '_2'...'_x' for the past season's stats.
    """
    out = df.copy()
    for n in range(num_seasons):
        df1 = df.copy()
        df1['Season'] = df1['Season'] + 1 + n
        out = pd.merge(out, df1[stats], how = 'left', on=['playerid','Season'],
            suffixes=("","_"+str(1+n)))
    return out



In [211]:
df['Events']

0       3
1       1
2       1
3       2
4       2
       ..
4034    1
4035    1
4036    1
4037    1
4038    1
Name: Events, Length: 3561, dtype: int64

In [212]:
stats = df.select_dtypes(exclude='object').columns
df = multiseason_lines(df,stats,1).dropna()
df = df[(df['Events']>70) & (df['Events_1']>70)]

In [213]:
correlations = df.corr()
df['PA_x'].min()

87

In [218]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

target = 'HR'
data = [ x for x in df.select_dtypes(exclude='object').columns if x.endswith("_1")]
X_train, X_test, y_train, y_test = train_test_split(df[data], df[target])

In [229]:
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
for e in data:
    x = scaler.fit_transform(df[target].to_frame())
    y = scaler.fit_transform(df[e].to_frame())
    print("{}: {:.2f}".format(e, r2_score(x,y)))

PA_x_1: -0.16
Events_1: -0.30
EV_1: -0.03
maxEV_1: -0.02
LA_1: -0.43
Barrels_1: 0.24
Barrel%_1: 0.06
HardHit_1: 0.02
HardHit%_1: -0.03
cosLA_1: -1.54
sinLA_1: -0.42
EV_x_1: -0.47
EV_y_1: -0.36
Hangtime_1: -0.36
Distance_1: -0.30
G_1: -0.31
AB_1: -0.19
PA_y_1: -0.16
H_1: -0.20
1B_1: -0.51
2B_1: -0.15
3B_1: -0.86
HR_1: 0.21
R_1: -0.05
RBI_1: 0.08
BB_1: -0.16
IBB_1: -0.36
SO_1: -0.11
HBP_1: -0.69
SF_1: -0.50
SH_1: -1.51
GDP_1: -0.48
SB_1: -0.94
CS_1: -0.96
AVG_1: -0.67


# Estblishing a Baseline

Any decent information we get should improve on using a simple linear regression of a previous season's HR total.
So, we findout how that scores.


In [231]:
linear = LinearRegression().fit(X_train['HR_1'].to_frame(), y_train)

print(linear.score(X_train['HR_1'].to_frame(), y_train),
    linear.score(X_test['HR_1'].to_frame(), y_test)) ##rough result: .35 and .39 (!)

0.34847086602244337 0.3926728091292663


In [236]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha = 2).fit(X_train,y_train)
#cols = ['maxEV_1', 'Barrels_1', 'HardHit_1','HardHit%_1']
linear.fit(X_train,y_train)
print(linear.score(X_train, y_train),
    linear.score(X_test, y_test))
print(lasso.score(X_train, y_train),
    lasso.score(X_test,y_test))

0.45292133046899385 0.4310399850536407
0.42789230560645664 0.44701669099531927


In [238]:
print(lasso.intercept_)
list(zip(X_train.columns,lasso.coef_))
#data.columns

-20.929127501386198


[('PA_x_1', 0.0),
 ('Events_1', 0.00886084042773968),
 ('EV_1', 0.0),
 ('maxEV_1', 0.20274536670999443),
 ('LA_1', -0.0),
 ('Barrels_1', 0.14746640715732365),
 ('Barrel%_1', 0.0),
 ('HardHit_1', 0.020242721352404833),
 ('HardHit%_1', 0.12579228386404634),
 ('cosLA_1', 0.0),
 ('sinLA_1', -0.0),
 ('EV_x_1', 0.0),
 ('EV_y_1', -0.0),
 ('Hangtime_1', -0.0),
 ('Distance_1', 0.03132805402819768),
 ('G_1', -0.07750943445587502),
 ('AB_1', 0.004409999801235575),
 ('PA_y_1', 0.0),
 ('H_1', 0.0),
 ('1B_1', -0.03426368853924428),
 ('2B_1', 0.010831327430470753),
 ('3B_1', -0.0),
 ('HR_1', 0.0),
 ('R_1', 0.07299581507597651),
 ('RBI_1', 0.04756948783125929),
 ('BB_1', 0.0),
 ('IBB_1', 0.0),
 ('SO_1', -0.0),
 ('HBP_1', -0.0),
 ('SF_1', -0.0),
 ('SH_1', -0.0),
 ('GDP_1', 0.0),
 ('SB_1', 0.0),
 ('CS_1', -0.0),
 ('AVG_1', -0.0)]

## Hmmm...

The above cell show some of the usual linear regression perversion: R_1 and RBI_1 have non-zero coefs while HR_1 is 0!

Let's try a minimal set of features.

In [239]:
physical_data = [x+'_1' for x in statcast_cols].drop(['PA','Events'])
minimal = physical_data.append('HR_1')

Index(['PA', 'Events', 'EV', 'maxEV', 'LA', 'Barrels', 'Barrel%', 'HardHit',
       'HardHit%', 'cosLA', 'sinLA', 'EV_x', 'EV_y', 'Hangtime', 'Distance'],
      dtype='object')

In [143]:
print(lasso.intercept_)
lasso.coef_
list(zip(data.columns,lasso.coef_))

110.01273209549072


[('EV_1', -0.0),
 ('maxEV_1', 2.295988404194982),
 ('LA_1', -0.0),
 ('Barrels_1', -0.0),
 ('Barrel%_1', 0.2081895715076427),
 ('HardHit_1', -0.0),
 ('HardHit%_1', 0.3388837911819282)]

In [152]:
from sklearn.preprocessing import PolynomialFeatures

lasso.alpha = 10
poly = PolynomialFeatures()
non_lin = poly.fit_transform(X_train)
lasso.fit(non_lin,y_train)
lasso.score(poly.fit_transform(X_test),y_test)

0.6555520785003144

In [153]:
print(lasso.intercept_)
lasso.coef_
#list(zip(data.columns,lasso.coef_))

78.47589693759062


array([ 0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -1.05732519e-03, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  2.88450136e-03,
        0.00000000e+00,  4.81417570e-04,  0.00000000e+00,  1.60774063e-05,
        1.39440054e-03, -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
       -3.51339694e-04,  0.00000000e+00, -2.12805133e-04, -0.00000000e+00,
       -2.73185500e-05, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00,  5.04156548e-05, -4.91719105e-04, -0.00000000e+00])