# Making predictions with StatCast

Our goal is to make some prediction of subsequent season box score data with StatCast data.

## Data Cleaning and Prep first.

In [5]:
import pandas as pd 
import numpy as np


#### Start with adding some approximate physical quantities

We know that we're dealing with physical data, so we introduce a few tirgonometric functions to represent component velocities and some approximate kinematics to represent other features of baseballs in the air.

These are definitely "first approximations."

Definitions:
EV_x: the velocity of the ball in the direction of the fences.
EV_y: the velocity of the ball upward.
cosLA: cosine of the launch angle
sinLA: sine of the launch angle
Hangtime: the (approximate) time required for the ball to return to the level of the playing field. Calulate by basic kinematics on EV_y.
Distance: This is the approximate distance from homeplate when the ball reaches the level of the playing field. No drag calculations here. 

In [6]:
statcast = pd.read_csv('../data/hitters_statcast_since2016.csv').dropna()
statcast['Barrel%'] = statcast['Barrel%'].apply(lambda x: float(x.strip('%')))
statcast['HardHit%'] = statcast['HardHit%'].apply(lambda x: float(x.strip('%')))

##Introduce a number of physical quantities, very approximate.
statcast['cosLA'] = np.cos(statcast['LA']/180*np.pi)
statcast['sinLA'] = np.sin(statcast['LA']/180*np.pi)
statcast['EV_x'] = statcast['EV']*statcast['cosLA']
statcast['EV_y'] = statcast['EV']*statcast['sinLA']
acceleration = -32 * 60 * 60 / 5280
func = (lambda x: -2 * x / acceleration)
statcast['Hangtime'] = func(statcast['EV_y'])
statcast['Distance'] = statcast['Hangtime']*statcast['EV_x']

## We know that the optimal LA for homeruns is not 0 degrees.
## Below, we found that an average LA of 20 was very good for HR
## hitting. We score this by the cosine squared.

statcast['LA_optimality'] = np.cos((statcast['LA'] - 20)/180*np.pi)**2

##In particluar, that LA_optimality times av. exist velocity correlated well with 
##HR%. So, let's add that too



standard = pd.read_csv('../data/hitters_since_1947.csv')
standard = standard[standard['Season'] >= 2016]
standard.drop(['G','AB','AVG','R','RBI'], axis = 1, inplace=True)

statcast_cols = statcast.select_dtypes(exclude = 'object').drop(['playerid','Season'], axis = 1).columns 
standard_cols = standard.select_dtypes(exclude = 'object').drop(['playerid','Season'], axis = 1).columns

df = pd.merge(statcast, standard.select_dtypes(exclude='object'), on = ['playerid','Season'], how = 'left').dropna()

In [7]:
df.rename({'PA_y':'PA'}, axis=1, inplace = True)
for s in standard_cols:
    df[s+'%'] = df[s]/df['Events']
df.head()



Unnamed: 0,Season,Name,Team,PA_x,Events,EV,maxEV,LA,Barrels,Barrel%,...,HR%,BB%,IBB%,SO%,HBP%,SF%,SH%,GDP%,SB%,CS%
0,2018,Ryder Jones,Giants,8,3,109.1,112.1,18.3,2.0,66.7,...,0.666667,0.0,0.0,1.666667,0.0,0.0,0.0,0.0,0.0,0.0
1,2016,Eric Young Jr.,Yankees,1,1,107.9,107.9,12.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2017,Aaron Blair,Braves,2,1,106.8,106.8,-2.5,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2017,Erik Kratz,Yankees,2,2,106.0,108.6,5.5,1.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2019,Jesus Tinoco,Rockies,6,2,106.0,106.0,-17.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.5,0.0,0.0,0.0


In [8]:
df

Unnamed: 0,Season,Name,Team,PA_x,Events,EV,maxEV,LA,Barrels,Barrel%,...,HR%,BB%,IBB%,SO%,HBP%,SF%,SH%,GDP%,SB%,CS%
0,2018,Ryder Jones,Giants,8,3,109.1,112.1,18.3,2.0,66.7,...,0.666667,0.0,0.0,1.666667,0.0,0.0,0.0,0.0,0.0,0.0
1,2016,Eric Young Jr.,Yankees,1,1,107.9,107.9,12.5,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.0,0.0
2,2017,Aaron Blair,Braves,2,1,106.8,106.8,-2.5,0.0,0.0,...,0.000000,1.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,2017,Erik Kratz,Yankees,2,2,106.0,108.6,5.5,1.0,50.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,2019,Jesus Tinoco,Rockies,6,2,106.0,106.0,-17.0,0.0,0.0,...,0.000000,0.0,0.0,2.000000,0.0,0.0,0.5,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4034,2016,Jumbo Diaz,Reds,1,1,51.0,51.0,-7.7,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4035,2018,Austin Voth,Nationals,2,1,45.3,45.3,-31.6,0.0,0.0,...,0.000000,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0
4036,2016,Dean Kiekhefer,Cardinals,1,1,44.1,44.1,-68.9,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4037,2017,Drew Storen,Reds,1,1,36.7,36.7,20.7,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
def multiseason_lines(df, stats, num_seasons):
    """
    Adds columns to rows in a DataFrame for previous seasons; useful for making past seasons features
    in ML applicaitons.

    Parameters
    ----------
    df: DataFrame
        A DataFrame that includes multiple seasons of player data in separate rows.
        The DataFrame must include a unique 'playerid' column (per Fangraphs) and 
        the seasons must be labeled 'Season' with int-type data.
    stats: list-like
        The stats to include from previous seasons.

    num_seasons: int
        The number of past seasons to include as past stats.

    Returns
    -------
        A DataFrame with the columns from df and additional columns from stats
        labeled with suffixes '_1', '_2'...'_x' for the past season's stats. Players
        without stats from the previous season are NaN valued.

    Example
    -------
        >>> df
        playerid | Season | Player | Batting Average
        001        2001     Ichiro   .350
        001        2002     Ichiro   .321
        >>> multiseason_lines(df, ['Batting_Average'], 1)
        playerid | Season | Player | Batting Average | Batting Average_1
        001        2002     Ichiro   .350              .321
    """
    out = df.copy()
    for n in range(num_seasons):
        df1 = df.copy()
        df1['Season'] = df1['Season'] + 1 + n ##we set df1 to match df, except that all season numbers are incremented.
        out = pd.merge(out, df1[stats], how = 'left', on=['playerid','Season'], #then we can align a previous season with a current one.
            suffixes=("","_"+str(1+n)))
    return out



In [10]:
stats = df.select_dtypes(exclude='object').columns
df = multiseason_lines(df,stats,1).dropna()
event_threshold = 100
df = df[(df['Events'] > event_threshold) & (df['Events_1']> event_threshold)]
df.columns

Index(['Season', 'Name', 'Team', 'PA_x', 'Events', 'EV', 'maxEV', 'LA',
       'Barrels', 'Barrel%', 'HardHit', 'HardHit%', 'playerid', 'cosLA',
       'sinLA', 'EV_x', 'EV_y', 'Hangtime', 'Distance', 'LA_optimality', 'PA',
       'H', '1B', '2B', '3B', 'HR', 'BB', 'IBB', 'SO', 'HBP', 'SF', 'SH',
       'GDP', 'SB', 'CS', 'PA%', 'H%', '1B%', '2B%', '3B%', 'HR%', 'BB%',
       'IBB%', 'SO%', 'HBP%', 'SF%', 'SH%', 'GDP%', 'SB%', 'CS%', 'PA_x_1',
       'Events_1', 'EV_1', 'maxEV_1', 'LA_1', 'Barrels_1', 'Barrel%_1',
       'HardHit_1', 'HardHit%_1', 'cosLA_1', 'sinLA_1', 'EV_x_1', 'EV_y_1',
       'Hangtime_1', 'Distance_1', 'LA_optimality_1', 'PA_1', 'H_1', '1B_1',
       '2B_1', '3B_1', 'HR_1', 'BB_1', 'IBB_1', 'SO_1', 'HBP_1', 'SF_1',
       'SH_1', 'GDP_1', 'SB_1', 'CS_1', 'PA%_1', 'H%_1', '1B%_1', '2B%_1',
       '3B%_1', 'HR%_1', 'BB%_1', 'IBB%_1', 'SO%_1', 'HBP%_1', 'SF%_1',
       'SH%_1', 'GDP%_1', 'SB%_1', 'CS%_1'],
      dtype='object')

In [11]:
correlations = df.corr()
df['PA_x'].min()

144

## Lots of data cleaning and prep above.

Our goal: let's predict subsequent season HR based on previous season stat cast data. We will also compare this to using previous season box score data. 

In [12]:
statcast_cols
statcast_cols_prev = [x + '_1' for x in statcast_cols]
statcast_cols_prev

['PA_1',
 'Events_1',
 'EV_1',
 'maxEV_1',
 'LA_1',
 'Barrels_1',
 'Barrel%_1',
 'HardHit_1',
 'HardHit%_1',
 'cosLA_1',
 'sinLA_1',
 'EV_x_1',
 'EV_y_1',
 'Hangtime_1',
 'Distance_1',
 'LA_optimality_1']

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

y = df['HR%']
X = df[[ x for x in df.select_dtypes(exclude='object').columns if x.endswith("_1")]]
X = X[statcast_cols_prev]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [14]:
X.columns

Index(['PA_1', 'Events_1', 'EV_1', 'maxEV_1', 'LA_1', 'Barrels_1', 'Barrel%_1',
       'HardHit_1', 'HardHit%_1', 'cosLA_1', 'sinLA_1', 'EV_x_1', 'EV_y_1',
       'Hangtime_1', 'Distance_1', 'LA_optimality_1'],
      dtype='object')

# Establishing a Baseline

Any decent information we get should improve on using a simple linear regression of a previous season's HR total.
So, we findout how that scores.

...

They scored roughly .37 and .40; the test was actually a little higher than the train, which is just a result of the split (random chance.)


In [15]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

## The most basic model, HR% = HR%_1
temp = df[[ x for x in df.select_dtypes(exclude='object').columns if x.endswith("_1")]]
print(r2_score(y, temp['HR%_1']))
print(mean_absolute_error(temp['HR%_1'], y))
print(mean_squared_error(temp['HR%_1'], y))


0.24026066677237912
0.017379435517525698
0.0005303982062377767


In [16]:
from scipy.stats import pearsonr

In [17]:
from sklearn.linear_model import LinearRegression

linear = LinearRegression().fit(X_train, y_train)
linear.fit(X_train,y_train)
print(linear.score(X_train, y_train),
    linear.score(X_test, y_test))

0.5396376384510931 0.43953426011833696


### The basic linear regression is good

It improves performance over a simple linear regression on past season HR%.LinearRegression

Next we look at PCA regressions. 

### PCA

PCA (Principle Component Analysis) is a dimensionality reduction algorithm that produces produces axes capturing as much of the data variance is possible. Each dimesion is independent of the others, i.e. the second dimension captures as much of variance that the first dimension don't capture. Our data dimensionality is small, but the following will show:
- The statcast data can be efficiently reduced to about 3 dimensions without loss of accuracy.
- With PCA, polynomial features improve training (but not testing) performance, suggesting that there are not important interaction among the variables. (i.e., polynomial features allow us to overfit out training data.)

In [18]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
#pca = PCA(n_components=20)

pca_model = Pipeline(
    steps=[('scaler', StandardScaler()),
        #('poly_features', PolynomialFeatures(degree=2)),
        ('pca', PCA(n_components=3)),
        ('linear', LinearRegression())]
)

In [19]:
pca_model.fit(X_train, y_train)
print(pca_model.score(X_train, y_train))
pca_model.score(X_test, y_test)

0.4967397073384652


0.43246683486402193

In [20]:
from sklearn.linear_model import Ridge

poly_no_pca  = Pipeline(
    steps=[('scaler', StandardScaler()),
        ('poly_features', PolynomialFeatures(degree=2)),
        #('pca', PCA(n_components=100)),
        ('linear', Ridge(alpha=5))]
)
pca_ridge = Pipeline(
    steps=[('scaler', StandardScaler()),
        ('poly_features', PolynomialFeatures(degree=2)),
        ('pca', PCA(n_components=15)),
        ('linear', Ridge(alpha=.01))]
)

In [21]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {'linear__alpha': np.logspace(1,3,num=1000), 'pca__n_components':np.linspace(20,100,num=1000).astype(int)}
grid = RandomizedSearchCV(pca_ridge, param_grid, n_iter=100)

grid.fit(X_train, y_train)


RandomizedSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('poly_features',
                                              PolynomialFeatures()),
                                             ('pca', PCA(n_components=15)),
                                             ('linear', Ridge(alpha=0.01))]),
                   n_iter=100,
                   param_distributions={'linear__alpha': array([  10.        ,   10.04620421,   10.09262191,   10.13925408,
         10.1861017 ,   10.23316578,   10.28044732,   10.32794732,
         10.37566679,   10.42360674,   10.47176819,...
        90,  90,  90,  91,  91,  91,  91,  91,  91,  91,  91,  91,  91,
        91,  91,  91,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,
        92,  92,  93,  93,  93,  93,  93,  93,  93,  93,  93,  93,  93,
        93,  93,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,
        94,  95,  95,  95,  95,  95,  95,  95,  95,  95,  95,  95,  95,
  

In [22]:
results = pd.DataFrame(grid.cv_results_)[['params','rank_test_score', 'mean_test_score']].sort_values('mean_test_score', ascending=False)
results['Components'] = results['params'].apply(lambda x: x['pca__n_components'])
results['L2'] = results['params'].apply(lambda x: x['linear__alpha'])

In [23]:
results.head(10)

Unnamed: 0,params,rank_test_score,mean_test_score,Components,L2
67,"{'pca__n_components': 27, 'linear__alpha': 73....",1,0.507715,27,73.598145
75,"{'pca__n_components': 30, 'linear__alpha': 112...",2,0.505809,30,112.993394
65,"{'pca__n_components': 45, 'linear__alpha': 102...",3,0.505159,45,102.096066
62,"{'pca__n_components': 70, 'linear__alpha': 125...",4,0.505105,70,125.63166
13,"{'pca__n_components': 66, 'linear__alpha': 112...",5,0.505087,66,112.473718
91,"{'pca__n_components': 53, 'linear__alpha': 106...",6,0.504984,53,106.912634
7,"{'pca__n_components': 85, 'linear__alpha': 147...",7,0.504918,85,147.628147
24,"{'pca__n_components': 89, 'linear__alpha': 95....",8,0.504873,89,95.715215
37,"{'pca__n_components': 72, 'linear__alpha': 154...",9,0.504815,72,154.592774
35,"{'pca__n_components': 67, 'linear__alpha': 157...",10,0.504767,67,157.469771


In [24]:
grid.param_distributions['linear__alpha'] = np.logspace(1,2.3,num=100)
grid.param_distributions['pca__n_components'] = [25,30,35]
grid.fit(X_train, y_train)

RandomizedSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('poly_features',
                                              PolynomialFeatures()),
                                             ('pca', PCA(n_components=15)),
                                             ('linear', Ridge(alpha=0.01))]),
                   n_iter=100,
                   param_distributions={'linear__alpha': array([ 10.        ,  10.30697715,  10.62337779,  10.94949121,
        11.28561557,  11.63205818,  11.98913578,  12.35717485,
        12.73651188,  13.12749369,  13.53047775...
        88.19714876,  90.90459967,  93.69516314,  96.57139053,
        99.53591153, 102.59143655, 105.7407592 , 108.98675886,
       112.3324033 , 115.78075137, 119.33495585, 122.99826628,
       126.77403197, 130.66570504, 134.67684358, 138.81111491,
       143.07229892, 147.46429154, 151.9911083 , 156.65688798,
       161.46589644, 166.42253047, 171.53132184, 176.796941

In [25]:
results = pd.DataFrame(grid.cv_results_)[['params','rank_test_score', 'mean_test_score']].sort_values('mean_test_score', ascending=False)
results['Components'] = results['params'].apply(lambda x: x['pca__n_components'])
results['L2'] = results['params'].apply(lambda x: x['linear__alpha'])

In [26]:
results.head()

Unnamed: 0,params,rank_test_score,mean_test_score,Components,L2
99,"{'pca__n_components': 25, 'linear__alpha': 52....",1,0.51048,25,52.749971
11,"{'pca__n_components': 25, 'linear__alpha': 48....",2,0.51047,25,48.175728
24,"{'pca__n_components': 25, 'linear__alpha': 45....",3,0.510439,25,45.348785
1,"{'pca__n_components': 25, 'linear__alpha': 65....",4,0.510433,25,65.184061
59,"{'pca__n_components': 25, 'linear__alpha': 73....",5,0.510337,25,73.564225


In [27]:
pca_ridge = grid.best_estimator_
pca_ridge.fit(X_train, y_train)
pca_ridge.score(X_train, y_train)
pca_ridge.score(X_test, y_test)

0.42169339247547544

In [28]:
from sklearn.cross_decomposition import PLSRegression

plsr = PLSRegression(n_components=2)

plsr = Pipeline(
    steps=[('scaler', StandardScaler()),
        ('poly_features', PolynomialFeatures(degree=2)),
        ('plsr', plsr)
    ]
        
)

In [29]:
from sklearn.model_selection import GridSearchCV
plsr_grid = GridSearchCV(plsr, param_grid={'plsr__n_components' :np.arange(2,20)})
plsr_grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('poly_features', PolynomialFeatures()),
                                       ('plsr', PLSRegression())]),
             param_grid={'plsr__n_components': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19])})

In [30]:
plsr = plsr_grid.best_estimator_

In [31]:
plsr.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('poly_features', PolynomialFeatures()),
                ('plsr', PLSRegression(n_components=10))])

In [32]:
plsr.score(X_train, y_train)
plsr.score(X_test, y_test)
#pearsonr(plsr.predict(X_test), y_test)


0.415183689739943

## Summary

There's no distinct advantage to using PCA regression or PLS to derive insights from StatCast data.

In [33]:
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(max_iter=1000,
        learning_rate_init=.0003,
        solver='lbfgs',
        activation='logistic',
        hidden_layer_sizes=80,
        random_state=1,
        alpha=.01
        )
mlp = Pipeline(steps=[('scaler', StandardScaler()),
                ('mlp', mlp)])

param_dist = {'mlp__learning_rate_init': [.0003],      
                #'mlp__activation': ['logistic'],
                #'mlp__solver': ['lbfgs', 'adam'],
                #'mlp__hidden_layer_sizes': [80, (80, 40)],
                'mlp__alpha': np.logspace(-5,1)}
mlp_search = RandomizedSearchCV(mlp, return_train_score=True, param_distributions=param_dist, n_iter=20)

In [34]:
mlp_search.fit(X_train, y_train)

RandomizedSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('mlp',
                                              MLPRegressor(activation='logistic',
                                                           alpha=0.01,
                                                           hidden_layer_sizes=80,
                                                           learning_rate_init=0.0003,
                                                           max_iter=1000,
                                                           random_state=1,
                                                           solver='lbfgs'))]),
                   n_iter=20,
                   param_distributions={'mlp__alpha': array([1.00000000e-05, 1.32571137e-05, 1.75751062e-05, 2.32995181e-05,
       3.08884360e-05, 4....
       2.68269580e-02, 3.55648031e-02, 4.71486636e-02, 6.25055193e-02,
       8.28642773e-02, 1.09854114e-01, 1.45634848e-01, 1.93069773e-0

In [35]:
results = pd.DataFrame(mlp_search.cv_results_).sort_values('mean_test_score', ascending=False)
cols = [col for col in results if col[:5]!='split']
results[cols]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_mlp__learning_rate_init,param_mlp__alpha,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
4,0.029276,0.000488,0.002021,2.2e-05,0.0003,0.00868511,"{'mlp__learning_rate_init': 0.0003, 'mlp__alph...",0.508591,0.087366,1,0.530703,0.022834
8,0.030315,0.002405,0.00237,0.000498,0.0003,0.0152642,"{'mlp__learning_rate_init': 0.0003, 'mlp__alph...",0.50859,0.087535,2,0.53064,0.022841
14,0.028495,0.001021,0.002793,0.000754,0.0003,0.0202359,"{'mlp__learning_rate_init': 0.0003, 'mlp__alph...",0.508541,0.087739,3,0.530536,0.022845
13,0.029297,0.000489,0.002007,2.8e-05,0.0003,0.000517947,"{'mlp__learning_rate_init': 0.0003, 'mlp__alph...",0.508529,0.087313,4,0.5307,0.022826
18,0.032162,0.000385,0.002591,0.00049,0.0003,0.000390694,"{'mlp__learning_rate_init': 0.0003, 'mlp__alph...",0.508528,0.087314,5,0.530699,0.022826
2,0.029294,0.000786,0.002418,0.000493,0.0003,0.0002223,"{'mlp__learning_rate_init': 0.0003, 'mlp__alph...",0.508526,0.087315,6,0.530698,0.022826
9,0.029676,0.000398,0.002001,1e-05,0.0003,0.000167683,"{'mlp__learning_rate_init': 0.0003, 'mlp__alph...",0.508526,0.087315,7,0.530698,0.022826
7,0.028889,0.000649,0.002799,0.000739,0.0003,0.000126486,"{'mlp__learning_rate_init': 0.0003, 'mlp__alph...",0.508525,0.087315,8,0.530697,0.022826
0,0.03319,0.005644,0.002404,0.000482,0.0003,5.42868e-05,"{'mlp__learning_rate_init': 0.0003, 'mlp__alph...",0.508524,0.087316,9,0.530697,0.022826
12,0.028684,0.000402,0.002005,2.8e-05,0.0003,4.09492e-05,"{'mlp__learning_rate_init': 0.0003, 'mlp__alph...",0.508524,0.087316,10,0.530697,0.022826


In [36]:
print(mlp.named_steps)
mlp.fit(X_train, y_train)
print(mlp.score(X_train, y_train))
mlp.score(X_test, y_test)

{'scaler': StandardScaler(), 'mlp': MLPRegressor(activation='logistic', alpha=0.01, hidden_layer_sizes=80,
             learning_rate_init=0.0003, max_iter=1000, random_state=1,
             solver='lbfgs')}
0.5288422036139779


0.4238314216524087

### MLP does no better

At least simple MLP Regression doesn't improve on the linear regression model above.

In [37]:
from marcel import MarcelForecaster

In [43]:
pitcher_data = pd.read_csv('../data/pitchers_since_1947.csv')
pitcher_data = pitcher_data[pitcher_data['Season'] >= 2016]
marcel = MarcelForecaster(pitcher_data, standard, as_pandas=True)

In [56]:
X_train['PA_1'].min()

141.0

## Let's compare with Marcel

In [89]:
ids = standard[(standard['PA'] > 100) & (standard['Season'] == 2018)]['playerid'].unique()
ids in ids
marcel.hitters = marcel.hitters[marcel.hitters['playerid'].isin(ids)]
results = marcel.project_hitters(2019, apply_age=False).drop('playerid',axis=1).reset_index()

In [90]:
results.head()

Unnamed: 0,playerid,Season,Name,Team,PA,H,1B,2B,3B,HR,BB,IBB,SO,HBP,SF,SH,GDP,SB,CS,OBP
0,393,2019,Victor Martinez,Tigers,497.5,118.352205,84.269869,19.739272,0.383359,13.959705,37.888979,4.252455,69.423185,3.668681,4.069768,0.190575,16.087523,1.173136,0.443662,0.321427
1,639,2019,Adrian Beltre,Rangers,479.4,122.729532,78.97017,23.753715,1.191349,18.814299,38.771245,2.88028,77.141648,5.134518,5.466013,0.189581,9.95999,1.977005,0.645063,0.347591
2,785,2019,Todd Frazier,White Sox,493.6,94.970929,54.140773,17.833912,0.604028,22.392217,54.356929,1.419104,113.286493,7.515606,5.569606,0.36472,9.23629,7.717252,3.368932,0.317754
3,1159,2019,Andrew Romine,Tigers,300.3,64.370961,45.972688,11.64373,1.927432,4.827111,20.830144,0.521647,65.055638,3.523968,1.249433,3.130148,6.03145,5.496097,1.769875,0.295455
4,1177,2019,Albert Pujols,Angels,512.6,118.538265,80.677287,17.541915,0.352812,19.966251,33.683374,3.726265,73.62877,2.252311,3.558135,0.175389,16.313351,2.890606,0.40831,0.301354


In [99]:
h2019 = marcel.hitters[marcel.hitters['Season'] == 2019].sort_values('playerid')
results
results['HR%'] = results['HR']/results['PA']
x_pred= results[results['playerid'].isin(h2019['playerid'])]
h2019['HR%'] = h2019['HR']/h2019['PA']
r2_score(h2019['HR%'], x_pred['HR%'])

0.297559582377932

## The models were stronger than Marcel

I need to double check that. It looks too good to be true.