In [23]:
import numpy as np
import pandas as pd
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import math
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
# from bayes_opt import BayesianOptimization
# from xgboost import XGBRegressor
# import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
data = pd.read_csv('../datasets/baseball/FG_2010_2018_inning_6_load.csv')

## Explore the Data

In [6]:
data.head(2)

Unnamed: 0,Season,Name,Team,G,Age,PA,HR,R,RBI,SB,...,f_val_SB_rate,f_val_rate,ny_f_val_rate,HR_rate_std,R_rate_std,RBI_rate_std,SB_rate_std,AVG_std,f_val_std,ny_f_val_std
0,2017,Mookie Betts,Red Sox,153,24,712,24,101,102,26,...,1.83,2.43,0.74546,0.077212,1.205684,1.226771,1.542399,-0.087713,3.964353,12.516023
1,2017,Christian Yelich,Marlins,156,25,695,18,100,81,16,...,0.71,0.77,0.816703,-0.543295,1.037601,0.094554,0.596226,0.5346,1.719685,10.721773


In [7]:
data.describe()

Unnamed: 0,Season,G,Age,PA,HR,R,RBI,SB,BB%,K%,...,f_val_SB_rate,f_val_rate,ny_f_val_rate,HR_rate_std,R_rate_std,RBI_rate_std,SB_rate_std,AVG_std,f_val_std,ny_f_val_std
count,1154.0,1154.0,1154.0,1154.0,1154.0,1154.0,1154.0,1154.0,1154.0,1154.0,...,1154.0,1154.0,1154.0,1154.0,1154.0,1154.0,1154.0,1154.0,1154.0,1154.0
mean,2013.55026,141.068458,28.214038,579.262565,18.353553,72.337955,69.570191,10.006932,0.085828,0.183038,...,0.049159,0.195347,0.029685,0.106438,0.175737,0.127342,0.044009,0.127762,0.581287,0.260231
std,2.314325,16.16898,3.662962,86.555025,9.940838,18.773729,22.253659,11.004136,0.030436,0.056472,...,1.147934,1.342655,0.521885,1.015281,0.980659,1.006717,1.01402,0.971037,3.076915,3.240759
min,2010.0,93.0,19.0,400.0,0.0,26.0,17.0,0.0,0.021,0.058,...,-1.0,-2.62,-1.0,-2.080068,-2.711601,-2.717538,-0.96994,-3.869888,-7.862697,-7.862697
25%,2012.0,130.0,26.0,511.0,11.0,58.0,52.0,2.0,0.063,0.142,...,-0.77,-0.7275,-0.36688,-0.69543,-0.534081,-0.62722,-0.666457,-0.535968,-1.690974,-2.160351
50%,2014.0,145.0,28.0,587.5,17.0,71.5,68.0,6.0,0.083,0.179,...,-0.34,-0.015,-0.005117,0.062385,0.141844,0.120951,-0.299616,0.133874,0.417967,0.045613
75%,2016.0,155.0,31.0,652.0,25.0,85.0,85.0,14.0,0.105,0.219,...,0.5275,0.9575,0.366121,0.8225,0.81774,0.809049,0.457565,0.772735,2.528986,2.423202
max,2017.0,162.0,40.0,754.0,59.0,137.0,139.0,68.0,0.206,0.372,...,7.14,6.98,1.888168,3.66598,3.934791,3.489644,6.18938,3.507457,11.463261,12.516023


In [8]:
data.columns

Index(['Season', 'Name', 'Team', 'G', 'Age', 'PA', 'HR', 'R', 'RBI', 'SB',
       'BB%', 'K%', 'ISO', 'BABIP', 'AVG', 'OBP', 'SLG', 'wOBA', 'wRC+', 'EV',
       'BsR', 'Off', 'O-Swing%', 'Z-Swing%', 'O-Contact%', 'Z-Contact%',
       'GB/FB', 'LD%', 'GB%', 'HR/FB', 'playerid', 'f_val_HR', 'f_val_R',
       'f_val_RBI', 'f_val_SB', 'f_val_BA', 'f_val', 'ny_f_val', 'HR_rate',
       'R_rate', 'RBI_rate', 'SB_rate', 'f_val_HR_rate', 'f_val_R_rate',
       'f_val_RBI_rate', 'f_val_SB_rate', 'f_val_rate', 'ny_f_val_rate',
       'HR_rate_std', 'R_rate_std', 'RBI_rate_std', 'SB_rate_std', 'AVG_std',
       'f_val_std', 'ny_f_val_std'],
      dtype='object')

## Create data structures for use in models

In [9]:
# define columns that won't be used in predictive modeling
drop_cols = ['Name', 'Team', 'f_val_HR', 'f_val_R','f_val_RBI','f_val_SB','f_val_BA', 
             'f_val', 'ny_f_val', 'HR', 'R', 'RBI', 'SB', 'f_val_HR_rate', 'f_val_R_rate', 
             'f_val_RBI_rate', 'f_val_SB_rate', 'f_val_rate', 'ny_f_val_rate', 'HR_rate_std', 
             'R_rate_std', 'RBI_rate_std','SB_rate_std', 'AVG_std', 'f_val_std']

# create DataFrame for use in data visualization
data2 = data.drop(drop_cols, axis=1)

In [10]:
# drop column EV due to missing entries (shows in in-depth data exploration notebook)
data2 = data2.drop('EV', axis=1)

In [11]:
# dropping columns using methodology from in-depth data exploration notebook
drop_cols2 = ['G', 'Age', 'BB%', 'K%', 'BABIP', 'BsR', 'O-Swing%', 
           'Z-Swing%', 'O-Contact%', 'Z-Contact%', 'GB/FB', 'LD%', 
           'GB%', 'SB_rate', 'ISO', 'OBP', 'wOBA', 'wRC+', 'SLG', 'HR/FB']

data3 = data2.drop(drop_cols2, axis=1)

In [14]:
# test on 2 most recent years of data (index 300 is cut off)
# used for all models except Elastic Net
df_train = data3[300:]
df_test = data3[:300]

# use to retain more features for Elastic Net
df_train_enet = data2[300:]
df_test_enet = data2[:300]

In [15]:
# remove target from X
X_train = df_train.drop('ny_f_val_std', axis=1)
X_test = df_test.drop('ny_f_val_std', axis=1)

# use for Elastic Net
X_train_enet = df_train_enet.drop('ny_f_val_std', axis=1)
X_test_enet = df_test_enet.drop('ny_f_val_std', axis=1)

# create target y
y_train = df_train['ny_f_val_std']
y_test = df_test['ny_f_val_std']

In [16]:
# hold onto season column
train_season = X_train['Season']
test_season = X_test['Season']

#hold onto playerid column
train_ID = X_train['playerid']
test_ID = X_test['playerid']

#Now drop the  'playerid' and 'Season' columns since they are unnecessary for the prediction process
X_train.drop(["playerid", "Season"], axis=1, inplace = True)
X_test.drop(["playerid", "Season"], axis=1, inplace = True)
X_train_enet.drop(["playerid", "Season"], axis=1, inplace = True)
X_test_enet.drop(["playerid", "Season"], axis=1, inplace = True)

In [17]:
X_train.head(1)

Unnamed: 0,PA,AVG,Off,HR_rate,R_rate,RBI_rate
300,654,0.291,23.1,0.124138,0.634483,0.531034


In [18]:
y_train.head()

300    8.210223
301    7.567880
302    6.856675
303    6.657771
304    6.631371
Name: ny_f_val_std, dtype: float64

### Scale the data

In [24]:
# retain column names before changing data structure
X_train_cols = X_train.columns
X_train_enet_cols = X_train_enet.columns

# Create normalized inputs for LinearRegression, XGBoost, and LGBoost
norm_scaler = MinMaxScaler()
X_train_normal = norm_scaler.fit_transform(X_train)
X_test_normal = norm_scaler.transform(X_test)

# create standardized inputs for ENet
stand_scaler = StandardScaler()
X_train_standard = stand_scaler.fit_transform(X_train_enet)
X_test_standard = stand_scaler.transform(X_test_enet)

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  del sys.path[0]
