In [132]:
import pandas as pd
import tensorflow
from tensorflow.keras import Sequential
from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import keras_tuner as kt
from tensorflow import keras
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split)

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.decomposition import PCA

In [133]:
data = pd.read_csv("data/b_combined.csv", index_col = 0)
data.head()

Unnamed: 0,Name,Fpoints_G,year,xwoba,xwobacon,xbacon,exit_velocity_avg,launch_angle_avg,barrel_batted_rate,hard_hit_percent,...,edge_percent,whiff_percent,swing_percent,pull_percent,straightaway_percent,opposite_percent,f_strike_percent,flyballs_percent,linedrives_percent,sprint_speed
0,"Ortiz, David",3.543046,2016,0.42,0.474,0.364,93.0,15.7,13.1,49.1,...,41.9,23.2,44.7,41.9,36.4,21.7,52.1,25.6,28.7,23.1
1,"Rodriguez, Alex",1.215385,2016,0.368,0.436,0.338,91.3,12.2,10.9,43.9,...,42.2,32.0,43.9,38.4,43.9,16.9,60.3,24.9,24.9,23.9
2,"Beltre, Adrian",3.137255,2016,0.36,0.38,0.333,89.5,12.6,5.5,40.4,...,42.6,16.8,48.1,35.3,41.4,23.3,61.9,18.4,31.6,26.4
3,"Beltran, Carlos",2.543046,2016,0.346,0.381,0.334,90.6,15.6,5.8,41.9,...,43.5,18.1,45.4,40.6,38.3,19.3,57.3,26.6,28.1,25.6
4,"Werth, Jayson",2.160839,2016,0.331,0.384,0.324,90.0,15.3,7.9,40.7,...,42.7,24.2,37.9,33.6,33.6,32.8,57.7,23.3,31.2,26.0


In [134]:
data['years_ago'] = 2023 - data['year']
decay_rate = 0.005
data['weight_decay_0.1'] = np.exp(-decay_rate * data['years_ago'])
data[['Name', 'year', 'Fpoints_G', 'weight_decay_0.1']].tail()

Unnamed: 0,Name,year,Fpoints_G,weight_decay_0.1
2621,"Carroll, Corbin",2023,2.993548,1.0
2622,"Henderson, Gunnar",2023,2.42,1.0
2623,"Vaughn, Andrew",2023,1.953947,1.0
2624,"Pasquantino, Vinnie",2023,2.377049,1.0
2625,"Massey, Michael",2023,1.48062,1.0


In [135]:
data_sorted = data.sort_values(['Name', 'year'])
data_sorted['Fpoints_G_2'] = data_sorted.groupby('Name')['Fpoints_G'].transform(lambda x: x.rolling(window=2, min_periods = 1).mean())
data_sorted['Fpoints_G_3'] = data_sorted.groupby('Name')['Fpoints_G'].transform(lambda x: x.rolling(window=3, min_periods = 1).mean())
data_sorted['Fpoints_G_1'] = data_sorted.groupby('Name')['Fpoints_G'].transform(lambda x: x.rolling(window=1, min_periods = 1).mean())
data_sorted['Fpoints_G_4'] = data_sorted.groupby('Name')['Fpoints_G'].transform(lambda x: x.rolling(window=4, min_periods = 1).mean())
data_sorted['Fpoints_G_5'] = data_sorted.groupby('Name')['Fpoints_G'].transform(lambda x: x.rolling(window=5, min_periods = 1).mean())
data_sorted['Fpoints_G_6'] = data_sorted.groupby('Name')['Fpoints_G'].transform(lambda x: x.rolling(window=6, min_periods = 1).mean())
league_trends = data.groupby('year')['Fpoints_G'].mean().rename('league_avg_Fpoints')
full_data = data_sorted.merge(league_trends, on='year')
# feature_columns = [col for col in full_data.columns if col not in ['Name', 'year', 'Unnamed: 0', 'Fpoints_G', 'weight_decay_0.1', 'years_ago']]
# for column in feature_columns:
#     full_data[f'weighted_{column}'] = full_data[column] * full_data['weight_decay_0.1']
full_data = full_data.drop(columns = ['years_ago', 'weight_decay_0.1'])

full_data.head()


Unnamed: 0,Name,Fpoints_G,year,xwoba,xwobacon,xbacon,exit_velocity_avg,launch_angle_avg,barrel_batted_rate,hard_hit_percent,...,flyballs_percent,linedrives_percent,sprint_speed,Fpoints_G_2,Fpoints_G_3,Fpoints_G_1,Fpoints_G_4,Fpoints_G_5,Fpoints_G_6,league_avg_Fpoints
0,"Abrams, CJ",2.251656,2023,0.266,0.297,0.292,86.5,6.8,2.1,30.7,...,21.0,18.5,29.0,2.251656,2.251656,2.251656,2.251656,2.251656,2.251656,1.769912
1,"Abreu, José",1.921986,2023,0.373,0.41,0.361,92.2,8.0,9.5,51.8,...,24.0,25.1,26.1,2.171184,2.311491,1.921986,2.608618,2.606392,2.578243,1.769912
2,"Acuña Jr., Ronald",4.446541,2023,0.365,0.444,0.371,91.2,10.8,12.8,49.7,...,24.7,21.2,28.5,3.357724,3.384824,4.446541,3.375575,3.311998,3.311998,1.769912
3,"Adames, Willy",1.939597,2023,0.323,0.414,0.335,88.9,18.9,13.0,43.6,...,33.4,22.7,27.8,2.156849,2.14028,1.939597,2.017247,1.929587,1.929587,1.769912
4,"Adams, Riley",1.431818,2023,0.274,0.349,0.292,90.2,13.5,9.4,45.8,...,28.1,18.8,26.7,1.070076,1.070076,1.431818,1.070076,1.070076,1.070076,1.769912


In [136]:
full_data.isna().sum()

Name                     0
Fpoints_G                0
year                     0
xwoba                    0
xwobacon                 0
xbacon                   0
exit_velocity_avg        0
launch_angle_avg         0
barrel_batted_rate       0
hard_hit_percent         0
avg_best_speed           0
avg_hyper_speed          0
z_swing_percent          0
z_swing_miss_percent     0
oz_swing_percent         0
oz_swing_miss_percent    0
iz_contact_percent       0
edge_percent             0
whiff_percent            0
swing_percent            0
pull_percent             0
straightaway_percent     0
opposite_percent         0
f_strike_percent         0
flyballs_percent         0
linedrives_percent       0
sprint_speed             0
Fpoints_G_2              0
Fpoints_G_3              0
Fpoints_G_1              0
Fpoints_G_4              0
Fpoints_G_5              0
Fpoints_G_6              0
league_avg_Fpoints       0
dtype: int64

In [137]:
X = full_data.drop(columns = ['Name', 'year', 'Fpoints_G'])
y = full_data['Fpoints_G']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 123)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size = 0.7, random_state = 123)
num_feats = X.columns.tolist()
ct = make_column_transformer(
    (StandardScaler(), num_feats))
ct

transformed_x_train = ct.fit_transform(X_train)
transformed_x_val = ct.transform(X_val)
transformed_x_test = ct.transform(X_test)



In [138]:
colnames = num_feats
X_train_transformed = pd.DataFrame(transformed_x_train, columns = colnames)
X_val_transformed = pd.DataFrame(transformed_x_val, columns = colnames)
X_test_transformed = pd.DataFrame(transformed_x_test, columns = colnames)
n_feats = len(colnames)
n_feats

31

In [139]:
# num_pca = 15
# pca = PCA(n_components = num_pca)
# X_train_transformed1 = pca.fit_transform(X_train_transformed)
# X_val_transformed1 = pca.transform(X_val_transformed)
# X_test_transformed1 = pca.transform(X_test_transformed)

In [141]:
def build_model(hp):
  model = Sequential()
  counter = 0
  for i in range(hp.Int('num_layers',min_value=1,max_value=8)):
    if counter == 0:
      model.add(Dense(hp.Int('units' + str(i), min_value=8, max_value=100,step=4),activation= hp.Choice('activation' + str(i), values=['relu','tanh','sigmoid']),input_dim=num_pca))
      model.add(Dropout(hp.Choice('dropout' + str(i), values=[0.0,0.1,0.2,0.3,0.4,0.5,0.6])))
    else:
      model.add(Dense(hp.Int('units' + str(i), min_value=8, max_value=100,step=2),activation= hp.Choice('activation' + str(i), values=['relu','tanh','sigmoid'])))
      model.add(Dropout(hp.Choice('dropout' + str(i), values=[0.0,0.1,0.2,0.3,0.4,0.5,0.6])))
    counter+=1
  model.add(Dense(1,activation='linear'))
  model.compile(optimizer=hp.Choice('optimizer',values=['adagrad','adam','sgd']),
                  loss='mse',
                  metrics=[MeanSquaredError()])
  return model

tuner = kt.Hyperband(build_model, objective = 'val_mean_squared_error', max_epochs = 40, overwrite = True)
tuner.search(X_train_transformed1, y_train, epochs = 40, validation_data = (X_val_transformed1, y_val))
print(tuner.get_best_hyperparameters()[0].values)
model = tuner.get_best_models()[0]
es = EarlyStopping(monitor = 'val_loss', patience = 10)
model.fit(X_train_transformed1, y_train, epochs = 40, verbose = 2, validation_split = 0.3, callbacks = [es])
model.save("models/lotsfeat_pca_nn")

Trial 90 Complete [00h 00m 04s]
val_mean_squared_error: 0.4600257873535156

Best val_mean_squared_error So Far: 0.07617422938346863
Total elapsed time: 00h 02m 09s
INFO:tensorflow:Oracle triggered exit


INFO:tensorflow:Oracle triggered exit


{'num_layers': 4, 'units0': 76, 'activation0': 'sigmoid', 'dropout0': 0.4, 'optimizer': 'adam', 'units1': 46, 'activation1': 'sigmoid', 'dropout1': 0.3, 'units2': 40, 'activation2': 'tanh', 'dropout2': 0.2, 'units3': 64, 'activation3': 'tanh', 'dropout3': 0.0, 'units4': 68, 'activation4': 'sigmoid', 'dropout4': 0.0, 'units5': 60, 'activation5': 'relu', 'dropout5': 0.5, 'units6': 86, 'activation6': 'tanh', 'dropout6': 0.4, 'units7': 82, 'activation7': 'sigmoid', 'dropout7': 0.4, 'tuner/epochs': 40, 'tuner/initial_epoch': 14, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0047'}
Epoch 1/40
29/29 - 1s - loss: 0.1028 - mean_squared_error: 0.1028 - val_loss: 0.0610 - val_mean_squared_error: 0.0610 - 616ms/epoch - 21ms/step
Epoch 2/40
29/29 - 0s - loss: 0.0915 - mean_squared_error: 0.0915 - val_loss: 0.0606 - val_mean_squared_error: 0.0606 - 52ms/epoch - 2ms/step
Epoch 3/40
29/29 - 0s - loss: 0.0946 - mean_squared_error: 0.0946 - val_loss: 0.0712 - val_mean_squared_error: 0.0712 - 

2024-03-14 18:45:25.957866: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,76]
	 [[{{node inputs}}]]
2024-03-14 18:45:25.964915: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,46]
	 [[{{node inputs}}]]
2024-03-14 18:45:25.971893: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,40]
	 [[{{node inputs}}]]
2024-03-14 18:45:26

INFO:tensorflow:Assets written to: models/lotsfeat_pca_nn/assets


INFO:tensorflow:Assets written to: models/lotsfeat_pca_nn/assets


In [53]:
# Including all features + decay + engineered features (0.1 decay)
model = keras.models.load_model("models/decay_featureeng_nn")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



0.27835274316007974

In [62]:
# No engineered features but all decay + features (0.1)
model = keras.models.load_model("models/decay_nn")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



0.5810911541239634

In [71]:
# Original data.. no engineered features or decay. 
model = keras.models.load_model("models/og_data_nn")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



0.5822079175088242

In [80]:
# Engineered features but no decay
model = keras.models.load_model("models/eng_feat_nn")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



0.27316737917681927

In [89]:
# Two rolling averages + 0.5 decay vs 0.1
model = keras.models.load_model("models/0.5decay_newfeat_nn")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



0.5678935030433923

In [107]:
# Two rolling averages + 0.005 decay 
model = keras.models.load_model("models/0.005decay_newfeat_nn")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



0.26121842330552364

In [114]:
# rolling averages up to 6 years, no decay
model = keras.models.load_model("models/lotsfeat_nn")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



0.24974797036071666

In [142]:
# same rolling averages w/ PCA reducing to 15 dimensions
model = keras.models.load_model("models/lotsfeat_pca_nn")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



0.2555150658183186

For this section we decide on the model with 6 rolling averages, no decay or pca. 