In [40]:
from pykalman import KalmanFilter
import pandas as pd
import tensorflow
from tensorflow.keras import Sequential
from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import keras_tuner as kt
from tensorflow import keras
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split)

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.decomposition import PCA

In [41]:
data = pd.read_csv("data/b_combined.csv", index_col = 0)

In [42]:
data_sorted = data.sort_values(['Name', 'year'])
data_sorted['Fpoints_G_2'] = data_sorted.groupby('Name')['Fpoints_G'].transform(lambda x: x.rolling(window=2, min_periods = 1).mean())
data_sorted['Fpoints_G_3'] = data_sorted.groupby('Name')['Fpoints_G'].transform(lambda x: x.rolling(window=3, min_periods = 1).mean())
data_sorted['Fpoints_G_1'] = data_sorted.groupby('Name')['Fpoints_G'].transform(lambda x: x.rolling(window=1, min_periods = 1).mean())
data_sorted['Fpoints_G_4'] = data_sorted.groupby('Name')['Fpoints_G'].transform(lambda x: x.rolling(window=4, min_periods = 1).mean())
data_sorted['Fpoints_G_5'] = data_sorted.groupby('Name')['Fpoints_G'].transform(lambda x: x.rolling(window=5, min_periods = 1).mean())
data_sorted['Fpoints_G_6'] = data_sorted.groupby('Name')['Fpoints_G'].transform(lambda x: x.rolling(window=6, min_periods = 1).mean())
league_trends = data.groupby('year')['Fpoints_G'].mean().rename('league_avg_Fpoints')
full_data = data_sorted.merge(league_trends, on='year')

full_data.head()

Unnamed: 0,Name,Fpoints_G,year,xwoba,xwobacon,xbacon,exit_velocity_avg,launch_angle_avg,barrel_batted_rate,hard_hit_percent,...,flyballs_percent,linedrives_percent,sprint_speed,Fpoints_G_2,Fpoints_G_3,Fpoints_G_1,Fpoints_G_4,Fpoints_G_5,Fpoints_G_6,league_avg_Fpoints
0,"Abrams, CJ",2.251656,2023,0.266,0.297,0.292,86.5,6.8,2.1,30.7,...,21.0,18.5,29.0,2.251656,2.251656,2.251656,2.251656,2.251656,2.251656,1.769912
1,"Abreu, José",1.921986,2023,0.373,0.41,0.361,92.2,8.0,9.5,51.8,...,24.0,25.1,26.1,2.171184,2.311491,1.921986,2.608618,2.606392,2.578243,1.769912
2,"Acuña Jr., Ronald",4.446541,2023,0.365,0.444,0.371,91.2,10.8,12.8,49.7,...,24.7,21.2,28.5,3.357724,3.384824,4.446541,3.375575,3.311998,3.311998,1.769912
3,"Adames, Willy",1.939597,2023,0.323,0.414,0.335,88.9,18.9,13.0,43.6,...,33.4,22.7,27.8,2.156849,2.14028,1.939597,2.017247,1.929587,1.929587,1.769912
4,"Adams, Riley",1.431818,2023,0.274,0.349,0.292,90.2,13.5,9.4,45.8,...,28.1,18.8,26.7,1.070076,1.070076,1.431818,1.070076,1.070076,1.070076,1.769912


In [50]:
target = 'Fpoints_G'
features = [col for col in data.columns if col not in ['Name', 'year', target]]

# Split the data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=123)
train_data, val_data = train_test_split(train_data, test_size = 0.3, random_state = 123)

# Scale features within each set separately
scaler = StandardScaler()
train_data[features] = scaler.fit_transform(train_data[features])
val_data[features] = scaler.transform(val_data[features])
test_data[features] = scaler.transform(test_data[features])

# Define a function to create sequences
def create_sequences(data, max_steps):
    X, y = [], []
    for player, player_data in data.groupby('Name'):
        for i in range(len(player_data)):
            end_ix = i + max_steps
            if end_ix > len(player_data):
                break
            seq_x, seq_y = player_data.iloc[i:end_ix][features].values, player_data.iloc[end_ix-1][target]
            X.append(seq_x)
            y.append(seq_y)
    return np.array(X), np.array(y)

# Determine the maximum number of available years for any player
max_steps = max(data.groupby('Name').size())
X_train, y_train = create_sequences(train_data, n_steps)
X_val, y_val = create_sequences(val_data, n_steps)
X_test, y_test = create_sequences(test_data, n_steps)


X_train.shape[2]

24

In [51]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import MeanSquaredError
import keras_tuner as kt

def build_model(hp):
    model = Sequential()
    model.add(LSTM(hp.Int('units0', min_value=8, max_value=100, step=8),
                   activation=hp.Choice('activation0', values=['relu', 'tanh', 'sigmoid']),
                   input_shape=(X_train.shape[1], X_train.shape[2]),
                   return_sequences=True if hp.Int('num_layers', min_value=1, max_value=8) > 1 else False))
    model.add(Dropout(hp.Choice('dropout0', values=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6])))

    for i in range(1, hp.Int('num_layers', min_value=1, max_value=8)):
        model.add(LSTM(hp.Int('units' + str(i), min_value=8, max_value=100, step=8),
                       activation=hp.Choice('activation' + str(i), values=['relu', 'tanh', 'sigmoid']),
                       return_sequences=True if i < hp.Int('num_layers', min_value=1, max_value=8) - 1 else False))
        model.add(Dropout(hp.Choice('dropout' + str(i), values=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6])))

    model.add(Dense(1, activation='linear'))
    model.compile(optimizer=hp.Choice('optimizer', values=['adagrad', 'adam', 'sgd']),
                  loss='mse',
                  metrics=[MeanSquaredError()])
    return model

# Assuming X_train_transformed1, y_train, X_val_transformed1, y_val are properly defined and preprocessed
tuner = kt.Hyperband(build_model,
                     objective='val_mean_squared_error',
                     max_epochs=40,
                     overwrite=True)

tuner.search(X_train, y_train, epochs=40, validation_data=(X_val, y_val))
print(tuner.get_best_hyperparameters()[0].values)
model = tuner.get_best_models()[0]

es = EarlyStopping(monitor='val_loss', patience=10)
model.fit(X_train, y_train, epochs=40, verbose=2, validation_split=0.3, callbacks=[es])
model.save("models/feature_eng_lstm")

Trial 90 Complete [00h 00m 05s]
val_mean_squared_error: 1.2599488496780396

Best val_mean_squared_error So Far: 0.20825420320034027
Total elapsed time: 00h 07m 22s
INFO:tensorflow:Oracle triggered exit
{'units0': 88, 'activation0': 'relu', 'num_layers': 1, 'dropout0': 0.2, 'optimizer': 'sgd', 'units1': 56, 'activation1': 'sigmoid', 'dropout1': 0.4, 'units2': 16, 'activation2': 'relu', 'dropout2': 0.0, 'units3': 24, 'activation3': 'tanh', 'dropout3': 0.3, 'units4': 72, 'activation4': 'tanh', 'dropout4': 0.1, 'units5': 32, 'activation5': 'tanh', 'dropout5': 0.0, 'units6': 88, 'activation6': 'sigmoid', 'dropout6': 0.4, 'units7': 96, 'activation7': 'tanh', 'dropout7': 0.6, 'tuner/epochs': 40, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}
Epoch 1/40
10/10 - 1s - loss: 0.3218 - mean_squared_error: 0.3218 - val_loss: 0.2848 - val_mean_squared_error: 0.2848 - 782ms/epoch - 78ms/step
Epoch 2/40
10/10 - 0s - loss: 0.3289 - mean_squared_error: 0.3289 - val_loss: 0.2875 - val_mea

2024-03-14 19:35:52.794366: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,?,24]
	 [[{{node inputs}}]]
2024-03-14 19:35:52.799274: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,?,24]
	 [[{{node inputs}}]]
2024-03-14 19:35:52.853953: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'lstm_input' with dtype float and shape [?,3,24]
	 [[{{node lstm_input}}]]
2024-

INFO:tensorflow:Assets written to: models/feature_eng_lstm/assets


INFO:tensorflow:Assets written to: models/feature_eng_lstm/assets


In [53]:
# First attempt w/ lstm model
model = keras.models.load_model("models/feature_eng_lstm")
preds = model.predict(X_test)
np.sqrt(mean_squared_error(y_test, preds))



0.5752059113380945

Worse than the basic RNN, will move back to that. 