In [6]:
import pandas as pd
import tensorflow
from tensorflow.keras import Sequential
from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import keras_tuner as kt
from tensorflow import keras
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split)

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.decomposition import PCA
import unicodedata
from fuzzywuzzy import process

In [7]:
data = pd.read_csv("data/b_combined.csv", index_col = 0)
data.head()

Unnamed: 0,Name,Fpoints_G,year,xwoba,xwobacon,xbacon,exit_velocity_avg,launch_angle_avg,barrel_batted_rate,hard_hit_percent,...,edge_percent,whiff_percent,swing_percent,pull_percent,straightaway_percent,opposite_percent,f_strike_percent,flyballs_percent,linedrives_percent,sprint_speed
0,"Ortiz, David",3.543046,2016,0.42,0.474,0.364,93.0,15.7,13.1,49.1,...,41.9,23.2,44.7,41.9,36.4,21.7,52.1,25.6,28.7,23.1
1,"Rodriguez, Alex",1.215385,2016,0.368,0.436,0.338,91.3,12.2,10.9,43.9,...,42.2,32.0,43.9,38.4,43.9,16.9,60.3,24.9,24.9,23.9
2,"Beltre, Adrian",3.137255,2016,0.36,0.38,0.333,89.5,12.6,5.5,40.4,...,42.6,16.8,48.1,35.3,41.4,23.3,61.9,18.4,31.6,26.4
3,"Beltran, Carlos",2.543046,2016,0.346,0.381,0.334,90.6,15.6,5.8,41.9,...,43.5,18.1,45.4,40.6,38.3,19.3,57.3,26.6,28.1,25.6
4,"Werth, Jayson",2.160839,2016,0.331,0.384,0.324,90.0,15.3,7.9,40.7,...,42.7,24.2,37.9,33.6,33.6,32.8,57.7,23.3,31.2,26.0


In [8]:
data['years_ago'] = 2023 - data['year']
decay_rate = 0.005
data['weight_decay_0.1'] = np.exp(-decay_rate * data['years_ago'])
mean_fpts = data[data['year']==2023]['Fpoints_G'].mean()
print(mean_fpts)
data[['Name', 'year', 'Fpoints_G', 'weight_decay_0.1']].tail()


1.7699118887617884


Unnamed: 0,Name,year,Fpoints_G,weight_decay_0.1
2621,"Carroll, Corbin",2023,2.993548,1.0
2622,"Henderson, Gunnar",2023,2.42,1.0
2623,"Vaughn, Andrew",2023,1.953947,1.0
2624,"Pasquantino, Vinnie",2023,2.377049,1.0
2625,"Massey, Michael",2023,1.48062,1.0


In [9]:
data_sorted = data.sort_values(['Name', 'year'])
data_sorted['Fpoints_G_2'] = data_sorted.groupby('Name')['Fpoints_G'].transform(lambda x: x.rolling(window=2, min_periods = 1).mean())
data_sorted['Fpoints_G_1'] = data_sorted.groupby('Name')['Fpoints_G'].transform(lambda x: x.rolling(window=1, min_periods = 1).mean())
# data_sorted['Fpoints_G_6'] = data_sorted.groupby('Name')['Fpoints_G'].transform(lambda x: x.rolling(window=6, min_periods = 1).mean())
league_trends = data.groupby('year')['Fpoints_G'].mean().rename('league_avg_Fpoints')
full_data = data_sorted.merge(league_trends, on='year')
latest_rolling_averages = full_data.drop_duplicates(subset=['Name'], keep='last')[['Name', 'Fpoints_G_2', 'Fpoints_G_1','league_avg_Fpoints']]
# feature_columns = [col for col in full_data.columns if col not in ['Name', 'year', 'Unnamed: 0', 'Fpoints_G', 'weight_decay_0.1', 'years_ago']]
# for column in feature_columns:
#     full_data[f'weighted_{column}'] = full_data[column] * full_data['weight_decay_0.1']
full_data = full_data.drop(columns = ['years_ago', 'weight_decay_0.1'])

full_data.head()


Unnamed: 0,Name,Fpoints_G,year,xwoba,xwobacon,xbacon,exit_velocity_avg,launch_angle_avg,barrel_batted_rate,hard_hit_percent,...,pull_percent,straightaway_percent,opposite_percent,f_strike_percent,flyballs_percent,linedrives_percent,sprint_speed,Fpoints_G_2,Fpoints_G_1,league_avg_Fpoints
0,"Abrams, CJ",2.251656,2023,0.266,0.297,0.292,86.5,6.8,2.1,30.7,...,32.8,39.9,27.3,65.9,21.0,18.5,29.0,2.251656,2.251656,1.769912
1,"Abreu, José",1.921986,2023,0.373,0.41,0.361,92.2,8.0,9.5,51.8,...,34.5,39.2,26.3,57.7,24.0,25.1,26.1,2.171184,1.921986,1.769912
2,"Acuña Jr., Ronald",4.446541,2023,0.365,0.444,0.371,91.2,10.8,12.8,49.7,...,40.4,39.5,20.1,60.8,24.7,21.2,28.5,3.357724,4.446541,1.769912
3,"Adames, Willy",1.939597,2023,0.323,0.414,0.335,88.9,18.9,13.0,43.6,...,37.7,36.7,25.7,59.8,33.4,22.7,27.8,2.156849,1.939597,1.769912
4,"Adams, Riley",1.431818,2023,0.274,0.349,0.292,90.2,13.5,9.4,45.8,...,45.8,28.1,26.0,65.2,28.1,18.8,26.7,1.070076,1.431818,1.769912


In [10]:
full_data.isna().sum()

Name                     0
Fpoints_G                0
year                     0
xwoba                    0
xwobacon                 0
xbacon                   0
exit_velocity_avg        0
launch_angle_avg         0
barrel_batted_rate       0
hard_hit_percent         0
avg_best_speed           0
avg_hyper_speed          0
z_swing_percent          0
z_swing_miss_percent     0
oz_swing_percent         0
oz_swing_miss_percent    0
iz_contact_percent       0
edge_percent             0
whiff_percent            0
swing_percent            0
pull_percent             0
straightaway_percent     0
opposite_percent         0
f_strike_percent         0
flyballs_percent         0
linedrives_percent       0
sprint_speed             0
Fpoints_G_2              0
Fpoints_G_1              0
league_avg_Fpoints       0
dtype: int64

In [11]:
X = full_data.drop(columns = ['Name', 'year', 'Fpoints_G'])
y = full_data['Fpoints_G']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 123)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size = 0.7, random_state = 123)
num_feats = X.columns.tolist()
ct = make_column_transformer(
    (StandardScaler(), num_feats))
ct

transformed_x_train = ct.fit_transform(X_train)
transformed_x_val = ct.transform(X_val)
transformed_x_test = ct.transform(X_test)



In [12]:
colnames = num_feats
X_train_transformed1 = pd.DataFrame(transformed_x_train, columns = colnames)
X_val_transformed1 = pd.DataFrame(transformed_x_val, columns = colnames)
X_test_transformed1 = pd.DataFrame(transformed_x_test, columns = colnames)
n_feats = len(colnames)
n_feats

27

In [77]:
# num_pca = 15
# pca = PCA(n_components = num_pca)
# X_train_transformed1 = pca.fit_transform(X_train_transformed)
# X_val_transformed1 = pca.transform(X_val_transformed)
# X_test_transformed1 = pca.transform(X_test_transformed)

In [78]:
def build_model(hp):
  model = Sequential()
  counter = 0
  for i in range(hp.Int('num_layers',min_value=1,max_value=8)):
    if counter == 0:
      model.add(Dense(hp.Int('units' + str(i), min_value=8, max_value=100,step=4),activation= hp.Choice('activation' + str(i), values=['relu','tanh','sigmoid']),input_dim=n_feats))
      model.add(Dropout(hp.Choice('dropout' + str(i), values=[0.0,0.1,0.2,0.3,0.4,0.5,0.6])))
    else:
      model.add(Dense(hp.Int('units' + str(i), min_value=8, max_value=100,step=2),activation= hp.Choice('activation' + str(i), values=['relu','tanh','sigmoid'])))
      model.add(Dropout(hp.Choice('dropout' + str(i), values=[0.0,0.1,0.2,0.3,0.4,0.5,0.6])))
    counter+=1
  model.add(Dense(1,activation='linear'))
  model.compile(optimizer=hp.Choice('optimizer',values=['adagrad','adam','sgd']),
                  loss='mse',
                  metrics=[MeanSquaredError()])
  return model

tuner = kt.Hyperband(build_model, objective = 'val_mean_squared_error', max_epochs = 40, overwrite = True)
tuner.search(X_train_transformed1, y_train, epochs = 40, validation_data = (X_val_transformed1, y_val))
print(tuner.get_best_hyperparameters()[0].values)
model = tuner.get_best_models()[0]
es = EarlyStopping(monitor = 'val_loss', patience = 10)
model.fit(X_train_transformed1, y_train, epochs = 40, verbose = 2, validation_split = 0.3, callbacks = [es])
model.save("models/1_2_rolling_nn")

Trial 90 Complete [00h 00m 03s]
val_mean_squared_error: 0.048792801797389984

Best val_mean_squared_error So Far: 0.008269142359495163
Total elapsed time: 00h 01m 58s
INFO:tensorflow:Oracle triggered exit


INFO:tensorflow:Oracle triggered exit


{'num_layers': 5, 'units0': 44, 'activation0': 'relu', 'dropout0': 0.0, 'optimizer': 'adam', 'units1': 52, 'activation1': 'sigmoid', 'dropout1': 0.0, 'units2': 96, 'activation2': 'relu', 'dropout2': 0.6, 'units3': 42, 'activation3': 'tanh', 'dropout3': 0.3, 'units4': 90, 'activation4': 'tanh', 'dropout4': 0.2, 'units5': 62, 'activation5': 'sigmoid', 'dropout5': 0.0, 'units6': 72, 'activation6': 'relu', 'dropout6': 0.1, 'units7': 74, 'activation7': 'sigmoid', 'dropout7': 0.4, 'tuner/epochs': 40, 'tuner/initial_epoch': 14, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0046'}
Epoch 1/40
29/29 - 1s - loss: 0.0588 - mean_squared_error: 0.0588 - val_loss: 0.0070 - val_mean_squared_error: 0.0070 - 725ms/epoch - 25ms/step
Epoch 2/40
29/29 - 0s - loss: 0.0524 - mean_squared_error: 0.0524 - val_loss: 0.0075 - val_mean_squared_error: 0.0075 - 53ms/epoch - 2ms/step
Epoch 3/40
29/29 - 0s - loss: 0.0542 - mean_squared_error: 0.0542 - val_loss: 0.0047 - val_mean_squared_error: 0.0047 - 52m

2024-03-14 21:00:13.481943: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,96]
	 [[{{node inputs}}]]
2024-03-14 21:00:13.489176: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,42]
	 [[{{node inputs}}]]
2024-03-14 21:00:13.496889: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,90]
	 [[{{node inputs}}]]
2024-03-14 21:00:13

INFO:tensorflow:Assets written to: models/1_2_rolling_nn/assets


INFO:tensorflow:Assets written to: models/1_2_rolling_nn/assets


In [53]:
# Including all features + decay + engineered features (0.1 decay)
model = keras.models.load_model("models/decay_featureeng_nn")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



0.27835274316007974

In [62]:
# No engineered features but all decay + features (0.1)
model = keras.models.load_model("models/decay_nn")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



0.5810911541239634

In [71]:
# Original data.. no engineered features or decay. 
model = keras.models.load_model("models/og_data_nn")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



0.5822079175088242

In [80]:
# Engineered features but no decay
model = keras.models.load_model("models/eng_feat_nn")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



0.27316737917681927

In [89]:
# Two rolling averages + 0.5 decay vs 0.1
model = keras.models.load_model("models/0.5decay_newfeat_nn")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



0.5678935030433923

In [107]:
# Two rolling averages + 0.005 decay 
model = keras.models.load_model("models/0.005decay_newfeat_nn")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



0.26121842330552364

In [37]:
# rolling averages up to 6 years, no decay
model = keras.models.load_model("models/lotsfeat_nn")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



0.07451840644916961

In [142]:
# same rolling averages w/ PCA reducing to 15 dimensions
model = keras.models.load_model("models/lotsfeat_pca_nn")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



0.2555150658183186

In [59]:
# rolling averages for 2,3,6 years, no decay
model = keras.models.load_model("models/somefeat_nn")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



0.2683192941272255

In [79]:
# 1 and 2 year rolling avg
model = keras.models.load_model("models/1_2_rolling_nn")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



0.08465170188990537

For this section we decide on the model with 6 rolling averages, no decay or pca. 

In [25]:

met2023 = pd.read_csv("data/b2023metrics.csv")
met2023.head()
met2023['year'] = met2023['year'] + 1
met2023= met2023.rename(columns = {'last_name, first_name':'Name'})
met2023new = met2023.merge(latest_rolling_averages, on = 'Name', how = 'left')
met2023new = met2023new.fillna(mean_fpts)
model = keras.models.load_model("models/1_2_rolling_nn")
met2023x = met2023new.drop(columns = ['player_id', 'year', 'Name'])
met2023t = ct.transform(met2023x)
preds = model.predict(met2023t)
met2023['Pred_fpoints_G'] = preds
met2023['rank'] = met2023['Pred_fpoints_G'].rank(ascending = False)
met2023sort = met2023.sort_values(by = 'rank', ascending = True)
met2023sort[['Name', 'Pred_fpoints_G', 'rank']].head(50)
met2023.shape



(461, 29)

This favors older players declining more than it should be... will remove some of the rolling averages. 

In [26]:
games = pd.read_csv("data/bgamesplayed.csv")
games.head()

Unnamed: 0,#,Name,Team,Bats,ESPN,YAHOO,G,PA,AB,R,...,HBP,SF,SH,CS,AVG,OBP,SLG,OPS,BABIP,RazzID
0,1,Mookie Betts,LAD,R,2B/OF,2B/SS/ OF,150,673,571,115.4,...,9.0,4.2,6.4,3.8,0.281,0.371,0.529,0.9,0.294,13611
1,2,Marcus Semien,TEX,R,2B,2B,150,663,590,100.3,...,4.8,4.2,6.2,4.4,0.267,0.333,0.465,0.798,0.284,12533
2,3,Kyle Schwarber,PHI,L,OF,OF,150,654,539,103.9,...,6.5,4.2,5.7,1.9,0.224,0.349,0.482,0.83,0.259,16478
3,4,Gunnar Henderson,BAL,L,SS/3B,SS/3B,152,654,574,96.2,...,5.2,4.9,2.0,4.3,0.261,0.339,0.484,0.823,0.31,683002
4,5,Luis Arraez,MIA,L,2B,1B/2B,151,653,588,88.9,...,4.6,4.2,5.5,2.3,0.316,0.368,0.422,0.79,0.329,650333


In [29]:
def normalize_name(name):
    # Remove accents and convert to lower case
    name = unicodedata.normalize('NFD', name).encode('ascii', 'ignore').decode('utf-8').lower()
    # Remove common titles, suffixes, or other variations as needed
    name = name.replace('jr.', '').replace('sr.', '').strip()
    return name

# Assuming df1 and df2 are your DataFrames and 'name' is the column with the names
games['Name_n'] = games['Name'].apply(normalize_name)
met2023['Name_n'] = met2023['Name'].apply(normalize_name)
games.head()

Unnamed: 0,#,Name,Team,Bats,ESPN,YAHOO,G,PA,AB,R,...,SH,CS,AVG,OBP,SLG,OPS,BABIP,RazzID,Name_x,Name_n
0,1,Mookie Betts,LAD,R,2B/OF,2B/SS/ OF,150,673,571,115.4,...,6.4,3.8,0.281,0.371,0.529,0.9,0.294,13611,mookie betts,mookie betts
1,2,Marcus Semien,TEX,R,2B,2B,150,663,590,100.3,...,6.2,4.4,0.267,0.333,0.465,0.798,0.284,12533,marcus semien,marcus semien
2,3,Kyle Schwarber,PHI,L,OF,OF,150,654,539,103.9,...,5.7,1.9,0.224,0.349,0.482,0.83,0.259,16478,kyle schwarber,kyle schwarber
3,4,Gunnar Henderson,BAL,L,SS/3B,SS/3B,152,654,574,96.2,...,2.0,4.3,0.261,0.339,0.484,0.823,0.31,683002,gunnar henderson,gunnar henderson
4,5,Luis Arraez,MIA,L,2B,1B/2B,151,653,588,88.9,...,5.5,2.3,0.316,0.368,0.422,0.79,0.329,650333,luis arraez,luis arraez


In [33]:
gp = games[['Name_n', 'G']]
def find_best_match(name, choices, score_cutoff=70):
    best_match = process.extractOne(name, choices, score_cutoff=score_cutoff)
    return best_match[0] if best_match else None

# Find best matches for names in df1 from names in df2
choices = gp['Name_n'].unique()
met2023['new_Name'] = met2023['Name_n'].apply(lambda name: find_best_match(name, choices))

# Merge based on the best match found
merged_df = met2023.merge(gp, left_on='new_Name', right_on='Name_n')
merged_df['Fpoints'] = merged_df['G'] * merged_df['Pred_fpoints_G']

In [34]:
finaldf = merged_df[['Name', 'Fpoints']]
finaldf['rank'] = finaldf['Fpoints'].rank(ascending=False)
finaldf.sort_values(by='rank').head(50
                                   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  finaldf['rank'] = finaldf['Fpoints'].rank(ascending=False)


Unnamed: 0,Name,Fpoints,rank
139,"Ramírez, José",512.477374,1.0
26,"Freeman, Freddie",507.695425,2.0
91,"Judge, Aaron",503.65355,3.0
19,"Goldschmidt, Paul",496.258914,4.0
126,"Betts, Mookie",495.630348,5.0
295,"Tatis Jr., Fernando",487.452247,6.0
406,"Witt Jr., Bobby",483.452268,7.0
363,"Alvarez, Yordan",475.17274,8.0
437,"Carroll, Corbin",466.809317,9.0
297,"Soto, Juan",462.638351,10.0
