In [1]:
import pandas as pd
import tensorflow
from tensorflow.keras import Sequential
from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import keras_tuner as kt
from tensorflow import keras
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split)

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.decomposition import PCA
import unicodedata
from fuzzywuzzy import process

2024-03-18 11:08:54.036940: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data = pd.read_csv("data/p_combined.csv", index_col = 0)
data.head()

Unnamed: 0,Name,Fpoints_IP,year,xwoba,xobp,xwobacon,xbacon,exit_velocity_avg,launch_angle_avg,sweet_spot_percent,...,oz_swing_percent,oz_swing_miss_percent,iz_contact_percent,edge_percent,whiff_percent,swing_percent,f_strike_percent,groundballs_percent,flyballs_percent,linedrives_percent
0,"Colon, Bartolo",1.692469,2016,0.314,0.294,0.368,0.326,88.9,11.8,34.1,...,31.8,20.4,87.0,42.1,14.4,48.6,66.6,44.1,23.2,27.0
1,"Grilli, Jason",2.169492,2016,0.275,0.275,0.38,0.337,88.4,22.1,34.1,...,34.8,43.1,75.7,46.7,29.9,52.1,61.4,27.1,30.6,29.4
2,"Benoit, Joaquin",2.1875,2016,0.271,0.276,0.316,0.272,84.8,11.3,26.5,...,37.0,49.1,75.9,41.6,33.6,52.1,53.9,47.6,20.5,22.9
3,"Belisle, Matt",2.217391,2016,0.297,0.335,0.308,0.299,86.9,7.4,34.0,...,28.0,30.2,83.7,43.4,20.0,45.5,67.1,54.7,21.7,21.7
4,"Sabathia, CC",1.655134,2016,0.304,0.311,0.345,0.318,87.8,10.7,29.1,...,31.0,34.6,84.2,46.3,21.4,46.2,62.5,48.2,20.5,25.3


In [12]:
mean_fpts = data[data['year']==2023]['Fpoints_IP'].quantile(0.65)
data_sorted = data.sort_values(['Name', 'year'])
data_sorted['Fpoints_G_2'] = data_sorted.groupby('Name')['Fpoints_IP'].transform(lambda x: x.rolling(window=2, min_periods = 1).mean())
data_sorted['Fpoints_G_1'] = data_sorted.groupby('Name')['Fpoints_IP'].transform(lambda x: x.rolling(window=1, min_periods = 1).mean())
# data_sorted['Fpoints_G_6'] = data_sorted.groupby('Name')['Fpoints_G'].transform(lambda x: x.rolling(window=6, min_periods = 1).mean())
league_trends = data.groupby('year')['Fpoints_IP'].mean().rename('league_avg_Fpoints')
full_data = data_sorted.merge(league_trends, on='year')
latest_rolling_averages = full_data.drop_duplicates(subset=['Name'], keep='last')[['Name', 'Fpoints_G_2', 'Fpoints_G_1','league_avg_Fpoints']]
# feature_columns = [col for col in full_data.columns if col not in ['Name', 'year', 'Unnamed: 0', 'Fpoints_G', 'weight_decay_0.1', 'years_ago']]
# for column in feature_columns:
#     full_data[f'weighted_{column}'] = full_data[column] * full_data['weight_decay_0.1']
# full_data = full_data.drop(columns = ['years_ago', 'weight_decay_0.1'])

full_data.head()

Unnamed: 0,Name,Fpoints_IP,year,xwoba,xobp,xwobacon,xbacon,exit_velocity_avg,launch_angle_avg,sweet_spot_percent,...,edge_percent,whiff_percent,swing_percent,f_strike_percent,groundballs_percent,flyballs_percent,linedrives_percent,Fpoints_G_2,Fpoints_G_1,league_avg_Fpoints
0,"Abad, Fernando",1.722944,2016,0.311,0.301,0.37,0.298,88.6,15.9,32.1,...,43.8,24.0,48.5,52.7,40.7,24.3,27.9,1.722944,1.722944,1.614885
1,"Albers, Matt",0.495108,2016,0.261,0.263,0.297,0.262,85.6,3.4,25.2,...,41.7,17.4,42.6,59.1,58.6,17.1,19.8,0.495108,0.495108,1.614885
2,"Allen, Cody",2.720588,2016,0.261,0.278,0.36,0.33,88.2,16.6,38.8,...,39.2,31.9,44.6,59.8,35.0,23.8,31.9,2.720588,2.720588,1.614885
3,"Alvarez, Jose",1.616462,2016,0.283,0.305,0.318,0.297,85.5,8.3,33.2,...,38.2,24.6,47.7,68.6,51.5,17.3,27.6,1.616462,1.616462,1.614885
4,"Anderson, Chase",1.439153,2016,0.309,0.309,0.346,0.313,88.4,13.2,32.2,...,42.4,19.3,47.0,61.6,44.4,19.9,28.8,1.439153,1.439153,1.614885


In [4]:
full_data.isna().sum()

Name                     0
Fpoints_IP               0
year                     0
xwoba                    0
xobp                     0
xwobacon                 0
xbacon                   0
exit_velocity_avg        0
launch_angle_avg         0
sweet_spot_percent       0
barrel_batted_rate       0
hard_hit_percent         0
avg_best_speed           0
avg_hyper_speed          0
z_swing_percent          0
z_swing_miss_percent     0
oz_swing_percent         0
oz_swing_miss_percent    0
iz_contact_percent       0
edge_percent             0
whiff_percent            0
swing_percent            0
f_strike_percent         0
groundballs_percent      0
flyballs_percent         0
linedrives_percent       0
Fpoints_G_2              0
Fpoints_G_1              0
league_avg_Fpoints       0
dtype: int64

In [5]:
X = full_data.drop(columns = ['Name', 'year', 'Fpoints_IP'])
y = full_data['Fpoints_IP']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 123)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size = 0.7, random_state = 123)
num_feats = X.columns.tolist()
ct = make_column_transformer(
    (StandardScaler(), num_feats))
ct

transformed_x_train = ct.fit_transform(X_train)
transformed_x_val = ct.transform(X_val)
transformed_x_test = ct.transform(X_test)

In [6]:
colnames = num_feats
X_train_transformed1 = pd.DataFrame(transformed_x_train, columns = colnames)
X_val_transformed1 = pd.DataFrame(transformed_x_val, columns = colnames)
X_test_transformed1 = pd.DataFrame(transformed_x_test, columns = colnames)
n_feats = len(colnames)
n_feats

26

In [7]:
def build_model(hp):
  model = Sequential()
  counter = 0
  for i in range(hp.Int('num_layers',min_value=1,max_value=8)):
    if counter == 0:
      model.add(Dense(hp.Int('units' + str(i), min_value=8, max_value=100,step=4),activation= hp.Choice('activation' + str(i), values=['relu','tanh','sigmoid']),input_dim=n_feats))
      model.add(Dropout(hp.Choice('dropout' + str(i), values=[0.0,0.1,0.2,0.3,0.4,0.5,0.6])))
    else:
      model.add(Dense(hp.Int('units' + str(i), min_value=8, max_value=100,step=2),activation= hp.Choice('activation' + str(i), values=['relu','tanh','sigmoid'])))
      model.add(Dropout(hp.Choice('dropout' + str(i), values=[0.0,0.1,0.2,0.3,0.4,0.5,0.6])))
    counter+=1
  model.add(Dense(1,activation='linear'))
  model.compile(optimizer=hp.Choice('optimizer',values=['adagrad','adam','sgd']),
                  loss='mse',
                  metrics=[MeanSquaredError()])
  return model

tuner = kt.Hyperband(build_model, objective = 'val_mean_squared_error', max_epochs = 40, overwrite = True)
tuner.search(X_train_transformed1, y_train, epochs = 40, validation_data = (X_val_transformed1, y_val))
print(tuner.get_best_hyperparameters()[0].values)
model = tuner.get_best_models()[0]
es = EarlyStopping(monitor = 'val_loss', patience = 10)
model.fit(X_train_transformed1, y_train, epochs = 40, verbose = 2, validation_split = 0.3, callbacks = [es])
model.save("models/pitch_1_2_rolling_nn")

Trial 90 Complete [00h 00m 05s]
val_mean_squared_error: 0.06504791975021362

Best val_mean_squared_error So Far: 0.008742892183363438
Total elapsed time: 00h 02m 00s
INFO:tensorflow:Oracle triggered exit
{'num_layers': 2, 'units0': 76, 'activation0': 'relu', 'dropout0': 0.2, 'optimizer': 'adam', 'units1': 20, 'activation1': 'tanh', 'dropout1': 0.1, 'units2': 48, 'activation2': 'tanh', 'dropout2': 0.2, 'units3': 92, 'activation3': 'tanh', 'dropout3': 0.1, 'units4': 74, 'activation4': 'sigmoid', 'dropout4': 0.1, 'units5': 94, 'activation5': 'relu', 'dropout5': 0.3, 'units6': 86, 'activation6': 'relu', 'dropout6': 0.1, 'units7': 94, 'activation7': 'sigmoid', 'dropout7': 0.1, 'tuner/epochs': 40, 'tuner/initial_epoch': 14, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0047'}
Epoch 1/40
28/28 - 0s - loss: 0.0365 - mean_squared_error: 0.0365 - val_loss: 0.0060 - val_mean_squared_error: 0.0060 - 475ms/epoch - 17ms/step
Epoch 2/40
28/28 - 0s - loss: 0.0355 - mean_squared_error: 0.035

2024-03-18 11:13:11.748046: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,76]
	 [[{{node inputs}}]]
2024-03-18 11:13:11.756175: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,20]
	 [[{{node inputs}}]]
2024-03-18 11:13:11.849037: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,76]
	 [[{{node inputs}}]]
2024-03-18 11:13:11

INFO:tensorflow:Assets written to: models/pitch_1_2_rolling_nn/assets


INFO:tensorflow:Assets written to: models/pitch_1_2_rolling_nn/assets


In [8]:
# 1 and 2 year rolling avg
model = keras.models.load_model("models/pitch_1_2_rolling_nn")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



0.08118580194049002

In [14]:
met2023 = pd.read_csv("data/p2023metrics.csv")
met2023.head()
met2023['year'] = met2023['year'] + 1
met2023= met2023.rename(columns = {'last_name, first_name':'Name'})
met2023new = met2023.merge(latest_rolling_averages, on = 'Name', how = 'left')
met2023new = met2023new.fillna(mean_fpts)
model = keras.models.load_model("models/pitch_1_2_rolling_nn")
met2023x = met2023new.drop(columns = ['player_id', 'year', 'Name'])
met2023t = ct.transform(met2023x)
preds = model.predict(met2023t)
met2023['Pred_fpoints_IP'] = preds
met2023['rank'] = met2023['Pred_fpoints_IP'].rank(ascending = False)
met2023sort = met2023.sort_values(by = 'rank', ascending = True)
met2023sort[['Name', 'Pred_fpoints_IP', 'rank']].head(50)



Unnamed: 0,Name,Pred_fpoints_IP,rank
252,"Bautista, Félix",3.342067,1.0
359,"Helsley, Ryan",3.259363,2.0
18,"Yates, Kirby",3.25936,3.0
201,"Hader, Josh",3.208422,4.0
410,"Bieber, Shane",3.177835,5.0
319,"Clase, Emmanuel",3.049321,6.0
111,"deGrom, Jacob",3.03086,7.0
294,"Swanson, Erik",3.020239,8.0
310,"Lamet, Dinelson",3.018507,9.0
2,"Verlander, Justin",3.011797,10.0


In [15]:
games = pd.read_csv("data/pgamesplayed.csv")
games.head()

Unnamed: 0,#,Name,Team,POS,R/L,G,GS,QS,TBF,IP,...,H,HBP,ER,R,HR,GB%,FB%,LD%,BABIP,RazzID
0,1,Framber Valdez,HOU,SP,L,30.8,30.8,18.8,781.4,187.9,...,168.2,9.5,71.2,78.8,16,56.7,23.3,20.0,0.299,664285
1,2,Zack Wheeler,PHI,SP,R,30.8,30.8,18.2,760.0,184.8,...,168.6,8.8,72.0,78.4,23,43.9,36.2,19.9,0.292,10310
2,3,Chris Bassitt,TOR,SP,R,30.8,30.8,16.4,788.2,184.8,...,179.2,10.1,87.4,94.2,26,43.3,36.2,20.5,0.288,12304
3,4,Luis Castillo,SEA,SP,R,30.8,30.8,17.4,779.3,184.8,...,165.4,7.2,78.9,85.5,24,43.6,36.5,19.9,0.282,15689
4,5,Aaron Nola,PHI,SP,R,30.8,30.8,17.6,765.3,184.8,...,171.0,6.5,77.1,83.4,26,42.5,37.2,20.3,0.288,16149


In [16]:
def normalize_name(name):
    # Remove accents and convert to lower case
    name = unicodedata.normalize('NFD', name).encode('ascii', 'ignore').decode('utf-8').lower()
    # Remove common titles, suffixes, or other variations as needed
    name = name.replace('jr.', '').replace('sr.', '').strip()
    return name

# Assuming df1 and df2 are your DataFrames and 'name' is the column with the names
games['Name_n'] = games['Name'].apply(normalize_name)
met2023['Name_n'] = met2023['Name'].apply(normalize_name)
games.head()

Unnamed: 0,#,Name,Team,POS,R/L,G,GS,QS,TBF,IP,...,HBP,ER,R,HR,GB%,FB%,LD%,BABIP,RazzID,Name_n
0,1,Framber Valdez,HOU,SP,L,30.8,30.8,18.8,781.4,187.9,...,9.5,71.2,78.8,16,56.7,23.3,20.0,0.299,664285,framber valdez
1,2,Zack Wheeler,PHI,SP,R,30.8,30.8,18.2,760.0,184.8,...,8.8,72.0,78.4,23,43.9,36.2,19.9,0.292,10310,zack wheeler
2,3,Chris Bassitt,TOR,SP,R,30.8,30.8,16.4,788.2,184.8,...,10.1,87.4,94.2,26,43.3,36.2,20.5,0.288,12304,chris bassitt
3,4,Luis Castillo,SEA,SP,R,30.8,30.8,17.4,779.3,184.8,...,7.2,78.9,85.5,24,43.6,36.5,19.9,0.282,15689,luis castillo
4,5,Aaron Nola,PHI,SP,R,30.8,30.8,17.6,765.3,184.8,...,6.5,77.1,83.4,26,42.5,37.2,20.3,0.288,16149,aaron nola


In [17]:
gp = games[['Name_n', 'IP']]
def find_best_match(name, choices, score_cutoff=70):
    best_match = process.extractOne(name, choices, score_cutoff=score_cutoff)
    return best_match[0] if best_match else None

# Find best matches for names in df1 from names in df2
choices = gp['Name_n'].unique()
met2023['new_Name'] = met2023['Name_n'].apply(lambda name: find_best_match(name, choices))

# Merge based on the best match found
merged_df = met2023.merge(gp, left_on='new_Name', right_on='Name_n')
merged_df['Fpoints'] = merged_df['IP'] * merged_df['Pred_fpoints_IP']

In [18]:
finaldf = merged_df[['Name', 'Fpoints']]
finaldf['rank'] = finaldf['Fpoints'].rank(ascending=False)
finaldf.sort_values(by='rank').head(50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  finaldf['rank'] = finaldf['Fpoints'].rank(ascending=False)


Unnamed: 0,Name,Fpoints,rank
404,"Bieber, Shane",515.762618,1.0
394,"Burnes, Corbin",503.143026,2.0
141,"Nola, Aaron",475.523613,3.0
383,"Gallen, Zac",465.449791,4.0
440,"Strider, Spencer",457.239336,5.0
195,"Castillo, Luis",455.127648,6.0
29,"Darvish, Yu",453.553558,7.0
3,"Verlander, Justin",452.371974,8.0
34,"Kelly, Merrill",444.27282,9.0
97,"Gausman, Kevin",429.441649,10.0


In [20]:
finaldf.to_csv("data/2024pitcherpreds.csv")