In [1]:
import tensorflow
from tensorflow.keras import Sequential
from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import keras_tuner as kt
from tensorflow import keras

import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np
from collections import deque

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import statistics


from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import average_precision_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import StackingClassifier
from sklearn.decomposition import PCA
import shap

2024-05-30 18:44:10.710539: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.
IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
data = pd.read_csv("data/data_preds.csv", index_col = 0)
data = data.drop(columns = ['Date','CT%_y', 'CTv_y', 'CB%_y', 'CBv_y', 'CH%_y', 
              'CHv_y', 'wCT_y', 'wCB_y', 'wCH_y', 'wCT/C_y', 'wCB/C_y', 'wCH/C_y',
             'FBv_y', 'SL%_y', 'SLv_y', 'SF%_y', 'SFv_y', 'XX%_y', 'wFB_y', 'wSL_y', 'wSF_y', 'wFB/C_y', 'wSL/C_y', 'wSF/C'])
data = data.dropna(axis = 1)
data = data.drop_duplicates()
data.shape

(4889, 240)

In [3]:
X = data.drop(columns = 'K')
y = data['K']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 123)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size = 0.7, random_state = 123)

In [4]:

cat_feats = ['Team_x', 'Opp']
num_feats = X.drop(columns = ['Team_x', 'Opp']).columns.tolist()

In [5]:
ct = make_column_transformer(
    (StandardScaler(), num_feats),
    (OneHotEncoder(handle_unknown = 'ignore'), cat_feats))
ct

In [6]:
transformed_x_train = ct.fit_transform(X_train)
transformed_x_val = ct.transform(X_val)
transformed_x_test = ct.transform(X_test)

In [7]:
colnames = num_feats + list(ct.named_transformers_['onehotencoder'].get_feature_names_out())

In [8]:
X_train_transformed = pd.DataFrame(transformed_x_train, columns = colnames)
X_val_transformed = pd.DataFrame(transformed_x_val, columns = colnames)
X_test_transformed = pd.DataFrame(transformed_x_test, columns = colnames)
n_features = len(colnames)

In [9]:
pca = PCA(n_components = 50)
X_train_transformed1 = pca.fit_transform(X_train_transformed)
X_val_transformed1 = pca.transform(X_val_transformed)
X_test_transformed1 = pca.transform(X_test_transformed)

In [10]:
def build_model(hp):
  model = Sequential()
  counter = 0
  for i in range(hp.Int('num_layers',min_value=1,max_value=30)):
    if counter == 0:
      model.add(Dense(hp.Int('units' + str(i), min_value=8, max_value=300,step=8),activation= hp.Choice('activation' + str(i), values=['relu','tanh','sigmoid']),input_dim=50))
      model.add(Dropout(hp.Choice('dropout' + str(i), values=[0.1,0.2,0.3,0.4,0.5,0.6])))
    else:
      model.add(Dense(hp.Int('units' + str(i), min_value=8, max_value=270,step=4),activation= hp.Choice('activation' + str(i), values=['relu','tanh','sigmoid'])))
      model.add(Dropout(hp.Choice('dropout' + str(i), values=[0.1,0.2,0.3,0.4,0.5,0.6])))
    counter+=1
  model.add(Dense(1,activation='linear'))
  model.compile(optimizer=hp.Choice('optimizer',values=['adagrad','adam','sgd']),
                  loss='mse',
                  metrics=[MeanSquaredError()])
  return model

tuner = kt.Hyperband(build_model, objective = 'val_mean_squared_error', max_epochs = 100, overwrite = True)
tuner.search(X_train_transformed1, y_train, epochs = 100, validation_data = (X_val_transformed1, y_val))
print(tuner.get_best_hyperparameters()[0].values)
model = tuner.get_best_models()[0]
es = EarlyStopping(monitor = 'val_loss', patience = 10)
model.fit(X_train_transformed1, y_train, epochs = 100, verbose = 2, validation_split = 0.3, callbacks = [es])
model.save("models/K_model_pca_50")

Trial 254 Complete [00h 00m 18s]
val_mean_squared_error: 5.159441947937012

Best val_mean_squared_error So Far: 5.072523593902588
Total elapsed time: 00h 18m 29s
INFO:tensorflow:Oracle triggered exit
{'num_layers': 4, 'units0': 8, 'activation0': 'tanh', 'dropout0': 0.2, 'optimizer': 'adam', 'units1': 132, 'activation1': 'sigmoid', 'dropout1': 0.4, 'units2': 80, 'activation2': 'relu', 'dropout2': 0.1, 'units3': 192, 'activation3': 'relu', 'dropout3': 0.3, 'units4': 144, 'activation4': 'tanh', 'dropout4': 0.5, 'units5': 16, 'activation5': 'sigmoid', 'dropout5': 0.5, 'units6': 56, 'activation6': 'tanh', 'dropout6': 0.3, 'units7': 56, 'activation7': 'tanh', 'dropout7': 0.1, 'units8': 36, 'activation8': 'tanh', 'dropout8': 0.4, 'units9': 16, 'activation9': 'tanh', 'dropout9': 0.5, 'units10': 264, 'activation10': 'tanh', 'dropout10': 0.3, 'units11': 36, 'activation11': 'sigmoid', 'dropout11': 0.2, 'units12': 176, 'activation12': 'sigmoid', 'dropout12': 0.5, 'units13': 40, 'activation13': 'ta

2024-05-30 19:04:26.347127: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,8]
	 [[{{node inputs}}]]
2024-05-30 19:04:26.354474: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,132]
	 [[{{node inputs}}]]
2024-05-30 19:04:26.361403: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,80]
	 [[{{node inputs}}]]
2024-05-30 19:04:26

INFO:tensorflow:Assets written to: models/K_model_pca_50/assets


INFO:tensorflow:Assets written to: models/K_model_pca_50/assets


In [11]:
model = keras.models.load_model("models/K_model_pca_50")
preds = model.predict(X_test_transformed1)
np.sqrt(mean_squared_error(y_test, preds))



2.406175278170989

In [12]:
samples = shap.sample(X_train_transformed, 1)
e = shap.KernelExplainer(model, samples)
shap_values = e.shap_values(X_test_transformed[1:300])
shap.initjs()
# visualize the first prediction's explanation with a force plot
shap.summary_plot(shap_values[0], X_test_transformed[1:300], feature_names = colnames)

Provided model function fails when applied to the provided data set.


ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 50), found shape=(1, 295)

In [13]:
teambat = pd.read_csv('data/team_batting.csv', index_col = 0)
# print(teambat.head())
pitchseas = pd.read_csv('data/pitcher_season_stats.csv', index_col = 0)
df = pd.read_csv("data/all_stats.csv")
# print(len(teambat.columns))
# print(len(pitchseas.columns))
def predict_player(team_data, pitch_season_data, name,opponent, pca, model, ct):
    td = team_data[team_data['Team'] == opponent]
    psd = pitch_season_data[pitch_season_data['Name'] == name]
    psd['Opp'] = opponent
    team=psd['Team'].values[0]
    # print(len(psd.columns))
    # print(len(td.columns))
    full_df = psd.merge(td, left_on = 'Opp', right_on = 'Team')
    full_df = full_df.drop(columns = ['Name', 'CT%_y', 'CTv_y', 'CB%_y', 'CBv_y', 'CH%_y', 
              'CHv_y', 'wCT_y', 'wCB_y', 'wCH_y', 'wCT/C_y', 'wCB/C_y', 'wCH/C_y',
             'FBv_y', 'SL%_y', 'SLv_y', 'SF%_y', 'SFv_y', 'XX%_y', 'wFB_y', 'wSL_y','wSF_y', 'wFB/C_y', 'wSL/C_y', 'wSF/C',
                          'XX%_x', 'SF%_x', 'wCB/C_x', 'wSF_x', 'Dol', 'wCH_x', 'CB%_x', 'wCH/C_x', 'CBv_x', 'wCB_x', 'CTv_x', 
                           'Team_y', 'wSL_x', 'wCT_x', 'wCT/C_x', 'CT%_x', 'SL%_x', 'CH%_x', 'CHv_x', 'wSL/C_x', 'SFv_x', 'SLv_x'])
    full_df = full_df.dropna(axis = 1)
    full_df['Team_x'] = team
    print(full_df['Team_x'])
    print(full_df['Opp'])
    # print(full_df.columns.tolist())
    # print(len(full_df.columns))
    full_df1 = ct.transform(full_df)
    full_df = pd.DataFrame(full_df1, columns = num_feats + list(ct.named_transformers_['onehotencoder'].get_feature_names_out()))
    print(full_df.columns[full_df.isna().any()].tolist())
    df_pred = pca.transform(full_df)
    pred = model.predict(df_pred)
    return pred
predict_player(teambat, pitchseas, 'Carlos Rodon', 'Los Angeles', pca, model, ct)

0    NYY
1    NYY
Name: Team_x, dtype: object
0    Los Angeles
1    Los Angeles
Name: Opp, dtype: object
[]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


array([[4.1945095],
       [4.5004263]], dtype=float32)

In [14]:
df = pd.read_csv("data/all_stats.csv")
print(len(df.columns))

305
