# Code to make Hall of Fame predictions for specific players

In [144]:
import numpy as np
import pandas as pd

# Normalize 2010 - 2020 data (from Baseball Savant)

In [1]:
def preprocess_bs_table(filename):
    # select columns, drop 2020 (short season)
    batting_df = pd.read_csv(filename)
    batting_df['player_id'] = (batting_df[' first_name'] + batting_df['last_name']).str[1:]
    batting_df = batting_df[['player_id', 'b_ab', 'b_walk', 'b_double', 'b_game', 'b_total_hits', 'b_hit_by_pitch', 'b_home_run', 'r_run', 'b_rbi', 'r_total_stolen_base', 'b_total_sacrifices', 'b_strikeout', 'b_triple', 'year']]
    batting_df = batting_df[batting_df['year'] != 2020]
    
    # normalize
    yearly_means = batting_df.groupby('year').mean()
    yearly_stds = batting_df.groupby('year').std()
    batting_df_norm_year = batting_df.apply(lambda row: (row - yearly_means.loc[row['year']]) / yearly_stds.loc[row['year']], axis=1)
    batting_df_norm_year['year'] = batting_df['year']
    batting_df_norm_year['player_id'] = batting_df['player_id']
    
    # save
    batting_df_norm_year.to_csv('../data_normalized/batting_norm_2010_2020.csv')

In [12]:
preprocess_bs_table('../data/batting_2010_2020.csv')

# Build samples for specific player IDs

In [None]:
new_df_norm = pd.read_csv('../data_normalized/batting_norm_2010_2020.csv').drop(columns='Unnamed: 0')
old_df_norm = pd.read_csv('../data_normalized/batting_norm_batters_only.csv').drop(columns='Unnamed: 0')
def player_df(player_id, dataset='new'):
    """
    Returns DataFrame representing a player's stats
    
    dataset="new" if player played after 2009
    dataset="old" if player played before 2010
    """
    if dataset == 'new':
        p_df = new_df_norm[new_df_norm['player_id'] == player_id].sort_values('year').drop(columns=['player_id'])
    else:
        p_df = old_df_norm[old_df_norm['player_id'] == player_id].sort_values('year').drop(columns=['player_id'])
    p_df['years_played'] = np.arange(1, p_df.shape[0]+1)
    return p_df

In [3]:
def build_samples(player_ids, fmt='ts', dataset='new'):
    """
    Builds aggregate or time series test set, given a list of player ids, the format (ts or agg) and the dataset to use (new or old)
    """
    if fmt == 'ts':
        X_test = np.zeros((len(player_ids), 25, 13))
    else:
        X_test = np.zeros((len(player_ids), 14))
        
    for i, pid in enumerate(player_ids):
        p_df = player_df(pid, dataset=dataset)
        if fmt == 'ts':
            X_test[i, :p_df['years_played'].max()] = p_df.drop(columns=['years_played', 'year']).to_numpy()
        else:
            p_df_agg = p_df.drop(columns=['year', 'years_played']).sum(axis=0)
            X_test[i] = np.concatenate((p_df_agg.to_numpy(), [p_df['years_played'].max()]))
    
    return X_test

# make predictions using LSTM

In [148]:
def predict(model, player_ids, fmt='ts', dataset='new'):
    """
    Makes a prediction using a loaded model (must have .predict() function)
    """
    X_test = build_samples(player_ids, fmt=fmt, dataset=dataset)
    preds = model.predict(X_test)
    return list(sorted(zip(player_ids, 100 * model.predict(X_test)[:, 1]), key=lambda x:-x[1]))

In [149]:
from keras.models import load_model

In [150]:
lstm = load_model('good_lstm/')

In [151]:
started_after_2011 = (batting_df_norm_year.groupby('player_id').agg('min')['year'] >= 2011)
ids_after_2011 = started_after_2011.index[started_after_2011]

In [None]:
predict(lstm, [''], dataset='new')

In [197]:
# All players before 2010
predict(lstm, old_df_norm['player_id'].unique(), dataset='old')

[('yastrca01', 99.99292),
 ('morgajo02', 99.9925),
 ('kalinal01', 99.989914),
 ('sheffga01', 99.98927),
 ('aaronha01', 99.98672),
 ('bondsba01', 99.98339),
 ('musiast01', 99.98079),
 ('griffke02', 99.97782),
 ('robinbr01', 99.977104),
 ('hunteto01', 99.97638),
 ('ripkeca01', 99.96563),
 ('henderi01', 99.96538),
 ('thomeji01', 99.95883),
 ('jonesch06', 99.95764),
 ('fiskca01', 99.955734),
 ('rosepe01', 99.95361),
 ('brettge01', 99.95192),
 ('murraed02', 99.9507),
 ('palmera01', 99.95017),
 ('rodrial01', 99.94198),
 ('mayswi01', 99.936455),
 ('ortizda01', 99.93531),
 ('winfida01', 99.92783),
 ('mcgrifr01', 99.927444),
 ('jacksre01', 99.92624),
 ('mccovwi01', 99.901794),
 ('jeterde01', 99.90079),
 ('raineti01', 99.89952),
 ('evansdw01', 99.89479),
 ('thomafr04', 99.89083),
 ('gehrich01', 99.88813),
 ('ottme01', 99.883934),
 ('gonzalu01', 99.87595),
 ('biggicr01', 99.860504),
 ('stargwi01', 99.8558),
 ('ramirma02', 99.84808),
 ('willibi01', 99.84285),
 ('parkeda01', 99.81088),
 ('willite01

In [185]:
# A few interesting current players
predict(lstm, ['JoseAltuve', 'GeorgeSpringer', 'AlexBregman', 'CarlosCorrea', 'MichaelBrantley', 'KyleTucker', 'YordanAlvarez', 'AledmysDiaz', 
               'FernandoTatis Jr.', 'ChristianYelich', 'CodyBellinger', 'AnthonyRendon', 'JuanSoto', 'FranciscoLindor', 'PeteAlonso', 'MikeTrout', 'MattChapman'],
       dataset='new')

[('MikeTrout', 95.59606),
 ('GeorgeSpringer', 94.88754),
 ('ChristianYelich', 93.80228),
 ('CodyBellinger', 93.5683),
 ('PeteAlonso', 88.06546),
 ('MattChapman', 84.73477),
 ('FernandoTatis Jr.', 76.16439),
 ('YordanAlvarez', 47.43093),
 ('FranciscoLindor', 1.5245372),
 ('AnthonyRendon', 0.1106233),
 ('KyleTucker', 0.0724342),
 ('JuanSoto', 0.03192389),
 ('AlexBregman', 0.02042448),
 ('CarlosCorrea', 0.01938864),
 ('JoseAltuve', 0.018569259),
 ('AledmysDiaz', 0.016059069),
 ('MichaelBrantley', 0.013783091)]

In [94]:
# All players after 2010
predict(lstm, ids_after_2011)

[('FreddieFreeman', 96.52155),
 ('ToddFrazier', 96.35371),
 ('KhrisDavis', 96.202),
 ('BrianDozier', 95.88725),
 ('KyleSeager', 95.64071),
 ('MikeTrout', 95.59606),
 ('PaulGoldschmidt', 95.13227),
 ('EugenioSuarez', 95.12585),
 ('GeorgeSpringer', 94.88754),
 ('WilMyers', 94.72681),
 ('TrevorStory', 94.48759),
 ('YoenisCespedes', 94.44325),
 ('KrisBryant', 94.37288),
 ('JavierBaez', 94.292206),
 ('JocPederson', 93.98625),
 ('YasielPuig', 93.82613),
 ('ChristianYelich', 93.80228),
 ('CodyBellinger', 93.5683),
 ('MookieBetts', 93.119446),
 ('JoeyGallo', 92.901306),
 ('KyleSchwarber', 92.85364),
 ('StevenSouza Jr.', 92.0113),
 ('AnthonyRizzo', 91.51869),
 ('RonaldAcuna Jr.', 91.22084),
 ('YoanMoncada', 91.15048),
 ('BryceHarper', 90.93375),
 ('MichaelConforto', 90.65497),
 ('JasonKipnis', 90.44714),
 ('MattOlson', 89.90074),
 ('EvanGattis', 89.635315),
 ('MikeZunino', 89.29534),
 ('RhysHoskins', 89.08057),
 ('BrandonBelt', 88.97232),
 ('MitchHaniger', 88.14825),
 ('PeteAlonso', 88.06546),
