In [1]:
import pandas as pd
import numpy as np

## Remove pitchers from batting table

In [2]:
batting_df = pd.read_csv('data_normalized/batting_norm.csv')
hof_df = pd.read_csv('data_normalized/hall_of_fame.csv')
appearances_df = pd.read_csv('data/appearances.csv')

In [3]:
appearances_df[appearances_df['player_id'] == 'wynnea01']

Unnamed: 0,year,team_id,league_id,player_id,g_all,gs,g_batting,g_defense,g_p,g_c,...,g_2b,g_3b,g_ss,g_lf,g_cf,g_rf,g_of,g_dh,g_ph,g_pr
28660,1939,WS1,AL,wynnea01,3.0,,3,3.0,3,0,...,0,0,0,0,0,0,0,,,
29782,1941,WS1,AL,wynnea01,5.0,,5,5.0,5,0,...,0,0,0,0,0,0,0,,,
30321,1942,WS1,AL,wynnea01,30.0,,30,30.0,30,0,...,0,0,0,0,0,0,0,,,
30878,1943,WS1,AL,wynnea01,38.0,,38,38.0,37,0,...,0,0,0,0,0,0,0,,,
31445,1944,WS1,AL,wynnea01,43.0,,43,43.0,33,0,...,0,0,0,0,0,0,0,,,
32708,1946,WS1,AL,wynnea01,25.0,,25,25.0,17,0,...,0,0,0,0,0,0,0,,,
33289,1947,WS1,AL,wynnea01,54.0,,54,54.0,33,0,...,0,0,0,0,0,0,0,,,
33862,1948,WS1,AL,wynnea01,73.0,,73,73.0,33,0,...,0,0,0,0,0,0,0,,,
34116,1949,CLE,AL,wynnea01,35.0,,35,35.0,26,0,...,0,0,0,0,0,0,0,,,
34685,1950,CLE,AL,wynnea01,39.0,,39,39.0,32,0,...,0,0,0,0,0,0,0,,,


In [4]:
games_played_df = appearances_df[['player_id', 'g_all', 'g_p']].groupby('player_id').sum()
batters_set = set(games_played_df[games_played_df['g_p'] < games_played_df['g_all'] / 2].index) # if someone pitches in over half the games they played in, they're a pitcher
batters_set

{'wiedeto01',
 'kieltbo01',
 'blatnjo01',
 'bowesfr01',
 'graype01',
 'youngjo02',
 'rizzuph01',
 'frobedo01',
 'lovitjo01',
 'bernacu01',
 'mullijo01',
 'peterha01',
 'lomasst01',
 'fioreje01',
 'sockach01',
 'dykstle01',
 'colliwi01',
 'kerwida01',
 'smithwi01',
 'bulloer01',
 'sauteal01',
 'kingdwe01',
 'whitmfr01',
 'howarfr01',
 'sanchor01',
 'duncapa01',
 'sketcbu01',
 'johnscl01',
 'leibeha01',
 'lisiri01',
 'cochrda01',
 'smithge01',
 'penaca01',
 'edwarbr01',
 'schilch01',
 'belarwa01',
 'ottme01',
 'whitemy01',
 'cecilda01',
 'bankser01',
 'gagnegr01',
 'lemonji01',
 'woulfji01',
 'voyleph01',
 'suttola01',
 'willire02',
 'fisheha01',
 'barnaba01',
 'maclilo01',
 'alexada01',
 'nunezab01',
 'santara01',
 'haydeja01',
 'wilforo01',
 'blackea01',
 'dortame01',
 'martihe02',
 'wynnema01',
 'wolfra01',
 'himesja01',
 'merried01',
 'connoch01',
 'guthbu01',
 'hargrmi01',
 'sweetri01',
 'lookbr01',
 'wilkibo01',
 'estrajo01',
 'duncama01',
 'tillejo01',
 'whitede04',
 'huffmbe01',


In [5]:
batting_df_batters_only = batting_df[batting_df['player_id'].isin(batters_set)]

In [6]:
batting_df_batters_only[batting_df_batters_only['player_id'] == 'ruthba01'].shape

(22, 16)

In [7]:
batting_df_batters_only.to_csv('data_normalized/batting_norm_batters_only.csv')

# Get set of current players 
## (have to exclude these players from train / test splits)

In [8]:
player_final_years = batting_df_batters_only.groupby('player_id')[['player_id', 'year']].max('year')
active_players = player_final_years[player_final_years['year'] == 2015].index
active_players

Index(['abreujo02', 'ackledu01', 'adamecr01', 'adamsma01', 'adriaeh01',
       'aguilje01', 'ahmedni01', 'alberha01', 'alcanar01', 'almonab01',
       ...
       'wongko01', 'wrighda03', 'yelicch01', 'ynoara01', 'youngch04',
       'youngde03', 'younger03', 'zimmery01', 'zobribe01', 'zuninmi01'],
      dtype='object', name='player_id', length=636)

In [9]:
np.save('data_ready/active_players.npy', active_players)

# Get set of Hall of Fame player ids

In [10]:
hof_players = set(hof_df[(hof_df['inducted'] == 'Y') & (hof_df['category'] == 'Player')]['player_id'].unique())
non_hof_players = set(hof_df[(hof_df['inducted'] == 'N') & (hof_df['category'] == 'Player')]['player_id'].unique())

### Add a few players who were good enough to be in, but weren't due to steroid use or something (our models don't care about that)
### Also add players who retired too recently to have been elected, or have been elected since dataset was updated

In [11]:
hof_players = hof_players.union(set(['bondsba01', 'rosepe01', 'suzukic01', 'jeterde01', 'rodrial01', 'pujolal01', 'palmera01', 'beltrad01', 'rodriiv01', 'raineti01', 'bagweje01', 'trammal01', 'thomeji01', 'morrija01', 'jonesla01', 'guerrvl01', 'martied01', 'baineha01', 'walkela01', 'simmote01']))

In [12]:
batting_df_agg = pd.read_csv('data_ready/batting_norm_agg.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data_ready/batting_norm_agg.csv'

# Aggregate stats method
## Split all players into one sample per year (first year of career, first 2 years, first 3, etc)

In [13]:
batting_df_agg = pd.DataFrame()
batters_set = batting_df_batters_only['player_id'].unique()

for batter in batters_set:
    batter_stats = batting_df_batters_only[batting_df_batters_only['player_id'] == batter]
    years = batter_stats['year']

    assert len(years) == len(set(years)) # fixed?
    
    for i in range(1, years.shape[0] + 1):
        sample_stats = batter_stats[batter_stats['year'].isin(years[:i])].drop(columns=['year', 'player_id', 'Unnamed: 0'])
        sample_agg = sample_stats.sum(axis=0)
        sample_agg['years_played'] = i
        sample_agg['player_id'] = batter
        
        batting_df_agg = batting_df_agg.append(sample_agg, ignore_index=True)

display(batting_df_agg)

Unnamed: 0,ab,bb,double,g,h,hbp,hr,player_id,r,rbi,sb,sh,so,triple,years_played
0,-0.661588,-0.438871,-0.640879,-0.705525,-0.583966,-0.539028,-0.388342,acostme01,-0.539369,-0.609924,-0.335788,-0.665060,-0.794309,-0.310270,1.0
1,-1.145500,-0.634319,-1.124076,-1.026475,-1.017650,-1.125153,-0.813317,acostme01,-0.903027,-1.153566,-0.636014,-1.424980,-0.892177,-0.404662,2.0
2,-1.161406,-0.031041,-1.378979,-0.693510,-1.160762,-0.056680,-1.235447,acostme01,-0.879855,-1.079926,-0.389503,-1.058649,-1.112142,-0.762685,3.0
3,-1.954610,-0.615781,-2.052017,-1.647343,-1.893377,-0.604941,-1.638442,acostme01,-1.575987,-1.775687,-0.956969,-1.764229,-2.108656,-1.364642,4.0
4,-1.744661,-0.229190,-2.314481,-1.438181,-1.531243,-1.148788,-2.011442,acostme01,-1.190280,-1.700775,-0.966303,-1.027140,-2.202136,-0.952990,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43686,-0.448198,-0.350041,-0.506754,-0.542378,-0.443520,-0.452528,-0.364299,turnetr01,-0.407728,-0.547344,0.027295,-0.478008,-0.390894,-0.427949,1.0
43687,0.781068,0.410928,0.186854,0.614127,0.573732,0.309030,0.325219,urshegi01,0.382690,0.258928,-0.357982,0.058954,0.748672,0.186402,1.0
43688,-0.659393,-0.567460,-0.605841,-1.099213,-0.623035,-0.452528,-0.502202,waldrky02,-0.605333,-0.587657,-0.357982,-0.478008,-0.663400,-0.427949,1.0
43689,-0.551088,-0.513105,-0.308580,-0.949296,-0.503358,-0.452528,-0.364299,willima07,-0.486770,-0.466716,-0.357982,-0.478008,-0.613853,-0.427949,1.0


In [209]:
batting_df_agg.to_csv('data_ready/batting_norm_agg.csv')

# Time series stats method
## Each sample is a matrix, where each row represents one year of play

In [269]:
batters_set = batting_df_batters_only['player_id'].unique()
max_years_played = 25
n_retired_samples = 7013 * 25
n_active_samples = 636 * 25
n_stats = 13

retired_players_ts = np.zeros((n_retired_samples, max_years_played, n_stats))
active_players_ts = np.zeros((n_active_samples, max_years_played, n_stats))

retired_seen, active_seen = 0, 0

for batter in batters_set:
    batter_stats = batting_df_batters_only[batting_df_batters_only['player_id'] == batter]
    years = batter_stats['year']

    assert len(years) == len(set(years)) # fixed?
    
    for i in range(1, years.shape[0] + 1):
        sample_stats = batter_stats[batter_stats['year'].isin(years[:i])].drop(columns=['year', 'player_id', 'Unnamed: 0'])
        
        ts_matrix = sample_stats.to_numpy()

        # route to correct dataset
        if years.max() == 2015:
            active_players_ts[active_seen, :len(ts_matrix), :] = ts_matrix
            active_seen += 1
        else:
            retired_players_ts[retired_seen, :len(ts_matrix), :] = ts_matrix
            retired_seen += 1

            
retired_players_ts = retired_players_ts[:40360]
active_players_ts = active_players_ts[:3331]
# display(retired_players_ts)

In [301]:
retired_players_ts = retired_players_ts[:40360]
active_players_ts = active_players_ts[:3331]

In [303]:
np.save('data_ready/retired_players_ts.npy', retired_players_ts)
np.save('data_ready/active_players_ts.npy', active_players_ts)

### cool thing: order is preserved between agg & time series methods

# Train / test splits (default = 5% HOF)

In [417]:
retired_players_ts = np.load('data_ready/retired_players_ts.npy')
batting_df_agg = pd.read_csv('data_ready/batting_norm_agg.csv')
active_players = np.load('data_ready/active_players.npy', allow_pickle=True)
batting_df_agg_retired = batting_df_agg[~batting_df_agg['player_id'].isin(active_players)]

FileNotFoundError: [Errno 2] No such file or directory: 'data_ready/retired_players_ts.npy'

In [327]:
batting_df_agg_retired.shape, retired_players_ts.shape

((40360, 18), (40360, 25, 13))

## train / test masks

In [344]:
np.random.seed(1)
n_samples = 40360
train_mask = np.array([1] * int(n_samples / 1.25) + ([0] * int(n_samples // 5)))
np.random.shuffle(train_mask)
test_mask = 1 - train_mask

### Aggregate

In [360]:
batting_df_agg_train = batting_df_agg_retired[train_mask == 1].drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
batting_df_agg_test = batting_df_agg_retired[test_mask == 1].drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

In [408]:
X_agg_test = batting_df_agg_test.drop(columns=['player_id', 'hof', 'years_played']).to_numpy()


In [392]:
X_agg_train = batting_df_agg_train.drop(columns=['player_id', 'hof', 'years_played']).to_numpy()
X_agg_test = batting_df_agg_test.drop(columns=['player_id', 'hof', 'years_played']).to_numpy()
y_agg_train = batting_df_agg_train['hof'].to_numpy(dtype=int)
y_agg_test = batting_df_agg_test['hof'].to_numpy(dtype=int)
train_player_ids = batting_df_agg_train['player_id'].to_numpy()
test_player_ids = batting_df_agg_test['player_id'].to_numpy()
train_years_played = batting_df_agg_train['years_played']
test_years_played = batting_df_agg_test['years_played']

In [409]:
for arr, name in zip((X_agg_train, X_agg_test, y_agg_train, y_agg_test, train_player_ids, test_player_ids, train_years_played, test_years_played), 
                     ('X_train', 'X_test', 'y_train', 'y_test', 'train_player_ids', 'test_player_ids', 'train_years_played', 'test_years_played')):
    np.save('data_ready/agg/' + name + '.npy', arr)

### Time series

In [399]:
X_ts_train = retired_players_ts[train_mask == 1]
X_ts_test = retired_players_ts[test_mask == 1]
np.save('data_ready/ts/X_train.npy', X_ts_train)
np.save('data_ready/ts/X_test.npy', X_ts_test)