In [1]:
import numpy as np
import pandas as pd

agg_df = pd.read_csv('../data_ready/agg/batting_norm_agg.csv')

## get all the unique players in each class
unique_not_hof_players = agg_df[agg_df['hof'] == False]['player_id'].unique()
unique_hof_players = agg_df[agg_df['hof'] == True]['player_id'].unique()


# Create training and test data by players

In [2]:
training_player_not_hof = np.random.choice(unique_not_hof_players, size=(unique_not_hof_players.size*70)//100 , replace= False)
test_player_not_hof = np.setdiff1d(unique_not_hof_players, training_player_not_hof) 

training_player_hof = np.random.choice(unique_hof_players, size=(unique_hof_players.size*70)//100 , replace= False)
test_player_hof = np.setdiff1d(unique_hof_players, training_player_hof) 

## Get the training and test data using the unique players above

In [12]:
training_data = agg_df[agg_df['player_id'].isin(np.union1d(training_player_not_hof,training_player_hof))]
test_data = agg_df[agg_df['player_id'].isin(np.union1d(test_player_not_hof, test_player_hof))]


pd.options.mode.chained_assignment = None  # default='warn'

# replace the booleans with 1s and 0s
test_data.loc[test_data['hof'] == False,'hof'] = 0
test_data.loc[test_data['hof'] == True,'hof'] = 1
training_data.loc[training_data['hof'] == False,'hof'] = 0
training_data.loc[training_data['hof'] == True,'hof'] = 1

# add a hof2 for cross entropy error
test_data['hof2'] = 0
test_data.loc[test_data['hof'] == 0,'hof2'] = 1
training_data['hof2'] = 0
training_data.loc[training_data['hof'] == 0,'hof2'] = 1

# get rid of unwanted columns and get the np arrays
redundant_columns = ['Unnamed: 0', 'Unnamed: 0.1','player_id','hof','hof2']

test_numpy = test_data[test_data.columns.difference(redundant_columns)].to_numpy()
train_numpy = training_data[training_data.columns.difference(redundant_columns)].to_numpy()

test_labels = test_data[['hof','hof2']].to_numpy().astype(float)
train_labels = training_data[['hof','hof2']].to_numpy().astype(float)

In [51]:
## extract the only last years data from test_data
indexes = test_data.groupby(by='player_id')['years_played'].idxmax()

last_years = test_data.loc[indexes]

last_years_hof = last_years[last_years['hof'] == 1]
last_years_hof_numpy = last_years_hof[last_years_hof.columns.difference(redundant_columns)].to_numpy()

last_years_labels =  last_years_hof[['hof','hof2']].to_numpy().astype(float)
np.save("last_years_hof.npy",last_years_hof_numpy)
np.save("last_years_hof_labels.npy",last_years_labels)

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [14]:
np.save("train_labels.npy",train_labels)
np.save("test_labels.npy",test_labels)
np.save("train_numpy.npy",train_numpy)
np.save("test_numpy.npy",test_numpy)


In [26]:
np.where(test_labels[:,0] == 1)[0]

def prediction_to_accuracy(predictions):

    size = predictions.shape[0]

    return np.sum((np.argmax(predictions, axis=1) == 0).astype(int))/size

In [48]:
test_data[test_data['hof'] == 1]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ab,bb,double,g,h,hbp,hr,player_id,r,rbi,sb,sh,so,triple,years_played,hof,hof2
1015,1015,1031,2.080127,2.907398,1.379983,1.911335,1.921572,0.251373,2.988810,bancrda01,2.493121,0.607633,0.965992,2.109969,2.285166,-0.104711,1.0,1,0
1016,1016,1032,3.809870,6.007041,1.891222,3.700394,3.143042,1.508947,4.249354,bancrda01,3.984375,1.521339,2.027955,3.507410,4.428393,-0.706668,2.0,1,0
1017,1017,1033,5.441091,7.578527,3.817320,5.120275,4.548701,0.954226,6.352408,bancrda01,5.504816,2.824168,3.091906,4.873839,5.861101,0.015451,3.0,1,0
1018,1018,1034,7.716436,10.194249,5.953824,7.080074,6.737132,0.987258,5.979408,bancrda01,8.135309,3.579550,4.050767,5.168493,7.860652,0.772976,4.0,1,0
1019,1019,1035,8.717725,11.249240,6.756326,7.994874,7.701477,0.411881,5.572967,bancrda01,9.234792,4.024298,4.558480,5.585808,9.017191,2.186588,5.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37805,37805,38311,25.379735,38.772301,33.477885,23.073886,31.420575,21.376493,50.339016,pujolal01,37.969268,40.188583,9.500268,-5.226504,11.516746,4.128242,11.0,1,0
37806,37806,38312,27.959195,40.915614,37.658484,25.232959,34.185553,22.960549,53.816902,pujolal01,40.585734,43.641229,10.360570,-5.730058,12.699813,3.698181,12.0,1,0
37807,37807,38313,29.380788,42.434429,38.889620,26.221434,35.550327,24.497809,55.773253,pujolal01,41.925017,45.634833,10.176007,-6.216021,13.369390,3.277531,13.0,1,0
37808,37808,38314,32.128052,44.486676,41.922252,28.509760,38.367432,25.954973,59.683742,pujolal01,44.966036,49.439402,10.672237,-6.710301,14.427626,3.505939,14.0,1,0
