In [1]:
import numpy as np
import pandas as pd

agg_df = pd.read_csv('../data_ready/agg/batting_norm_agg.csv')

## get all the unique players in each class
unique_not_hof_players = agg_df[agg_df['hof'] == False]['player_id'].unique()
unique_hof_players = agg_df[agg_df['hof'] == True]['player_id'].unique()


# Create training and test data by players

In [2]:
training_player_not_hof = np.random.choice(unique_not_hof_players, size=(unique_not_hof_players.size*70)//100 , replace= False)
test_player_not_hof = np.setdiff1d(unique_not_hof_players, training_player_not_hof) 

training_player_hof = np.random.choice(unique_hof_players, size=(unique_hof_players.size*70)//100 , replace= False)
test_player_hof = np.setdiff1d(unique_hof_players, training_player_hof) 

In [11]:
training_player_not_hof = np.load('train_non_hof.npy', allow_pickle=True)
training_player_hof = np.load('train_hof.npy', allow_pickle=True)
test_player_not_hof = np.load('test_non_hof.npy', allow_pickle=True)
test_player_hof = np.load('test_hof.npy', allow_pickle=True)

## Get the training and test data using the unique players above

In [12]:
training_data = agg_df[agg_df['player_id'].isin(np.union1d(training_player_not_hof,training_player_hof))]
test_data = agg_df[agg_df['player_id'].isin(np.union1d(test_player_not_hof, test_player_hof))]


pd.options.mode.chained_assignment = None  # default='warn'

# replace the booleans with 1s and 0s
test_data.loc[test_data['hof'] == False,'hof'] = 0
test_data.loc[test_data['hof'] == True,'hof'] = 1
training_data.loc[training_data['hof'] == False,'hof'] = 0
training_data.loc[training_data['hof'] == True,'hof'] = 1

# add a hof2 for cross entropy error
test_data['hof2'] = 0
test_data.loc[test_data['hof'] == 0,'hof2'] = 1
training_data['hof2'] = 0
training_data.loc[training_data['hof'] == 0,'hof2'] = 1

# get rid of unwanted columns and get the np arrays
redundant_columns = ['Unnamed: 0', 'Unnamed: 0.1','player_id','hof','hof2']

test_numpy = test_data[test_data.columns.difference(redundant_columns)].to_numpy()
train_numpy = training_data[training_data.columns.difference(redundant_columns)].to_numpy()

test_labels = test_data[['hof','hof2']].to_numpy().astype(float)
train_labels = training_data[['hof','hof2']].to_numpy().astype(float)

In [13]:
## extract the only last years data from test_data
indexes = test_data.groupby(by='player_id')['years_played'].idxmax()

last_years = test_data.loc[indexes]

last_years_hof = last_years[last_years['hof'] == 1]
last_years_hof_numpy = last_years_hof[last_years_hof.columns.difference(redundant_columns)].to_numpy()

last_years_hof_labels =  last_years_hof[['hof','hof2']].to_numpy().astype(float)
np.save("last_years_hof.npy",last_years_hof_numpy)
np.save("last_years_hof_labels.npy",last_years_hof_labels)

In [18]:
last_years_numpy = last_years[last_years.columns.difference(redundant_columns)].to_numpy()
last_years_labels =  last_years[['hof','hof2']].to_numpy().astype(float)


In [19]:
np.save('last_years.npy', last_years_numpy)
np.save('last_years_labels.npy', last_years_labels)

In [14]:
np.save("train_labels.npy",train_labels)
np.save("test_labels.npy",test_labels)
np.save("train_numpy.npy",train_numpy)
np.save("test_numpy.npy",test_numpy)


In [16]:
np.where(test_labels[:,0] == 1)[0]

def prediction_to_accuracy(predictions):

    size = predictions.shape[0]

    return np.sum((np.argmax(predictions, axis=1) == 0).astype(int))/size

In [17]:
test_data[test_data['hof'] == 1]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ab,bb,double,g,h,hbp,hr,player_id,r,rbi,sb,sh,so,triple,years_played,hof,hof2
697,697,708,0.086278,0.334338,0.198963,0.270196,-0.020047,0.289218,0.460889,heilmha01,0.202486,0.064317,-0.489063,0.851372,0.456511,-0.364084,1.0,1,0
698,698,709,1.676156,1.796478,3.078759,1.939128,1.709485,1.998252,1.166920,heilmha01,1.858825,2.928893,-0.078871,2.117374,1.663324,2.092280,2.0,1,0
699,699,710,3.717099,3.210280,5.004856,3.814351,3.871020,2.851889,3.904125,heilmha01,3.419559,6.263144,0.633379,3.600429,3.803959,4.432563,3.0,1,0
700,700,711,4.651196,4.649516,5.791941,4.671009,4.864466,3.461799,7.411913,heilmha01,4.342063,7.755701,1.868868,4.042562,3.626758,5.881833,4.0,1,0
701,701,712,6.781735,6.046454,8.571682,6.619782,7.390073,3.924485,10.489772,heilmha01,6.626686,11.384259,2.238009,5.589218,5.593536,9.645537,5.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35716,35716,36222,28.486382,16.045100,29.227875,24.754260,28.306761,15.447421,31.397848,beltrad01,25.084145,30.205564,12.412385,-1.865139,23.601504,10.323985,14.0,1,0
35717,35717,36223,31.049662,17.342599,31.777160,26.956577,31.485774,17.031478,35.673496,beltrad01,28.081844,33.542383,12.174804,-2.368693,24.933600,11.085290,15.0,1,0
35718,35718,36224,33.765350,19.389275,34.269031,29.280818,34.796817,19.373146,39.524178,beltrad01,30.981829,36.670060,11.990241,-2.854656,26.176783,10.664639,16.0,1,0
35719,35719,36225,36.057926,21.935658,36.908270,31.332346,37.734235,20.052842,42.017184,beltrad01,33.612064,39.301404,11.803443,-3.348936,27.309481,10.893048,17.0,1,0


# Evaluate 