# Code for preprocessing aggregate-format data to train neural network

In [1]:
import numpy as np
import pandas as pd

agg_df = pd.read_csv('data_ready/agg/batting_norm_agg.csv')

## get all the unique players in each class
unique_not_hof_players = agg_df[agg_df['hof'] == False]['player_id'].unique()
unique_hof_players = agg_df[agg_df['hof'] == True]['player_id'].unique()


# Create training and test data by players

In [2]:
training_player_not_hof = np.random.choice(unique_not_hof_players, size=(unique_not_hof_players.size*70)//100 , replace= False)
test_player_not_hof = np.setdiff1d(unique_not_hof_players, training_player_not_hof) 

training_player_hof = np.random.choice(unique_hof_players, size=(unique_hof_players.size*70)//100 , replace= False)
test_player_hof = np.setdiff1d(unique_hof_players, training_player_hof) 

In [11]:
training_player_not_hof = np.load('data_ready/agg/nn/train_non_hof.npy', allow_pickle=True)
training_player_hof = np.load('data_ready/agg/nn/train_hof.npy', allow_pickle=True)
test_player_not_hof = np.load('data_ready/agg/nn/test_non_hof.npy', allow_pickle=True)
test_player_hof = np.load('data_ready/agg/nn/test_hof.npy', allow_pickle=True)

## Get the training and test data using the unique players above

In [12]:
training_data = agg_df[agg_df['player_id'].isin(np.union1d(training_player_not_hof,training_player_hof))]
test_data = agg_df[agg_df['player_id'].isin(np.union1d(test_player_not_hof, test_player_hof))]


pd.options.mode.chained_assignment = None  # default='warn'

# replace the booleans with 1s and 0s
test_data.loc[test_data['hof'] == False,'hof'] = 0
test_data.loc[test_data['hof'] == True,'hof'] = 1
training_data.loc[training_data['hof'] == False,'hof'] = 0
training_data.loc[training_data['hof'] == True,'hof'] = 1

# add a hof2 for cross entropy error
test_data['hof2'] = 0
test_data.loc[test_data['hof'] == 0,'hof2'] = 1
training_data['hof2'] = 0
training_data.loc[training_data['hof'] == 0,'hof2'] = 1

# get rid of unwanted columns and get the np arrays
redundant_columns = ['Unnamed: 0', 'Unnamed: 0.1','player_id','hof','hof2']

test_numpy = test_data[test_data.columns.difference(redundant_columns)].to_numpy()
train_numpy = training_data[training_data.columns.difference(redundant_columns)].to_numpy()

test_labels = test_data[['hof','hof2']].to_numpy().astype(float)
train_labels = training_data[['hof','hof2']].to_numpy().astype(float)

In [13]:
## extract the only last years data from test_data
indexes = test_data.groupby(by='player_id')['years_played'].idxmax()

last_years = test_data.loc[indexes]

last_years_hof = last_years[last_years['hof'] == 1]
last_years_hof_numpy = last_years_hof[last_years_hof.columns.difference(redundant_columns)].to_numpy()

last_years_hof_labels =  last_years_hof[['hof','hof2']].to_numpy().astype(float)
np.save("data_ready/agg/nn/last_years_hof.npy",last_years_hof_numpy)
np.save("data_ready/agg/nn/last_years_hof_labels.npy",last_years_hof_labels)

In [18]:
last_years_numpy = last_years[last_years.columns.difference(redundant_columns)].to_numpy()
last_years_labels =  last_years[['hof','hof2']].to_numpy().astype(float)


In [19]:
np.save('data_ready/agg/nn/last_years.npy', last_years_numpy)
np.save('data_ready/agg/nn/last_years_labels.npy', last_years_labels)

In [14]:
np.save("data_ready/agg/nn/train_labels.npy",train_labels)
np.save("data_ready/agg/nn/test_labels.npy",test_labels)
np.save("data_ready/agg/nn/train_numpy.npy",train_numpy)
np.save("data_ready/agg/nn/test_numpy.npy",test_numpy)