In [1]:
import numpy as np
from sklearn import preprocessing

raw_csv = np.loadtxt('Audiobooks_data.csv', delimiter = ',')

unscaled_inputs = raw_csv[:,1:-1]
targets_all = raw_csv[:,-1]


In [2]:
#Balancing the data set
num1_target = int(np.sum(targets_all))
zero_target_counter = 0
indices_to_remove = []

for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_target_counter += 1
        if zero_target_counter > num1_target:
            indices_to_remove.append(i)
            
unscaled_inputs_equal_priors = np.delete(unscaled_inputs,indices_to_remove,axis=0)
target_equal_priors = np.delete(targets_all,indices_to_remove,axis=0)

In [3]:
#Standardizing
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

#Shuffling
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = target_equal_priors[shuffled_indices]

In [4]:
#split into train, validation, test

sample_count = shuffled_inputs.shape[0]

train_sample_count = int(0.8* sample_count)
valid_sample_count = int(0.1*sample_count)
test_sample_count = sample_count - train_sample_count - valid_sample_count

#Sorting inputs
train_inputs = shuffled_inputs[:train_sample_count]
train_targets = shuffled_targets[:train_sample_count]

valid_inputs = shuffled_inputs[train_sample_count:train_sample_count+valid_sample_count]
valid_targets = shuffled_targets[train_sample_count:train_sample_count+valid_sample_count]

test_inputs = shuffled_inputs[train_sample_count+valid_sample_count:]
test_targets = shuffled_targets[train_sample_count+valid_sample_count:]

#check if data is balanced and split correctly
print(np.sum(train_targets), train_sample_count, np.sum(train_targets) / train_sample_count)
print(np.sum(valid_targets), valid_sample_count, np.sum(valid_targets) / valid_sample_count)
print(np.sum(test_targets), test_sample_count, np.sum(test_targets) / test_sample_count)

1805.0 3579 0.5043308186644314
206.0 447 0.4608501118568233
226.0 448 0.5044642857142857


In [5]:
#save data in .npz

np.savez('Audiobooks_train_data', inputs=train_inputs,targets=train_targets)
np.savez('Audiobooks_valid_data', inputs=valid_inputs,targets=valid_targets)
np.savez('Audiobooks_test_data', inputs=test_inputs,targets=test_targets)