# Neural Network Practical Example - Preprocessing

### Extract the data from csv file 

In [1]:
import numpy as np
from sklearn import preprocessing

raw_csv_data = np.loadtxt('Audiobooks_data.csv', delimiter=',')

unscaled_inputs_all = raw_csv_data[:,1:-1]
targets_all = raw_csv_data[:,-1]

### Balance the dataset 

Make sure each target has equal number of samples in the dataset

In [6]:
num_targets_one = int(targets_all.sum())
num_targets_zero = 0
indices_to_remove = []

for i in range(len(targets_all)):
    if targets_all[i] == 0:
        num_targets_zero += 1
        if num_targets_zero > num_targets_one:
            indices_to_remove.append(i)

unscaled_inputs_balanced = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)
targets_balanced = np.delete(targets_all, indices_to_remove, axis=0)

### Standardize the input 

In [7]:
scaled_inputs = preprocessing.scale(unscaled_inputs_balanced)

### Shuffle the data

In [8]:
shuffled_indices = np.arange(len(scaled_inputs))
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_balanced[shuffled_indices]

### Split the dataset into train, validation and test 

In [9]:
num_samples = int(len(shuffled_inputs))

num_train = int(0.8 * num_samples)
num_validation = int(0.1 * num_samples)
num_test = num_samples - num_train - num_validation

train_inputs = shuffled_inputs[:num_train]
train_targets = shuffled_targets[:num_train]
validation_inputs = shuffled_inputs[num_train:num_train+num_validation]
validation_targets = shuffled_targets[num_train:num_train+num_validation]
test_inputs = shuffled_inputs[num_train+num_validation:]
test_targets = shuffled_targets[num_train+num_validation:]

### Save train, validation and test datasets in npz file 

In [10]:
np.savez('Audiobooks_train_data', inputs=train_inputs, targets=train_targets)
np.savez('Audiobooks_validation_data', inputs=validation_inputs, targets=validation_targets)
np.savez('Audiobooks_test_data', inputs=test_inputs, targets=test_targets)