### Extract the data from the csv

In [1]:
import numpy as np
from sklearn import preprocessing

In [2]:
csv_data = np.loadtxt('Audiobooks_data.csv', delimiter = ',')

inputs_all = csv_data[:, 1:-1] # Excluding "id" and "targets"
targets_all = csv_data[:, -1]

### Balancing the data set

In our data set, the number of `targets = 0` > number of `targets = 1` <br>
Hence, we will balance the targets, such that both the targets are equal in number.

In [3]:
num_target_one = int(np.sum(targets_all)) # Typecasting into "int", as the sum is a "float"
num_target_one

2237

In [4]:
zero_counter = 0
indices_to_remove = []

In [5]:
targets_all.shape[0] 
# Gives the total number of data entries 
# (or simply the total number of rows in our data set)

14084

In [6]:
# To find the indices of extra occurences of 0, so that after 
# deleting them, the priors get balanced

# shape[0] denotes the elements present in axis = 0 (x-axis) or rows
for i in range(targets_all.shape[0]): 
    if (targets_all[i] == 0):
        zero_counter += 1
        if (zero_counter > num_target_one):
            indices_to_remove.append(i)

In [7]:
# Removing extra input values
inputs_equal_priors = np.delete(inputs_all, indices_to_remove, axis = 0) 

# Removing extra target values
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis = 0) 

### Standardizing the inputs

In [8]:
# All data of a particular variable get standardized
scaled_inputs = preprocessing.scale(inputs_equal_priors) 

Our targets comprise of only 0 or 1, so no need to standardize the "targets" data set

### Shuffle the data

In [9]:
# Generating an "array" that the indices of all the remaining 
# entries of our data set
shuffled_indices = np.arange(targets_equal_priors.shape[0]) 

In [10]:
np.random.shuffle(shuffled_indices) # Shuffling the indices

`shuffling_done = np.random.shuffle(shuffled_indices)` cannot do in this way. <br>
Abovementioned code would return `None`, not the shuffled indices. <br>
`np.random.shuffle(array)` does inplace shuffling on the passed array itself.

In [11]:
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

### Splitting it into training, validation and testing data set

In [12]:
print(shuffled_inputs.shape[0]) # Number of rows (X-axis)
print(shuffled_inputs.shape[1]) # Number of columns (Y-axis)

4474
10


In [13]:
samples_count = shuffled_inputs.shape[0]

Using 75-15-10 split for training, validation and testing

In [14]:
# 75% data being kept for training
training_count = int(0.75 * samples_count)

# 15% data being kept for validation
validation_count = int(0.15 * samples_count) 

# Remaining data being kept for testing
testing_count = samples_count - training_count - validation_count

In [15]:
print(training_count)
print(validation_count)
print(testing_count)

3355
671
448


In [16]:
# Creating training inputs
train_inputs = shuffled_inputs[0:training_count] 

# Creating training targets
train_targets = shuffled_targets[0:training_count] 

In [17]:
# Creating validation inputs
validation_inputs = shuffled_inputs[training_count:training_count + validation_count] 

# Creating validation targets
validation_targets = shuffled_targets[training_count:training_count + validation_count] 

In [18]:
# Creating testing inputs
test_inputs = shuffled_inputs[training_count + validation_count:] 

# Creating testing targets
test_targets = shuffled_targets[training_count + validation_count:] 

### Checking the balance of resultant data set

In [19]:
print(np.sum(train_targets) / training_count)
print(np.sum(validation_targets) / validation_count)
print(np.sum(test_targets) / testing_count)

0.49538002980625934
0.4858420268256334
0.5558035714285714


As all data set splits are close to 50% priors (i.e. they have 50% targets as `0` and 50% targets as `1`), we can say that our data set splits are balanced.

### Saving the data in .npz file

In [20]:
np.savez('Audiobooks_training', inputs = train_inputs, targets = train_targets)
np.savez('Audiobooks_validation', inputs = validation_inputs, targets = validation_targets)
np.savez('Audiobooks_testing', inputs = test_inputs, targets = test_targets)