# Audiobook Practical Example (step 1)

## Import Libraries

In [11]:
import numpy as np
from sklearn import preprocessing

## Extract data from csv

In [12]:
raw_csv_data = np.loadtxt('C:/Users/iolley2/Desktop/DS Contd/TensorFlow/Business Case/Audiobooks_data.csv',delimiter = ',')

# skip the first column of the data
unscaled_inputs_all = raw_csv_data[:,1:-1]
targets_all = raw_csv_data[:,-1]
print(f'{raw_csv_data.shape}\nRecall targets and first column was removed')

(14084, 12)
Recall targets and first column was removed


## Balance dataset

In [13]:
num_one_targets = int(np.sum(targets_all)) # number of customers that converged ()
zero_targets_counter = 0
indices_to_remove = []

for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter >num_one_targets:
            indices_to_remove.append(i)

unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove,axis=0)
targets_equal_priors = np.delete(targets_all, indices_to_remove,axis=0)

## Standardize the inputs

In [14]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

## Shuffle data

In [15]:
# create a range of numbers from total rows
shuffled_indices = np.arange(scaled_inputs.shape[0]) # Take indices from the axix '0' of the scaled inputs shape and place them in a variable
shuffled_indices

array([   0,    1,    2, ..., 4471, 4472, 4473])

In [16]:
np.random.shuffle(shuffled_indices) # shuffle the indices


In [17]:
shuffled_inputs = scaled_inputs[shuffled_indices] # create shuffled input variables where indices are the shuffled indices
shuffled_inputs

array([[ 0.21053387, -0.18888517, -0.12759546, ..., -0.41569922,
        -0.20183481, -0.80255852],
       [ 1.27894497,  0.41646744,  0.53870369, ..., -0.41569922,
        -0.20183481,  2.61858106],
       [ 1.27894497,  0.41646744,  0.60615619, ..., -0.41569922,
        -0.20183481, -0.80255852],
       ...,
       [-1.71260612, -1.27851988, -0.39082475, ..., -0.41569922,
        -0.20183481, -0.80255852],
       [-0.64419501, -0.67316726, -0.39082475, ..., -0.41569922,
        -0.20183481, -0.80255852],
       [ 0.21053387, -0.18888517, -0.39082475, ..., -0.41569922,
        -0.20183481, -0.33471037]])

In [18]:

shuffled_targets = targets_equal_priors[shuffled_indices] # create shuffled targets variable where indices are the shuffled indices
shuffled_targets

array([1., 1., 0., ..., 1., 1., 1.])

## Split the dataset into train, validation and test

In [19]:
samples_count = shuffled_inputs.shape[0] # count total number of samples

# determine the size of the 3 datasets (using 80-10-10 split)
train_samples_count = int(0.8*samples_count)
validation_samples_count = int(0.1*samples_count)
test_samples_count = samples_count - train_samples_count - validation_samples_count

train_inputs = shuffled_inputs[:train_samples_count] # select inputs from row 0 to total training input count
train_targets = shuffled_targets[:train_samples_count] # select targets from row 0 to total training target count

validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count] # select inputs from row total training input count(as the prev rows are selected for training set) to total training input count + total validation input count
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count] # select inputs from row total training target count(as the prev rows are selected for training set) to total training target count + total validation target count

test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:] # select inputs from row total training input count + total validation input count(as previous rows were selected for training and validation set) to the end
test_targets = shuffled_targets[train_samples_count+validation_samples_count:] # select target from row total training target count + total validation target count(as previous rows were selected for training and validation set) to the end

In [20]:
print(np.sum(train_targets),train_samples_count,np.sum(train_targets)/train_samples_count)
print(np.sum(validation_targets),validation_samples_count,np.sum(validation_targets)/validation_samples_count)
print(np.sum(test_targets),test_samples_count,np.sum(test_targets)/test_samples_count)

1808.0 3579 0.5051690416317407
216.0 447 0.48322147651006714
213.0 448 0.47544642857142855


## Save 3 datasets in *.npz

In [32]:
np.savez('Audiobooks_data_train',inputs = train_inputs,targets = train_targets)
np.savez('Audiobooks_data_test',inputs = test_inputs,targets = test_targets)
np.savez('Audiobooks_data_validation',inputs =validation_inputs,targets = validation_targets)