## Imports

In [48]:
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(threshold=np.inf)



In [50]:
raw_data = np.loadtxt("Audiobooks_data.csv", delimiter=",")
raw_inputs = raw_data[:,1:-1]
raw_targets = raw_data[:,-1]
raw_inputs.shape


(14084, 10)

## shuffle data to avoid effect (eg day effect)

In [53]:
suffled_index = np.arange(raw_inputs.shape[0])
np.random.shuffle(suffled_index)
raw_inputs = raw_inputs[suffled_index]
raw_targets = raw_targets[suffled_index]
raw_targets.shape

(14084,)

## Balance the dataset

In [56]:

num_one_targets = int(np.sum(raw_targets))
zero_target_counter =0
indices_to_remove =[]
for i in range (raw_targets.shape[0]):
    if raw_targets[i] ==0:
        zero_target_counter +=1
        if zero_target_counter > num_one_targets:
            indices_to_remove.append(i)
balanced_targets = np.delete(raw_targets, indices_to_remove, axis=0)
balanced_inputs = np.delete(raw_inputs, indices_to_remove, axis=0)
balanced_targets.shape

(4474,)

### scale the inputs

In [59]:
from sklearn import preprocessing
scaled_inputs = preprocessing.scale(balanced_inputs)

### reshuffle data before spliting

In [62]:
shuffled_index = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_index)
shuffled_inputs = scaled_inputs[shuffled_index]
shuffled_targets = balanced_targets[shuffled_index]

### Split the dataset into train, validation, and test

In [65]:
samples_size = shuffled_inputs.shape[0]

train_size = int(0.8 * samples_size)
validation_size = int(0.1 * samples_size)
test_size =validation_size


train_inputs = shuffled_inputs[:train_size]
train_targets = shuffled_targets[:train_size]


validation_inputs = shuffled_inputs[train_size:train_size+validation_size]
validation_targets = shuffled_targets[train_size:train_size+validation_size]

test_inputs = shuffled_inputs[train_size+validation_size:]
test_targets = shuffled_targets[train_size+validation_size:]


# Print the number of targets that are 1s, the total number of samples, and the proportion for training, validation, and test.
print(np.sum(train_targets), train_size, np.sum(train_targets) / train_size)
print(np.sum(validation_targets), validation_size, np.sum(validation_targets) / validation_size)
print(np.sum(test_targets), test_size, np.sum(test_targets) / test_size)

1775.0 3579 0.49594858899133837
232.0 447 0.5190156599552572
230.0 447 0.5145413870246085


## save in npz

In [68]:
np.savez("Audiobook_data_train", inputs=train_inputs, targets=train_targets)
np.savez("Audiobook_data_validation", inputs=validation_inputs, targets=validation_targets)
np.savez("Audiobook_data_test", inputs=test_inputs, targets=test_targets)