# Audiobooks and Customer Conversion

In [1]:
import numpy as np
from sklearn import preprocessing

In [2]:
# load the dataset
raw_data = np.loadtxt('../data/audiobook_data.csv', delimiter=',')

In [3]:
# extract data from the .csv file
unscaled_inputs_all = raw_data[:, 1:-1]
targets_all = raw_data[:, -1]

In [4]:
# NEW
# Let's shuffle the data BEFORE balancing instead this time
shuffled_indices_all = np.arange(unscaled_inputs_all.shape[0])
np.random.shuffle(shuffled_indices_all)

In [5]:
shuffled_inputs_all = unscaled_inputs_all[shuffled_indices_all]
shuffled_targets_all = targets_all[shuffled_indices_all]

In [6]:
# balance the dataset 50/50:
#   1. count the values that have a target = 1
#   2. keep as many 0s as there are 1s
num_ones = int(np.sum(shuffled_targets_all))
num_ones

2237

In [7]:
shuffled_inputs_all.shape

(14084, 10)

In [8]:
zero_counter = 0
indices_to_remove = []

for i in range (shuffled_targets_all.shape[0]):
    if (shuffled_targets_all[i] == 0):
        zero_counter += 1
        if (zero_counter > num_ones):
            indices_to_remove.append(i)

unscaled_inputs_equal_priors = np.delete(shuffled_inputs_all, indices_to_remove, axis=0)
targets_equal_priors = np.delete(shuffled_targets_all, indices_to_remove, axis=0)

In [9]:
# standardize the inputs
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)
scaled_inputs.shape

(4474, 10)

In [10]:
# since we going to batch, we must shuffle the data as well
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

In [11]:
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

In [12]:
# split scaled and shuffled datasets into train, validation, and test
num_samples = shuffled_inputs.shape[0]
num_samples

4474

In [13]:
# use the 80-10-10 split
num_train_samples = int(0.8 * num_samples)
num_validation_samples = int(0.1 * num_samples)
num_testing_samples = num_samples - num_train_samples - num_validation_samples

In [14]:
print( 100 * num_train_samples/num_samples )
print( 100 * num_validation_samples/num_samples )
print( 100 * num_testing_samples/num_samples )

79.99552972731337
9.991059454626733
10.013410818059901


In [15]:
# extract data subsets from the main dataset:

# 1. TRAINING DATA
train_inputs = shuffled_inputs[:num_train_samples]
train_targets = shuffled_targets[:num_train_samples]

# 2. VALIDATION DATA
split_limit = num_train_samples + num_validation_samples
validation_inputs = shuffled_inputs[num_train_samples:split_limit]
validation_targets = shuffled_targets[num_train_samples:split_limit]

# 3. TESTING DATA
test_inputs = shuffled_inputs[split_limit:]
test_targets = shuffled_targets[split_limit:]

In [16]:
# check to see if the data subsets are balanced:
print( np.sum(train_targets) / num_train_samples )
print( np.sum(validation_targets) / num_validation_samples )
print( np.sum(test_targets) / num_testing_samples )

0.4973456272701872
0.49440715883668906
0.5267857142857143


In [17]:
# save the data in tensor format using .npz files
np.savez('./out/audiobook_training_data', inputs=train_inputs, targets=train_targets)
np.savez('./out/audiobook_validation_data', inputs=validation_inputs, targets=validation_targets)
np.savez('./out/audiobook_testing_data', inputs=test_inputs, targets=test_targets)

### Create the ML Algorithm

In [18]:
import tensorflow as tf

In [19]:
# load the .npz data files that we had saved from part 1:

train_npz = np.load('./out/audiobook_training_data.npz')
train_inputs = train_npz['inputs'].astype(np.float)
train_targets = train_npz['targets'].astype(np.int)

val_npz = np.load('./out/audiobook_validation_data.npz')
validation_inputs = val_npz['inputs'].astype(np.float)
validation_targets = val_npz['targets'].astype(np.int)

test_npz = np.load('./out/audiobook_testing_data.npz')
test_inputs = test_npz['inputs'].astype(np.float)
test_targets = test_npz['targets'].astype(np.int)

### MODEL

In [20]:
input_size = 10
output_size = 2
hidden_layer_size = 128

model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(output_size, activation='softmax'),
])

In [21]:
model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [22]:
BATCH_SIZE = 10
MAX_EPOCHS = 100
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

In [23]:
model.fit(train_inputs, 
          train_targets, 
          batch_size=BATCH_SIZE, 
          epochs=MAX_EPOCHS,
          callbacks=early_stopping,
          validation_data=(validation_inputs, validation_targets),
          verbose=2)

Epoch 1/100
358/358 - 1s - loss: 0.4469 - accuracy: 0.7550 - val_loss: 0.4139 - val_accuracy: 0.7673
Epoch 2/100
358/358 - 0s - loss: 0.3898 - accuracy: 0.7882 - val_loss: 0.3783 - val_accuracy: 0.8009
Epoch 3/100
358/358 - 0s - loss: 0.3800 - accuracy: 0.7991 - val_loss: 0.3689 - val_accuracy: 0.8098
Epoch 4/100
358/358 - 0s - loss: 0.3693 - accuracy: 0.8016 - val_loss: 0.3853 - val_accuracy: 0.7785
Epoch 5/100
358/358 - 0s - loss: 0.3652 - accuracy: 0.8033 - val_loss: 0.3761 - val_accuracy: 0.7673


<tensorflow.python.keras.callbacks.History at 0x2466cf5e3d0>

### TESTING

In [24]:
test_loss, test_acc = model.evaluate(test_inputs, test_targets)

