#Audiobooks business case

Create a machine learning algorithm that can predict if a customer will buy again

Preprocess the data. Balance the dataset. Create 3 datasets: training, validation, and test. Save the newly created sets in a tensor friendly format (e.g. *.npz)
Since we are dealing with real life data, we will need to preprocess it a bit. This is the relevant code, which is not that hard, but is crucial to creating a good model.

If you want to know how to do that, go through the code with comments. In any case, this should do the trick for most datasets organized in the way: many inputs, and then 1 cell containing the targets (supersized learning datasets). Keep in mind that a specific problem may require additional preprocessing.

Note that we have removed the header row, which contains the names of the categories. We simply want the data.

In [2]:
import numpy as np
from sklearn.preprocessing import StandardScaler
import pickle
import tensorflow as tf

raw_csv_data = np.loadtxt('Audiobooks_data.csv', delimiter=',')

unscaled_inputs_all = raw_csv_data[:,1:-1]
targets_all = raw_csv_data[:,-1]

##Balance the dataset

In [3]:
num_one_targets = int(np.sum(targets_all))

zero_targets_counter = 0

indices_to_remove = []

for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)

##Standardize the inputs

In [4]:
scaler_deep_learning = StandardScaler()
scaled_inputs = scaler_deep_learning.fit_transform(unscaled_inputs_equal_priors)

##Shuffle the data

Shuffling the data means that we randomize the order in which the data is presented to the model during training. This helps the model to learn the general patterns in the data, rather than memorizing the specific order.

In [5]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

##Split the dataset into train, validation, and test

In [6]:
samples_count = shuffled_inputs.shape[0]

train_samples_count = int(0.8 * samples_count)
validation_samples_count = int(0.1 * samples_count)

test_samples_count = samples_count - train_samples_count - validation_samples_count

train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]

test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]

print(np.sum(train_targets), train_samples_count, np.sum(train_targets) / train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets) / validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets) / test_samples_count)

1791.0 3579 0.5004191114836547
222.0 447 0.4966442953020134
224.0 448 0.5


##Save the three datasets in *.npz

In [7]:
np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets)

##Save the scaler

In [8]:
pickle.dump(scaler_deep_learning, open('scaler_deep_learning.pickle', 'wb'))

##Data

In [9]:
npz = np.load('Audiobooks_data_train.npz')

In [10]:


train_inputs = npz['inputs'].astype(np.float)
train_targets = npz['targets'].astype(np.int)

npz = np.load('Audiobooks_data_validation.npz')
validation_inputs, validation_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)

npz = np.load('Audiobooks_data_test.npz')
test_inputs, test_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  train_inputs = npz['inputs'].astype(np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  train_targets = npz['targets'].astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  validation_inputs, validation_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  validation_inputs, validation_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_inputs, test_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np

##model

In [11]:
#input_size = 10
output_size = 2
hidden_layer_size = 50

model = tf.keras.Sequential([
                            tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
                            tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
                            tf.keras.layers.Dense(output_size, activation='softmax')
                            ])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics = ['accuracy'])

batch_size = 100

max_epochs=100

early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

model.fit(train_inputs,
          train_targets,
          batch_size=batch_size,
          epochs=max_epochs,
          callbacks=[early_stopping],
          validation_data=(validation_inputs, validation_targets),
          verbose=2
          )


Epoch 1/100
36/36 - 1s - loss: 0.5056 - accuracy: 0.8215 - val_loss: 0.3824 - val_accuracy: 0.8702 - 1s/epoch - 38ms/step
Epoch 2/100
36/36 - 0s - loss: 0.3592 - accuracy: 0.8754 - val_loss: 0.3149 - val_accuracy: 0.8881 - 95ms/epoch - 3ms/step
Epoch 3/100
36/36 - 0s - loss: 0.3162 - accuracy: 0.8877 - val_loss: 0.2957 - val_accuracy: 0.8926 - 96ms/epoch - 3ms/step
Epoch 4/100
36/36 - 0s - loss: 0.2930 - accuracy: 0.8908 - val_loss: 0.2832 - val_accuracy: 0.8881 - 125ms/epoch - 3ms/step
Epoch 5/100
36/36 - 0s - loss: 0.2799 - accuracy: 0.8952 - val_loss: 0.2752 - val_accuracy: 0.8993 - 162ms/epoch - 5ms/step
Epoch 6/100
36/36 - 0s - loss: 0.2696 - accuracy: 0.8994 - val_loss: 0.2719 - val_accuracy: 0.8993 - 154ms/epoch - 4ms/step
Epoch 7/100
36/36 - 0s - loss: 0.2629 - accuracy: 0.9016 - val_loss: 0.2652 - val_accuracy: 0.9016 - 141ms/epoch - 4ms/step
Epoch 8/100
36/36 - 0s - loss: 0.2560 - accuracy: 0.9050 - val_loss: 0.2640 - val_accuracy: 0.9038 - 136ms/epoch - 4ms/step
Epoch 9/100


<keras.callbacks.History at 0x7b3a559a7c70>

the optimal hidden layer size is a part of teh fine-tunning of teh model

batch size: indicate how many observations are fed to the algorithm at once



in 93% of the cases our model has predicted whether a customer will convert again

##test the model

In [12]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)



In [13]:
print('\nTest loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))


Test loss: 0.26. Test accuracy: 89.06%


the test accuracy is always lower than teh validation accuracy

we can make solid predictions about teh purchasing behaviour of 9 out of 10 people

##obtain the probability for a customer to convert

In [14]:
model.predict(test_inputs).round(2)



array([[1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.15, 0.85],
       [0.2 , 0.8 ],
       [0.93, 0.07],
       [0.9 , 0.1 ],
       [0.97, 0.03],
       [0.99, 0.01],
       [1.  , 0.  ],
       [0.91, 0.09],
       [0.04, 0.96],
       [0.29, 0.71],
       [0.04, 0.96],
       [0.94, 0.06],
       [0.23, 0.77],
       [1.  , 0.  ],
       [0.04, 0.96],
       [0.92, 0.08],
       [0.95, 0.05],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.98, 0.02],
       [0.3 , 0.7 ],
       [0.05, 0.95],
       [0.  , 1.  ],
       [0.23, 0.77],
       [0.84, 0.16],
       [0.89, 0.11],
       [0.93, 0.07],
       [0.92, 0.08],
       [0.73, 0.27],
       [0.22, 0.78],
       [0.99, 0.01],
       [0.93, 0.07],
       [0.  , 1.  ],
       [1.  , 0.  ],
       [0.66, 0.34],
       [0.  , 1.  ],
       [0.96, 0.04],
       [0.75, 0.25],
       [0.  , 1.  ],
       [0.3 , 0.7 ],
       [0.3 , 0.7 ],
       [0.99, 0.01],
       [0.2 ,

In [15]:
model.predict(test_inputs)[:,1].round(0)



array([0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0.,
       1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0.,
       1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1.,
       1., 1., 0., 0., 1., 1., 1., 1., 0., 1., 0., 1., 1., 0., 0., 1., 0.,
       0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 1., 1., 1.,
       0., 1., 0., 1., 1., 1., 0., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 1., 1., 1., 0.,
       0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0.,
       1., 1., 0., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 0.,
       1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0., 1., 1., 1.,
       1., 0., 0., 1., 1.

this method is for binary (0 and 1). if it has more tahn 2 outputs we can use argmax

In [16]:
np.argmax(model.predict(test_inputs),axis=1)



array([0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,

##save the model

In [17]:
model.save('audiobooks_model.h5')

#Predicting on new data

##load the scaler and the model

In [18]:
scaler_deep_learning =pickle.load(open('scaler_deep_learning.pickle','rb'))

In [19]:
model = tf.keras.models.load_model('audiobooks_model.h5')

##load the new data

In [21]:
raw_data = np.loadtxt('New_Audiobooks_Data.csv', delimiter=',')
new_data_inputs = raw_data[:,1:]

##predict the probability of a customer to convert

In [22]:
new_data_inputs_scaled = scaler_deep_learning.transform(new_data_inputs)

In [23]:
model.predict(new_data_inputs_scaled)[:,1].round(2)



array([0.  , 0.  , 0.05, 1.  , 0.  , 0.05, 0.05, 0.07, 0.04, 0.79, 0.  ,
       0.76, 0.97, 0.  , 0.06, 0.09, 0.84, 0.71, 0.82, 0.96, 1.  , 1.  ,
       1.  , 0.02, 0.  , 1.  , 0.29, 0.  , 1.  , 1.  ], dtype=float32)

In [24]:
np.argmax(model.predict(new_data_inputs_scaled),1)



array([0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 1])