# Neural Networks with Tensorflow Basic Exercise

## Import Relevant Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

## Load the data

In [2]:
full_data = np.loadtxt("Audiobooks_data.csv", delimiter=',')

In [3]:
full_data

array([[9.9400e+02, 1.6200e+03, 1.6200e+03, ..., 5.0000e+00, 9.2000e+01,
        0.0000e+00],
       [1.1430e+03, 2.1600e+03, 2.1600e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [2.0590e+03, 2.1600e+03, 2.1600e+03, ..., 0.0000e+00, 3.8800e+02,
        0.0000e+00],
       ...,
       [3.1134e+04, 2.1600e+03, 2.1600e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [3.2832e+04, 1.6200e+03, 1.6200e+03, ..., 0.0000e+00, 9.0000e+01,
        0.0000e+00],
       [2.5100e+02, 1.6740e+03, 3.3480e+03, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00]])

This data is in the form of a matrix. The columns are as below 
1. Id
2. overall book length in minutes
3. average book length in minutes
4. Price
5. Average price
6. whether or not a review was given (1:yes, 0:no)
7. what was the review on a scale of 10
8. minutes listened
9. percent completed
10. support requests
11. last visited minus purchase date
12. targets

## Feature Selection

We will create a new column that records the number of books purchased using the second and the third column.

In [4]:
# create an empty array to store the number of books for each id
num_books_column = []

# take the total reading and the average and divide to get number of books
for total, avg in zip(full_data[:, 1], full_data[:, 2]):
    num_books = total/avg
    num_books_column = np.append(num_books_column, num_books)

# insert this as a column
inputs = np.insert(full_data, 3, num_books_column, axis=1)
inputs[:, 3]

array([1. , 1. , 1. , ..., 1. , 1. , 0.5])

Now, looking at the columns, I will use the newly created column, the price column, review column, percent completed, and the last visited minus the date of purchase since these columns seem logically most relevant towards predicting a customer's loyalty and interest.

In [5]:
inputs = full_data
inputs1 = inputs[:, 3]
inputs2 = inputs[:, 4]
inputs3 = inputs[:, 6]
inputs4 = inputs[:, 9]
inputs5 = inputs[:, 11]
output = inputs[:, -1]
columns_to_include = np.stack((inputs1, inputs2, inputs3, inputs4, inputs5, output), axis=1)
columns_to_include

array([[19.73, 19.73, 10.  ,  5.  ,  0.  ,  0.  ],
       [ 5.33,  5.33,  8.91,  0.  ,  0.  ,  0.  ],
       [ 5.33,  5.33,  8.91,  0.  ,  0.  ,  0.  ],
       ...,
       [ 6.14,  6.14,  8.91,  0.  ,  0.  ,  0.  ],
       [ 5.33,  5.33,  8.  ,  0.  ,  0.  ,  0.  ],
       [ 5.33, 10.67,  8.91,  0.  ,  1.  ,  1.  ]])

## Split the data into training and testing data

In [6]:
# shuffle the columns to make them randomized
np.random.shuffle(columns_to_include)

# keep the last 2000 samples for testing purpose
training_inputs_to_include = columns_to_include[:-2000, :-1]
testing_inputs_to_include = columns_to_include[-2000:, :-1]
training_targets = (columns_to_include[:-2000, -1]).astype(int)
testing_targets = (columns_to_include[-2000:, -1]).astype(int)

## Save the data as an .npz file (tensorflow friendly)

In [7]:
np.savez("Audiobooks_training", inputs=training_inputs_to_include, targets=training_targets)
np.savez("Audiobooks_testing", inputs=testing_inputs_to_include, targets=testing_targets)

## Load the data from this saved file

In [8]:
training_data = np.load("Audiobooks_training.npz")

## Create the Model

In [9]:
# number of inputs
input_size = 5
# number of outputs
output_size = 2
# hidden layer size 
hidden_layer_size = 150

# create the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='sigmoid'), # first hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation='sigmoid'), # second hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation='sigmoid'), # thord hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation='sigmoid'), # fourth hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation='sigmoid'), # fifth hidden layer
    
    tf.keras.layers.Dense(output_size, activation='softmax') # 
])

## Decide the loss function, the optimizer, and train the data

In [10]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(training_data['inputs'], training_data['targets'], epochs=100, verbose=2)

Epoch 1/100
378/378 - 1s - loss: 0.2334 - accuracy: 0.9127
Epoch 2/100
378/378 - 1s - loss: 0.0048 - accuracy: 0.9991
Epoch 3/100
378/378 - 1s - loss: 0.0054 - accuracy: 0.9989
Epoch 4/100
378/378 - 0s - loss: 0.0063 - accuracy: 0.9988
Epoch 5/100
378/378 - 0s - loss: 0.0016 - accuracy: 0.9997
Epoch 6/100
378/378 - 1s - loss: 7.8850e-04 - accuracy: 0.9998
Epoch 7/100
378/378 - 1s - loss: 0.0017 - accuracy: 0.9997
Epoch 8/100
378/378 - 0s - loss: 9.0888e-04 - accuracy: 0.9999
Epoch 9/100
378/378 - 1s - loss: 0.0066 - accuracy: 0.9988
Epoch 10/100
378/378 - 0s - loss: 0.0067 - accuracy: 0.9992
Epoch 11/100
378/378 - 1s - loss: 0.0019 - accuracy: 0.9998
Epoch 12/100
378/378 - 1s - loss: 8.7302e-05 - accuracy: 1.0000
Epoch 13/100
378/378 - 0s - loss: 4.4328e-05 - accuracy: 1.0000
Epoch 14/100
378/378 - 1s - loss: 2.8228e-05 - accuracy: 1.0000
Epoch 15/100
378/378 - 0s - loss: 1.9634e-05 - accuracy: 1.0000
Epoch 16/100
378/378 - 0s - loss: 1.4349e-05 - accuracy: 1.0000
Epoch 17/100
378/378 

<tensorflow.python.keras.callbacks.History at 0x1dc9c584688>

## Test the model and predict accuracy

In [11]:
test_data = np.load("Audiobooks_testing.npz")

In [12]:
raw_outputs = model.predict_on_batch(test_data['inputs'])

In [13]:
outputs = []
for arr in raw_outputs:
    if arr[0] > arr[1]:
        outputs = np.append(outputs, 0)
    else:
        outputs = np.append(outputs, 1)
outputs = outputs.astype(int)
validate_array = (outputs == test_data['targets'])
print(f"Accuracy: {(validate_array.sum()/len(validate_array)*100).round(2)}%")

Accuracy: 100.0%
