In [1]:
import numpy as np
from sklearn import preprocessing

raw_csv_data = np.loadtxt('Audiobooks_data.csv', delimiter=',')

In [2]:
unscaled_input = raw_csv_data[:,1:-1]

In [3]:
targets_all = raw_csv_data[:,-1]

In [4]:
len(targets_all)

14084

In [5]:
num_one_targets = int(np.sum(targets_all))

In [6]:
num_zero_targets = 0

indexs_to_remove = []

for i in range(targets_all.shape[0]):
    if targets_all[i]==0:
        num_zero_targets+=1
        if num_zero_targets>num_one_targets:
            indexs_to_remove.append(i)

In [7]:
unscaled_inputs_equal_prior = np.delete(unscaled_input, indexs_to_remove, axis=0)
targets_equal_prior = np.delete(targets_all, indexs_to_remove, axis=0)

In [8]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_prior)

In [9]:
suffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(suffled_indices)

shuffled_inputs = scaled_inputs[suffled_indices]
shuffled_targets = targets_equal_prior[suffled_indices]

In [10]:
sample_count = shuffled_inputs.shape[0]

train_sample_count = int(0.8*sample_count)
validation_sample_count = int(0.1*sample_count)

test_sample_count = int(sample_count-train_sample_count-validation_sample_count)

train_inputs = shuffled_inputs[:train_sample_count]
train_targets = shuffled_targets[:train_sample_count]

validation_inputs = shuffled_inputs[train_sample_count:train_sample_count+validation_sample_count]
validation_targets = shuffled_targets[train_sample_count:train_sample_count+validation_sample_count]

test_inputs = shuffled_inputs[train_sample_count+validation_sample_count:]
test_targets = shuffled_targets[train_sample_count+validation_sample_count:]

In [11]:
print("Training Dataset \t no. of ones: {0:.0f}, no. of zeros: {1:.0f}, prior: {2:.2f}%".format(np.sum(train_targets),train_sample_count-np.sum(train_targets),(np.sum(train_targets)/train_sample_count)*100))
print("Validation Dataset \t no. of ones: {0:.0f}, no. of zeros: {1:.0f}, prior: {2:.2f}%".format(np.sum(validation_targets),validation_sample_count-np.sum(validation_targets),(np.sum(validation_targets)/validation_sample_count)*100))
print("Test Dataset \t\t no. of ones: {0:.0f}, no. of zeros: {1:.0f}, prior: {2:.2f}%".format(np.sum(test_targets),test_sample_count-np.sum(test_targets),(np.sum(test_targets)/test_sample_count)*100))

Training Dataset 	 no. of ones: 1774, no. of zeros: 1805, prior: 49.57%
Validation Dataset 	 no. of ones: 225, no. of zeros: 222, prior: 50.34%
Test Dataset 		 no. of ones: 238, no. of zeros: 210, prior: 53.12%


In [12]:
np.savez('Audiobook_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobook_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Audiobook_data_test', inputs=test_inputs, targets=test_targets)

In [13]:
import tensorflow as tf

In [14]:
npz = np.load('Audiobook_data_train.npz')
train_inputs, train_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)

npz = np.load('Audiobook_data_validation.npz')
validation_inputs, validation_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)

npz = np.load('Audiobook_data_test.npz')
test_inputs, test_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)

In [21]:
inputs = 10 
output_size = 2
hidden_layers_size = 50

model = tf.keras.Sequential([
            tf.keras.layers.Dense(hidden_layers_size, activation='relu'),
            tf.keras.layers.Dense(hidden_layers_size, activation='relu'),
            tf.keras.layers.Dense(hidden_layers_size, activation='relu'),
    
            tf.keras.layers.Dense(output_size, activation='softmax'),
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

batch_size = 100


EPOCHS = 50

early_stopping = tf.keras.callbacks.EarlyStopping(patience = 2)

model.fit(train_inputs,
         train_targets,
         batch_size = batch_size,
         epochs = EPOCHS,
         callbacks=[early_stopping],
         validation_data = (validation_inputs, validation_targets),
         verbose=2)

Epoch 1/50
36/36 - 1s - loss: 0.5658 - accuracy: 0.6865 - val_loss: 0.4784 - val_accuracy: 0.7539
Epoch 2/50
36/36 - 0s - loss: 0.4325 - accuracy: 0.7728 - val_loss: 0.4212 - val_accuracy: 0.7785
Epoch 3/50
36/36 - 0s - loss: 0.3874 - accuracy: 0.7941 - val_loss: 0.3890 - val_accuracy: 0.8076
Epoch 4/50
36/36 - 0s - loss: 0.3659 - accuracy: 0.7991 - val_loss: 0.3777 - val_accuracy: 0.7808
Epoch 5/50
36/36 - 0s - loss: 0.3553 - accuracy: 0.8075 - val_loss: 0.3700 - val_accuracy: 0.7763
Epoch 6/50
36/36 - 0s - loss: 0.3468 - accuracy: 0.8100 - val_loss: 0.3604 - val_accuracy: 0.8188
Epoch 7/50
36/36 - 0s - loss: 0.3422 - accuracy: 0.8111 - val_loss: 0.3700 - val_accuracy: 0.7852
Epoch 8/50
36/36 - 0s - loss: 0.3367 - accuracy: 0.8117 - val_loss: 0.3661 - val_accuracy: 0.8121


<tensorflow.python.keras.callbacks.History at 0x1c8ecc2a9a0>

In [22]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)

