In [234]:
import numpy as np
from sklearn import preprocessing

# DATA PREPROCESSING

## Load the data

In [250]:
raw_data=np.loadtxt('Audiobooks_data.csv',delimiter=',')
unscaled_inputs=raw_data[:,1:-1]
targets_all=raw_data[:,-1]

## Shuffle the data

In [251]:
shuffled_indices=np.arange(unscaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_unscaled_inputs=unscaled_inputs[shuffled_indices]
shuffled_unscaled_targets=targets_all[shuffled_indices]

## Balance the dataset

In [252]:
number_ones=int(np.sum(shuffled_unscaled_targets))
zero_targets_counter=0
indices_to_remove=[]

for i in range(shuffled_unscaled_targets.shape[0]):
    if targets_all[i]==0:
        zero_targets_counter+=1
        if zero_targets_counter>number_ones:
            indices_to_remove.append(i)

In [253]:
unscaled_inputs_equal_priors=np.delete(shuffled_unscaled_inputs,indices_to_remove,axis=0)
targets_all_equal_priors=np.delete(shuffled_unscaled_targets,indices_to_remove,axis=0)

## Standardize the inputs

In [254]:
scaled_inputs=preprocessing.scale(unscaled_inputs_equal_priors)

## Shuffle the data

In [255]:
shuffled_indices=np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs=scaled_inputs[shuffled_indices]
shuffled_targets=targets_all_equal_priors[shuffled_indices]

## Splitting the data

In [256]:
samples_count=shuffled_inputs.shape[0]

train_samples_count=int(0.8*samples_count)
validation_samples_count=int(0.1*samples_count)
test_samples_count=int(0.1*samples_count)

train_inputs=shuffled_inputs[:train_samples_count]
train_targets=shuffled_targets[:train_samples_count]

validation_inputs=shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets=shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]

test_inputs=shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets=shuffled_targets[train_samples_count+validation_samples_count:]

In [257]:
print(int(np.sum(train_targets)),train_samples_count)

557 3579


## Save the datasets as .npz

In [258]:
np.savez('Audiobook_data_train',inputs=train_inputs,targets=train_targets)
np.savez('Audiobook_data_validated',inputs=validation_inputs,targets=validation_targets)
np.savez('Audiobook_data_test',inputs=test_inputs,targets=test_targets)

# DEEP LEARNING

In [259]:
import numpy as np
import tensorflow as tf

In [260]:
npz=np.load('Audiobook_data_train.npz')
train_inputs=npz['inputs'].astype(float)
train_targets=npz['targets'].astype(int)

npz=np.load('Audiobook_data_validated.npz')
validated_inputs=npz['inputs'].astype(float)
validated_targets=npz['targets'].astype(int)

npz=np.load('Audiobook_data_test.npz')
test_inputs=npz['inputs'].astype(float)
test_targets=npz['targets'].astype(int)

In [261]:
input_size=10
output_size=2
hidden_layer_size=50

In [262]:
model=tf.keras.Sequential([
                                                tf.keras.layers.Dense(hidden_layer_size,activation='relu'),   
                                                tf.keras.layers.Dense(hidden_layer_size,activation='relu'),                                                
                                                tf.keras.layers.Dense(output_size,activation='softmax')
                                              ])

model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])


In [263]:
early_stopping=tf.keras.callbacks.EarlyStopping(patience=2)

model.fit(train_inputs,train_targets,batch_size=100,epochs=100,callbacks=[early_stopping],validation_data=(validated_inputs,validated_targets),verbose=2)

Epoch 1/100
36/36 - 1s - loss: 0.5146 - sparse_categorical_accuracy: 0.7944 - val_loss: 0.4158 - val_sparse_categorical_accuracy: 0.8412 - 724ms/epoch - 20ms/step
Epoch 2/100
36/36 - 0s - loss: 0.3488 - sparse_categorical_accuracy: 0.8829 - val_loss: 0.3502 - val_sparse_categorical_accuracy: 0.8770 - 56ms/epoch - 2ms/step
Epoch 3/100
36/36 - 0s - loss: 0.3019 - sparse_categorical_accuracy: 0.8961 - val_loss: 0.3237 - val_sparse_categorical_accuracy: 0.8814 - 52ms/epoch - 1ms/step
Epoch 4/100
36/36 - 0s - loss: 0.2819 - sparse_categorical_accuracy: 0.9022 - val_loss: 0.3078 - val_sparse_categorical_accuracy: 0.8881 - 52ms/epoch - 1ms/step
Epoch 5/100
36/36 - 0s - loss: 0.2699 - sparse_categorical_accuracy: 0.9036 - val_loss: 0.2953 - val_sparse_categorical_accuracy: 0.8859 - 52ms/epoch - 1ms/step
Epoch 6/100
36/36 - 0s - loss: 0.2624 - sparse_categorical_accuracy: 0.9050 - val_loss: 0.2852 - val_sparse_categorical_accuracy: 0.8881 - 56ms/epoch - 2ms/step
Epoch 7/100
36/36 - 0s - loss: 0

<keras.callbacks.History at 0x1bf4516ac70>

# TESTING

In [264]:
model.evaluate(test_inputs,test_targets)



[0.23834264278411865, 0.9017857313156128]