In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing

### Extract Data from CSV 

In [21]:
raw_data_csv = np.loadtxt('Audiobooks.csv', delimiter=',')
unscaled_input_data = raw_data_csv[:,1:-1]
targets_all = raw_data_csv[:,-1]

In [22]:
print(unscaled_input_data.shape)
print(targets_all.shape)

(14084, 10)
(14084,)


### Balance the Dataset

In [23]:
num_one_targets = int(np.sum(targets_all))
zero_target_counter = 0
indices_to_remove = []

for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_target_counter += 1
        if zero_target_counter > num_one_targets:
            indices_to_remove.append(i)

unscaled_input_equal_priors = np.delete(unscaled_input_data, indices_to_remove, axis=0)
target_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)

In [24]:
print(unscaled_input_equal_priors.shape)
print(target_equal_priors.shape)

(4474, 10)
(4474,)


### Standardize the Inputs

In [25]:
scaled_inputs = preprocessing.scale(unscaled_input_equal_priors)

### Shuffle the Data

In [26]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = target_equal_priors[shuffled_indices]

### Split dataset into Train, Validation, and Test

In [27]:
samples_count = shuffled_inputs.shape[0]

train_sample_count = int(0.8*samples_count)
validation_sample_count = int(0.1*samples_count)
test_sample_count = samples_count - train_sample_count - validation_sample_count

train_inputs = shuffled_inputs[:train_sample_count]
train_targets = shuffled_targets[:train_sample_count]

validation_inputs = shuffled_inputs[train_sample_count:train_sample_count+validation_sample_count]
validation_targets = shuffled_targets[train_sample_count:train_sample_count+validation_sample_count]

test_inputs = shuffled_inputs[train_sample_count+validation_sample_count:]
test_targets = shuffled_targets[train_sample_count+validation_sample_count:]

print(np.sum(train_targets), train_sample_count, np.sum(train_targets)/train_sample_count)
print(np.sum(validation_targets), validation_sample_count, np.sum(validation_targets)/validation_sample_count)
print(np.sum(test_targets), test_sample_count, np.sum(test_targets)/test_sample_count)

1795.0 3579 0.5015367421067337
219.0 447 0.4899328859060403
223.0 448 0.49776785714285715


### Save the Three Datasets in *.npz

In [28]:
np.savez('Audio_data_train', inputs=train_inputs, targets= train_targets)
np.savez('Audio_data_validation', inputs=validation_inputs, targets= validation_targets)
np.savez('Audio_data_test', inputs=test_inputs, targets= test_targets)

### Create a Class that will Batch the Data

In [29]:
class Data_Reader():
    
    def __init__(self, dataset, batch_size = None):
        npz = np.load('Audio_data_{0}.npz'.format(dataset))
        self.inputs , self.targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)
        
        if batch_size is None:
            self.batch_size = self.inputs.shape[0]
        else:
            self.batch_size = batch_size
        self.curr_batch = 0
        self.batch_count = self.inputs.shape[0] // self.batch_size
        
    def __next__(self):
        if self.curr_batch >= self.batch_count:
            self.curr_batch = 0
            raise StopIteration()
        
        batch_slice = slice(self.curr_batch * self.batch_size, (self.curr_batch+1)*self.batch_size)
        inputs_batch = self.inputs[batch_slice]
        targets_batch = self.targets[batch_slice]
        self.curr_batch += 1
        
        classes_num = 2
        targets_one_hot = np.zeros((targets_batch.shape[0], classes_num))
        targets_one_hot[range(targets_batch.shape[0]), targets_batch] = 1
        
        return inputs_batch, targets_one_hot
    
    def __iter__(self):
        return self
        
        

## Create the Machine Learning Model

In [30]:
input_size = 10
output_size = 2
hidden_layer_size = 50

tf.reset_default_graph()

inputs = tf.placeholder(tf.float32, [None, input_size])
targets = tf.placeholder(tf.int32, [None, output_size])

weights_1 = tf.get_variable("weights_1", [input_size,hidden_layer_size])
biases_1 = tf.get_variable("biases_1",[hidden_layer_size])

outputs_1 = tf.nn.relu(tf.matmul(inputs,weights_1)+biases_1)

weights_2 = tf.get_variable("weights_2", [hidden_layer_size,hidden_layer_size])
biases_2 = tf.get_variable("biases_2", [hidden_layer_size])

outputs_2 = tf.nn.relu(tf.matmul(outputs_1,weights_2)+biases_2)

weights_3 = tf.get_variable("weights_3", [hidden_layer_size, hidden_layer_size])
biases_3 = tf.get_variable("biases_3", [hidden_layer_size])

outputs_3 = tf.nn.relu(tf.matmul(outputs_2, weights_3)+biases_3)

weights_4 = tf.get_variable("weights_4", [hidden_layer_size, output_size])
biases_4 = tf.get_variable("biases_4", [output_size])


outputs = tf.matmul(outputs_3,weights_4)+biases_4

loss = tf.nn.softmax_cross_entropy_with_logits(logits=outputs, labels=targets)
mean_loss = tf.reduce_mean(loss)

optimize = tf.train.AdamOptimizer(learning_rate=0.0012).minimize(mean_loss)

out_equals_targets = tf.equal(tf.argmax(outputs,1), tf.arg_max(targets,1))

accuracy = tf.reduce_mean(tf.cast(out_equals_targets,tf.float32))

sess = tf.InteractiveSession()

initializer = tf.global_variables_initializer()
sess.run(initializer)

batch_size = 100
max_epochs = 50

prev_validation_loss = 9999999.

train_data = Data_Reader('train', batch_size)
validation_data = Data_Reader('validation')

for epoch_counter in range(max_epochs):
    curr_epoch_loss = 0.
    
    for input_batch , target_batch in train_data:
        _, batch_loss = sess.run([optimize, mean_loss],
                                feed_dict={inputs:input_batch, targets: target_batch})
        curr_epoch_loss += batch_loss
    curr_epoch_loss /= train_data.batch_count
    
    
    validation_loss = 0.
    validation_accuracy = 0.
    
    for input_batch , target_batch in validation_data:
        validation_loss, validation_accuracy = sess.run([mean_loss, accuracy],
                                feed_dict={inputs:input_batch, targets: target_batch})
        
    print('Epoch '+str(epoch_counter+1)+
         '. Training loss: '+ '{0:.3f}'.format(curr_epoch_loss)+
         '. Validation loss: '+ '{0:.3f}'.format(validation_loss)+
         '. Validation Accuracy: '+ '{0:.2f}'.format(validation_accuracy * 100.)+'%')
    
    if validation_loss > prev_validation_loss:
        break
    prev_validation_loss = validation_loss
    
print('End of Training.')



Epoch 1. Training loss: 0.617. Validation loss: 0.517. Validation Accuracy: 67.11%
Epoch 2. Training loss: 0.439. Validation loss: 0.398. Validation Accuracy: 79.64%
Epoch 3. Training loss: 0.383. Validation loss: 0.372. Validation Accuracy: 80.54%
Epoch 4. Training loss: 0.363. Validation loss: 0.367. Validation Accuracy: 81.43%
Epoch 5. Training loss: 0.353. Validation loss: 0.362. Validation Accuracy: 82.33%
Epoch 6. Training loss: 0.345. Validation loss: 0.357. Validation Accuracy: 82.33%
Epoch 7. Training loss: 0.340. Validation loss: 0.351. Validation Accuracy: 82.55%
Epoch 8. Training loss: 0.336. Validation loss: 0.346. Validation Accuracy: 83.00%
Epoch 9. Training loss: 0.333. Validation loss: 0.344. Validation Accuracy: 82.77%
Epoch 10. Training loss: 0.330. Validation loss: 0.342. Validation Accuracy: 81.88%
Epoch 11. Training loss: 0.328. Validation loss: 0.342. Validation Accuracy: 81.66%
Epoch 12. Training loss: 0.326. Validation loss: 0.342. Validation Accuracy: 82.10%
E

### Test our Model

In [31]:
test_data = Data_Reader('test')

for input_batch , target_batch in test_data:
        test_accuracy = sess.run([accuracy],
                        feed_dict={inputs:input_batch, targets: target_batch})
        
test_accuracy_percent = test_accuracy[0] * 100.

print('Test Accuracy: ' + '{0:.2f}'.format(test_accuracy_percent)+'%')

Test Accuracy: 81.47%
