# Training NN models:

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import IPython.display as ipd
from sklearn.cluster import MiniBatchKMeans
import joblib
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import ModelCheckpoint
from DataGenerator import DataGenerator_Sup

Using TensorFlow backend.


Get a list of all files:

In [2]:
Ad_folder = '../Data/audio_ads' # positive samples location
Music_folder = '../Data/Music'  # negative samples (music) location
Podcast_folder = '../Data/pod'  # negative samples (podcasts) location

pos_files = []
for r,d,f in os.walk(Ad_folder):
    for filename in f:
        if '.mp3' in filename:
            pos_files.append(os.path.join(Ad_folder,filename))

music_files = []
for r,d,f in os.walk(Music_folder):
    for filename in f:
        if '.mp3' or '.au' in filename:
            music_files.append(os.path.join(r,filename))

podcast_files = []
for r,d,f in os.walk(Podcast_folder):
    for filename in f:
        if '.wav' in filename:
            podcast_files.append(os.path.join(r,filename))


How many mp3 files do we have?

In [3]:
neg_files = music_files + podcast_files
n_pos_files = len(pos_files)
n_neg_files = len(neg_files)

print('We have ' + str(n_pos_files) + ' positive examples')
print('We have ' + str(len(music_files)) + ' music examples')
print('We have ' + str(len(podcast_files)) + ' podcast examples')

music_duration = 30/60.0 # duration of files in minutes
podcast_duration = 12/60.0 # duration of files in minutes
ads_duration = 30/60.0 # average duration of ad files in minutes

pos_minutes = round(ads_duration*n_pos_files,2)
neg_minutes = round(music_duration*len(music_files) + podcast_duration*len(podcast_files),2)
pos_fraction = str(round(neg_minutes/pos_minutes,2)) # the fraction of positives to take for balancing
print('--------------------------------')
print('In total, ' + str(pos_minutes) + ' minutes of positive and ' + str(neg_minutes) + ' minutes of negative')
print('A factor of ' + str(pos_fraction) + ' is applied on positives for balancing')

We have 2303 positive examples
We have 1023 music examples
We have 1567 podcast examples
--------------------------------
In total, 1151.5 minutes of positive and 824.9 minutes of negative
A factor of 0.72 is applied on positives for balancing


### Define utility functions:

In [4]:
def create_model(n_features):
    '''Create a model obejct with an input of length n_features'''
    model = Sequential() # create a model instance

    #add model layers
    model.add(Dense(256, activation = 'relu', input_shape=(n_features,)))
    model.add(Dense(64, activation = 'relu'))
    model.add(Dense(1, activation = 'sigmoid'))
    
    return model

### Create data generators:

In [6]:
pos_fraction = 0.72 # the fraction of positives used in training
train_fraction = 0.75 # the fraction of the data used for training

train_minutes = round(train_fraction*neg_minutes,2) # number of neg audio training minutes
train_music_minutes = train_minutes/2 # number of minutes for music training
train_podcast_minutes = train_minutes/2 # number of minutes for podcast training
print('Training time in minutes for positive/negative is ' + str(train_minutes))
print('Out of this value, ' + str(train_music_minutes) + ' is music and ' 
      + str(train_podcast_minutes) +' is podcasts')

n_train_music_files = int(train_music_minutes/music_duration)
n_train_podcast_files = int(train_podcast_minutes/podcast_duration)
n_train_pos_files = int(n_pos_files*pos_fraction*train_fraction)
n_test_pos_files = int(n_pos_files*pos_fraction*(1-train_fraction))
print('---------------------------------------------------------------')
print('This translates into ' + str(n_train_music_files) + ' files of music'
      + ' and ' + str(n_train_podcast_files) + ' files of podcasts for training')
print('The rest of the negative files are used for testing')
print('---------------------------------------------------------------')
print(str(n_train_pos_files) + ' poitive files are used for training')
print(str(n_test_pos_files) + ' poitive files are used for testing')

assert len(music_files) >= n_train_music_files, 'There are not enough music files for that!'
assert len(podcast_files) >= n_train_podcast_files, 'There are not enough podcast files for that!'

Training time in minutes for positive/negative is 618.67
Out of this value, 309.335 is music and 309.335 is podcasts
---------------------------------------------------------------
This translates into 618 files of music and 1546 files of podcasts for training
The rest of the negative files are used for testing
---------------------------------------------------------------
1243 poitive files are used for training
414 poitive files are used for testing


In [7]:
train_files = [] # a list of training files 
test_files = [] # a list of test files 

# shuffle files
np.random.shuffle(pos_files)
np.random.shuffle(music_files)
np.random.shuffle(podcast_files)

'''Collect a balanced list of files + add labels'''
# Training list
for f in pos_files[:n_train_pos_files]:
    train_files.append([f,1])
for f in music_files[:n_train_music_files]:
    train_files.append([f,0])
for f in podcast_files[:n_train_podcast_files]:
    train_files.append([f,0])

# Test list
for f in pos_files[n_train_pos_files:n_train_pos_files + n_test_pos_files]:
    train_files.append([f,1])
for f in music_files[n_train_music_files:]:
    train_files.append([f,0])
for f in podcast_files[n_train_podcast_files:]:
    train_files.append([f,0])

train_generator = DataGenerator_Sup(train_files, dataset='train')
test_generator = DataGenerator_Sup(test_files, dataset='test')

Check generators:

In [9]:
if 1:
    X, Y = train_generator.__getitem__(2) # get item
    print(X.shape)
    print(Y.shape)
    print('Positive example fraction in batch is ' + str((sum(Y == 1)/Y.shape[0])[0]))

(52, 1690)
(52, 1)
Positive example fraction in batch is 0.5192307692307693


### Train NN model:

Create and compile model:

In [None]:
num_features = 1690
model = create_model(num_features)
filepath = 'models/weights_1690_256_64_1_with_pod.hdf5'
checkpoint = ModelCheckpoint(filepath)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Train model:

In [None]:
H = model.fit_generator(generator = train_generator, epochs = 2, callbacks = [checkpoint])

In [None]:
plt.rcParams.update({'font.size': 16})
plt.figure(figsize= (10,4))
plt.subplot(1,2,1)
plt.plot(H.history['loss'], linewidth=3, color = 'b')
plt.title('Loss')
plt.xlabel('epoch')

plt.subplot(1,2,2)
plt.plot(H.history['acc'], linewidth=3, color = 'g')
plt.title('Accuracy')
plt.xlabel('epoch')

In [None]:
# Error files:
train_generatorerator.err_files

### Check Accuracy:

In [None]:
# recreate a model for evaluation
num_features = 1690
eval_model = create_model(num_features)
# load weights
eval_model.load_weights('models/weights_1690_256_64_1.hdf5')
# compile
eval_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

What metrics do we use for evaluation?

In [None]:
eval_model.metrics_names

How does the model perform on the test?

In [None]:
eval_model.evaluate_generator(generator = test_generator, steps = 5)