# Training KMeans models:

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import IPython.display as ipd
from sklearn.cluster import MiniBatchKMeans
import joblib
import keras
from DataGenerator import DataGenerator

Using TensorFlow backend.


Get a list of all files:

In [2]:
Ad_folder = '../Data/audio_ads' # audion files location

files = []
for r,d,f in os.walk(Ad_folder):
    for filename in f:
        if '.mp3' in filename:
            files.append(os.path.join(Ad_folder,filename))

Music_folder = '../Data/Music' # audion files location

Music_files = []
for r,d,f in os.walk(Music_folder):
    for filename in f:
        if '.mp3' or '.au' in filename:
            Music_files.append(os.path.join(Music_folder,filename))

How many mp3 files do we have?

In [3]:
print('We have ' + str(len(files)) + ' ad audios')
print('We have ' + str(len(Music_files)) + ' music audios')

We have 2303 ad audios
We have 1023 music audios


### Define utility functions:

In [4]:
def train_kmeans(X, n_clusters = 50):
    '''Takes a tarining batch X_train of shape (n_samples, n_features) 
       and trains a k-means model'''
    # normalize
    mu = np.mean(X, axis=0) 
    std = np.std(X, axis=0)
    X = (X-mu)/std
    # create and train model
    model = MiniBatchKMeans(n_clusters=n_clusters, batch_size=1000)
    model.partial_fit(X)
    
    return model,mu,std

In [5]:
def predict_kmeans(X, model, T = 38):
    '''Takes a list of feature vectors X, a model and mean and std values
       and returns a class 1 for ads, 0 for non ads based on threshold T'''
    mu = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    X = (np.vstack(X)-mu)/std # stack and normailze data
    cluster_distance = np.min(model.transform(X), axis=1) # compute the distance to nearest cluster
    avg_min_distance = np.mean(cluster_distance)
#     import pdb; pdb.set_trace()
    return avg_min_distance

### Create data generators:

In [6]:
train_size = 0.8

n_files = len(files)
n_train = int(np.floor(n_files*train_size))
train_files = files[:n_train]
test_files = files[n_train:]

train_generator = DataGenerator(filepath_list=train_files, shuffle=True, dataset='train')
test_generator = DataGenerator(filepath_list=test_files, shuffle=False, dataset='test')
Music_generator = DataGenerator(filepath_list=Music_files, shuffle=False, dataset='Music')

Check generators:

In [7]:
if 0:
    X = train_generator.__getitem__(0) # get item
    print(X.shape)

How many batches in one epoch?

In [8]:
n_batches = len(train_generator)

### Train k-means model:

In [None]:
n_epoch = 10
for e in range(n_epoch):
    for i in range(n_batches):
        X_train = train_generator.__getitem__(i) # get item
        model, mu, std = train_kmeans(X_train) # train model
    train_generator.on_epoch_end()
    joblib.dump(model, 'models/Kmeans_model_1.joblib') # save model after each batch    

Save trained model:

In [None]:
# joblib.dump(model, 'Kmeans_model.joblib')

In [None]:
# Error files:
train_generator.err_files

### Check Accuracy:

Load latest model:

In [7]:
model = joblib.load('models/Kmeans_model_1.joblib')

First on the positive group:

In [25]:
X_test = test_generator.__getitem__(3)
len(X_test)

92

In [26]:
predict_kmeans(X_test, model)

40.751273960665436

Next, on the negative group:

In [21]:
X_Music = Music_generator.__getitem__(1)
len(X_Music)

170

In [22]:
predict_kmeans(X_Music, model)

38.21812318229378