# Feature Extraction

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import IPython.display as ipd
from sklearn.cluster import MiniBatchKMeans
import joblib
import keras

Using TensorFlow backend.


Get a list of all files:

In [2]:
Ad_folder = '../Data/audio_ads' # audion files location

files = []
for r,d,f in os.walk(Ad_folder):
    for filename in f:
        if '.mp3' in filename:
            files.append(os.path.join(Ad_folder,filename))

Music_folder = '../Data/Music' # audion files location

Music_files = []
for r,d,f in os.walk(Music_folder):
    for filename in f:
        if '.mp3' in filename:
            Music_files.append(os.path.join(Music_folder,filename))

How many mp3 files do we have?

In [3]:
len(files)

2307

### Define utility functions:

In [4]:
class DataGenerator(keras.utils.Sequence):
    """Generate data for Keras"""
    def __init__(self, filepath_list, batch_size=10, sample_duration = 3, dataset='train', shuffle=True):
        """
        * filepath_list: a list of paths for audio files
        * batch_size: batch sample of each iteration
        * sample_duration: standard sample duration in the data set
        * dataset: use to label the dataset of the current generator
        * shuffle: whether or not shuffle the dataframe before itering over it. Default True!
        """
        self.batch_size = batch_size
        self.dataset = dataset
        self.shuffle = shuffle
        self.files = filepath_list
        self.n_files = len(self.files)
        self.sr = 22050 # audio sampling rate
        self.n_mfcc = 13 # number of frequency coefficients to use
        self.d = sample_duration
        self.on_epoch_end()

    def __len__(self):
        """Denotes the number of batches per epoch"""
        num_batches = self.df.shape[0] // self.batch_size + (self.df.shape[0] % self.batch_size > 0)
        
        return num_batches

    def __getitem__(self, index):
        """Generate one batch of data"""
        start_index = index * self.batch_size
        end_index = (index+1) * self.batch_size
        if end_index > self.n_files:
            end_index = None
        batch_files = self.files[start_index:end_index]

        X = self.__data_generation(batch_files)

        return X

    def __data_generation(self, batch_files):
        """Generate a data batch"""
        X = []
        clip_list = self.load_clips(batch_files)
        np.random.shuffle(clip_list) # randomize clips (many of them come from the same file)
        for clip in clip_list:
            features = librosa.feature.mfcc(clip, sr=self.sr, n_mfcc=self.n_mfcc, dct_type=2)
            X.append(features.flatten())
            
        return np.vstack(X)
    
    def load_clips(self, filepath_list):
        '''Loads files in filepath_list, cuts them to clips of length
           d and returns a list of all the clips'''
        clip_list = []
        # load all files in filepath_list
        for f in filepath_list:
            i = 0 # keep track of clip number
            audio = librosa.core.load(f, offset = i*self.d, duration = self.d)[0]
            # add to data_list only clips in standard size
            while(len(audio) == self.sr*self.d):
                clip_list.append(audio)
                i = i+1
                audio = librosa.core.load(f, offset = i*self.d, duration = self.d)[0]

        return clip_list
    
    def on_epoch_end(self):
            """Update indexes after each epoch"""
#             if self.shuffle:
#                 self.files = np.random.shuffle(self.files)

In [5]:
def load_clips(filepath_list, d = 3, sr = 22050):
    '''Loads files in filepath_list, cuts them to clips of length
       d and returns a list of all the clips'''
    clip_list = []
    # load all files in filepath_list
    for f in filepath_list:
        i = 0 # keep track of clip number
        audio = librosa.core.load(f, offset = i*d, duration = d)[0]
        # add to data_list only clips in standard size
        while(len(audio) == sr*d):
            clip_list.append(audio)
            i = i+1
            audio = librosa.core.load(f, offset = i*d, duration = d)[0]
    
    return clip_list

In [6]:
def clips2features(clip_list, n_mfcc = 13, sr = 22050, train_size = 0.8):
    '''Takes a list of equal length clips with rate sr, 
       and returns feture vector with n_mfcc frequency coefficients'''
    feature_vectors = []
    X_train = []
    X_test = []
    n_clips = len(clip_list)
    n_train = int(np.floor(n_clips*train_size))
    np.random.shuffle(clip_list) # randomize data
    # extract feature vectors and append to feature_vectors list 
    for clip in clip_list:
        features = librosa.feature.mfcc(clip, sr=sr, n_mfcc=n_mfcc, dct_type=2)
        feature_vectors.append(features.flatten())
    # divide train and test
    X_train = feature_vectors[:n_train]
    X_test = feature_vectors[n_train:]
    
    return X_train, X_test

In [7]:
def train_kmeans(X, n_clusters = 10):
    '''Takes a tarining batch X_train of shape (n_samples, n_features) 
       and trains a k-means model'''
    # normalize
    mu = np.mean(X, axis=0) 
    std = np.std(X, axis=0)
    X = (X-mu)/std
    # create and train model
    model = MiniBatchKMeans(n_clusters=n_clusters, batch_size=1000)
    model.partial_fit(X)
    
    return model,mu,std

In [8]:
def predict_kmeans(X, model, mu, std, T = 44):
    '''Takes a list of feature vectors X, a model and mean and std values
       and returns a class 1 for ads, 0 for non ads based on threshold T'''
    X = (np.vstack(X)-mu)/std # stack and normailze data
    cluster_distance = np.min(model.transform(X), axis=1) # compute the distance to nearest cluster
#     import pdb; pdb.set_trace()
    return cluster_distance<T

### Create data generators:

In [9]:
train_size = 0.8

n_files = len(files)
n_train = int(np.floor(n_files*train_size))
train_files = files[:n_train]
test_files = files[n_train:]

train_generator = DataGenerator(filepath_list=train_files, shuffle=True, dataset='train')
test_generator = DataGenerator(filepath_list=test_files, shuffle=False, dataset='test')

Check generators:

In [None]:
if 0:
    X = train_generator.__getitem__(0) # get item
    X.shape

How many batches in one epoch?

In [10]:
n_batch = int(np.floor(train_generator.n_files/train_generator.batch_size))
n_batch

184

### Load and train k-means model:

In [11]:
for i in range(n_batch):
    X_train = train_generator.__getitem__(0) # get item
    model, mu, std = train_kmeans(X_train) # train model
    joblib.dump(model, 'Kmeans_model.joblib') # save model after each batch

Save trained model:

In [None]:
# joblib.dump(model, 'Kmeans_model.joblib')

### Check Accuracy:

First on the positive group:

In [13]:
X_test = test_generator.__getitem__(0)
len(X_test)

110

In [14]:
np.mean(predict_kmeans(X_test, model, mu, std))

0.24545454545454545

Next, on the negative group:

In [15]:
Music_generator = DataGenerator(filepath_list=Music_files, shuffle=False, dataset='Music')
X_Music = Music_generator.__getitem__(0)
len(X_Music)

145

In [16]:
np.mean(predict_kmeans(X_Music, model, mu, std))

0.9310344827586207

In [None]:
X_Music = clips2features(load_clips(Music_files[:3]), train_size=1)[0]
len(X_Music)

### Listen to data:

In [None]:
data = load_clips(files[0:2])

In [None]:
len(data)

In [None]:
ipd.Audio(data[20], rate = 22050)