# Feature Extraction

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import IPython.display as ipd
from sklearn.cluster import MiniBatchKMeans
import joblib

Get a list of all files:

In [None]:
mp3_folder = '../Data/audio_ads' # audion files location

files = []
for r,d,f in os.walk(mp3_folder):
    for filename in f:
        if '.mp3' in filename:
            files.append(os.path.join(mp3_folder,filename))

How many mp3 files do we have?

In [None]:
len(files)

### Define utility functions:

In [None]:
def load_clips(filepath_list, d = 3, sr = 22050):
    '''Loads files in filepath_list, cuts them to clips of length
       d and returns a list of all the clips'''
    clip_list = []
    # load all files in filepath_list
    for f in filepath_list:
        i = 0 # keep track of clip number
        audio = librosa.core.load(f, offset = i*d, duration = d)[0]
        # add to data_list only clips in standard size
        while(len(audio) == sr*d):
            clip_list.append(audio)
            i = i+1
            audio = librosa.core.load(f, offset = i*d, duration = d)[0]
    
    return clip_list

In [None]:
def clips2features(clip_list, n_mfcc = 13, sr = 22050, train_size = 0.8):
    '''Takes a list of equal length clips with rate sr, 
       and returns feture vector with n_mfcc frequency coefficients'''
    feature_vectors = []
    X_train = []
    X_test = []
    n_clips = len(clip_list)
    n_train = int(np.floor(n_clips*train_size))
    np.random.shuffle(clip_list) # randomize data
    # extract feature vectors and append to feature_vectors list 
    for clip in clip_list:
        features = librosa.feature.mfcc(clip, sr=sr, n_mfcc=n_mfcc, dct_type=2)
        feature_vectors.append(features.flatten())
    # divide train and test
    X_train = feature_vectors[:n_train]
    X_test = feature_vectors[n_train:]
    
    return X_train, X_test

In [None]:
def train_kmeans(X_train, n_clusters = 10):
    '''Takes a list of feature vectors and trains 
       a k-means model'''
    X = np.vstack(X_train) # stack vertically (#samples, #features)
    # normalize
    mu = np.mean(X, axis=0) 
    std = np.std(X, axis=0)
    X = (X-mu)/std
    # create and train model
    model = MiniBatchKMeans(n_clusters=n_clusters, batch_size=1000)
    model.partial_fit(X)
    
    return model,mu,std

### Load and train k-means model:

In [None]:
for i in range(8):
    X_train, X_test = clips2features(load_clips(files[100*i:100*i+100]))
    model, mu, std = train_kmeans(X_train)

In [None]:
joblib.dump(model, 'Kmeans_model.joblib')

In [None]:
X_train, X_test = clips2features(load_clips(files[:2]))
len(X_train)

In [None]:
model, mu, std = train_kmeans(X_train)

Save trained model:

In [None]:
model.cluster_centers_.shape

In [None]:
model.cluster_centers_

Test performance on not seen ads:

In [None]:
X_t = (np.vstack(X_test[:2])-mu)/std
model.transform(X_t)

Load Music smaple (Negatives)

In [None]:
Music_folder = '../Data/Music' # audion files location

Music_files = []
for r,d,f in os.walk(Music_folder):
    for filename in f:
        if '.mp3' in filename:
            Music_files.append(os.path.join(Music_folder,filename))

In [None]:
X_Music = clips2features(load_clips(Music_files), train_size=1)[0]
len(X_Music)

Test model performance on non seen music:

In [None]:
X_n = (np.vstack(X_Music[:2])-mu)/std
model.transform(np.vstack(X_n))

In [None]:
data = load_clips(files[0:2])

In [None]:
len(data)

In [None]:
ipd.Audio(data[20], rate = 22050)