# Feature Extraction

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import IPython.display as ipd
from sklearn.cluster import KMeans

Get a list of all files:

In [2]:
mp3_folder = '../Data/audio_ads' # audion files location

files = []
for r,d,f in os.walk(mp3_folder):
    for filename in f:
        if '.mp3' in filename:
            files.append(os.path.join(mp3_folder,filename))

How many mp3 files do we have?

In [3]:
len(files)

2307

### Define utility functions:

In [4]:
def load_clips(filepath_list, d = 3, sr = 22050):
    '''Loads files in filepath_list, cuts them to clips of length
       d and returns a list of all the clips'''
    clip_list = []
    # load all files in filepath_list
    for f in filepath_list:
        i = 0 # keep track of clip number
        audio = librosa.core.load(f, offset = i*d, duration = d)[0]
        # add to data_list only clips in standard size
        while(len(audio) == sr*d):
            clip_list.append(audio)
            i = i+1
            audio = librosa.core.load(f, offset = i*d, duration = d)[0]
    
    return clip_list

In [5]:
def clips2features(clip_list, n_mfcc = 13, sr = 22050, train_size = 0.8):
    '''Takes a list of equal length clips with rate sr, 
       and returns feture vector with n_mfcc frequency coefficients'''
    feature_vectors = []
    X_train = []
    X_test = []
    n_clips = len(clip_list)
    n_train = int(np.floor(n_clips*train_size))
    np.random.shuffle(clip_list) # randomize data
    # extract feature vectors and append to feature_vectors list 
    for clip in clip_list:
        features = librosa.feature.mfcc(clip, sr=sr, n_mfcc=n_mfcc, dct_type=2)
        feature_vectors.append(features.flatten())
    # divide train and test
    X_train = feature_vectors[:n_train]
    X_test = feature_vectors[n_train:]
    
    return X_train, X_test

In [32]:
def train_kmeans(X_train, n_clusters = 100):
    '''Takes a list of feature vectors and trains 
       a k-means model'''
    X = np.vstack(X_train) # stack vertically (#samples, #features)
    # normalize
    mu = np.mean(X, axis=0) 
    std = np.std(X, axis=0)
    X = (X-mu)/std
    # create and train model
    model = KMeans(n_clusters=n_clusters)
    model.fit(X)
    
    return model,mu,std

### Load and train k-means model:

In [7]:
X_train, X_test = clips2features(load_clips(files[:110]))
len(X_train)

859

In [33]:
model, mu, std = train_kmeans(X_train)

In [9]:
model.cluster_centers_.shape

(10, 1690)

Test performance on not seen ads:

In [34]:
X_t = (np.vstack(X_test[:2])-mu)/std
model.transform(X_t)

array([[80.53090468, 40.54711136, 52.58741733, 50.16121711, 56.47006932,
        68.84435732, 41.55130108, 49.31517079, 46.21020695, 44.42434862,
        64.52468175, 42.78968302, 45.55043281, 40.83980678, 44.16459565,
        72.9480642 , 58.75859626, 57.5317853 , 47.36817177, 39.39728575,
        55.09598456, 63.76115656, 46.39515206, 55.7892186 , 55.98650091,
        42.00257581, 60.68926519, 55.71863634, 42.85679997, 61.58421043,
        45.12394223, 58.09512382, 59.17163381, 52.63224123, 60.18193088,
        49.2689304 , 62.14131758, 60.00975938, 41.31073127, 53.26580044,
        46.87035554, 40.82545742, 64.31951857, 44.48699632, 47.55314113,
        66.75964575, 42.62382876, 41.42230213, 50.30348067, 53.47969981,
        68.39748796, 56.28960579, 44.20823699, 79.49228486, 61.07018167,
        67.04444223, 53.68837932, 42.9568288 , 54.40175159, 61.40513008,
        50.58325271, 56.47279949, 57.1112798 , 53.12897571, 58.84455267,
        48.181523  , 63.91576358, 58.08671884, 62.8

Load Music smaple (Negatives)

In [13]:
Music_folder = '../Data/Music' # audion files location

Music_files = []
for r,d,f in os.walk(Music_folder):
    for filename in f:
        if '.mp3' in filename:
            Music_files.append(os.path.join(Music_folder,filename))

In [14]:
X_Music = clips2features(load_clips(Music_files), train_size=1)[0]
len(X_Music)

287

Test model performance on non seen music:

In [35]:
X_n = (np.vstack(X_Music[:2])-mu)/std
model.transform(np.vstack(X_n))

array([[70.81090267, 28.80918075, 44.49385676, 38.03492597, 43.83396361,
        57.2433692 , 32.03460256, 41.2550844 , 40.37259678, 36.01761123,
        56.99219876, 31.41292297, 45.96115588, 27.37115503, 35.09680603,
        62.90780558, 51.12377287, 50.76941471, 34.6424082 , 28.26810225,
        47.56678979, 54.17455897, 39.27687688, 50.15557896, 54.09416976,
        27.31895072, 50.99221248, 48.13273594, 33.96282486, 53.75906538,
        35.43247967, 54.22293607, 52.68654123, 44.36674475, 49.51456822,
        41.21489622, 55.19398775, 53.90020253, 33.67307661, 42.91038013,
        37.62601568, 28.52698261, 54.86042192, 26.47856216, 48.70965692,
        53.85052906, 31.03596446, 32.66804539, 48.401807  , 45.94155756,
        60.85826944, 48.10323273, 34.15038235, 68.86415089, 54.73230814,
        59.85215025, 47.2651244 , 30.38489835, 48.60186392, 52.20325818,
        40.45105611, 48.28066868, 48.74253611, 41.81015315, 53.53504803,
        41.16957246, 52.92733624, 49.33272503, 51.1

In [None]:
data = load_clips(files[0:2])

In [None]:
len(data)

In [None]:
ipd.Audio(data[20], rate = 22050)