In [1]:
import fnmatch
import os
import random
import re
import threading
import pandas as pd
import time
import tqdm

import librosa
import numpy as np
import torch


import matplotlib.pyplot as plt

%matplotlib inline

FILE_PATTERN = r'*.wav'

# Load data

In [2]:
def randomize_files(files):
    '''Shuffle loaded files'''
    for file in files:
        file_index = random.randint(0, (len(files) - 1))
        yield files[file_index]


def find_files(directory, pattern='*.wav'):
    '''Recursively finds all files matching the pattern.'''
    files = []
    fnames = []
    for root, dirnames, filenames in os.walk(directory):
        for filename in fnmatch.filter(filenames, pattern):
            files.append(os.path.join(root, filename))
            fnames.append(filename)
    return files, fnames

def get_category(fname):
    '''Parse type from fnamepar'''
    return fname.split('/')[-1].split('_')[0]

def load_generic_audio(files, sample_rate, amount):
    '''Generator that yields audio waveforms from the directory.'''
    for it, filename in enumerate(files):
        if it == amount:
            break
        category_id = get_category(filename)
        audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
        audio = audio.reshape(-1, 1)
        yield audio, filename, category_id


def trim_silence(audio, threshold, frame_length=512):
    '''Removes silence at the beginning and end of a sample.'''
    if audio.size < frame_length:
        frame_length = audio.size
    energy = librosa.feature.rmse(audio, frame_length=frame_length)
    frames = np.nonzero(energy > threshold)
    indices = librosa.core.frames_to_samples(frames)[1]

    # Note: indices can be an empty array, if the whole audio was silence.
    return audio[indices[0]:indices[-1]] if indices.size else audio[0:0]



class AudioReader(object):
    '''Generic background audio reader that preprocesses audio files
    and add tham in lists'''

    def __init__(self,
                 audio_dir,
                 sample_rate,
                 silence_threshold=None,
                 sample_size=None,
                 load_size=None):
        self.audio_dir = audio_dir
        self.sample_rate = sample_rate
        self.sample_size = sample_size
        self.silence_threshold = silence_threshold
        self.load_size = load_size
        self.counter = 0
        self.time = time.time()

        # TODO Find a better way to check this.
        # Checking inside the AudioReader's thread makes it hard to terminate
        # the execution of the script, so we do it in the constructor for now.

        self.files, self.fnames = find_files(audio_dir)
        if not self.files:
            raise ValueError("No audio files found in '{}'.".format(audio_dir))
        self.pred_category = np.full(len(self.files), True)
        if load_size is not None:
            self.data = [0]*load_size
            self.id = [0]*load_size
        else:    
            self.data = [0]*len(self.files)
            self.id = [0]*len(self.files)
        # Determine the number of mutually-exclusive categories we will
        # accomodate in our embedding table.

    def read(self):
        #Read dataset
        
        iterator = load_generic_audio(self.files, self.sample_rate, self.load_size)
        for audio, filename, category_id in iterator:
            if self.silence_threshold is not None:
                # Remove silence
                audio = trim_silence(audio[:, 0], self.silence_threshold)
                audio = audio.reshape(-1)
                if audio.size == 0:
                    self.pred_category[self.counter]=False


            self.data[self.counter] = audio
            self.id[self.counter] = category_id
            self.counter += 1
            if self.counter % 400 == 0:
                print (time.time() - self.time,self.counter)
        return self

In [3]:
audio_dir = './data_v_7_stc/audio' 
sample_rate = 16000
silence_threshold = 7e-4
audio_reader = AudioReader(audio_dir,sr,silence_threshold=silence_threshold)
audio_reader.read()

22.02117681503296 400
43.68047094345093 800
65.40355324745178 1200
89.51941466331482 1600
113.04612159729004 2000
134.56798648834229 2400
154.92541027069092 2800
175.22799634933472 3200
196.10326719284058 3600
216.31541323661804 4000
236.01640033721924 4400
258.4560263156891 4800
280.1238281726837 5200
301.908460855484 5600
324.4076609611511 6000
347.73490715026855 6400
371.001829624176 6800
393.31889939308167 7200
415.85274863243103 7600
436.7358467578888 8000
459.60129475593567 8400
480.13412261009216 8800
500.8863036632538 9200
520.6947932243347 9600
544.0434896945953 10000
563.1334483623505 10400
583.7568407058716 10800
605.5180835723877 11200


<__main__.AudioReader at 0x7f037144ffd0>

In [14]:
meta = pd.read_csv('./data_v_7_stc/meta/meta.txt',sep = '\t', names=['file', 'q', 'w', 'e', 'label'])
meta.drop(columns= ['q','w','e'],inplace=True)
to = {'background':0,'bags':1,'door':2,'keyboard':3,'knocking_door':4,'ring':5,'speech':6,'tool':7}
fr = {0:'background',1:'bags',2:'door',3:'keyboard',4:'knocking_door',5:'ring',6:'speech',7:'tool'}
meta['label'] = meta['label'].map(to)

# Feature Extraction

In [4]:
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    return mfccs,chroma,mel,contrast,tonnetz

def parse_audio_files(parent_dir,sub_dirs,file_ext='*.wav'):
    features, labels = np.empty((0,193)), np.empty(0)
    for label, sub_dir in enumerate(sub_dirs):
        for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
            mfccs, chroma, mel, contrast,tonnetz = extract_feature(fn)
            ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
            features = np.vstack([features,ext_features])
            labels = np.append(labels, fn.split('/')[2].split('-')[1])
    return np.array(features), np.array(labels, dtype = np.int)

def one_hot_encode(labels):
    n_labels = len(labels)
    n_unique_labels = len(np.unique(labels))
    one_hot_encode = np.zeros((n_labels,n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode

In [41]:
n=len(audio_reader.data)
features = np.zeros((n,187))
start = time.time()
for i,arr in enumerate(audio_reader.data):
    if arr.size == 0:
        continue
    stft = np.abs(librosa.stft(arr))
    mfcc=np.mean(librosa.feature.mfcc(arr,sr,n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(arr, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    features[i] = np.hstack([mfccs,chroma,contrast,mel])
    if i%500 == 0:
        print (i, time.time()-start)
meta2 = meta.set_index('file')
files = [0]*len(audio_reader.files)
for i,f in enumerate(audio_reader.files):
    files[i]=f.split('/')[-1]
targets = np.array(meta2.loc[files]['label'])
X_train, y_train = features, targets

0 0.017755508422851562
500 14.90951132774353
1000 29.10416316986084
1500 43.933130979537964




2000 58.297215700149536
2500 73.46459031105042
3000 87.80545735359192
3500 104.1183774471283
4000 118.8462381362915
4500 132.37928175926208
5000 149.08286380767822
5500 164.07082200050354
6000 180.5038948059082
6500 195.36712551116943
7000 210.31628489494324
7500 227.9818766117096
8000 242.1499993801117
8500 257.6939375400543
9000 271.8154282569885
9500 287.50893211364746
10000 304.1990976333618
10500 318.4688255786896
11000 334.30134630203247


In [43]:
test_dir='./data_v_7_stc/test'
test_reader = AudioReader(test_dir,sr,silence_threshold=silence_threshold)
test_reader.read()
n=len(test_reader.data)
X_full = np.zeros((n,187))
start = time.time()
for i,arr in enumerate(test_reader.data):
    if arr.size == 0:
        continue
    stft = np.abs(librosa.stft(arr))
    mfcc=np.mean(librosa.feature.mfcc(arr,sr,n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(arr, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    #tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(arr), sr=sr).T,axis=0)
    X_full[i] = np.hstack([mfcc,chroma,contrast,mel])
    if i%300 == 0:
        print (i, time.time()-start)
to = {'background':0,'bags':1,'door':2,'keyboard':3,'knocking':4,'ring':5,'speech':6,'tool':7}
T = np.array(test_reader.id)
X_test, y_test =X_full[T != 'unknown'],  np.array([to[x] for x in T[T != 'unknown']])
X_unknown = X_full[T == 'unknown']

0.5809955596923828 400
0 0.03862881660461426
300 10.188040494918823
600 19.95690655708313


# Classification part

In [44]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.model_selection import validation_curve, learning_curve
from sklearn.metrics import log_loss, accuracy_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [46]:
xgb =  XGBClassifier(n_estimators=1000, max_depth=4, random_state=17)
xgb.fit(X_train, y_train)
pred = xgb.predict_proba(X_test)
idx, score = pred.argmax(axis=1), pred.max(axis=1)
accuracy_score(y_test,idx)

0.9302325581395349

In [20]:
def write_pred(est, arr,files): 
    pred = est.predict_proba(arr)
    idx, score = pred.argmax(axis=1), pred.max(axis=1)
    lables = np.array([fr[x] for x in idx])
    res = np.column_stack((files,score,lables))
    print (res)
    np.savetxt('to_ret.txt',res,delimiter='\t',fmt="%s")
write_pred(xgb,X_full,test_reader.fnames)

[['background_0077.wav' '0.9954573' 'background']
 ['bags_0003.wav' '0.9998622' 'bags']
 ['door_0023.wav' '0.99931216' 'door']
 ...
 ['background_t_0009.wav' '0.9921795' 'background']
 ['knocking_door_t_0028.wav' '0.6664679' 'background']
 ['bags_t_0014.wav' '0.99633235' 'bags']]
