In [1]:
import glob
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram
import time
%matplotlib inline
plt.style.use('ggplot')

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 11
plt.rcParams['figure.titlesize'] = 13

In [3]:
EPSILON = 10e-10
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    #chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    raw_mel = librosa.feature.melspectrogram(X, sr=sample_rate).T
    mel = np.mean(raw_mel, axis=0)
    processed_mel = raw_mel + EPSILON
    logMel = np.mean(np.log(processed_mel),axis=0)
    #contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    #tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    return mfccs,logMel, mel

def parse_audio_files(parent_dir,sub_dirs,file_ext='*.wav'):
    features, labels, namePath = np.empty((0,296)), np.empty(0), np.empty(0)
    for label, sub_dir in enumerate(sub_dirs):
        for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
            mfccs, logmel, mel = extract_feature(fn)
            ext_features = np.hstack([mfccs,logmel, mel])
            features = np.vstack([features,ext_features])
            labels = np.append(labels, fn.split('/')[-1].split('-')[1])
            namePath = np.append(namePath, fn)
    return np.array(features), np.array(labels, dtype = np.int), np.array(namePath, dtype=np.string_)

def one_hot_encode(labels):
    n_labels = len(labels)
    n_unique_labels = len(np.unique(labels))
    one_hot_encode = np.zeros((n_labels,n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode

def one_hot_decode(labels):
    n_labels, n_unique_labels = np.shape(labels)
    one_hot_decode = np.zeros((n_labels))
    for idx in range(n_labels):
        for l in range(n_unique_labels):
            if(labels[idx, l] == 1):
                one_hot_decode[idx] = l
                break;
    return one_hot_decode

In [4]:
x = extract_feature("../../data/UrbanSound8K/audio/fold1/7061-6-0-0.wav")
nfMfccs = np.shape(x[0])[0]
nfLogMel = np.shape(x[1])[0]
nfMel = np.shape(x[2])[0]

In [5]:
print "Number of features"
print "* Mfcc: {0}".format(nfMfccs)
print "* logMel {0}".format(nfLogMel)
print "* Mel {0}".format(nfMel)

Number of features
* Mfcc: 40
* logMel 128
* Mel 128


In [7]:
parent_dir = '../../data/UrbanSound8K/audio/'
sub_dirs = ['fold1']
print "---Extracting features.... ---"
start_time = time.time()
features, labels, file_names = parse_audio_files(parent_dir,sub_dirs)
print "---Loading time: {0} seconds ---".format(time.time() - start_time)

---Extracting features.... ---
---Loading time: 192.633636951 seconds ---


In [8]:
one_hot_labels = one_hot_encode(labels)

### Shingling

In [54]:
window_length = 10
hop_length = 5

In [72]:
def shingle_features(feature, window_length, hop_length):
    nFrames, nFeatures = np.shape(feature)
    nMovingWindow = np.floor((nFrames-hop_length)/(window_length - hop_length))
    new_samples = np.zeros((nMovingWindow, nFeatures*window_length))
    for window in np.arange(nMovingWindow):
        start = window*hop_length
        end = start + window_length
        new_samples[window,:] = np.concatenate([feature[i] for i in np.arange(start, end)], axis=0)
    return new_samples    

In [73]:
def extract_shingled_features(file_name):
    X, sample_rate = librosa.load(file_name)
#     raw_mfccs = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T
    raw_mel = librosa.feature.melspectrogram(X, sr=sample_rate).T
    processed_mel = raw_mel + EPSILON
    logmel = np.log(processed_mel)
    shingling = shingle_features(logmel, window_length=window_length, hop_length=hop_length)
    return shingling

In [74]:
k = extract_shingling_features("../../data/UrbanSound8K/audio/fold1/7061-6-0-0.wav")

10
Start: 0.0. End: 10.0
Start: 5.0. End: 15.0
Start: 10.0. End: 20.0
Start: 15.0. End: 25.0
Start: 20.0. End: 30.0
Start: 25.0. End: 35.0
Start: 30.0. End: 40.0
Start: 35.0. End: 45.0
Start: 40.0. End: 50.0
Start: 45.0. End: 55.0
Start: 50.0. End: 60.0
Start: 55.0. End: 65.0
Start: 60.0. End: 70.0
Start: 65.0. End: 75.0
Start: 70.0. End: 80.0
Start: 75.0. End: 85.0
Start: 80.0. End: 90.0
Start: 85.0. End: 95.0




In [75]:
def parse_audio_files(parent_dir,sub_dirs,file_ext='*.wav', shingling=False):
    features, labels, namePath = np.empty((0,296)), np.empty(0), np.empty(0)
    for label, sub_dir in enumerate(sub_dirs):
        for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
            mfccs, logmel, mel = extract_feature(fn)
            if(shingling):
                
            else:
                mfccs, logmel, mel = extract_feature(fn)
            ext_features = np.hstack([mfccs,logmel, mel])
            features = np.vstack([features,ext_features])
            labels = np.append(labels, fn.split('/')[-1].split('-')[1])
            namePath = np.append(namePath, fn)
    return np.array(features), np.array(labels, dtype = np.int), np.array(namePath, dtype=np.string_)

(18, 1280)