In [106]:
import glob
import os
import librosa
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
%matplotlib inline
plt.style.use('ggplot')

In [107]:
# time interval for sampling is 0.5 second
TIME_INTERVAL = 0.5
# audio length is 60 seconds
AUDIO_LENGTH = 60.0

In [108]:
def windows(data, window_size):
    start = 0
    while start < len(data):
        yield int(start), int(start + window_size)
        start += (window_size / 2)

def extract_feature(file_name, time_interval=1, num_mfcc=40):
    X, sample_rate = librosa.load(file_name)
    mfccs = []
    window_size = round(sample_rate * time_interval)
    
    for (start,end) in windows(X, window_size):
        if len(X[start:end]) != window_size:
            continue
        mfcc = np.mean(librosa.feature.mfcc(y=X[start:end], sr=sample_rate, n_mfcc=num_mfcc).T,axis=0)
#         mfccs = np.append(mfccs, mfcc)
        
        mfccs.append(mfcc)
    
#     print(mfccs.shape)
    return mfccs

sound_class_table = {
    'air_conditioner' : 0,
    'car_horn' : 1,
    'children_playing' : 2,
    'dog_bark' : 3,
    'drilling' : 4,
    'engine_idling' : 5,
    'gun_shot' : 6,
    'jackhammer' : 7,
    'siren' : 8,
    'street_music' : 9
}

def extract_label(fn):

    label_file_path = fn.replace('wav', 'txt')

    with open(label_file_path) as fd:
        lines = fd.readlines()
        time_sections_with_label = list(map(lambda x: (float(x[0]), float(x[1]), x[2]), map(lambda x : x.split(), lines)))

    labels = []
    start_time_point = TIME_INTERVAL

    for t in np.arange(start_time_point, AUDIO_LENGTH, TIME_INTERVAL):
        curr_label = np.zeros(len(sound_class_table), dtype=np.int)
        for time_section in time_sections_with_label:
            if t < time_section[0] or t > time_section[1]:
                continue
            curr_label[sound_class_table[time_section[2]]] += 1

        labels.append(curr_label)

    return labels
    
def parse_audio_files(parent_dir,sub_dirs,file_ext='*.wav'):
    features, labels = [], []
    
    for label, sub_dir in enumerate(sub_dirs):
        for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
            
            before = datetime.now()
            
            mfccs = extract_feature(fn)
            features.append(mfccs)
            label = extract_label(fn)
            labels.append(label)
            
            after = datetime.now()
            
            time_took = (after - before).total_seconds() * 1000
            print('Soundscape %s took %d ms to generate' % (fn, time_took))
            
    return np.array(features), np.array(labels)

In [109]:
parent_dir = '.'

tr_sub_dirs = ['tmp']
tr_features,tr_labels = parse_audio_files(parent_dir,tr_sub_dirs)


# tr_labels = one_hot_encode(tr_labels)

Soundscape ./tmp/soundscape_6.wav took 3151 ms to generate
Soundscape ./tmp/soundscape_7.wav took 3084 ms to generate
Soundscape ./tmp/soundscape_5.wav took 3044 ms to generate
Soundscape ./tmp/soundscape_4.wav took 3039 ms to generate
Soundscape ./tmp/soundscape_0.wav took 3043 ms to generate
Soundscape ./tmp/soundscape_1.wav took 3040 ms to generate
Soundscape ./tmp/soundscape_3.wav took 3020 ms to generate
Soundscape ./tmp/soundscape_2.wav took 3084 ms to generate
Soundscape ./tmp/soundscape_9.wav took 3047 ms to generate
Soundscape ./tmp/soundscape_8.wav took 3045 ms to generate


In [110]:
print(tr_features.shape)
print(tr_labels.shape)

X_train = tr_features
y_train = tr_labels

X_val = X_train
y_val = y_train

(10, 119, 40)
(10, 119, 10)


In [89]:
from __future__ import print_function
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.engine.training import _slice_arrays
from keras.layers import (Activation, TimeDistributed,
                          Dense, RepeatVector, recurrent)
import numpy as np
from six.moves import range
from keras.datasets import mnist
from keras.layers import Dense
from keras import backend as K

Using TensorFlow backend.


In [93]:
def build_model(input_shape, output_shape, rnn_type, hidden_rnn_layers, hidden_rnn_size):
    model = Sequential()

    # the recurrent part is applied to the sequence
    model.add(rnn_type(hidden_rnn_size, input_shape=input_shape))
    
    model.add(RepeatVector(output_shape[1]))
    
    for _ in range(hidden_rnn_layers):
        model.add(rnn_type(hidden_rnn_size, return_sequences=True))

    # For each of step of the output sequence,
    model.add(TimeDistributed(Dense(output_shape[2])))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [98]:
model = build_model(X_train.shape[1:], y_train.shape[1:], recurrent.LSTM,
                    hidden_rnn_layers=1, hidden_rnn_size=128)

In [100]:
# Need at least 50000 soundscapes

for iteration in range(1, 100):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X_train, y_train, validation_data=(X_val, y_val),
              batch_size=2, epochs=3, verbose=2)
    
    ind = np.random.randint(0, len(X_val))
    
    rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])]
    
    pred = model.predict_classes(rowX, verbose=0)
    
    print('acutal: ', rowy[0])
    print('pred: ', pred[0])


--------------------------------------------------
Iteration 1
Train on 10 samples, validate on 10 samples
Epoch 1/3
0s - loss: 2.1568 - acc: 0.2000 - val_loss: 2.1199 - val_acc: 0.2300
Epoch 2/3
0s - loss: 2.1199 - acc: 0.2600 - val_loss: 2.0798 - val_acc: 0.2600
Epoch 3/3
0s - loss: 2.0772 - acc: 0.2500 - val_loss: 2.0407 - val_acc: 0.2400
acutal:  [[0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1]
 [1 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 1 0 0 0 0 0]]
pred:  [0 4 4 4 4 4 4 5 5 5]

--------------------------------------------------
Iteration 2
Train on 10 samples, validate on 10 samples
Epoch 1/3
0s - loss: 2.0371 - acc: 0.2500 - val_loss: 2.0041 - val_acc: 0.2800
Epoch 2/3
0s - loss: 1.9944 - acc: 0.2800 - val_loss: 1.9629 - val_acc: 0.2600
Epoch 3/3
0s - loss: 1.9610 - acc: 0.2800 - val_loss: 1.9217 - val_acc: 0.2700
acutal:  [[1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0