In [125]:
seed = 1356

import numpy as np
np.random.seed(seed)
from tensorflow import set_random_seed
set_random_seed(seed)

import sklearn
import cv2
import random
import math
import os
import datetime

from itertools import chain
from collections import Counter
from sklearn.metrics import f1_score, confusion_matrix

import matplotlib.pyplot as plt

# constants
img_folder = 'audio'
img_name = ['_pressure.png', '_spec1.png', '_spec2.png', '_spec3.png', '.wav']

ismap = {0:'absence', 1:'cooking', 2:'dishwashing', 3:'eating', 4:'other', 5:'social activity', 6:'vacuum cleaning', 7:'watching tv', 8:'working'}

im_size = 64
im_size_flat = im_size * im_size
n_labels = 9
n_channels = 1 # grayscale
sd = np.sqrt(2) / np.sqrt(im_size_flat)

y_length = 160000
hop_length = 40000

In [2]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D, LSTM
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
from keras import optimizers
from keras.models import load_model

Using TensorFlow backend.


In [14]:
import librosa
import librosa.display

afs, labels, _ = zip(*[line.rstrip('\n').split('\t') for line in open('meta.txt').readlines()])

In [4]:
def get_ims(links):
    return np.array([(1 - cv2.imread(link, 0)/255).reshape((im_size, im_size, 1)) for link in links])
def get_wav(links):
    temp = []
    for link in links:
#         y, sr = librosa.load(link, sr = None, mono = True)
#         temp.append(librosa.feature.mfcc(y=y,sr=sr).T)
        temp.append(mfccs[link])
    return np.array(temp)
def get_wav2(links):
    temp = []
    for link in links:
#         y, sr = librosa.load(link, sr = None, mono = True)
#         temp.append(librosa.feature.mfcc(y=y,sr=sr).T)
        temp.append(mfccs2[link])
    return np.array(temp)

In [5]:
def data_loader(files, batch_size, file_type = 0):
    L = len(files)
    while True:
        batch_start = 0
        batch_end = batch_size
        
        while batch_start < L:
            lim = min(L, batch_end)
            if file_type == 0:
                X = get_ims([f[0] for f in files[batch_start:lim]])
                Y = np.array([f[1] for f in files[batch_start:lim]])
                yield (X, Y)
            elif file_type == 1:
                # wav files
                X = get_wav([f[0] for f in files[batch_start:lim]])
                Y = np.array([f[1] for f in files[batch_start:lim]])
                yield (X, Y)
            batch_start += batch_size
            batch_end += batch_size
            
def data_loader2(files, batch_size, file_type = 0):
    L = len(files)
    while True:
        batch_start = 0
        batch_end = batch_size
        
        while batch_start < L:
            lim = min(L, batch_end)
            if file_type == 0:
                X = get_ims(files[batch_start:lim])
                yield X
            elif file_type == 1:
                # wav files
                X = get_wav2(files[batch_start:lim])
                yield X
            batch_start += batch_size
            batch_end += batch_size

In [119]:
def pred(sr, ys, model1, model2, model3):        
    weights = [0.3, 0.4, 0.3]
    
#     model1 = load_model('models/model_2D_small_3.hdf5')
#     model2 = load_model('models/model_1D_norm.hdf5')
#     model3 = load_model('models/model_LSTM_norm.hdf5')
    
    preds = []
    
    for y in ys:
#         print("Generating MFCCs")
        y_feat_1d = get_1d_np(y, sr)
#         print("Generating 2D spectrogram")
        y_feat_2d = get_2d_np(y, sr)
        
        pred = weights[0] * model1.predict(np.array([y_feat_2d,]))
        pred = weights[1] * model2.predict(np.array([y_feat_1d,]))
        pred += weights[2] * model3.predict(np.array([y_feat_1d,]))
        
        preds.append(pred)
    return preds
        
#     afs = [line.rstrip('\n') for line in open('eval/eval/meta.txt').readlines()]
#     tlbatch = int(math.ceil(len(afs)/50))
    
#     model1_eval_data = ['eval/eval/src/' + af[6:-4] + '_spec3.png' for af in afs]
#     model23_eval_data = afs
    
#     preds = weights[0] * model1.predict_generator(data_loader2(model1_eval_data, 50, 0),
#                                                   steps = tlbatch,
#                                                   verbose = 1)
    
#     preds += weights[1] * model2.predict_generator(data_loader2(model23_eval_data, 50, 1),
#                                                    steps = tlbatch,
#                                                    verbose = 1)
    
#     preds += weights[2] * model3.predict_generator(data_loader2(model23_eval_data, 50, 1),
#                                                    steps = tlbatch,
#                                                    verbose = 1)
    
#     predsT = np.argmax(preds, axis = 1)
#     return predsT
#     res = {af:ismap[pred] for af,pred in zip(afs,preds)}
#     return res

In [44]:
# y, sr = librosa.load(y=y, sr=None, mono=True)
# y.shape = (160000, ) # 10sec

def get_1d_np(y, sr):
    return librosa.feature.mfcc(y = y, sr = sr).T
    
def get_2d_np(y, sr):
    fig = plt.figure()
    ax = plt.Axes(fig, [0., 0., 1., 1.])
    ax.set_axis_off()
    fig.add_axes(ax)
    fig.set_size_inches(1, 1)
    D = librosa.power_to_db(np.abs(librosa.stft(y))**2, ref=np.max)
    librosa.display.specshow(D, cmap='gray_r', y_axis='log')
    plt.savefig('tmp/tmp.png', dpi=64)
    plt.close()
    return np.array((1 - cv2.imread('tmp/tmp.png', 0)/255).reshape((im_size, im_size, 1)))

In [100]:
def make_audio(audio_file, hop_length = hop_length):
    y, sr = librosa.load(audio_file, sr=None, mono=True)
    if len(y) < y_length:
        return (sr, [np.pad(y, (0, y_length - len(y)), 'constant', constant_values = 0)])
    elif len(y) > y_length:
        final_start = len(y)//hop_length * hop_length
        l_final = y_length - (len(y) - final_start)
        b = [0] * l_final
        tmp = [(list(y)+list(b))[i:y_length+i] for i in range(0, len(y), hop_length)]
        return (sr, [np.array(ar) for ar in tmp if len(ar) == y_length])
    else:
        return (sr, [y])

In [55]:
model1 = load_model('models/model_2D_small_3.hdf5')
model2 = load_model('models/model_1D_norm.hdf5')
model3 = load_model('models/model_LSTM_norm.hdf5')

In [117]:
from collections import Counter

def avgpreds(preds):
    return [ismap[p] for p in np.argmax(np.array([sum(i) for i in zip(*preds)]), axis = 1)][0]
def indivpreds(preds):
    data = [ismap[p[0]] for p in np.argmax(preds, axis = 2)]
    return max(data, key = Counter(data).get)

In [124]:
import os

for fn in os.listdir("audio"):
    print(fn)
    sr, ys = make_audio("audio/" + fn)
    print(len(ys))
    preds = pred(sr, ys, model1, model2, model3)
    print(avgpreds(preds))
    print(indivpreds(preds))
    print()

cooking1.wav
1
cooking
cooking

cooking2.wav
56
social activity
social activity

cooking3.wav
74
absence
absence

cooking4_1.wav
20
social activity
social activity

dishwashing1.wav
23
other
other

dishwashing2_1.wav
29
social activity
social activity

dishwashing2_2.wav
27
social activity
social activity

dishwashing3_1.wav
11
social activity
absence

dishwashing3_2.wav
13
social activity
social activity

dishwashing4_1.wav
23
vacuum cleaning
vacuum cleaning

dishwashing4_2.wav
25
vacuum cleaning
vacuum cleaning

eating1_1.wav
13
social activity
social activity

eating2_1.wav
10
working
working

eating3_1.wav
13
other
dishwashing

eating4_1.wav
12
working
working

eating4_2.wav
14
working
working

vacuum1_1.wav
12
social activity
social activity

vacuum1_2.wav
13
other
other

vacuum1_3.wav
15
social activity
social activity

vacuum2.wav
22
vacuum cleaning
vacuum cleaning

vacuum3.wav
13
social activity
social activity

vacuum4_1.wav
15
social activity
social activity

vacuum4_2.wav
18

In [126]:
import os

for fn in os.listdir("unedited audio"):
    print(fn)
    sr, ys = make_audio("unedited audio/" + fn)
    print(len(ys))
    preds = pred(sr, ys, model1, model2, model3)
    print(avgpreds(preds))
    print(indivpreds(preds))
    print()

cooking1.wav
1
cooking
cooking

cooking2.wav
56
social activity
social activity

cooking3.wav
74


KeyboardInterrupt: 

<matplotlib.figure.Figure at 0xa9e96a6cf8>