In [1]:
from __future__ import print_function
from __future__ import division

import numpy as np
import tensorflow as tf
import argparse
import speech_recognition as sr
# from keras.layers import GlobalAveragePooling1D
from keras.models import load_model
from keras import backend as K
from keras.engine import InputSpec
from keras.engine.topology import Layer

from sklearn.utils import class_weight as clw
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, accuracy_score

import time
import os

#TODO: load the correct feature extraction function
from audio_features import extract_logmel as extract_feat
# import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:

class MeanPool(Layer):
    def __init__(self, **kwargs):
        super(MeanPool, self).__init__(**kwargs)
        self.supports_masking = True
        self.input_spec = InputSpec(ndim=3)


    def compute_mask(self, input, input_mask=None):
      # do not pass the mask to the next layers
      return None


    def call(self, x, mask=None):
        if mask is not None:
            # mask (batch, time)
            mask = K.cast(mask, K.floatx())
            # mask (batch, x_dim, time)
            mask = K.repeat(mask, x.shape[-1])
            # mask (batch, time, x_dim)
            mask = K.permute_dimensions(mask, (0,2,1))
            x = x * mask
        return K.sum(x, axis=1) / K.sum(mask, axis=1)


    def compute_output_shape(self, input_shape):
        # remove temporal dimension
        return (input_shape[0], input_shape[2])

In [3]:

class emoLSTM():
    
    def __init__(self, pre_trained):
        self.model = load_model(pre_trained, custom_objects={'MeanPool': MeanPool})


    def evaluate(self, x_test, y_test, sample_weight=None):
        score = self.model.evaluate(x=pad_sequences(x_test), y=y_test, sample_weight=sample_weight)
        # print ("Test Loss: {}, Test UA: {}, Test WA: {}".format(score[0], score[1], score[2]))
        print (score)
        print (self.model.metrics_names)
        

    def predict(self, x_test):
        return self.model.predict(pad_sequences(x_test))


In [4]:

class AudioRec(object):

    def __init__(self):
        self.r = sr.Recognizer()
        self.src = sr.Microphone()
        with self.src as source:
            print("Calibrating microphone...")
            self.r.adjust_for_ambient_noise(source, duration=2)


    def listen(self, save_path):
        with self.src as source:
            print("Recording ...")
            # record for a maximum of 10s
            audio = self.r.listen(source, phrase_time_limit=10)
        # write audio to a WAV file
        with open(save_path, "wb") as f:
            f.write(audio.get_wav_data())

In [5]:
def pad_sequences(mini_batch):
    batch = np.copy(mini_batch)
    max_len = max([example.shape[0] for example in batch])
    feat_len = batch[0].shape[1]
    for i,example in enumerate(batch):
        seq_len = example.shape[0]
        if seq_len != max_len:
            batch[i] = np.vstack([example, np.zeros((max_len - seq_len, feat_len), dtype=example.dtype)])
    return np.dstack(batch).transpose(2,0,1)

In [7]:
weights_path = 'models/emorec_model_0703_035905.val-acc-0.5656.h5'
realtime_mode = True
wav_path = 'microphone-results.wav'

# previously tried 6 emotions
# labels = {0:'ang', 1:'hap', 2:'exc', 3:'sad', 4:'fru', 5:'neu'}
labels = {0:'ang', 1:'hap', 2:'sad', 3:'neu'}

enn = emoLSTM(weights_path)
enn.model.summary()

if realtime_mode:
    ar = AudioRec()
    while True:
        try:
            ar.listen("microphone-results.wav")
            features = extract_feat("microphone-results.wav")

            if features is not None:
                print ("Extracted features of shape: ", features.shape)
                y_pred = enn.predict(features[None,:])
                pred_class = labels[np.argmax(y_pred, axis=1)[0]]
                print ("Predicted emotion is: ", pred_class)
            time.sleep(0.1)
        except KeyboardInterrupt:
            print ("Quitting realtime application..")
            break

else:
    wav_path = args["audio"]
    features = extract_feat(wav_path)
    print ("Extracted features of shape: ", features.shape)
    y_pred = enn.predict(features[None,:])
    pred_class = labels[np.argmax(y_pred, axis=1)[0]]
    print("Predicted emotion is: ", pred_class)