In [18]:
# Importing required libraries 
# Keras
import keras
from keras import regularizers
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Embedding, LSTM
from keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, AveragePooling2D, Conv2D, MaxPooling2D, AveragePooling2D
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint

# sklearn
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Other  
import librosa
import librosa.display
import json
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
import seaborn as sns
import glob 
import os
import pickle
import wave
import struct
import IPython.display as ipd  # To play sound in the notebook
import python_speech_features as ps
import seaborn as sns
import scipy

import pyaudio
import math
import datetime
from concurrent.futures.process import ProcessPoolExecutor
from concurrent.futures import ThreadPoolExecutor
import threading
import time
from multiprocessing import Process

In [68]:
class emotionAnalizer():
    def __init__(self):
        super().__init__()
        self.alive = True
        self.extractMellalive = False
        #inputAudioに必要な初期化
        self.CHUNK=1024
        self.RATE = 44100
        self.wave = np.empty((0,1024),int)
        self.audio = pyaudio.PyAudio()
        self.stream=self.audio.open(format = pyaudio.paInt16,
            channels = 1,
            rate = self.RATE,
            frames_per_buffer = self.CHUNK,
            input = True,
            output=True)
        #detectAudioに必要な初期化
        self.spaceleng = 10
        self.voiceleng = 10
        self.sokuonleng = 3
        self.winlen = 0.08
        self.winstep = 0.016
        self.nfilt = 40 #周波数の分解能
        self.waddr = 0
        self.ave = np.empty((1,), float) #diffで過去のデータを参照するなので1つ余分に作っておく。
        self.diff = np.empty((0,), float)
        self.silent = 0
        self.voice  = 0
        self.standby = -1
        self.isVoice = 0
        self.voiceOnWave = np.empty((0,2,2), int)
        #extractMellに必要な初期化
        self.pastVoiceOnWaveLeng = 0
        self.lastwaddr = 0
        self.features = np.zeros((0, 300, self.nfilt, 3))
        self.eps = 1e-5
        with open('mean_and_std.pkl', 'rb') as file:
          self.mean1,self.std1,self.mean2,self.std2,self.mean3,self.std3 = pickle.load(file)
        #emotionRecognitionに必要な初期化
        self.faddr = 0
        self.emotionResult = np.zeros((0,14))
        # loading json and model architecture 
        json_file = open('model_json.json', 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        self.loaded_model = model_from_json(loaded_model_json)

        # load weights into new model
        self.loaded_model.load_weights("saved_models/Emotion_Model.h5")
        print("Loaded model from disk")

    def inputAudio(self):
            #print("inputAudio Start")
            while self.alive:
                ret = self.stream.read(self.CHUNK, exception_on_overflow = False)
                self.stream.write(ret)
                self.wave = np.append(self.wave, np.expand_dims(np.frombuffer(ret, dtype="int16"), 0), axis=0)
            #print("inputAudio End")
        
    def detectVoice(self):
        #print("detectVoice Start")
        #time.sleep(0.001)
        if self.wave.shape[0] > 0:
            self.waddr = self.wave.shape[0]-1 #wave変数の最新のデータは、その時のwaveの要素数から1引いたもの
            self.ave = np.append(self.ave, np.mean(np.abs(self.wave[self.waddr])))
            self.diff = np.append(self.diff, self.ave[-2]-self.ave[-1])
            #声の部分か判定する。
            if np.abs(self.diff[self.waddr]) > 100:
                self.silent = 0
                self.voice  = self.sokuonleng
            else:
                self.silent += 1
                self.voice  -= 1 if self.voice > 0 else 0
            #発話していないなら
            if self.isVoice == 0:
                if self.standby == -1:
                    #スタンバイでない時、音を検知したら、スタンバイする(その時の時間を覚えておく)。
                    if self.voice == self.sokuonleng:
                        self.standby = self.waddr - 1 #発声する瞬間も大事な特微量なので、現在のアドレスより1コマ過去のものから判定開始
                else:
                    if self.waddr-self.standby >= self.voiceleng:
                        #voiceleng以上発話していれば、isRecをアクティブに。
                        if self.voice != 0:
                            print("Record start{}".format(self.standby, self.waddr))
                            self.isVoice = 1
                            self.voiceOnWave = np.append(self.voiceOnWave, np.zeros((1,2,2), int),0)
                            self.voiceOnWave[-1,0,0] = self.standby
                        #voicelengより短い発話だったら、スタンバイ解除
                        elif self.voice == 0:
                            self.standby = -1
            #発話しているなら
            else:
                #もしも、一定時間以上静寂なら、isRecをディスアクティブに。
                if self.silent > self.spaceleng or self.alive == False:
                    #print("--->{}".format(self.waddr-self.spaceleng, self.waddr))
                    self.isVoice = 0
                    self.standby = -1
                    self.voiceOnWave[-1,1,0] = self.waddr - self.spaceleng + 3 #+3して発話後に若干余白をとる。
        #print("detectVoice End")
            
    def mellCepstrum(self, start, end=None):
        #Mellの特微量を生成
        if end != None:
            mel_spec = ps.logfbank(self.wave[start:end].reshape(-1), samplerate=self.RATE, winlen=self.winlen, winstep=self.winstep, nfilt=self.nfilt, nfft=int(self.winlen*self.RATE))
        else:
            mel_spec = ps.logfbank(self.wave[start].reshape(-1), samplerate=self.RATE, winlen=self.winlen, winstep=self.winstep, nfilt=self.nfilt, nfft=int(self.winlen*self.RATE))
        #deltaの特微量を生成
        delta1= ps.delta(mel_spec, 4)
        #delta-deltaの特微量を生成
        delta2 = ps.delta(delta1, 4)
        procd = np.empty((mel_spec.shape[0], self.nfilt, 3))
        procd[:,:,0] = mel_spec#(mel_spec - self.mean1)/(self.std1+self.eps) #mel_spec#
        procd[:,:,1] = delta1#(delta1 - self.mean2)/(self.std2+self.eps) #delta1#
        procd[:,:,2] = delta2#(delta2 - self.mean3)/(self.std3+self.eps) #delta2#
        return procd
    
    def mellshape(self, arrayelem=1):
        datalen=self.CHUNK/self.RATE
        shape=(datalen*arrayelem-self.winlen)/self.winstep+1 if arrayelem > 1 else 1
        return math.ceil(shape)
                
    def extractMell(self):
        #print("extractMell Start")
        #time.sleep(0.001)
        #voiceOnWave上に新しい声が登録されたら
        if self.voiceOnWave.shape[0] > self.pastVoiceOnWaveLeng:
            #isVoiceがアクティブになった直後なら
            if self.voiceOnWave[-1,0,0] + self.voiceleng == self.waddr: 
                self.lastwaddr = self.voiceOnWave[-1,0,0]
                self.voiceOnWave[-1,0,1] = self.features.shape[0]
                self.extractMellalive = True
            #メル周波数ケプストラムの変換後のフレームが300フレーム以上になったら
            if self.mellshape(self.waddr-self.lastwaddr) >= 300 and self.alive == True:
                tmp = self.mellCepstrum(self.lastwaddr, self.waddr)
                self.features = np.append(self.features, np.zeros((1, 300, self.nfilt, 3)),0) 
                self.features[-1] = tmp[0:300]
                self.lastwaddr = self.waddr
            #isVoiceがディアクティブになったら(レコーディングが終了したら)
            elif self.voiceOnWave[-1,1,0] > 0 or self.alive == False:
                #type1 この処理をした時だけ、必ずAngryとHappyが下がり、Sadが急上昇する。
                #tmp = self.mellCepstrum(self.lastwaddr, self.waddr)
                #self.features = np.append(self.features, np.zeros((1, 300, self.nfilt, 3)),0)
                #self.features[-1,:-tmp.shape[0]] = self.features[-2,tmp.shape[0]:]
                #self.features[-1,-tmp.shape[0]:] = tmp
                #type2 この処理をした時だけ、必ずAngryとHappyが下がり、Sadが急上昇する。
                tmp = self.mellCepstrum(self.lastwaddr, self.waddr)
                self.features = np.append(self.features, np.zeros((1, 300, self.nfilt, 3)),0)
                self.features[-1,:tmp.shape[0]] = tmp
                #type3 この処理をした時だけ、必ずAngryとHappyが下がり、Sadが急上昇する。
                #tmp = self.mellCepstrum(self.waddr-76, self.waddr)
                #self.features = np.append(self.features, np.zeros((1, 300, self.nfilt, 3)),0)
                #self.features[-1,:tmp.shape[0]] = tmp
                self.voiceOnWave[-1,1,1] = self.features.shape[0]
                self.pastVoiceOnWaveLeng = self.voiceOnWave.shape[0]
                self.extractMellalive = False
        #print("extractMell End")
    
    def inverselabel(self, value):
        tag = {'female_angry':0, 'female_disgust':1, 'female_fear':2, 'female_happy':3,
     'female_neutral':4, 'female_sad':5, 'female_surprise':6, 'male_angry':7,
     'male_disgust':8, 'male_fear':9, 'male_happy':10, 'male_neutral':11, 'male_sad':12,
     'male_surprise':13}
        return [k for k, v in tag.items() if v == value][0]
    
    def emotionRecognition(self):
        #print("emotionRecognition Start")
        #time.sleep(0.001)
        if self.faddr < self.features.shape[0]:
            preds = self.loaded_model.predict_step(np.expand_dims(self.features[self.faddr],axis=0))
            #print(preds.argmax(axis=1))
            self.emotionResult = np.append(self.emotionResult, preds, 0)
            print("{}".format(self.inverselabel(self.emotionResult[-1].argmax())))
            self.faddr +=1
        #print("emotionRecognition End")
        
    def terminate(self):
        self.stream.stop_stream()
        self.stream.close()
    
    def alpha(self):
        #print("alpha start")
        ret = self.stream.read(self.CHUNK, exception_on_overflow = False)
        self.stream.write(ret)
        self.wave = np.append(self.wave, np.expand_dims(np.frombuffer(ret, dtype="int16"), 0), axis=0)
        #print("alpha end")
    
    def bravo(self):
        #print("bravo start")
        time.sleep(0.001)
        #print("bravo end")
        
    def charlie(self):
        #print("charlie start")
        time.sleep(0.001)
        #print("charlie end")
    
    def delta(self):
        #print("delta start")
        time.sleep(0.001)
        #print("delta end")
    
    def run(self):
        """
        with ThreadPoolExecutor(max_workers=4) as executor:
            while self.alive:
                executor.submit(fn=self.alpha)
                executor.submit(fn=self.bravo)
                executor.submit(fn=self.charlie)
                executor.submit(fn=self.delta)
        """
        """
        with ThreadPoolExecutor(max_workers=8) as executor:
            executor.submit(fn=self.inputAudio())
        """
        iA = Process(target=self.inputAudio())
        iA.start()
        while self.alive or self.extractMellalive:
            print("T_T")
            self.detectVoice()
            self.extractMell()
            self.emotionRecognition()
        self.terminate()

In [75]:
eAn = emotionAnalizer()
try:
    eAn.run()
except KeyboardInterrupt:
    eAn.alive = False
    print("Realtime emotion recognition stoped.")

Loaded model from disk
Realtime emotion recognition stoped.


#### 保存用コマンド

In [98]:
dt_now = datetime.datetime.now()
os.makedirs("output", exist_ok=True)
os.makedirs("output/"+str(dt_now.isoformat()), exist_ok=True)

In [99]:
for no in eAn.voiceOnWave:
    Y=eAn.wave[no[0,0]:no[1,0]].reshape(-1)
    outd = struct.pack("h" * len(Y), *Y)
    filename = "output/"+str(dt_now.isoformat())+ "/trimAt" + str(no[0,0])+"-"+str(no[1,0]) + ".wav"
    # 書き出し
    with wave.open(filename, 'w') as ww:
        ww.setnchannels(1)
        ww.setsampwidth(2)
        ww.setframerate(44100)
        ww.writeframes(outd)

In [100]:
Y=eAn.wave.reshape(-1)
outd = struct.pack("h" * len(Y), *Y)
filename = "output/"+str(dt_now.isoformat())+ "/All.wav"
# 書き出し
with wave.open(filename, 'w') as ww:
    ww.setnchannels(1)
    ww.setsampwidth(2)
    ww.setframerate(44100)
    ww.writeframes(outd)

### test

In [41]:
def inputAudio():
        #print("inputAudio Start")
        alive = True
        CHUNK=1024
        RATE = 44100
        wave = np.empty((0,1024),int)
        audio = pyaudio.PyAudio()
        stream=audio.open(format = pyaudio.paInt16,
                    channels = 1,
                    rate = RATE,
                    frames_per_buffer = CHUNK,
                    input = True,
                    output=True)
        while alive:
            ret = stream.read(CHUNK, exception_on_overflow = False)
            stream.write(ret)
            wave = np.append(wave, np.expand_dims(np.frombuffer(ret, dtype="int16"), 0), axis=0)

In [42]:
iA = Process(target=inputAudio)
iA.start()

Process Process-12:
Traceback (most recent call last):
  File "/Users/naokitakatani/.pyenv/versions/3.7.9/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/naokitakatani/.pyenv/versions/3.7.9/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-41-a45729e88fac>", line 13, in inputAudio
    output=True)
  File "/Users/naokitakatani/Documents/2020_12_22_realtime_SERv3/env/lib/python3.7/site-packages/pyaudio.py", line 750, in open
    stream = Stream(self, *args, **kwargs)
  File "/Users/naokitakatani/Documents/2020_12_22_realtime_SERv3/env/lib/python3.7/site-packages/pyaudio.py", line 441, in __init__
    self._stream = pa.open(**arguments)
OSError: [Errno -9986] Internal PortAudio error


In [46]:
emotionResult = np.zeros((0,14))
# loading json and model architecture 
json_file = open('model_json.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("saved_models/Emotion_Model.h5")
print("Loaded model from disk")
with open('../2020_12_12_SERv3/extractMell.pkl', 'rb') as file:
  features, emolabel = pickle.load(file)

Loaded model from disk


In [73]:
def deqpredict():
    for index, data in enumerate(features):
        print(index)
        time.sleep(1)
        preds = loaded_model.predict_step(np.expand_dims(data,axis=0))

In [76]:
start = time.time()
dp = Process(target=deqpredict(), args=())
dp.start()
elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")

0
1
2
3


KeyboardInterrupt: 