In [3]:
# Importing required libraries 
# Keras
from keras.models import model_from_json

# Other  
import json
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import seaborn as sns
import os
import pickle
import wave
import IPython.display as ipd  # To play sound in the notebook
import python_speech_features as ps
import seaborn as sns
import scipy
import math
import time
from scipy import signal
import sys

In [4]:
def trimVoice(data, sokuonleng = 3, spaceleng = 10, voiceleng= 10, debug=False):
    trimpoint = detectVoice(data, sokuonleng = sokuonleng, spaceleng=spaceleng, voiceleng = voiceleng, debug=debug)
    trimdata = []
    for point in trimpoint:
        trimdata.append(data[point[0]:point[1]])
    return trimdata

In [5]:
def detectVoice(data, sokuonleng = 3, spaceleng = 10, voiceleng= 10, debug=False):
    zscore = np.empty_like(data)
    eps = 1e-5
    mean = np.mean(data)
    std = np.std(data)
    zscore = (data - mean)/(std+eps)
    zscore = np.append(zscore, np.zeros(1024 - zscore.shape[0] % 1024),0)
    zscore = zscore.reshape(-1, 1024)
    ave = np.empty((1,), float)
    diff = np.empty((0,), float)
    silent = 0
    voice = 0
    isVoice = 0
    standby = 0
    voiceOnWave = np.empty((0,2), int)
    warn = False
    for waddr in range(zscore.shape[0]):
        ave = np.append(ave, np.mean(np.abs(zscore[waddr])))
        diff = np.append(diff, ave[-2]-ave[-1])
        #print(np.abs(diff[waddr]) > 0)
        #声の部分か判定する。
        if np.abs(diff[waddr]) > 0.05:
            silent = 0
            voice  = sokuonleng
        else:
            silent += 1
            voice  -= 1 if voice > 0 else 0
        #発話していないなら
        if isVoice == 0:
            if standby == -1:
                #スタンバイでない時、音を検知したら、スタンバイする(その時の時間を覚えておく)。
                if voice == sokuonleng:
                    standby = waddr - 1 #発声する瞬間も大事な特微量なので、現在のアドレスより1コマ過去のものから判定開始
            else:
                if waddr-standby >= voiceleng:
                    #voiceleng以上発話していれば、isRecをアクティブに。
                    if voice != 0:
                        if debug == True:
                            print("Record start{}".format(standby, waddr))
                        isVoice = 1
                        voiceOnWave = np.append(voiceOnWave, np.zeros((1,2), int),0)
                        voiceOnWave[-1,0] = standby*1024
                    #voicelengより短い発話だったら、スタンバイ解除
                    elif voice == 0:
                        standby = -1
                elif silent >= spaceleng:
                    standby = -1
                
        #発話しているなら
        else:
            #もしも、一定時間以上静寂なら、isRecをディスアクティブに。
            if silent >= spaceleng or waddr == zscore.shape[0]-1:
                if debug == True:
                    print("--->{}".format(waddr-spaceleng, waddr))
                isVoice = 0
                standby = -1
                voiceOnWave[-1,1] = (waddr - silent +5)*1024 #+5して発話後に若干余白をとる。
    return voiceOnWave

In [6]:
def loadwav(path, mono=True):
    file = wave.open(path, 'r')
    params = file.getparams()
    nchannels, samplewidth, samplerate, wav_length = params[:4]
    datatype = {2:"int16", 4:"int32"}
    str_data = file.readframes(wav_length)
    data = np.frombuffer(str_data, dtype = np.short)
    if(nchannels == 2):
        left = data[::2]
        right= data[1::2]
        wavedata = np.stack([left, right], 1)
        if mono == True:
            wavedata = np.mean(wavedata, axis=1)
            wavedata = wavedata.astype(datatype[samplewidth])
    else:
        wavedata = data
    return wavedata, samplerate, samplewidth, nchannels

In [7]:
def savewav(data, path):
    outd = struct.pack("h" * len(data), *data)
    with wave.open(path, 'w') as ww:
        ww.setnchannels(nchannel)
        ww.setsampwidth(samplewidt)
        ww.setframerate(samplerate)
        ww.writeframes(outd)

In [8]:
def mellCepstrum(data, RATE, winlen=0.08, winstep=0.016, nfilt=40, start=None, end=None):
    data = np.append(data, np.zeros(1024 - data.shape[0] % 1024),0)
    eps = 1e-5
    if start == None and end == None:
        mel_spec = ps.logfbank(data, samplerate=RATE, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=int(winlen*RATE))
    elif start == None or end == None:
        point = start if end == None else end
        mel_spec = ps.logfbank(data[point], samplerate=RATE, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=int(winlen*RATE))
    else:
        mel_spec = ps.logfbank(data[start:end], samplerate=RATE, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=int(winlen*RATE))
    #deltaの特微量を生成
    delta1= ps.delta(mel_spec, 4)
    #delta-deltaの特微量を生成
    delta2 = ps.delta(delta1, 4)
    mean1 = np.mean(mel_spec)
    mean2 = np.mean(delta1)
    mean3 = np.mean(delta2)
    std1 = np.std(mel_spec)
    std2 = np.std(delta1)
    std3 = np.std(delta2)
    procd = np.empty((mel_spec.shape[0], nfilt, 3))
    procd[:,:,0] = (mel_spec - mean1)/(std1+eps) #mel_spec
    procd[:,:,1] = (delta1 - mean2)/(std2+eps) #delta1
    procd[:,:,2] = (delta2 - mean3)/(std3+eps) #delta2
    return procd

In [9]:
def inverselabel(value):
    tag = {'female_angry':0, 'female_disgust':1, 'female_fear':2, 'female_happy':3,
 'female_neutral':4, 'female_sad':5, 'female_surprise':6, 'male_angry':7,
 'male_disgust':8, 'male_fear':9, 'male_happy':10, 'male_neutral':11, 'male_sad':12,
 'male_surprise':13}
    return [k for k, v in tag.items() if v == value][0]

In [10]:
# loading json and model architecture 
json_file = open('model_json.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights("saved_models/Emotion_Model.h5")
print("Loaded model from disk")

Loaded model from disk


In [15]:
# lets pick up the meta-data that we got from our first part of the Kernel
ref = pd.read_csv("/Users/naokitakatani/Documents/2020_12_12_SERv3/Data_path.csv")
ref.head()

Unnamed: 0,labels,source,path
0,male_sad,SAVEE,/Users/naokitakatani/Documents/datasets/SAVEE/...
1,male_sad,SAVEE,/Users/naokitakatani/Documents/datasets/SAVEE/...
2,male_neutral,SAVEE,/Users/naokitakatani/Documents/datasets/SAVEE/...
3,male_surprise,SAVEE,/Users/naokitakatani/Documents/datasets/SAVEE/...
4,male_neutral,SAVEE,/Users/naokitakatani/Documents/datasets/SAVEE/...


In [49]:
features = np.empty((0,300,40,3),float)
emolabel = np.zeros(0,int)
setindex = np.zeros(0,int)
start = 0
b, a = signal.iirfilter(2, 1500,btype='highpass',rs = 60,rp = 1.0, ftype='ellip',output='ba',fs=44100)

In [65]:
for index, (label, path) in enumerate(zip(ref.labels, ref.path)):
    if index == start: 
        if features.shape[0] == index:
            features = np.append(features,np.zeros((1,300,40,3),float),0)
        elif features.shape[0] != index + 1:
            sys.exit()
        print("No.{} path : {}".format(index, path))
        try:
            X, samplerate,samplewidth,nchannel = loadwav(path)
            trimdata = trimVoice(X, spaceleng = 30)
            Y = trimdata[0]
            heatmap_data = mellCepstrum(Y, samplerate)
            end = heatmap_data.shape[0] if heatmap_data.shape[0] <= 300 else 300
            features[-1,:end] = heatmap_data[:end]
        except KeyboardInterrupt:
            print("KeyboardInterrupt")
            sys.exit()
        #except:
         #   print("Skip this audio file.")
        start += 1

No.261 path : /Users/naokitakatani/Documents/datasets/SAVEE/KL_f15.wav
No.262 path : /Users/naokitakatani/Documents/datasets/SAVEE/DC_a14.wav
No.263 path : /Users/naokitakatani/Documents/datasets/SAVEE/KL_d03.wav
No.264 path : /Users/naokitakatani/Documents/datasets/SAVEE/JK_su06.wav
No.265 path : /Users/naokitakatani/Documents/datasets/SAVEE/JK_su12.wav
No.266 path : /Users/naokitakatani/Documents/datasets/SAVEE/JE_n23.wav
No.267 path : /Users/naokitakatani/Documents/datasets/SAVEE/JE_f02.wav
No.268 path : /Users/naokitakatani/Documents/datasets/SAVEE/KL_su14.wav
No.269 path : /Users/naokitakatani/Documents/datasets/SAVEE/JE_d14.wav
No.270 path : /Users/naokitakatani/Documents/datasets/SAVEE/KL_n08.wav
No.271 path : /Users/naokitakatani/Documents/datasets/SAVEE/KL_n20.wav
No.272 path : /Users/naokitakatani/Documents/datasets/SAVEE/KL_n18.wav
No.273 path : /Users/naokitakatani/Documents/datasets/SAVEE/KL_n24.wav
No.274 path : /Users/naokitakatani/Documents/datasets/SAVEE/KL_n30.wav
No.

KeyboardInterrupt: 

In [63]:
preds = loaded_model.predict(features, 
                         batch_size=16, 
                         verbose=1)
#13の感情のなかから最も高い値(確率が高い感情)を選ぶ
preds=preds.argmax(axis=1)

 2/17 [==>...........................] - ETA: 2:30

KeyboardInterrupt: 

In [37]:
with open("nonFilterFeatures.pkl", "wb") as file:
    pickle.dump((features, preds), file)

In [None]:
features = np.empty((0,300,40,3),float)
emolabel = np.zeros(0,int)
setindex = np.zeros(0,int)
start = 0
b, a = signal.iirfilter(2, 1500,btype='highpass',rs = 60,rp = 1.0, ftype='ellip',output='ba',fs=44100)

In [None]:
for index, (label, path) in enumerate(zip(ref.labels, ref.path)):
    if index == start: 
        if features.shape[0] == index:
            features = np.append(features,np.zeros((1,300,40,3),float),0)
        elif features.shape[0] != index + 1:
            sys.exit()
        print("No.{} path : {}".format(index, path))
        try:
            X, samplerate,samplewidth,nchannel = loadwav(path)
            trimdata = trimVoice(X, spaceleng = 30)
            Y = signal.lfilter(b, a, trimdata[0])
            heatmap_data = mellCepstrum(Y, samplerate)
            end = heatmap_data.shape[0] if heatmap_data.shape[0] <= 300 else 300
            features[-1,:end] = heatmap_data[:end]
        except KeyboardInterrupt:
            print("KeyboardInterrupt")
            sys.exit()
        #except:
         #   print("Skip this audio file.")
        start += 1

In [None]:
preds = loaded_model.predict(features, 
                         batch_size=16, 
                         verbose=1)
#13の感情のなかから最も高い値(確率が高い感情)を選ぶ
preds=preds.argmax(axis=1)

In [None]:
with open("FilteredFeatures.pkl", "wb") as file:
    pickle.dump((features, preds), file)