# IEMOCAPとEmo-DBを同時に使って学習する

- Emo-DB単体での学習は精度が出なかった
- IEMOCAPとEmo-DBを両方使った場合の精度を検証する

In [1]:
import sys
sys.path.append('../../lib/')
import mymodels
import NN_learning_tools as nnlt

In [2]:
import numpy as np
import tensorly as tl
import torch
import librosa
import librosa.display
import audeer
import wave
import os
import matplotlib.pyplot as plt
import pandas as pd
import audeer
import audformat
import soundfile as sf
import audiofile as af
%matplotlib inline

# Emo-DBのロード

In [3]:
#音声ファイルをロードするメソッド
def load_wave_data(audio_dir, file_name):
    file_path = os.path.join(audio_dir, file_name)
    x, fs = librosa.load(file_path, sr=16000)
    return x, fs

#wavファイルからメルペクトログラムを生成するメソッド
def calculate_melsp(x, n_fft=1024, hop_length=128):
    stft = np.abs(librosa.stft(x, n_fft=n_fft, hop_length=hop_length))**2
    log_stft = librosa.power_to_db(stft)
    melsp = librosa.feature.melspectrogram(S=log_stft,n_mels=128)
    return melsp

#wavファイルの波形を描画
def show_wave(x):
    plt.plot(x)
    plt.show()

#メルスペクトログラムの描画
def show_melsp(melsp, fs):
    librosa.display.specshow(melsp, sr=fs)
    plt.colorbar()
    plt.show()

In [3]:
#ファイルの情報を読みやすくするための関数
def parse_names(names, from_i, to_i, is_number=False, mapping=None):
    for name in names:
        key = name[from_i:to_i]
        if is_number:
            key = int(key)
        yield mapping[key] if mapping else key

In [None]:
Emo_DB_dir = '../../download/wav/'

In [4]:
files = sorted([os.path.join(wavdir, f) for f in os.listdir(wavdir)])

NameError: name 'wavdir' is not defined

In [None]:
names = [audeer.basename_wo_ext(f) for f in files]

In [21]:
male = audformat.define.Gender.MALE
female = audformat.define.Gender.FEMALE
language = audformat.utils.map_language('de')

In [22]:
speaker_mapping = {
    3: {'gender': male, 'age': 31, 'language': language},
    8: {'gender': female, 'age': 34, 'language': language},
    9: {'gender': female, 'age': 21, 'language': language},
    10: {'gender': male, 'age': 32, 'language': language},
    11: {'gender': male, 'age': 26, 'language': language},
    12: {'gender': male, 'age': 30, 'language': language},
    13: {'gender': female, 'age': 32, 'language': language},
    14: {'gender': female, 'age': 35, 'language': language},
    15: {'gender': male, 'age': 25, 'language': language},
    16: {'gender': female, 'age': 31, 'language': language},
}

In [27]:
emotion_mapping = {
    'W': 'anger',
    'L': 'boredom',
    'E': 'disgust',
    'A': 'fear',
    'F': 'happiness',
    'T': 'sadness',
    'N': 'neutral',
}

In [28]:
speakers = list(parse_names(names, from_i=0, to_i=2, is_number=True))
print(len(speakers))

535


In [29]:
emotions = list(parse_names(names, from_i=5, to_i=6, mapping=emotion_mapping))

In [74]:
a = [1 for _ in range(10)]

In [81]:
a[:len(a)]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [82]:
b = [2 for _ in range(3)]

In [83]:
b

[2, 2, 2]

In [84]:
a[:len(b)] = b

In [85]:
a

[2, 2, 2, 1, 1, 1, 1, 1, 1, 1]

In [71]:
def wavload(path):
    data, samplerate = sf.read(path)
    return data, samplerate

def get_name(path):
    return path[len(wavdir):-len('.wav')]

In [86]:
def zero_pad(data, data_length):
    ret = [0 for _ in range(data_length)]
    ret[:len(data)] = data
    return ret

In [87]:
def segment_and_contain(path, std_len=50000):
    ret = []
    data, samplerate = wavload(path)
    label = get_name(path)
    print(f'label = {label}')
    n_segments = -(-len(data) // std_len)
    print(f'n_segments={n_segments}')
    data = zero_pad(data, n_segments*std_len)
    for i in range(n_segments):
        ret.append(data[std_len * i : std_len * (i + 1)])
    
    ret.append(label)
    return ret

In [88]:
dataset_prime = []
for path in files:
    ret = segment_and_contain(path, std_len=50000)
    dataset_prime.append(ret)

label = 03a01Fa
n_segments=1
label = 03a01Nc
n_segments=1
label = 03a01Wa
n_segments=1
label = 03a02Fc
n_segments=1
label = 03a02Nc
n_segments=1
label = 03a02Ta
n_segments=1
label = 03a02Wb
n_segments=1
label = 03a02Wc
n_segments=1
label = 03a04Ad
n_segments=1
label = 03a04Fd
n_segments=1
label = 03a04Lc
n_segments=1
label = 03a04Nc
n_segments=1
label = 03a04Ta
n_segments=1
label = 03a04Wc
n_segments=1
label = 03a05Aa
n_segments=1
label = 03a05Fc
n_segments=2
label = 03a05Nd
n_segments=2
label = 03a05Tc
n_segments=2
label = 03a05Wa
n_segments=2
label = 03a05Wb
n_segments=1
label = 03a07Fa
n_segments=1
label = 03a07Fb
n_segments=1
label = 03a07La
n_segments=1
label = 03a07Nc
n_segments=1
label = 03a07Wc
n_segments=1
label = 03b01Fa
n_segments=1
label = 03b01Lb
n_segments=1
label = 03b01Nb
n_segments=1
label = 03b01Td
n_segments=2
label = 03b01Wa
n_segments=1
label = 03b01Wc
n_segments=1
label = 03b02Aa
n_segments=1
label = 03b02La
n_segments=2
label = 03b02Na
n_segments=1
label = 03b02T

label = 13b01Ec
n_segments=1
label = 13b01Fc
n_segments=1
label = 13b01Ld
n_segments=1
label = 13b01Nc
n_segments=1
label = 13b01Wa
n_segments=1
label = 13b02Fb
n_segments=2
label = 13b02Lc
n_segments=2
label = 13b02Nb
n_segments=1
label = 13b02Wa
n_segments=1
label = 13b03Ac
n_segments=2
label = 13b03Ed
n_segments=2
label = 13b03Fd
n_segments=2
label = 13b03Lb
n_segments=2
label = 13b03Na
n_segments=2
label = 13b03Td
n_segments=2
label = 13b03Wc
n_segments=2
label = 13b09Ab
n_segments=1
label = 13b09Ec
n_segments=1
label = 13b09Fb
n_segments=1
label = 13b09Fc
n_segments=1
label = 13b09La
n_segments=1
label = 13b09Na
n_segments=1
label = 13b09Wa
n_segments=1
label = 13b10Ec
n_segments=1
label = 13b10Fa
n_segments=1
label = 13b10La
n_segments=1
label = 13b10Nc
n_segments=1
label = 13b10Wa
n_segments=1
label = 13b10Wc
n_segments=1
label = 14a01Aa
n_segments=1
label = 14a01Ac
n_segments=1
label = 14a01Ea
n_segments=2
label = 14a01Na
n_segments=1
label = 14a01Wa
n_segments=1
label = 14a01W

In [89]:
for data in dataset_prime:
    print(len(data[0]))

50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
50000
5000

In [73]:
len(dataset_prime[1][0])

25780

In [49]:
dataset_prime[0]

([0.00067138671875,
  0.003082275390625,
  0.002532958984375,
  0.000274658203125,
  0.000823974609375,
  0.002532958984375,
  0.001922607421875,
  0.001312255859375,
  0.00115966796875,
  0.001861572265625,
  0.001373291015625,
  0.00115966796875,
  0.0010986328125,
  0.001708984375,
  0.002044677734375,
  0.001434326171875,
  0.001220703125,
  0.00164794921875,
  0.0015869140625,
  0.001312255859375,
  0.000946044921875,
  0.000946044921875,
  0.001434326171875,
  0.0015869140625,
  0.001495361328125,
  0.000823974609375,
  0.000457763671875,
  0.001373291015625,
  0.001983642578125,
  0.0015869140625,
  0.000946044921875,
  0.00115966796875,
  0.00177001953125,
  0.001922607421875,
  0.000732421875,
  0.000885009765625,
  0.002197265625,
  0.002044677734375,
  0.000732421875,
  0.00018310546875,
  0.000823974609375,
  0.001373291015625,
  0.001007080078125,
  0.000885009765625,
  0.001220703125,
  0.001434326171875,
  0.001373291015625,
  0.00115966796875,
  0.0015869140625,
  0.001

In [90]:
len(dataset_prime)

535