# 支持向量机方法的语音情感识别实现以及评估

使用Emo_DB数据库,柏林工业大学录制.
分类如下:
```
letter	emotion (english)	letter	emotion (german)
A	anger	W	Ärger (Wut)
B	boredom	L	Langeweile
D	disgust	E	Ekel
F	anxiety/fear	A	Angst
H	happiness	F	Freude
S	sadness	T	Trauer
N = neutral version
```

In [126]:
import os
import pandas as pd
import librosa
import IPython.display as ipd
import numpy as np
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [127]:
EmoList = ['anger', 'boredom', 'disgust', 'anxiety/fear', 'happiness', 'sadness', 'neutarl']


def getEmotion(fileName):
    if 'A' in fileName:
        return 'fear'
    elif 'W' in fileName:
        return 'anger'
    elif 'L' in fileName:
        return 'boredom'
    elif 'F' in fileName:
        return 'happy'
    elif 'T' in fileName:
        return 'sad'
    elif 'E' in fileName:
        return 'disgust'
    elif 'N' in fileName:
        return 'neutral'

In [128]:
filePath = os.walk(r"D:\SpeechEmotionRecognition\dataset\Emo-DB\wav")
fileList = list([i for i in filePath][0])[2:][0]

In [129]:
EmoList = [getEmotion(i) for i in fileList]

In [130]:
os.chdir(r"D:\SpeechEmotionRecognition\dataset\Emo-DB\wav")

In [131]:
data, sampling_rate = librosa.load(fileList[42], duration=3, offset=0.5)
# data

In [132]:
sampling_rate

22050

In [133]:
def extract_mfcc(filename):
    data, sampling_rate = librosa.load(filename, duration=3, offset=0.5)
    mfccs = librosa.feature.mfcc(y=data, sr=sampling_rate, n_mfcc=64)
    mfcc_scaled = np.mean(mfccs.T, axis=0)
    return mfcc_scaled

In [134]:
sequence = [extract_mfcc(i) for i in fileList]

In [135]:
sequence[42]

array([-2.5866388e+02,  1.0037382e+02, -4.3144783e+01,  4.4948952e+01,
       -9.4050846e+00,  1.4938450e+00, -1.1602770e+01,  9.4904429e-01,
       -2.0080004e+01, -1.4023942e+01, -2.2404554e+00, -1.2222795e+01,
       -5.8485401e-01, -1.5121354e+01,  4.7117958e+00, -2.2433913e+00,
       -5.7632809e+00, -2.1108053e+00, -4.4906902e+00, -1.5633948e+00,
       -2.2546792e+00,  2.1571879e+00, -6.4659443e+00, -2.2059062e+00,
       -5.9675655e+00, -3.8036745e+00, -1.7230661e+00,  1.0110145e-01,
        2.2422442e+00, -2.4449635e+00,  2.9775691e+00, -2.1144652e+00,
        1.4298250e-01,  1.5012416e-01,  1.8711941e-01,  2.0893056e+00,
        2.6053292e-01,  1.4545770e+00,  1.2673495e+00,  3.5544436e+00,
        1.7999094e+00,  5.3344727e+00,  4.9093075e+00,  3.3470671e+00,
        5.0946608e+00,  4.1163092e+00,  4.6195498e+00,  3.5717468e+00,
        2.6710844e+00,  1.1702039e+00,  1.4572349e+00,  2.3130095e+00,
        3.0296566e+00,  1.5378088e+00,  5.6293267e-01,  6.5076953e-01,
      

In [136]:
x = np.array(sequence)
x.shape

(535, 64)

In [137]:
def idx_y(type):
    dict = {
        'fear': 0,
        'anger': 1,
        'boredom': 2,
        'happy': 3,
        'sad': 4,
        'disgust': 5,
        'neutral': 6
    }
    return dict.get(type)


y = [idx_y(elem) for elem in EmoList]
y = np.array(y)
y.shape

(535,)

In [138]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7)

In [139]:
model = svm.SVC(kernel='rbf', C=1024,decision_function_shape='ovr')
model.fit(x_train, y_train)

In [140]:
model.score(x_train, y_train)

0.9919786096256684

In [141]:
def show_accuracy(a, b):
    acc = a == b
    return np.mean(acc)

In [142]:
yy = model.predict(x_test)
show_accuracy(yy, y_test)

0.7701863354037267

In [143]:
model.predict(x_train[:32])

array([6, 4, 1, 1, 6, 2, 5, 4, 4, 5, 1, 5, 4, 3, 0, 1, 6, 0, 5, 2, 6, 1,
       5, 6, 3, 1, 6, 2, 2, 6, 0, 2])

### 在取得MFCC特征之前进行划分，从直观上来判断模型的效果
手动测试，直觉上感觉90%的正确率可能有错误

In [144]:
# np.array(fileList)
np.array(y)

array([3, 6, 1, 3, 6, 4, 1, 1, 0, 3, 2, 6, 4, 1, 0, 3, 6, 4, 1, 1, 3, 3,
       2, 6, 1, 3, 2, 6, 4, 1, 1, 0, 2, 6, 4, 1, 6, 4, 1, 2, 6, 4, 1, 0,
       5, 6, 6, 1, 1, 0, 3, 2, 6, 1, 1, 0, 0, 3, 2, 6, 4, 1, 3, 2, 6, 4,
       1, 3, 2, 6, 4, 1, 3, 2, 6, 4, 4, 1, 0, 3, 3, 2, 6, 1, 3, 2, 6, 4,
       1, 3, 2, 6, 4, 1, 0, 3, 2, 6, 4, 1, 1, 0, 3, 2, 6, 4, 1, 5, 3, 6,
       1, 5, 5, 2, 1, 3, 2, 6, 1, 5, 2, 6, 4, 1, 1, 5, 6, 4, 1, 1, 5, 6,
       1, 6, 4, 1, 1, 5, 3, 3, 2, 6, 4, 1, 5, 6, 1, 0, 6, 1, 0, 6, 1, 0,
       3, 2, 6, 1, 3, 6, 1, 1, 0, 2, 4, 1, 0, 0, 2, 4, 1, 0, 5, 3, 2, 0,
       2, 6, 1, 2, 4, 1, 0, 2, 1, 3, 2, 1, 0, 0, 2, 6, 1, 5, 3, 2, 6, 4,
       1, 0, 3, 6, 1, 0, 3, 3, 2, 6, 4, 1, 0, 2, 4, 1, 0, 5, 3, 2, 6, 1,
       0, 3, 6, 4, 1, 3, 2, 6, 4, 1, 1, 0, 3, 2, 6, 4, 1, 0, 0, 2, 6, 4,
       1, 3, 2, 6, 1, 0, 5, 6, 1, 1, 1, 0, 2, 6, 4, 1, 0, 2, 1, 4, 1, 0,
       5, 3, 6, 1, 1, 1, 2, 4, 0, 4, 1, 0, 2, 1, 0, 5, 5, 3, 2, 6, 1, 0,
       5, 3, 2, 6, 4, 1, 0, 3, 2, 4, 1, 0, 5, 2, 6,

In [145]:
x_train, x_test, y_train, y_test = train_test_split(np.array(fileList), np.array(y), train_size=0.7)

In [146]:
x_train = [extract_mfcc(i) for i in x_train]

In [147]:
clf = svm.SVC(kernel='linear', C=64)
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

1.0

In [148]:
x_test = [extract_mfcc(i) for i in x_test]

In [149]:
res = model.predict(x_test)

In [150]:
res.shape

(161,)

In [151]:
show_accuracy(res, y_test)

0.9254658385093167

In [152]:
res[:32]

array([5, 3, 1, 4, 1, 1, 2, 0, 1, 4, 6, 6, 0, 3, 2, 4, 0, 5, 4, 3, 0, 4,
       4, 3, 5, 1, 5, 2, 2, 1, 1, 3])

In [153]:
y_test[:32]

array([5, 3, 1, 5, 1, 1, 2, 0, 1, 4, 6, 6, 0, 3, 2, 4, 0, 5, 4, 3, 0, 4,
       4, 3, 5, 1, 5, 2, 2, 1, 1, 1])

In [154]:
li = []
for i,j in zip(res,y_test):
    li.append(i==j)
print(sum(li)/len(li))

0.9254658385093167
