# 支持向量机方法的语音情感识别实现以及评估

使用Emo_DB数据库,柏林工业大学录制.
分类如下:
```
letter	emotion (english)	letter	emotion (german)
A	anger	W	Ärger (Wut)
B	boredom	L	Langeweile
D	disgust	E	Ekel
F	anxiety/fear	A	Angst
H	happiness	F	Freude
S	sadness	T	Trauer
N = neutral version
```

In [1]:
import os
import pandas as pd
import librosa
import IPython.display as ipd
import numpy as np
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
EmoList = ['anger', 'boredom', 'disgust', 'anxiety/fear', 'happiness', 'sadness', 'neutarl']


def getEmotion(fileName):
    if 'A' in fileName:
        return 'fear'
    elif 'W' in fileName:
        return 'anger'
    elif 'L' in fileName:
        return 'boredom'
    elif 'F' in fileName:
        return 'happy'
    elif 'T' in fileName:
        return 'sad'
    elif 'E' in fileName:
        return 'disgust'
    elif 'N' in fileName:
        return 'neutral'

In [3]:
filePath = os.walk(r"D:\SpeechEmotionRecognition\dataset\Emo-DB\wav")
fileList = list([i for i in filePath][0])[2:][0]
fileList

['03a01Fa.wav',
 '03a01Nc.wav',
 '03a01Wa.wav',
 '03a02Fc.wav',
 '03a02Nc.wav',
 '03a02Ta.wav',
 '03a02Wb.wav',
 '03a02Wc.wav',
 '03a04Ad.wav',
 '03a04Fd.wav',
 '03a04Lc.wav',
 '03a04Nc.wav',
 '03a04Ta.wav',
 '03a04Wc.wav',
 '03a05Aa.wav',
 '03a05Fc.wav',
 '03a05Nd.wav',
 '03a05Tc.wav',
 '03a05Wa.wav',
 '03a05Wb.wav',
 '03a07Fa.wav',
 '03a07Fb.wav',
 '03a07La.wav',
 '03a07Nc.wav',
 '03a07Wc.wav',
 '03b01Fa.wav',
 '03b01Lb.wav',
 '03b01Nb.wav',
 '03b01Td.wav',
 '03b01Wa.wav',
 '03b01Wc.wav',
 '03b02Aa.wav',
 '03b02La.wav',
 '03b02Na.wav',
 '03b02Tb.wav',
 '03b02Wb.wav',
 '03b03Nb.wav',
 '03b03Tc.wav',
 '03b03Wc.wav',
 '03b09La.wav',
 '03b09Nc.wav',
 '03b09Tc.wav',
 '03b09Wa.wav',
 '03b10Ab.wav',
 '03b10Ec.wav',
 '03b10Na.wav',
 '03b10Nc.wav',
 '03b10Wb.wav',
 '03b10Wc.wav',
 '08a01Ab.wav',
 '08a01Fd.wav',
 '08a01Lc.wav',
 '08a01Na.wav',
 '08a01Wa.wav',
 '08a01Wc.wav',
 '08a02Ab.wav',
 '08a02Ac.wav',
 '08a02Fe.wav',
 '08a02La.wav',
 '08a02Na.wav',
 '08a02Tb.wav',
 '08a02Wc.wav',
 '08a04F

In [4]:
EmoList = [getEmotion(i) for i in fileList]
EmoList

['happy',
 'neutral',
 'anger',
 'happy',
 'neutral',
 'sad',
 'anger',
 'anger',
 'fear',
 'happy',
 'boredom',
 'neutral',
 'sad',
 'anger',
 'fear',
 'happy',
 'neutral',
 'sad',
 'anger',
 'anger',
 'happy',
 'happy',
 'boredom',
 'neutral',
 'anger',
 'happy',
 'boredom',
 'neutral',
 'sad',
 'anger',
 'anger',
 'fear',
 'boredom',
 'neutral',
 'sad',
 'anger',
 'neutral',
 'sad',
 'anger',
 'boredom',
 'neutral',
 'sad',
 'anger',
 'fear',
 'disgust',
 'neutral',
 'neutral',
 'anger',
 'anger',
 'fear',
 'happy',
 'boredom',
 'neutral',
 'anger',
 'anger',
 'fear',
 'fear',
 'happy',
 'boredom',
 'neutral',
 'sad',
 'anger',
 'happy',
 'boredom',
 'neutral',
 'sad',
 'anger',
 'happy',
 'boredom',
 'neutral',
 'sad',
 'anger',
 'happy',
 'boredom',
 'neutral',
 'sad',
 'sad',
 'anger',
 'fear',
 'happy',
 'happy',
 'boredom',
 'neutral',
 'anger',
 'happy',
 'boredom',
 'neutral',
 'sad',
 'anger',
 'happy',
 'boredom',
 'neutral',
 'sad',
 'anger',
 'fear',
 'happy',
 'boredom',

In [5]:
os.chdir(r"D:\SpeechEmotionRecognition\dataset\Emo-DB\wav")

In [6]:
data, sampling_rate = librosa.load(fileList[42], duration=3, offset=0.5)
data

array([ 4.4411033e-02,  1.2456501e-02, -5.2903663e-02, ...,
       -4.4907378e-05, -5.5233475e-05,  0.0000000e+00], dtype=float32)

In [7]:
sampling_rate

22050

In [37]:
def extract_mfcc(filename):
    data, sampling_rate = librosa.load(filename, duration=3, offset=0.5)
    mfccs = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=64)
    mfcc_scaled = np.mean(mfccs.T, axis=0)
    return mfcc_scaled

In [38]:
sequence = [extract_mfcc(i) for i in fileList]

  0.        ] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfccs = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=64)
 -1.2293892e-06  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfccs = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=64)
  0.        ] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfccs = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=64)
  4.0420236e-05  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfccs = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=64)
 -6.6959990e-05  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfccs = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=64)
  5.9685142e-05  0.0000000e+00] as keyword args. From version

In [39]:
x = np.array(sequence)
x.shape

(535, 64)

In [40]:
def idx_y(type):
    dict = {
        'fear': 0,
        'anger': 1,
        'boredom': 2,
        'happy': 3,
        'sad': 4,
        'disgust': 5,
        'neutral': 6
    }
    return dict.get(type)


y = [idx_y(elem) for elem in EmoList]
y = np.array(y)
y.shape

(535,)

In [41]:
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.7)

In [42]:
model = svm.SVC(kernel='rbf',decision_function_shape='ovr',C=1024)
model.fit(x_train,y_train)

In [43]:
model.score(x_train,y_train)

0.9973262032085561

In [44]:
def show_accuracy(a,b):
    acc = a == b
    return np.mean(acc)

In [45]:
yy = model.predict(x_test)
# show_accuracy(yy,y_test)

0.782608695652174

In [46]:
yy

array([6, 5, 4, 1, 1, 6, 3, 3, 6, 1, 0, 1, 3, 0, 1, 1, 2, 0, 4, 0, 2, 2,
       2, 2, 1, 3, 1, 6, 6, 2, 4, 5, 4, 4, 4, 2, 1, 6, 4, 4, 1, 1, 1, 4,
       1, 2, 1, 3, 6, 6, 6, 4, 3, 4, 1, 2, 0, 6, 2, 4, 1, 1, 4, 3, 6, 0,
       0, 5, 3, 1, 5, 6, 1, 1, 2, 0, 1, 1, 1, 4, 1, 1, 1, 6, 4, 6, 0, 6,
       4, 3, 3, 1, 4, 5, 1, 4, 0, 6, 6, 4, 2, 3, 0, 1, 0, 4, 5, 0, 6, 5,
       5, 5, 1, 3, 6, 3, 4, 2, 2, 5, 3, 2, 0, 4, 0, 6, 2, 5, 2, 3, 6, 1,
       3, 1, 1, 3, 1, 6, 2, 6, 2, 1, 2, 1, 0, 5, 0, 0, 0, 3, 1, 1, 1, 2,
       6, 3, 2, 6, 0, 0, 3])

### 在取得MFCC特征之前进行划分，从直观上来判断模型的效果
手动测试，直觉上感觉90%的正确率可能有错误

In [48]:
# np.array(fileList)
np.array(y)

array([3, 6, 1, 3, 6, 4, 1, 1, 0, 3, 2, 6, 4, 1, 0, 3, 6, 4, 1, 1, 3, 3,
       2, 6, 1, 3, 2, 6, 4, 1, 1, 0, 2, 6, 4, 1, 6, 4, 1, 2, 6, 4, 1, 0,
       5, 6, 6, 1, 1, 0, 3, 2, 6, 1, 1, 0, 0, 3, 2, 6, 4, 1, 3, 2, 6, 4,
       1, 3, 2, 6, 4, 1, 3, 2, 6, 4, 4, 1, 0, 3, 3, 2, 6, 1, 3, 2, 6, 4,
       1, 3, 2, 6, 4, 1, 0, 3, 2, 6, 4, 1, 1, 0, 3, 2, 6, 4, 1, 5, 3, 6,
       1, 5, 5, 2, 1, 3, 2, 6, 1, 5, 2, 6, 4, 1, 1, 5, 6, 4, 1, 1, 5, 6,
       1, 6, 4, 1, 1, 5, 3, 3, 2, 6, 4, 1, 5, 6, 1, 0, 6, 1, 0, 6, 1, 0,
       3, 2, 6, 1, 3, 6, 1, 1, 0, 2, 4, 1, 0, 0, 2, 4, 1, 0, 5, 3, 2, 0,
       2, 6, 1, 2, 4, 1, 0, 2, 1, 3, 2, 1, 0, 0, 2, 6, 1, 5, 3, 2, 6, 4,
       1, 0, 3, 6, 1, 0, 3, 3, 2, 6, 4, 1, 0, 2, 4, 1, 0, 5, 3, 2, 6, 1,
       0, 3, 6, 4, 1, 3, 2, 6, 4, 1, 1, 0, 3, 2, 6, 4, 1, 0, 0, 2, 6, 4,
       1, 3, 2, 6, 1, 0, 5, 6, 1, 1, 1, 0, 2, 6, 4, 1, 0, 2, 1, 4, 1, 0,
       5, 3, 6, 1, 1, 1, 2, 4, 0, 4, 1, 0, 2, 1, 0, 5, 5, 3, 2, 6, 1, 0,
       5, 3, 2, 6, 4, 1, 0, 3, 2, 4, 1, 0, 5, 2, 6,

In [60]:
x_train,x_test,y_train,y_test = train_test_split(np.array(fileList),np.array(y),train_size=0.8)

In [61]:
x_train = [extract_mfcc(i) for i in x_train]

 -2.3720999e-05  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfccs = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=64)
  0.        ] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfccs = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=64)
  0.        ] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfccs = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=64)
 -8.5607615e-05  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfccs = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=64)
 -7.4387048e-05  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfccs = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=64)
  0.        ] as keyword args. From version 0.10 passing thes

In [62]:
clf = svm.SVC(kernel='linear',C=64)
clf.fit(x_train,y_train)
clf.score(x_train,y_train)

1.0

In [63]:
x_test = [extract_mfcc(i) for i in x_test]

 -5.0067196e-05  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfccs = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=64)
 0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfccs = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=64)
 -0.19695708] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfccs = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=64)
  0.02787619] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfccs = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=64)
  0.        ] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfccs = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=64)
  0.        ] as keyword args. From version 0.10 passing these as positional arguments will res

In [64]:
res = model.predict(x_test)

In [67]:
res.shape

(107,)

In [70]:
show_accuracy(res,y_test)

0.9158878504672897

In [74]:
res[37]
# y_test[37]

3