### Emotion Classification using word2vec and speech features

### First, use Mecab to tokenize japanese corpus, then use Google word2vec tool to train a word2vec model

In [1]:
from sklearn import svm, naive_bayes, ensemble, neural_network, metrics
from sklearn.model_selection import cross_val_predict, train_test_split
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import Word2Vec

In [2]:
from gensim.models import word2vec
word_vectors = word2vec.Word2Vec.load('word2vec/word2vec.gensim.model')

In [3]:
word_vectors.most_similar('江戸川コナン')

[('工藤新一', 0.8142108917236328),
 ('扮する', 0.8039429187774658),
 ('柳生十兵衛', 0.7999906539916992),
 ('怪盗キッド', 0.7948559522628784),
 ('名探偵', 0.7929961085319519),
 ('メインヒロイン', 0.7890465259552002),
 ('サイドストーリー', 0.7831361889839172),
 ('マペット', 0.7783994078636169),
 ('怪盗', 0.7781833410263062),
 ('中村主水', 0.7774820327758789)]

### Looks like the model works well, "江戸川コナン" is the main character's name of "Detective Conan", his true name should be "工藤新一". So this model can capture the semantic meaning of the character...

In [4]:
##process the trans
trans_list = ['01_MAD_1.wav', '01_MMK_1.wav', '02_MTN.wav', '03_FTY.wav', '05_MYH.wav', '01_MAD_2.wav', '01_MMK_2.wav', '03_FMA.wav', '04_MNN.wav']
trans_list = [element.replace('wav','txt') for element in trans_list]
count = {}
for file in trans_list:
    f = open('trans/'+file)
    for line in f:
        if not line == '':
            index = file.split('txt')[0] + line.split(',')[0]
            content = line.split(',')
            content = content[len(content)-1]
            content = content.replace('{','')
            content = content.replace('}','')
            content = content.replace('\n','')
            count[index] = content
ins_list = []
not_intside = ['03_FTY.215','04_MNN.777', '04_MNN.267','04_MNN.417','01_MMK_1.178']
inst = open('instances.txt')
for line in inst:
    line = line.split('\n')[0]
    if line not in not_intside:
        ins_list.append(line)
len(ins_list)

4850

### Extract all sentences from transcript, store all sentences in the dictionary whose ids are the instances.

In [5]:
f = open('./emobase2010.arff')
not_class = ['NEU','UNK','OTH',' )']
content = []
speech = []
labels = []
for line in f:
    if ',' in line and "'" in line:
        label = line.split(',')[-1].replace('\n','')
        if label not in not_class:
            labels.append(label)
            feature = line.split("',")[1]
            speech.append(feature)
            line = line.split(',')[0].replace("'","")
            if line not in not_intside:
                content.append(line)

### Remove the data with the three emotion labels and the one without any label.
### Extract the speech features from arff file.

In [6]:
label = set(labels)
print(label)

{'SAD', 'JOY', 'FEA', 'DIS', 'ANT', 'SUR', 'ANG', 'ACC'}


In [7]:
sentences = []
for element in content:
    sentences.append(count[element])

### Store the transcript sentences to a text file.
### Use Mecab software to tokenize sentences.

In [8]:
print(len(sentences))
f = open('trans_text.txt','w')
for s in sentences:
    f.write(s+'\n')
f.close()

2742


### Compute the average of word vectors in the sentence.

In [9]:
trans_vectors = []
f = open('trans_tokenized.txt')
for line in f:
    tokens = line.split()
    temp = [0.0]*50
    for token in tokens:
        if token in word_vectors:
            temp =[temp[i]+word_vectors[token][i] for i in range(len(temp))]
    trans_vectors.append(temp)       

### Extract speech features

In [10]:
speech_features = []
for feature in speech:
    feature = feature[:-5]
    temp = []
    ss = feature.split(',')
    temp = [float(ele) for ele in ss]
    speech_features.append(temp)

### Concatenate two kinds of features together.

In [11]:
speech_features = np.asarray(speech_features)
text_features = np.asarray(trans_vectors)
new_features = np.concatenate((speech_features,text_features),axis = 1)
print(speech_features.shape)
print(text_features.shape)
print(new_features.shape)

(2742, 1582)
(2742, 50)
(2742, 1632)


### Binary classification using speech feature. The accuracy is 87.5%.

In [12]:
#speech feature
label_set = set(labels)
score = 0.0
for label in label_set:
    new_labels = [ 1 if label == ele else 0 for ele in labels ]
    wclf = svm.SVC(kernel='rbf', class_weight='balanced')
    predicted = cross_val_predict(wclf, speech_features, new_labels)
    score += metrics.accuracy_score(new_labels, predicted)
print(score/8)

0.875


### Binary classification using text feature. The accuracy is 62.6%.

In [13]:
#text feature
score = 0.0
for label in label_set:
    new_labels = [ 1 if label == ele else 0 for ele in labels ]
    wclf = svm.SVC(kernel='rbf', class_weight='balanced')
    predicted = cross_val_predict(wclf, trans_vectors, new_labels)
    score += metrics.accuracy_score(new_labels, predicted)
print(score/8)

0.626094091904


### Binary classification using concatenated feature. Looks like the text feature doesn't
### improve the accuracy.

In [14]:
label_set = set(labels)
score = 0.0
for label in label_set:
    new_labels = [ 1 if label == ele else 0 for ele in labels ]
    wclf = svm.SVC(kernel='rbf', class_weight='balanced')
    predicted = cross_val_predict(wclf, new_features, new_labels)
    score += metrics.accuracy_score(new_labels, predicted)
print(score/8)

0.875


### Multi-class classification 

|Feature Type | Accuracy |
|---|---|
| Speech Feature| 24.5% |
| Text Feature  | 24.6% |
| Concatenated  | 24.8% |

### The text feature improves the result a little bit

In [None]:
###multi-class classification
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(labels)
labels = le.transform(labels)

In [None]:
wclf = svm.SVC(kernel='linear', class_weight='balanced')
predicted = cross_val_predict(wclf, new_features, labels)
metrics.accuracy_score(labels, predicted)

#accuracy 0.24762946754

In [None]:
wclf = svm.SVC(kernel='linear', class_weight='balanced')
predicted = cross_val_predict(wclf, speech_features, labels)
metrics.accuracy_score(labels, predicted)
#accuracy 0.245441283735

In [None]:
wclf = svm.SVC(kernel='linear', class_weight='balanced')
predicted = cross_val_predict(wclf, text_features, labels)
metrics.accuracy_score(labels, predicted)
#accuracy 0.246170678337