In [1]:
#import libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix

In [2]:
#open the data
data = pd.read_csv('EmotionTest.csv', encoding = 'utf-8')
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ID,Tweet,Emotions
0,0,0,2018-En-00866,"@RanaAyyub @rajnathsingh Օ՜, թաքնված վրեժ և զա...",anger
1,1,1,2018-En-02590,"Ես այս ամենն անում եմ, որպեսզի համոզվեմ, որ դո...",joy
2,2,2,2018-En-03361,"եթե ոչ, ապա #teamchristine bc այն ամենը, ինչ Տ...",anger
3,3,3,2018-En-03230,Դա #հիանալի սկիզբ է #սկսնակների համար՝ անցնելո...,joy
4,4,4,2018-En-01143,Իմ լավագույն ընկերներն առաջին անգամ ինձ հետ վա...,fear


In [3]:
data['Emotions'].value_counts()

joy         294
anger       133
sadness      78
fear         43
surprise      6
Name: Emotions, dtype: int64

In [4]:
#retrieve the input and target
with open('EmotionTestText.pickle', 'rb') as f:
    X = pickle.load(f)
X = X.reset_index(drop = True)
y = data['Emotions']

In [5]:
#open the lexicon
with open('emotionScores.pickle', 'rb') as f:
    scores = pickle.load(f)

In [6]:
scores.head()

Unnamed: 0,HWN_Offset,SWN_Offset,Part_of_Speech,Armenian,English,Positive,Negative,Objective,Afraid,Amused,Angry,Annoyed,Dont_Care,Happy,Inspired,Sad
0,00014490-a,00014490,a,միանգամայն բավարար,"rich,plentiful,plenteous,copious,ample",0.125,0.0,0.875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00014490-a,00014490,a,լիառատ,"rich,plentiful,plenteous,copious,ample",0.125,0.0,0.875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"00024996-a,00818008-a,01640850-a,01687167-a,,,,","00024996,00818008,01640850,01687167,00821208,0...",a,նոր,"new,young,new,novel,new,fresh",0.171875,0.046875,0.78125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00025470-a,00025470,a,թթվային,acid,0.0,0.375,0.625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"00029933-a,",0002993300011160,a,ագահ,"prehensile,greedy,grasping,grabby,covetous,ava...",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
#function to transform the text into a vector of size 5
def transform(x):
    total = 0
    count = 0
    score = np.zeros((len(x), 5))
    for i in x.index:
        s = np.zeros((len(x[i]), 5))
        for j in range(len(x[i])):
            total += 1
            if x[i][j] in scores['Armenian'].values:
                count += 1
                ind = np.where(scores['Armenian'] == x[i][j])[0]
                afraid, angry, happy, inspired, sad = 0, 0, 0, 0, 0, 0, 0, 0
                for k in range(len(ind)):
                    afraid += scores['Afraid'][ind[k]]
                    angry += scores['Angry'][ind[k]]
                    happy += scores['Happy'][ind[k]]
                    inspired += scores['Inspired'][ind[k]]
                    sad += scores['Sad'][ind[k]]
                s[j][0] = afraid / len(ind)
                s[j][1] = angry / len(ind)
                s[j][2] = happy / len(ind)
                s[j][3] = inspired / len(ind)
                s[j][4] = sad / len(ind)
        t = s.sum(axis = 0)
        h = list(x.index).index(i)
        score[h] = t
    return [score, count, total]

In [8]:
#transform the input data
transformed = transform(X)
text = pd.DataFrame(transformed[0], index = X.index)
text.columns = ['Afraid', 'Angry', 'Happy', 'Inspired', 'Sad']

In [9]:
len(data)

554

In [10]:
#success rate
print(transformed[1] / transformed[2] * 100)

59.17825020441537


In [11]:
#baseline model to evaluate the lexicon
results = []
for i in range(len(text)):
    output = list(text[['Afraid', 'Angry', 'Happy', 'Inspired', 'Sad']].loc[i].values)
    result = output.index(max(output))
    if result == 0:
        results.append('fear')
    elif result == 1:
        results.append('anger')
    elif result == 2:
        results.append('joy')
    elif result == 3:
        results.append('surprise')
    elif result == 4:
        results.append('sadness')
results = pd.Series(results, index = y.index)

In [12]:
#check performance
acc = accuracy_score(y, results)
f1 = f1_score(y, results, average = None)
rec = recall_score(y, results, average = None)
pre = precision_score(y, results, average = None)
print('Accuracy:', acc)
print('F-measure:', f1)
print('Recall:', rec)
print('Precision:', pre)

Accuracy: 0.14981949458483754
F-measure: [0.24731183 0.12543554 0.19941349 0.0877193  0.03333333]
Recall: [0.17293233 0.41860465 0.11564626 0.06410256 0.5       ]
Precision: [0.43396226 0.07377049 0.72340426 0.13888889 0.01724138]


In [13]:
confusion_matrix(y, results)

array([[ 23,  55,   3,  11,  41],
       [  8,  18,   2,   2,  13],
       [ 11, 131,  34,  18, 100],
       [ 11,  38,   7,   5,  17],
       [  0,   2,   1,   0,   3]], dtype=int64)