In [1]:
#import libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [2]:
#open the data
data = pd.read_csv('SentimentTest_B.csv', encoding = 'utf-8')
data.head()

Unnamed: 0.1,Unnamed: 0,Column1,Column2,Column3,Column4,Column5
0,0,5.23065e+17,aaron rodgers,negative,"@Espngreeny Ես Ֆինսի երկրպագու եմ, ուրբաթ է, ի...",
1,1,5.22477e+17,aaron rodgers,positive,Ահարոն Ռոջերսը իրոք որսում է կիրակի երեկոյան կ...,
2,2,5.22512e+17,aaron rodgers,positive,Բրիտանացի Ահարոն Ռոջերսը կարող է լինել ամենահա...,
3,3,5.2252e+17,aaron rodgers,positive,Ինչն է Ահարոն Ռոջերսին այդքան լավը դարձնում վե...,
4,4,5.22678e+17,aaron rodgers,positive,Ահարոն Ռոջերսի վերջին դրայվը անհավանական էր: Բ...,


In [3]:
data['Column3'].value_counts()

positive    867
negative    260
Name: Column3, dtype: int64

In [4]:
len(data)

1127

In [5]:
#retrieve the input and target
with open('SentimentTestTextB.pickle', 'rb') as f:
    X = pickle.load(f)
y = data['Column3']

In [6]:
#open the lexicon
with open('sentimentScores_final.pickle', 'rb') as f:
    scores = pickle.load(f)

In [7]:
#function to transform the text into a vector of size 3: [Positive, Negative, Objective]
def transform(x):
    total = 0
    count = 0
    score = np.zeros((len(x), 3))
    for i in x.index:
        s = np.zeros((len(x[i]), 3))
        for j in range(len(x[i])):
            total += 1
            if x[i][j] in scores['Armenian'].values:
                count += 1
                ind = np.where(scores['Armenian'] == x[i][j])[0]
                pos, neg, obj = 0, 0, 0
                for k in range(len(ind)):
                    pos += scores['Positive'][ind[k]]
                    neg += scores['Negative'][ind[k]]
                    obj += scores['Objective'][ind[k]]
                s[j][0] = pos / len(ind)
                s[j][1] = neg / len(ind)
                s[j][2] = obj / len(ind)
        t = s.sum(axis = 0)
        h = list(x.index).index(i)
        score[h] = t
    return [score, count, total]

In [8]:
#transform the input data
transformed = transform(X)
text = pd.DataFrame(transformed[0], index = X.index)
text.columns = ['Positive', 'Negative', 'Objective']

In [9]:
#success rate
print(transformed[1] / transformed[2] * 100)

54.634035803308414


In [10]:
#change the target values to (0, 1) to reflect (negative, positive)
real = np.zeros(len(y))
for i in range(len(y)):
    if y[i] == 'positive':
        real[i] = 1
    else:
        real[i] = 0
real = pd.Series(real, index = y.index)

In [15]:
#baseline model to evaluate the lexicon
results = []
for i in range(len(text)):
    result = text['Positive'][i] - text['Negative'][i]
    if result >= 0:
        results.append(1)
    elif result < 0:
        results.append(0)
results = pd.Series(results, index = y.index)

In [16]:
#check performance
acc = accuracy_score(real, results)
f1 = f1_score(real, results)
rec = recall_score(real, results)
pre = precision_score(real, results)
print('Accuracy:', acc)
print('F-measure:', f1)
print('Recall:', rec)
print('Precision:', pre)

Accuracy: 0.6983141082519965
F-measure: 0.8023255813953488
Recall: 0.7958477508650519
Precision: 0.8089097303634232


In [18]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(real, results))

[[ 97 163]
 [177 690]]
