In [1]:
#import libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [2]:
#get the model
model = TFBertForSequenceClassification.from_pretrained('./BERT/MonoBERTSentiment')
tokenizer = BertTokenizer.from_pretrained('./bert_model')
model.summary()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ./BERT/MonoBERTSentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  135193344 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 135,194,882
Trainable params: 135,194,882
Non-trainable params: 0
_________________________________________________________________


In [3]:
#read the data
test = pd.read_csv('SentimentTest_B.csv', encoding = 'utf-8')

In [4]:
test.head()

Unnamed: 0.1,Unnamed: 0,Column1,Column2,Column3,Column4,Column5
0,0,5.23065e+17,aaron rodgers,negative,"@Espngreeny Ես Ֆինսի երկրպագու եմ, ուրբաթ է, ի...",
1,1,5.22477e+17,aaron rodgers,positive,Ահարոն Ռոջերսը իրոք որսում է կիրակի երեկոյան կ...,
2,2,5.22512e+17,aaron rodgers,positive,Բրիտանացի Ահարոն Ռոջերսը կարող է լինել ամենահա...,
3,3,5.2252e+17,aaron rodgers,positive,Ինչն է Ահարոն Ռոջերսին այդքան լավը դարձնում վե...,
4,4,5.22678e+17,aaron rodgers,positive,Ահարոն Ռոջերսի վերջին դրայվը անհավանական էր: Բ...,


In [5]:
#create a polarity column and change the targets accordingly
test['Polarity'] = np.nan
for i in range(len(test)):
    if test['Column3'][i] == 'positive':
        test['Polarity'][i] = 1
    else:
        test['Polarity'][i] = 0
test = test.drop('Column3', axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Polarity'][i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Polarity'][i] = 1


In [6]:
#predict
pred_sentences = test['Column4']
tf_batch = tokenizer(list(pred_sentences), max_length = 128, padding = True, truncation = True, return_tensors = 'tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis = -1)
labels = [0, 1]
label = tf.argmax(tf_predictions, axis = 1)
label = label.numpy()
predictions = pd.Series(label, index = test.index)
print('Accuracy:', accuracy_score(test['Polarity'], predictions))
print('F-measure:', f1_score(test['Polarity'], predictions))
print('Recall:', recall_score(test['Polarity'], predictions))
print('Precision:', precision_score(test['Polarity'], predictions))

Accuracy: 0.8251996450754214
F-measure: 0.8928765633496465
Recall: 0.9469434832756632
Precision: 0.8446502057613169


In [7]:
from sklearn.metrics.cluster import contingency_matrix
contingency_matrix(test['Polarity'], predictions)

array([[109, 151],
       [ 46, 821]])

In [8]:
import pickle
with open('SentimentTestTextB.pickle', 'rb') as f:
    lemmatized_text = pickle.load(f)

In [9]:
tf_predictions

<tf.Tensor: shape=(1127, 2), dtype=float32, numpy=
array([[0.16520686, 0.8347932 ],
       [0.34829298, 0.65170705],
       [0.00347991, 0.99652004],
       ...,
       [0.00255701, 0.99744296],
       [0.00279735, 0.99720263],
       [0.00280853, 0.9971915 ]], dtype=float32)>

In [10]:
x = []

for i in range(len(tf_predictions)):
    r = np.array(tf_predictions[i])
    if r[0] > r[1]:
        x.append(r[0] / r[1])
    else:
        x.append(r[1] / r[0])

In [11]:
x

[5.0530176,
 1.8711461,
 286.36343,
 251.86086,
 4.3700995,
 2.4202604,
 3.0960605,
 25.958553,
 1.136978,
 376.84045,
 200.74364,
 86.10384,
 87.493065,
 379.78967,
 23.596546,
 63.622307,
 397.43622,
 373.0617,
 18.634434,
 1.5630426,
 384.63022,
 393.28107,
 335.02115,
 292.55377,
 200.72353,
 4.0374784,
 114.35618,
 416.56796,
 289.17255,
 50.854393,
 1.8390118,
 4.0374784,
 235.97147,
 75.83756,
 63.635895,
 200.91142,
 1.0106853,
 1.2047652,
 6.9620085,
 51.20554,
 207.38513,
 285.86902,
 151.14174,
 17.996532,
 10.835443,
 14.814455,
 9.848101,
 1.3518043,
 9.521088,
 32.84703,
 121.56479,
 7.991052,
 20.631853,
 28.41537,
 109.07422,
 8.438567,
 185.44261,
 26.221485,
 1.7437227,
 40.80354,
 329.42477,
 3.1386278,
 9.133355,
 4.632262,
 267.0907,
 5.906001,
 56.469322,
 5.241587,
 53.51742,
 156.39514,
 110.870316,
 305.02133,
 64.97438,
 315.32202,
 65.068665,
 1.7754667,
 331.96985,
 224.67549,
 216.15343,
 193.34114,
 285.4095,
 231.62283,
 278.08408,
 259.26282,
 194.48949,

In [12]:
possibility = pd.Series(x, index = test.index)

In [13]:
index = []
for i in range(len(predictions)):
    if predictions[i] != test['Polarity'][i]:
        index.append(i)

y = {'Text': test['Column4'].loc[index], 'Label': test['Polarity'].loc[index], 'Prediction': predictions[index], 'Possibility': possibility[index]}
summary = pd.DataFrame(y)
summary

Unnamed: 0,Text,Label,Prediction,Possibility
0,"@Espngreeny Ես Ֆինսի երկրպագու եմ, ուրբաթ է, ի...",0.0,1,5.053018
4,Ահարոն Ռոջերսի վերջին դրայվը անհավանական էր: Բ...,1.0,0,4.370100
6,Կարծես թե @Panthers Defense-ը կիրակի օրը A կստ...,0.0,1,3.096061
7,"@amyrobJAFEEL-ը հենց նոր տեղեկացրեց ինձ, որ Ահ...",0.0,1,25.958553
11,"Հոկտեմբերի 4-ն է։ Եկեք իրական լինենք, Ահարոն Ս...",0.0,1,86.103844
...,...,...,...,...
1047,Ուեյն նահանգը (@wscwildcats) շաբաթ օրը ժամը 13...,0.0,1,301.789337
1090,"գուշակեք, ով չի կարող վաղը գնալ WWAT ֆիլմը դիտ...",0.0,1,19.931259
1099,@mvdn777 լավ էդ ժամանակավորումը կատարյալ կլինե...,0.0,1,27.601208
1110,"Այսպիսով, Աշխատանքային ետ առաջատարի հետ Populu...",1.0,0,2.875626


In [14]:
def get_index(cont, thr):
    ind = []
    for i in range(len(cont)):
        if cont[i] < thr:
            ind.append(i)
    return ind

In [15]:
low_index = get_index(possibility, 14)

In [16]:
low_index

[0,
 1,
 4,
 5,
 6,
 8,
 19,
 25,
 30,
 31,
 36,
 37,
 38,
 44,
 46,
 47,
 48,
 51,
 55,
 58,
 61,
 62,
 63,
 65,
 67,
 75,
 86,
 89,
 95,
 99,
 100,
 108,
 110,
 111,
 121,
 123,
 128,
 129,
 147,
 151,
 161,
 164,
 167,
 172,
 181,
 182,
 183,
 190,
 191,
 199,
 210,
 213,
 223,
 226,
 230,
 233,
 237,
 244,
 245,
 246,
 247,
 249,
 295,
 296,
 301,
 308,
 309,
 325,
 328,
 340,
 341,
 352,
 356,
 357,
 359,
 362,
 372,
 373,
 374,
 379,
 386,
 389,
 395,
 399,
 431,
 434,
 435,
 437,
 439,
 441,
 455,
 456,
 458,
 463,
 466,
 470,
 472,
 474,
 475,
 476,
 498,
 509,
 510,
 514,
 515,
 521,
 522,
 523,
 524,
 525,
 526,
 527,
 528,
 530,
 531,
 532,
 538,
 546,
 551,
 553,
 557,
 558,
 564,
 570,
 574,
 579,
 582,
 583,
 585,
 587,
 588,
 589,
 591,
 594,
 596,
 600,
 601,
 603,
 608,
 609,
 610,
 611,
 612,
 617,
 618,
 620,
 622,
 624,
 625,
 629,
 630,
 632,
 633,
 635,
 637,
 638,
 641,
 645,
 646,
 647,
 662,
 669,
 676,
 679,
 690,
 693,
 694,
 702,
 707,
 709,
 720,
 747,
 753

In [17]:
len(test)

1127

In [18]:
len(low_index)

242

In [19]:
with open('positive_contribution.pickle', 'rb') as f:
    positive_contribution = pickle.load(f)
    
with open('negative_contribution.pickle', 'rb') as f:
    negative_contribution = pickle.load(f)
    
with open('positive_adverb.pickle', 'rb') as f:
    positive_adverb = pickle.load(f)
    
with open('negative_adverb.pickle', 'rb') as f:
    negative_adverb = pickle.load(f)

In [20]:
positive_contribution

Unnamed: 0,Word,Count,Contribution
0,միանգամայն բավարար,0,0.500000
1,լիառատ,0,0.500000
2,նոր,2263,1.000000
3,շահավետ,0,0.500000
4,նպաստավոր,0,0.500000
...,...,...,...
1475,օգտակար,8,0.999665
1476,օժանդակել,0,0.500000
1477,օջախ(տուն),0,0.500000
1478,օրորել,0,0.500000


In [21]:
negative_contribution

Unnamed: 0,Word,Count,Contribution
0,թթվային,0,0.500000
1,ակնոցավոր,0,0.500000
2,այլատյաց,2,0.880797
3,անօտարելի,0,0.500000
4,մեռած,0,0.500000
...,...,...,...
1376,օբյեկտ,3,0.952574
1377,օղակ,11,0.999983
1378,օրենականություն,0,0.500000
1379,օրհասական,0,0.500000


In [22]:
positive_adverb

['ամբողջությամբ',
 'անպայման',
 'խնդրեմ',
 'նույնիսկ',
 'դեռ',
 'դեռևս',
 'արդեն',
 'անկասկած',
 'շատ',
 'բարձրաձայն',
 'հատկապես',
 'պասսիվորեն',
 'հեշտորեն',
 'հեշտությամբ',
 'քննադատաբար',
 'քաղաքավարիորեն',
 'համեստորեն',
 'զգույշ',
 'քաոսայնորեն',
 'հերոսաբար',
 'սառնորեն',
 'ամբողջովին',
 'ամբողջովին ',
 'անխտիր',
 'ավելի լավ',
 'ավելի քիչ',
 'բոլորովին',
 'զարմանալիորեն',
 'էլ',
 'ինչքան էլ որ',
 'լիովին',
 'կատարելապես',
 'հաճախ',
 'համապատասխանաբար',
 'հենց',
 'հետեւաբար',
 'հոգեպես',
 'միանգամայն',
 'շուրջ',
 'որքան էլ որ',
 'պարզապես',
 'վճռականորեն',
 'տրամաբանորեն',
 'ուղակի']

In [23]:
negative_adverb

['ուղղակի',
 'երբեք',
 'չ-',
 'շուտով',
 'շուտ',
 'հազվադեպ',
 'իհարկե',
 'անշուշտ',
 'սովորաբար',
 'չափազանց',
 'սաստիկ',
 'քիչ',
 'սակավ',
 'ֆիզիկապես',
 'ֆիզիկորեն',
 'այժմ',
 'անմիջապես',
 'հիմա',
 'պատահաբար']

In [24]:
test.loc[low_index]

Unnamed: 0.1,Unnamed: 0,Column1,Column2,Column4,Column5,Polarity
0,0,5.230650e+17,aaron rodgers,"@Espngreeny Ես Ֆինսի երկրպագու եմ, ուրբաթ է, ի...",,0.0
1,1,5.224770e+17,aaron rodgers,Ահարոն Ռոջերսը իրոք որսում է կիրակի երեկոյան կ...,,1.0
4,4,5.226780e+17,aaron rodgers,Ահարոն Ռոջերսի վերջին դրայվը անհավանական էր: Բ...,,1.0
5,5,5.228680e+17,aaron rodgers,"Ահարոն Ռոջերս. լավն է 4-րդ քառորդում, բայց ընդ...",,1.0
6,6,5.229120e+17,aaron rodgers,Կարծես թե @Panthers Defense-ը կիրակի օրը A կստ...,,0.0
...,...,...,...,...,...,...
1077,1077,5.230960e+17,white house,"Ինչ էլ որ լինի այնտեղ, մի շարք մարդկանց համար,...",,1.0
1094,1094,5.208830e+17,wwat,"Վաղը դուրս կգա «Գողացեք իմ աղջկան», մյուս օրը՝...",,1.0
1105,1105,5.230620e+17,xmas,Քիչ-քիչ նոսրանում է ավստրալական կենսակերպը։ Կա...,,0.0
1110,1110,5.196010e+17,yougov,"Այսպիսով, Աշխատանքային ետ առաջատարի հետ Populu...",,1.0


In [25]:
pos_count = []
neg_count = []
data = lemmatized_text[low_index]

for i in low_index:
    words = data[i]
    pos = 0
    neg = 0
    for j in range(len(words)):
        if words[j] in positive_contribution['Word'].values:
            pos += 1
        elif words[j] in negative_contribution['Word'].values:
            neg += 1
    pos_count.append(pos)
    neg_count.append(neg)

In [26]:
pos_count

[1,
 2,
 1,
 2,
 2,
 1,
 2,
 3,
 0,
 3,
 4,
 1,
 5,
 1,
 4,
 2,
 4,
 2,
 1,
 2,
 4,
 2,
 1,
 2,
 2,
 0,
 3,
 1,
 0,
 1,
 1,
 2,
 2,
 3,
 1,
 3,
 4,
 5,
 6,
 4,
 2,
 3,
 5,
 6,
 2,
 1,
 2,
 1,
 2,
 2,
 1,
 1,
 2,
 3,
 2,
 2,
 2,
 4,
 4,
 0,
 1,
 1,
 1,
 3,
 2,
 2,
 2,
 2,
 6,
 1,
 4,
 3,
 2,
 4,
 3,
 1,
 3,
 1,
 4,
 2,
 1,
 5,
 1,
 2,
 0,
 2,
 2,
 3,
 1,
 0,
 2,
 4,
 3,
 5,
 2,
 1,
 1,
 1,
 1,
 1,
 0,
 6,
 2,
 2,
 5,
 1,
 3,
 0,
 2,
 3,
 0,
 3,
 2,
 5,
 3,
 5,
 4,
 2,
 3,
 0,
 3,
 0,
 3,
 3,
 2,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 3,
 2,
 2,
 2,
 1,
 1,
 4,
 4,
 2,
 4,
 0,
 1,
 5,
 2,
 0,
 1,
 1,
 3,
 1,
 5,
 2,
 2,
 1,
 3,
 1,
 1,
 6,
 3,
 0,
 0,
 2,
 1,
 3,
 4,
 7,
 3,
 3,
 2,
 2,
 2,
 5,
 2,
 2,
 4,
 6,
 5,
 5,
 2,
 1,
 5,
 2,
 3,
 5,
 2,
 2,
 5,
 5,
 2,
 2,
 7,
 4,
 0,
 4,
 1,
 0,
 1,
 2,
 1,
 1,
 4,
 0,
 1,
 7,
 0,
 3,
 1,
 1,
 1,
 5,
 2,
 0,
 1,
 4,
 1,
 1,
 2,
 2,
 1,
 4,
 2,
 1,
 6,
 2,
 2,
 1,
 0,
 0,
 5,
 3,
 4,
 2,
 3,
 4,
 4,
 1]

In [27]:
neg_count

[4,
 2,
 2,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 2,
 2,
 0,
 1,
 3,
 1,
 3,
 1,
 2,
 2,
 2,
 2,
 5,
 2,
 1,
 1,
 2,
 2,
 0,
 1,
 2,
 2,
 1,
 1,
 1,
 0,
 3,
 0,
 1,
 3,
 3,
 1,
 2,
 0,
 0,
 2,
 0,
 0,
 1,
 0,
 2,
 1,
 3,
 0,
 2,
 1,
 2,
 2,
 2,
 0,
 0,
 1,
 0,
 1,
 2,
 0,
 4,
 0,
 2,
 0,
 1,
 2,
 0,
 1,
 0,
 1,
 2,
 1,
 2,
 2,
 2,
 3,
 1,
 1,
 3,
 0,
 0,
 0,
 0,
 1,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 0,
 3,
 2,
 1,
 2,
 2,
 0,
 3,
 0,
 1,
 2,
 3,
 2,
 4,
 1,
 3,
 2,
 0,
 2,
 1,
 3,
 1,
 1,
 3,
 3,
 2,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 2,
 2,
 1,
 0,
 1,
 1,
 1,
 2,
 0,
 1,
 0,
 2,
 2,
 1,
 1,
 0,
 3,
 1,
 2,
 0,
 1,
 4,
 1,
 1,
 1,
 2,
 1,
 1,
 0,
 1,
 3,
 3,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 3,
 2,
 0,
 2,
 0,
 1,
 1,
 1,
 0,
 1,
 2,
 1,
 1,
 1,
 0,
 1,
 2,
 0,
 2,
 1,
 0,
 1,
 3,
 0,
 1,
 1,
 1,
 1,
 3,
 1,
 2,
 3,
 0,
 2,
 2,
 2,
 0,
 1,
 1,
 1,
 0,
 2,
 5,
 2,
 1,
 0,
 1,
 0,
 4,
 2,
 2,
 2,
 1,
 1,
 0,
 2,
 0,
 0,
 0,
 4,
 3,
 0,
 2]

In [28]:
def found_something(ind, words, window, adverb):
    a = ind - 1
    found_a = False
    count_a = 0
    while a >= 0 and found_a == False and count_a <= window:
        if words[a] in adverb:
            found_a = True
        a = a - 1
        count_a = count_a + 1
    b = ind + 1
    found_b = False
    count_b = 0
    while b < len(data[i]) and found_b == False and count_b <= window:
        if words[b] in adverb:
            found_b = True
        b = b + 1
        count_b = count_b + 1
    return found_a, found_b

In [29]:
positive_sentiment = []
negative_sentiment = []

for i in low_index:
    words = data[i]
    pos_words = []
    neg_words = []
    pos_scores = np.array([])
    neg_scores = np.array([])
    
    for j in range(len(words)):
        if words[j] in positive_contribution['Word'].values:
            pos_ind = np.where(positive_contribution['Word'] == words[j])[0]
            pos_cont = positive_contribution['Contribution'][pos_ind]
            pos_words.append([words[j], pos_cont])
        elif words[j] in negative_contribution['Word'].values:
            neg_ind = np.where(negative_contribution['Word'] == words[j])[0]
            neg_cont = negative_contribution['Contribution'][neg_ind]
            neg_words.append([words[j], neg_cont])
            
    for k in range(len(pos_words)):
        word = pos_words[k][0]
        c = pos_words[k][1]
        ind = data[i].index(word)
        pos_found = found_something(ind, words, 4, positive_adverb)
        if pos_found[0] == True or pos_found[1] == True:
            g = 2
        else:
            g = 1
        neg_found = found_something(ind, words, 3, negative_adverb)
        if neg_found[0] == True or neg_found[1] == True:
            f = -1
        else:
            f = 1
        pos_scores = np.append(pos_scores, g * f * c)
        
    for l in range(len(neg_words)):
        word = neg_words[l][0]
        c = neg_words[l][1]
        ind = data[i].index(word)
        pos_found = found_something(ind, words, 4, positive_adverb)
        if pos_found[0] == True or pos_found[1] == True:
            g = 2
        else:
            g = 1
        neg_found = found_something(ind, words, 3, negative_adverb)
        if neg_found[0] == True or neg_found[1] == True:
            f = -1
        else:
            f = 1
        neg_scores = np.append(neg_scores, g * f * c)
    
    positive_sentiment.append(np.sum(pos_scores))
    negative_sentiment.append(np.sum(neg_scores))

In [30]:
for i in range(len(low_index)):
    if positive_sentiment[i] > negative_sentiment[i]:
        predictions[low_index[i]] = 1
    else:
        predictions[low_index[i]] = 0

In [31]:
print('Accuracy:', accuracy_score(test['Polarity'], predictions))
print('F-measure:', f1_score(test['Polarity'], predictions))
print('Recall:', recall_score(test['Polarity'], predictions))
print('Precision:', precision_score(test['Polarity'], predictions))

Accuracy: 0.8278615794143744
F-measure: 0.895021645021645
Recall: 0.9538638985005767
Precision: 0.8430173292558614


In [32]:
contingency_matrix(test['Polarity'], predictions)

array([[106, 154],
       [ 40, 827]])