In this example we will show how to apply our methodoly for one of the considered dataset - Anger dataset (should be located in the 'data' folder).

**Dataset references:**
Semeval-2018 Task 1: Affect in Tweets. Saif M. Mohammad, Felipe Bravo-Marquez, Mohammad Salameh, and Svetlana Kiritchenko. In Proceedings of International Workshop on Semantic Evaluation (SemEval-2018), New Orleans, LA, USA, June 2018.

**Imports**

In [85]:
import numpy as np
import itertools
from scipy.stats import pearsonr

In [11]:
# this packages for embeddings can be uploaded here or in embeddings_and_lexicons.py (commented section)
# torchmoji 
from torchmoji.sentence_tokenizer import SentenceTokenizer
from torchmoji.model_def import torchmoji_feature_encoding 
from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
from transformers import AutoTokenizer, AutoModel, TFAutoModel
# roBERTa model 
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
# Word2Vec
from gensim.models import KeyedVectors
# SBERT
from sentence_transformers import SentenceTransformer
# BERT
from transformers import BertTokenizer, BertModel

In [2]:
import sys

# import our functions
sys.path.append('./code')
from preprocessing import *
from embeddings_and_lexicons import *
from wknn_eval import *

**Upload data and perform preprocessing**

In [3]:
file_train = './data/SemEval2018-Task1-all-data/English_EI-oc/training/EI-oc-En-anger-train.txt'
file_dev = './data/SemEval2018-Task1-all-data/English_EI-oc/development/2018-EI-oc-En-anger-dev.txt'
file_test = './data/SemEval2018-Task1-all-data/English_EI-oc/test-gold/2018-EI-oc-En-anger-test-gold.txt'

columns = ['ID', 'Tweet', 'Affect Dimension', 'Intensity Class']
sep = '\t'

train_data, dev_data, eval_data, test_data = upload_datasets(file_train, file_dev, file_test, columns, sep)

In [4]:
eval_data.head(3)

Unnamed: 0,ID,Tweet,Affect Dimension,Intensity Class,Cleaned_tweet,Cleaned_tweet_wt_stopwords,Class
0,2017-En-10264,@xandraaa5 @amayaallyn6 shut up hashtags are c...,anger,2: moderate amount of anger can be inferred,$MENTION$ $MENTION$ shut up hashtags are cool ...,$MENTION$ $MENTION$ shut hashtags cool offended,2
1,2017-En-10072,it makes me so fucking irate jesus. nobody is ...,anger,3: high amount of anger can be inferred,it makes me so fucking irate jesus. nobody is ...,makes fucking irate jesus. nobody calling ppl ...,3
2,2017-En-11383,Lol Adam the Bull with his fake outrage...,anger,1: low amount of anger can be inferred,Lol Adam the Bull with his fake outrage...,Lol Adam Bull fake outrage...,1


**Read lexicons**

In [94]:
# Read VAD lexicon

vad_file = './lexica/NRC-VAD/NRC-VAD-Lexicon.txt'
vad = pd.read_csv(vad_file, sep="\t")
vad.head(3)

Unnamed: 0,Word,Valence,Arousal,Dominance
0,aaaaaaah,0.479,0.606,0.291
1,aaaah,0.52,0.636,0.282
2,aardvark,0.427,0.49,0.437


In [95]:
# EMOLEX lexicon

emolex_lexicon = pd.read_csv('./lexica/NRC-Emotion/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', sep="\t", header=None)
emolex_lexicon.columns = ["Word", "Emotion", "Availability"]

# Transform EMOLEX to DataFrame form

emolex = emolex_lexicon.pivot(columns='Emotion',values='Availability',index='Word').reset_index()
emolex.columns = ['Word', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust']
emolex.head(3)

Unnamed: 0,Word,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,,0,0,0,0,0,0,0,0,0,0
1,aback,0,0,0,0,0,0,0,0,0,0
2,abacus,0,0,0,0,0,0,0,0,0,1


In [96]:
# Affect Intensity lexicon

ai_lexicon = pd.read_csv('./lexica/NRC-Affect-Intensity/NRC-AffectIntensity-Lexicon.txt', sep="\t")
ai_lexicon.head()

# Transform AI to DataFrame form

ai = ai_lexicon.pivot(columns='AffectDimension',values='score',index='term').reset_index()
ai.columns = ['term', 'anger', 'fear', 'joy', 'sadness']
ai = ai.fillna(0)
ai.head(3)

Unnamed: 0,term,anger,fear,joy,sadness
0,TRUE,0.0,0.0,0.328,0.0
1,aaaaaaah,0.0,0.344,0.0,0.0
2,aaaah,0.0,0.234,0.0,0.0


In [99]:
# ANEW lexicon

anew = pd.read_csv('./lexica/ANEW/all.csv')
anew.rename(columns={'Description': 'Word'}, inplace=True)
del anew['Word No.']
del anew['Word Frequency']
anew.head(3)

Unnamed: 0,Word,Valence Mean,Valence SD,Arousal Mean,Arousal SD,Dominance Mean,Dominance SD
0,grin,7.4,1.87,5.27,2.64,6.0,1.86
1,honest,7.7,1.43,5.32,1.92,6.24,2.13
2,gripe,3.14,1.56,5.0,2.19,4.67,1.79


In [100]:
# Warriner lexicon

warriner = pd.read_csv('./lexica/AffectiveNorms/Ratings_Warriner_et_al.csv', index_col=0)
warriner.head(3)

Unnamed: 0,Word,V.Mean.Sum,V.SD.Sum,V.Rat.Sum,A.Mean.Sum,A.SD.Sum,A.Rat.Sum,D.Mean.Sum,D.SD.Sum,D.Rat.Sum,...,A.Rat.L,A.Mean.H,A.SD.H,A.Rat.H,D.Mean.L,D.SD.L,D.Rat.L,D.Mean.H,D.SD.H,D.Rat.H
1,aardvark,6.26,2.21,19,2.41,1.4,22,4.27,1.75,15,...,11,2.55,1.29,11,4.12,1.64,8,4.43,1.99,7
2,abalone,5.3,1.59,20,2.65,1.9,20,4.95,1.79,22,...,12,2.38,1.92,8,5.55,2.21,11,4.36,1.03,11
3,abandon,2.84,1.54,19,3.73,2.43,22,3.32,2.5,22,...,11,3.82,2.14,11,2.77,2.09,13,4.11,2.93,9


In [101]:
# Save lexicons in the list

lexicons_data = [[vad, 'Word', 3, 0, 1], [emolex, 'Word', 10, 0, 1], [ai, 'term', 4, 0, 1],
                 [anew, 'Word', 6, 0, 9], [warriner, 'Word', 63, 0, 1000]]

**Apply lexicons to the dataset**

In [106]:
lex = lexicons_data[0]
eval_data['Vector_vad'] = eval_data[Raw_tweet].apply(lambda x: np.mean([get_lexicon_scores(i, lex[0], lex[1], lex[2]) 
                                                                        for i in x.split(' ')], axis=0))

In [108]:
lex = lexicons_data[1]
eval_data['Vector_emolex'] = eval_data[Raw_tweet].apply(lambda x: np.mean([get_lexicon_scores(i, lex[0], lex[1], lex[2]) 
                                                                        for i in x.split(' ')], axis=0))

In [109]:
lex = lexicons_data[2]
eval_data['Vector_ai'] = eval_data[Raw_tweet].apply(lambda x: np.mean([get_lexicon_scores(i, lex[0], lex[1], lex[2]) 
                                                                        for i in x.split(' ')], axis=0))

In [110]:
lex = lexicons_data[3]
eval_data['Vector_anew'] = eval_data[Raw_tweet].apply(lambda x: np.mean([get_lexicon_scores(i, lex[0], lex[1], lex[2]) 
                                                                        for i in x.split(' ')], axis=0))

In [111]:
lex = lexicons_data[4]
eval_data['Vector_warriner'] = eval_data[Raw_tweet].apply(lambda x: np.mean([get_lexicon_scores(i, lex[0], lex[1], lex[2]) 
                                                                        for i in x.split(' ')], axis=0))

In [142]:
# combination of all lexicons
eval_data['Vector_all_lexicons'] = eval_data.apply(lambda x: x['Vector_vad'].tolist()+x['Vector_emolex'].tolist()+
                                   x['Vector_ai'].tolist()+x['Vector_anew'].tolist()+x['Vector_warriner'].tolist(), axis = 1)

**Apply embedding methods on the evaluation (train+dev) data**

In [16]:
Raw_tweet = 'Tweet'

In [None]:
# roBERTa

# path to the pre-loaded roBERTa model 
MODEL_path_roberta = "./model/twitter-roberta-base-emotion"
# upload tokenizer and model
tokenizer_roberta = AutoTokenizer.from_pretrained(MODEL_path_roberta)
model_roberta = TFAutoModel.from_pretrained(MODEL_path_roberta)

# Apply on 3 tweet preprocessing options
for column in [Raw_tweet, 'Cleaned_tweet', 'Cleaned_tweet_wt_stopwords']:
    eval_data['Vector_roBERTa_'+column] = eval_data[column].apply(lambda x: 
                                            get_vector_roberta(x, tokenizer_roberta, model_roberta))

In [None]:
# DeepMoji

# If dataset is big, it's better to split it
for column in [Raw_tweet, 'Cleaned_tweet', 'Cleaned_tweet_wt_stopwords']:
    eval_data['Vector_DeepMoji_'+column] = None
    for i in range(50):
        ind = list(range(i*int(len(eval_data[column])/50),(i+1)*int(len(eval_data[column])/50)))
        eval_data['Vector_DeepMoji_'+column].iloc[ind] = get_vectors_deepmoji(eval_data[column].iloc[ind]) 

# If dataset is small, you can use this:
#for column in [Raw_tweet, 'Cleaned_tweet','Cleaned_tweet_wt_stopwords']:
#    eval_data['Vector_DeepMoji_'+column] = get_vectors_deepmoji(eval_data[column])

In [None]:
# BERT

# load pre-trained model tokenizer (vocabulary)
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
# upload BERT model 
model_bert = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

# Apply on 3 tweet preprocessing options
for column in [Raw_tweet, 'Cleaned_tweet', 'Cleaned_tweet_wt_stopwords']:
    eval_data['Vector_BERT_'+column] = eval_data[column].apply(lambda x: 
                                                      get_vector_bert(x, tokenizer_bert, model_bert))

In [25]:
# sBERT

# upload Sentence-BERT model from the 'sentence_transformers' package 
model_sbert = SentenceTransformer('distilbert-base-nli-mean-tokens')

# Apply on 3 tweet preprocessing options
for column in [Raw_tweet, 'Cleaned_tweet', 'Cleaned_tweet_wt_stopwords']:
    eval_data['Vector_sBERT_'+column] = eval_data[column].apply(lambda x: get_vector_sbert(x, model_sbert))

In [27]:
# USE 

# upload the big Universal Sentence Encoder model from HTTPS domain 
model_use = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

# Apply on 3 tweet preprocessing options
for column in [Raw_tweet, 'Cleaned_tweet', 'Cleaned_tweet_wt_stopwords']:
    eval_data['Vector_USE_'+column] = eval_data[column].apply(lambda x: get_vector_use(x, model_use))

In [29]:
# Word2Vec

# path to the pre-loaded Word2Vec model 
w2v_path = './model/GoogleNews-vectors-negative300.bin'
# upload model
model_w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True)

# Apply on 3 tweet preprocessing options
for column in [Raw_tweet, 'Cleaned_tweet', 'Cleaned_tweet_wt_stopwords']:
    eval_data['Vector_Word2Vec_'+column] = eval_data[column].apply(lambda x: get_vector_w2v(x, model_w2v))

**Append lexicons to embedding vectors**

In [132]:
for embedding in ['roBERTa', 'DeepMoji', 'BERT', 'sBERT', 'USE', 'Word2Vec']:
    eval_data["Vector_"+embedding+"_vad"] = eval_data.apply(lambda x: append_lexicon_scores(x['Tweet'], x["Vector_"+embedding+"_"+Raw_tweet], lexicons_data[0]), axis=1)
    eval_data["Vector_"+embedding+"_emolex"] = eval_data.apply(lambda x: append_lexicon_scores(x['Tweet'], x["Vector_"+embedding+"_"+Raw_tweet], lexicons_data[1]), axis=1)
    eval_data["Vector_"+embedding+"_ai"] = eval_data.apply(lambda x: append_lexicon_scores(x['Tweet'], x["Vector_"+embedding+"_"+Raw_tweet], lexicons_data[2]), axis=1)
    eval_data["Vector_"+embedding+"_anew"] = eval_data.apply(lambda x: append_lexicon_scores(x['Tweet'], x["Vector_"+embedding+"_"+Raw_tweet], lexicons_data[3]), axis=1)
    eval_data["Vector_"+embedding+"_warriner"] = eval_data.apply(lambda x: append_lexicon_scores(x['Tweet'], x["Vector_"+embedding+"_"+Raw_tweet], lexicons_data[4]), axis=1)

**Apply cross-validation for wkNN to evaluate k**

In [31]:
K_fold = 5

k_list = [5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25]

In [33]:
pcc_raw_roBERTa = []
pcc_clean_roBERTa = []
pcc_wtstop_roBERTa = []

for k in k_list:
    pcc_raw_roBERTa.append(cross_validation_ensemble_knn(eval_data, ['Vector_roBERTa_'+Raw_tweet], 'Class', K_fold, [k], 'labels', 'pcc'))
    pcc_clean_roBERTa.append(cross_validation_ensemble_knn(eval_data, ['Vector_roBERTa_Cleaned_tweet'], 'Class', K_fold, [k], 'labels', 'pcc'))
    pcc_wtstop_roBERTa.append(cross_validation_ensemble_knn(eval_data, ['Vector_roBERTa_Cleaned_tweet_wt_stopwords'], 'Class', K_fold, [k], 'labels', 'pcc'))

In [34]:
for array in [pcc_raw_roBERTa, pcc_clean_roBERTa, pcc_wtstop_roBERTa]:
    print('The highest PCC score: ', max(array), ' with k = ', k_list[array.index(max(array))])

The highest PCC score:  0.6637482135603178  with k =  23
The highest PCC score:  0.6579449484161686  with k =  13
The highest PCC score:  0.6436266193028504  with k =  23


The best result for roBERTa embedding is PCC = 0.6637, for the raw tweets with k=23

In [72]:
pcc_raw_DeepMoji = []
pcc_clean_DeepMoji = []
pcc_wtstop_DeepMoji = []

for k in k_list:
    pcc_raw_DeepMoji.append(cross_validation_ensemble_knn(eval_data, ['Vector_DeepMoji_'+Raw_tweet], 'Class', K_fold, [k], 'labels', 'pcc'))
    pcc_clean_DeepMoji.append(cross_validation_ensemble_knn(eval_data, ['Vector_DeepMoji_Cleaned_tweet'], 'Class', K_fold, [k], 'labels', 'pcc'))
    pcc_wtstop_DeepMoji.append(cross_validation_ensemble_knn(eval_data, ['Vector_DeepMoji_Cleaned_tweet_wt_stopwords'], 'Class', K_fold, [k], 'labels', 'pcc'))

In [73]:
for array in [pcc_raw_DeepMoji, pcc_clean_DeepMoji, pcc_wtstop_DeepMoji]:
    print('The highest PCC score: ', max(array), ' with k = ', k_list[array.index(max(array))])

The highest PCC score:  0.559513358149148  with k =  21
The highest PCC score:  0.572431579806779  with k =  25
The highest PCC score:  0.5300392319317812  with k =  17


The best result for DeepMoji embedding is PCC = 0.5724, for the cleaned tweets with k=25

In [74]:
pcc_raw_BERT = []
pcc_clean_BERT = []
pcc_wtstop_BERT = []

for k in k_list:
    pcc_raw_BERT.append(cross_validation_ensemble_knn(eval_data, ['Vector_BERT_'+Raw_tweet], 'Class', K_fold, [k], 'labels', 'pcc'))
    pcc_clean_BERT.append(cross_validation_ensemble_knn(eval_data, ['Vector_BERT_Cleaned_tweet'], 'Class', K_fold, [k], 'labels', 'pcc'))
    pcc_wtstop_BERT.append(cross_validation_ensemble_knn(eval_data, ['Vector_BERT_Cleaned_tweet_wt_stopwords'], 'Class', K_fold, [k], 'labels', 'pcc'))

In [75]:
for array in [pcc_raw_BERT, pcc_clean_BERT, pcc_wtstop_BERT]:
    print('The highest PCC score: ', max(array), ' with k = ', k_list[array.index(max(array))])

The highest PCC score:  0.4271145112625013  with k =  25
The highest PCC score:  0.41817036049973316  with k =  23
The highest PCC score:  0.4231505798100299  with k =  23


The best result for BERT embedding is PCC = 0.4271, for the raw tweets with k=25

In [76]:
pcc_raw_sBERT = []
pcc_clean_sBERT = []
pcc_wtstop_sBERT = []

for k in k_list:
    pcc_raw_sBERT.append(cross_validation_ensemble_knn(eval_data, ['Vector_sBERT_'+Raw_tweet], 'Class', K_fold, [k], 'labels', 'pcc'))
    pcc_clean_sBERT.append(cross_validation_ensemble_knn(eval_data, ['Vector_sBERT_Cleaned_tweet'], 'Class', K_fold, [k], 'labels', 'pcc'))
    pcc_wtstop_sBERT.append(cross_validation_ensemble_knn(eval_data, ['Vector_sBERT_Cleaned_tweet_wt_stopwords'], 'Class', K_fold, [k], 'labels', 'pcc'))

In [77]:
for array in [pcc_raw_sBERT, pcc_clean_sBERT, pcc_wtstop_sBERT]:
    print('The highest PCC score: ', max(array), ' with k = ', k_list[array.index(max(array))])

The highest PCC score:  0.4921208438658003  with k =  19
The highest PCC score:  0.47359940406435647  with k =  17
The highest PCC score:  0.4815516848012857  with k =  19


The best result for sBERT embedding is PCC = 0.4921, for the raw tweets with k=19

In [78]:
pcc_raw_USE = []
pcc_clean_USE = []
pcc_wtstop_USE = []

for k in k_list:
    pcc_raw_USE.append(cross_validation_ensemble_knn(eval_data, ['Vector_USE_'+Raw_tweet], 'Class', K_fold, [k], 'labels', 'pcc'))
    pcc_clean_USE.append(cross_validation_ensemble_knn(eval_data, ['Vector_USE_Cleaned_tweet'], 'Class', K_fold, [k], 'labels', 'pcc'))
    pcc_wtstop_USE.append(cross_validation_ensemble_knn(eval_data, ['Vector_USE_Cleaned_tweet_wt_stopwords'], 'Class', K_fold, [k], 'labels', 'pcc'))

In [79]:
for array in [pcc_raw_USE, pcc_clean_USE, pcc_wtstop_USE]:
    print('The highest PCC score: ', max(array), ' with k = ', k_list[array.index(max(array))])

The highest PCC score:  0.5041695261922098  with k =  21
The highest PCC score:  0.5141543858558131  with k =  25
The highest PCC score:  0.517907031254379  with k =  21


The best result for USE embedding is PCC = 0.5179, for the cleaned tweets without stop words with k=21

In [80]:
pcc_raw_Word2Vec = []
pcc_clean_Word2Vec = []
pcc_wtstop_Word2Vec = []

for k in k_list:
    pcc_raw_Word2Vec.append(cross_validation_ensemble_knn(eval_data, ['Vector_Word2Vec_'+Raw_tweet], 'Class', K_fold, [k], 'labels', 'pcc'))
    pcc_clean_Word2Vec.append(cross_validation_ensemble_knn(eval_data, ['Vector_Word2Vec_Cleaned_tweet'], 'Class', K_fold, [k], 'labels', 'pcc'))
    pcc_wtstop_Word2Vec.append(cross_validation_ensemble_knn(eval_data, ['Vector_Word2Vec_Cleaned_tweet_wt_stopwords'], 'Class', K_fold, [k], 'labels', 'pcc'))

In [81]:
for array in [pcc_raw_Word2Vec, pcc_clean_Word2Vec, pcc_wtstop_Word2Vec]:
    print('The highest PCC score: ', max(array), ' with k = ', k_list[array.index(max(array))])

The highest PCC score:  0.2258041909958301  with k =  25
The highest PCC score:  0.3286663375433289  with k =  25
The highest PCC score:  0.3428537945713763  with k =  13


The best result for Word2Vec embedding is PCC = 0.3428, for the cleaned tweets without stop words with k=13

**Evaluate lexicon vectors with wkNN**

In [112]:
cross_validation_ensemble_knn(eval_data, ['Vector_vad'], 'Class', K_fold, [23], 'labels', 'pcc')

0.1476203194946398

In [113]:
cross_validation_ensemble_knn(eval_data, ['Vector_emolex'], 'Class', K_fold, [23], 'labels', 'pcc')

0.126552439618361

In [114]:
cross_validation_ensemble_knn(eval_data, ['Vector_ai'], 'Class', K_fold, [23], 'labels', 'pcc')

0.12462609469605984

In [115]:
cross_validation_ensemble_knn(eval_data, ['Vector_anew'], 'Class', K_fold, [23], 'labels', 'pcc')

0.09334916798515583

In [116]:
cross_validation_ensemble_knn(eval_data, ['Vector_warriner'], 'Class', K_fold, [23], 'labels', 'pcc')

0.06284205672907116

In [143]:
cross_validation_ensemble_knn(eval_data, ['Vector_all_lexicons'], 'Class', K_fold, [23], 'labels', 'pcc')

0.07168080496100634

In [117]:
# evaluate k for the best lexicon

vad_pcc = []

for k in k_list:
    vad_pcc.append(cross_validation_ensemble_knn(eval_data, ['Vector_vad'], 'Class', K_fold, [k], 'labels', 'pcc'))

print('The highest PCC score: ', max(vad_pcc), ' with k = ', k_list[vad_pcc.index(max(vad_pcc))])

The highest PCC score:  0.17568084042800533  with k =  25


**Trying to improve the best setup for each embedding vector by adding the lexicons scores**

In [133]:
for embed in [('roBERTa', 23), ('DeepMoji', 25), ('BERT', 25), ('sBERT', 19), ("USE", 21), ("Word2Vec", 13)]:
    for lex in ['vad', 'emolex', 'ai', 'anew', 'warriner']:
        print(embed[0])
        print(lex)
        print(cross_validation_ensemble_knn(eval_data, ['Vector_'+embed[0]+'_'+lex], 
                                            'Class', K_fold, [embed[1]], 'labels', 'pcc'))
        print("\n")

roBERTa
vad
0.6551149880734646


roBERTa
emolex
0.662168758171656


roBERTa
ai
0.6575718740398746


roBERTa
anew
0.6614993743942349


roBERTa
warriner
0.6553673896886737


DeepMoji
vad
0.547434057943307


DeepMoji
emolex
0.5430539013777578


DeepMoji
ai
0.5519381664125546


DeepMoji
anew
0.5633770965541167


DeepMoji
warriner
0.5436415534152078


BERT
vad
0.4139455987667056


BERT
emolex
0.42246833906371617


BERT
ai
0.41516869905507026


BERT
anew
0.4173472930588189


BERT
warriner
0.4127843476346432


sBERT
vad
0.4666241941646674


sBERT
emolex
0.48005716969446083


sBERT
ai
0.47014700882669225


sBERT
anew
0.46705930697530706


sBERT
warriner
0.46894322973699015


USE
vad
0.5197846532915962


USE
emolex
0.4963473805188579


USE
ai
0.5202263728592667


USE
anew
0.5068671231675517


USE
warriner
0.5036098891517351


Word2Vec
vad
0.22998473593066882


Word2Vec
emolex
0.22468338251782569


Word2Vec
ai
0.2276347070325106


Word2Vec
anew
0.20225246364770114


Word2Vec
warriner
0.205382689

The best setups was impoved only for the **USE** embedding is PCC = 0.5202, for combination with **AI lexicon** (k=21)

**Ensembles**

In [83]:
# all 6 models

cross_validation_ensemble_knn(eval_data, ['Vector_roBERTa_'+Raw_tweet,'Vector_DeepMoji_Cleaned_tweet','Vector_BERT_'+Raw_tweet,
                                         'Vector_sBERT_'+Raw_tweet,'Vector_USE_Cleaned_tweet_wt_stopwords',
                                          'Vector_Word2Vec_Cleaned_tweet_wt_stopwords'], 
                              "Class", K_fold, [23, 25, 35, 19, 21, 13], 'labels', 'pcc')

0.6530347043034405

In [134]:
# add the best lexicon vector

cross_validation_ensemble_knn(eval_data, ['Vector_roBERTa_'+Raw_tweet,'Vector_DeepMoji_Cleaned_tweet','Vector_BERT_'+Raw_tweet,
                                         'Vector_sBERT_'+Raw_tweet,'Vector_USE_Cleaned_tweet_wt_stopwords',
                                          'Vector_Word2Vec_Cleaned_tweet_wt_stopwords', 'Vector_vad'], 
                              "Class", K_fold, [23, 25, 35, 19, 21, 13, 25], 'labels', 'pcc')

0.6353227386471721

In [144]:
# add all lexicons vectors + combination

cross_validation_ensemble_knn(eval_data, ['Vector_roBERTa_'+Raw_tweet,'Vector_DeepMoji_Cleaned_tweet','Vector_BERT_'+Raw_tweet,
                                         'Vector_sBERT_'+Raw_tweet,'Vector_USE_Cleaned_tweet_wt_stopwords',
                                          'Vector_Word2Vec_Cleaned_tweet_wt_stopwords', 'Vector_vad', 'Vector_emolex', 
                                          'Vector_ai', 'Vector_anew', 'Vector_warriner', 'Vector_all_lexicons'], 
                              "Class", K_fold, [23, 25, 35, 19, 21, 13, 25, 23, 23, 23, 23, 23], 'labels', 'pcc')

0.5296172440623522

In [145]:
# with updated by lexicon USE

cross_validation_ensemble_knn(eval_data, ['Vector_roBERTa_'+Raw_tweet,'Vector_DeepMoji_Cleaned_tweet','Vector_BERT_'+Raw_tweet,
                                     'Vector_sBERT_'+Raw_tweet,'Vector_USE_ai', 'Vector_Word2Vec_Cleaned_tweet_wt_stopwords'], 
                              "Class", K_fold, [23, 25, 35, 19, 21, 13], 'labels', 'pcc')

0.6461547385933937

In [146]:
# with updated by lexicon USE + the best lexicon

cross_validation_ensemble_knn(eval_data, ['Vector_roBERTa_'+Raw_tweet,'Vector_DeepMoji_Cleaned_tweet','Vector_BERT_'+Raw_tweet,
                                     'Vector_sBERT_'+Raw_tweet,'Vector_USE_ai', 'Vector_Word2Vec_Cleaned_tweet_wt_stopwords', 
                                         'Vector_vad'], "Class", K_fold, [23, 25, 35, 19, 21, 13, 25], 'labels', 'pcc')

0.6395912047460274

The best ensemble for anger is based on all embeddings (PCC=0.65), but it still worse than single roBERTa model, which is the best setup (PCC=0.66).

**Apply the best method on the test data**

In [152]:
# Apply embeddings on the test data

test_data['Vector_roBERTa_'+Raw_tweet] = test_data[Raw_tweet].apply(lambda x: 
                                            get_vector_roberta(x, tokenizer_roberta, model_roberta))

In [153]:
# Calculate predicted labels

test_labels = knn_ensemble_labels(eval_data, eval_data['Class'], test_data, ['Vector_roBERTa_'+Raw_tweet], [23])

In [154]:
pearsonr(test_data['Class'].to_list(), test_labels)[0]

0.6737452388532557

**Explore explainability**

In [155]:
# Indexes of wrong predicted test instances

error = [i for i in range(len(test_data['Class'])) if test_data['Class'][i] != test_labels[i]]

In [156]:
print(error[:10])

[2, 5, 8, 10, 11, 16, 17, 18, 20, 22]


In [160]:
# Correct prediction sample

i = 0 # not in error list
print(test_data['Tweet'].iloc[i])
print(test_data['Class'].iloc[i])

@PageShhh1 I know you mean well but I'm offended. Prick.
2


In [165]:
# Explore its train neighbours and their classes

test_vector = test_data['Vector_roBERTa_'+Raw_tweet].iloc[i]
get_neigbours(test_vector, eval_data, 'Vector_roBERTa_'+Raw_tweet, 5, 'Tweet', 'Class')

(['@Jack_Septic_Eye Grass growing simulator is offended',
  "@healeyraine I'm offended, I actually am",
  'You offend me, @Tansorma',
  '@Idubbbz @LeafyIsHere  I am offended',
  "@NeoFundie @fitchest Ha. Right. I'm from San Jose, CA, and I was offended right there with you. Dave, go on a walk or something next time."],
 [1, 2, 2, 2, 3])

In [168]:
# Wrong prediction sample

i = error[1] # not in error list
print(test_data['Tweet'].iloc[i])
print(test_data['Class'].iloc[i])

@ajduey04303 We've been broken up a while, both moved on, she's got a kid, I don't hold any animosity towards her anymore...
0


In [169]:
# Explore its train neighbours and their classes

test_vector = test_data['Vector_roBERTa_'+Raw_tweet].iloc[i]
get_neigbours(test_vector, eval_data, 'Vector_roBERTa_'+Raw_tweet, 5, 'Tweet', 'Class')

(["I told myself i wouldn't talk about this but i need to bring it up. I'm slightly bitter about the tøp cover of cancer",
  '@Iucifaer you can go on what you usually do its just their own personal reason and not mean to offend anyone :(',
  "@StarklyDark 'I know you trusted me.' His words were soft as he ignored the anger and focused on the hurt beneath. 'I know I screwed up.' --",
  '@BoJackHorseman if your depressed and somebody calls you long faced will you still automatically take umbrage?',
  '@bassekraah. @sunilddesai @jyoti1013 @Archnahr . A2: #irritation and isolation #lifenabler'],
 [2, 3, 2, 0, 2])