**Imports**

In [3]:
import sys
import pandas as pd
import numpy as np
import itertools

from tqdm import tqdm
from scipy.stats import ttest_ind

sys.path.append('./code')
from preprocessing import *
from tweets_embedding import *
from frnn_owa_eval import *

**Upload data and perform preprocessing**

In [4]:
# Read datasets
file_train_path = '../data/hateval2019_en_train.csv'
file_dev_path = '../data/hateval2019_en_dev.csv'
file_test_path = '../data/hateval2019_en_test.csv'
Raw_tweet = 'text'
col = ['id', Raw_tweet, 'HS']
sep = ','

# Upload as a DataFrames (preprocessing is biult-in functions)
# If we have test dataset
train, dev, data, test = upload_datasets(file_train_path, file_dev_path, file_test_path, col, sep)

# If we have only train dataset - use this line instead
#data = transform_data(file_train_path, col, sep)

**Apply embedding methods on the train data**

In [None]:
# roBERTa

# preload components to save time
# path to the pre-loaded roBERTa model 
MODEL_path_roberta = r"..\model\twitter-roberta-base-hate"
# upload tokenizer and model
tokenizer_roberta = AutoTokenizer.from_pretrained(MODEL_path_roberta)
model_roberta = TFAutoModel.from_pretrained(MODEL_path_roberta)

# Apply on 3 tweet preprocessing options
for column in tqdm([Raw_tweet, 'Cleaned_tweet', 'Cleaned_tweet_wt_stopwords']):
    data['Vector_roBERTa_'+column] = data[column].apply(lambda x: 
                                            get_vector_roberta(x, tokenizer_roberta, model_roberta))

In [7]:
# BERT

# preload components to save time
# load pre-trained model tokenizer (vocabulary)
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
# upload BERT model 
model_bert = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

# Apply on 3 tweet preprocessing options
for column in tqdm([Raw_tweet, 'Cleaned_tweet', 'Cleaned_tweet_wt_stopwords']):
    data['Vector_BERT_'+column] = data['Cleaned_tweet'].apply(lambda x: 
                                                      get_vector_bert(x, tokenizer_bert, model_bert))

In [8]:
# sBERT

# preload components to save time
# upload Sentence-BERT model from the 'sentence_transformers' package 
model_sbert = SentenceTransformer('distilbert-base-nli-mean-tokens')

# Apply on 3 tweet preprocessing options
for column in tqdm([Raw_tweet, 'Cleaned_tweet', 'Cleaned_tweet_wt_stopwords']):
    data['Vector_sBERT_'+column] = data[Raw_tweet].apply(lambda x: get_vector_sbert(x, model_sbert))

In [None]:
# DeepMoji
  
# If dataset is big, it's better to split it
for column in [Raw_tweet, 'Cleaned_tweet', 'Cleaned_tweet_wt_stopwords']:
    data['Vector_DeepMoji_'+column] = None
    for i in range(20):
        ind = list(range(i*int(len(data[column])/20),(i+1)*int(len(data[column])/20)))
        data['Vector_DeepMoji_'+column].iloc[ind] = get_vectors_deepmoji(data[column].iloc[ind]) 

# If dataset is small, you can use this:
#for column in [Raw_tweet, 'Cleaned_tweet','Cleaned_tweet_wt_stopwords']:
    #data['Vector_DeepMoji_'+column] = get_vectors_deepmoji(data[column])

In [None]:
# USE 

# preload components to save time
# upload the big Universal Sentence Encoder model from HTTPS domain 
model_use = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

# Apply on 3 tweet preprocessing options
for column in [Raw_tweet, 'Cleaned_tweet', 'Cleaned_tweet_wt_stopwords']:
    data['Vector_USE_'+column] = data['Cleaned_tweet'].apply(lambda x: get_vector_use(x, model_use))

In [11]:
# Word2Vec

# preload components to save time
# path to the pre-loaded Word2Vec model 
w2v_path = '../model/GoogleNews-vectors-negative300.bin'
# upload model
model_w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True)

# Apply on 3 tweet preprocessing options
for column in tqdm([Raw_tweet, 'Cleaned_tweet', 'Cleaned_tweet_wt_stopwords']):
    data['Vector_Word2Vec_'+column] = data[Raw_tweet].apply(lambda x: get_vector_w2v(x, model_w2v))

In [58]:
# Save embeddings in a file
data.to_csv('../data/hs_train_embeddings.csv')

# To upload saved embeddings 
#data = pd.read_csv('../data/hs_train_embeddings.csv', index_col=0)

**Apply cross-validation for OWA-FRNN to evaluate k**

In [None]:
# Evaluation for roBERTa

f1_raw_roBERTa = []
f1_clean_roBERTa = []
f1_wtstop_roBERTa = []
for k in tqdm([11, 13, 15, 17, 19, 21, 23, 25, 27, 29]):
    f1_raw_roBERTa.append(cross_validation_ensemble_owa(data, ['Vector_roBERTa_text'], 'HS', K_fold, [k], additive(), additive(), 'labels', 'f1'))
    f1_clean_roBERTa.append(cross_validation_ensemble_owa(data, ['Vector_roBERTa_Cleaned_tweet'], 'HS', K_fold, [k], additive(), additive(), 'labels', 'f1'))
    f1_wtstop_roBERTa.append(cross_validation_ensemble_owa(data, ['Vector_roBERTa_Cleaned_tweet_wt_stopwords'], 'HS', K_fold, [k], additive(), additive(), 'labels', 'f1'))

In [37]:
#if 0.05 > p-value - arrays are different

print(ttest_ind(f1_raw_roBERTa, f1_clean_roBERTa))
print(ttest_ind(f1_clean_roBERTa, f1_wtstop_roBERTa))
print(ttest_ind(f1_raw_roBERTa, f1_wtstop_roBERTa))

Ttest_indResult(statistic=16.3037567870835, pvalue=3.171119831143775e-12)
Ttest_indResult(statistic=126.65411058202282, pvalue=5.181029547995751e-28)
Ttest_indResult(statistic=129.0724725673128, pvalue=3.687333754309249e-28)


In [None]:
# Evaluation for BERT

f1_raw_BERT = []
f1_clean_BERT = []
f1_wtstop_BERT = []
for k in tqdm([11, 13, 15, 17, 19, 21, 23, 25, 27, 29]):
    f1_raw_BERT.append(cross_validation_ensemble_owa(data, ['Vector_BERT_text'], 'HS', K_fold, [k], additive(), additive(), 'labels', 'f1'))
    f1_clean_BERT.append(cross_validation_ensemble_owa(data, ['Vector_BERT_Cleaned_tweet'], 'HS', K_fold, [k], additive(), additive(), 'labels', 'f1'))
    f1_wtstop_BERT.append(cross_validation_ensemble_owa(data, ['Vector_BERT_Cleaned_tweet_wt_stopwords'], 'HS', K_fold, [k], additive(), additive(), 'labels', 'f1'))

In [42]:
print(ttest_ind(f1_raw_BERT, f1_clean_BERT))
print(ttest_ind(f1_clean_BERT, f1_wtstop_BERT))
print(ttest_ind(f1_raw_BERT, f1_wtstop_BERT))

Ttest_indResult(statistic=-16.416286874851846, pvalue=2.8226833111845612e-12)
Ttest_indResult(statistic=50.04418390655184, pvalue=8.928317165178167e-21)
Ttest_indResult(statistic=33.062355197035764, pvalue=1.4357237896271443e-17)


In [None]:
# Evaluation for sBERT

f1_raw_sBERT = []
f1_clean_sBERT = []
f1_wtstop_sBERT = []
for k in tqdm([11, 13, 15, 17, 19, 21, 23, 25, 27, 29]):
    f1_raw_sBERT.append(cross_validation_ensemble_owa(data, ['Vector_sBERT_text'], 'HS', K_fold, [k], additive(), additive(), 'labels', 'f1'))
    f1_clean_sBERT.append(cross_validation_ensemble_owa(data, ['Vector_sBERT_Cleaned_tweet'], 'HS', K_fold, [k], additive(), additive(), 'labels', 'f1'))
    f1_wtstop_sBERT.append(cross_validation_ensemble_owa(data, ['Vector_sBERT_Cleaned_tweet_wt_stopwords'], 'HS', K_fold, [k], additive(), additive(), 'labels', 'f1'))

In [44]:
print(ttest_ind(f1_raw_sBERT, f1_clean_sBERT))
print(ttest_ind(f1_clean_sBERT, f1_wtstop_sBERT))
print(ttest_ind(f1_raw_sBERT, f1_wtstop_sBERT))

Ttest_indResult(statistic=3.4837515074149, pvalue=0.002650634653343464)
Ttest_indResult(statistic=22.53379083616941, pvalue=1.217166840178357e-14)
Ttest_indResult(statistic=22.470330376354124, pvalue=1.2784313992113216e-14)


In [None]:
# Evaluation for DeepMoji

f1_raw_DeepMoji = []
f1_clean_DeepMoji = []
f1_wtstop_DeepMoji = []
for k in tqdm([11, 13, 15, 17, 19, 21, 23, 25, 27, 29]):
    f1_raw_DeepMoji.append(cross_validation_ensemble_owa(data, ['Vector_DeepMoji_text'], 'HS', K_fold, [k], additive(), additive(), 'labels', 'f1'))
    f1_clean_DeepMoji.append(cross_validation_ensemble_owa(data, ['Vector_DeepMoji_Cleaned_tweet'], 'HS', K_fold, [k], additive(), additive(), 'labels', 'f1'))
    f1_wtstop_DeepMoji.append(cross_validation_ensemble_owa(data, ['Vector_DeepMoji_Cleaned_tweet_wt_stopwords'], 'HS', K_fold, [k], additive(), additive(), 'labels', 'f1'))

In [46]:
print(ttest_ind(f1_raw_DeepMoji, f1_clean_DeepMoji))
print(ttest_ind(f1_clean_DeepMoji, f1_wtstop_DeepMoji))
print(ttest_ind(f1_raw_DeepMoji, f1_wtstop_DeepMoji))

Ttest_indResult(statistic=40.393241550449225, pvalue=4.087237312094933e-19)
Ttest_indResult(statistic=-21.899299312650374, pvalue=2.0005806130929803e-14)
Ttest_indResult(statistic=27.066850571796405, pvalue=4.917158474993862e-16)


In [None]:
# Evaluation for USE

f1_raw_USE = []
f1_clean_USE = []
f1_wtstop_USE = []
for k in tqdm([11, 13, 15, 17, 19, 21, 23, 25, 27, 29]):
    f1_raw_USE.append(cross_validation_ensemble_owa(data, ['Vector_USE_text'], 'HS', K_fold, [k], additive(), additive(), 'labels', 'f1'))
    f1_clean_USE.append(cross_validation_ensemble_owa(data, ['Vector_USE_Cleaned_tweet'], 'HS', K_fold, [k], additive(), additive(), 'labels', 'f1'))
    f1_wtstop_USE.append(cross_validation_ensemble_owa(data, ['Vector_USE_Cleaned_tweet_wt_stopwords'], 'HS', K_fold, [k], additive(), additive(), 'labels', 'f1'))

In [48]:
print(ttest_ind(f1_raw_USE, f1_clean_USE))
print(ttest_ind(f1_clean_USE, f1_wtstop_USE))
print(ttest_ind(f1_raw_USE, f1_wtstop_USE))

Ttest_indResult(statistic=-12.555781559784352, pvalue=2.426472410269981e-10)
Ttest_indResult(statistic=10.024645089049283, pvalue=8.598847759513292e-09)
Ttest_indResult(statistic=-0.8566987347867746, pvalue=0.4028731773532028)


In [None]:
# Evaluation for Word2Vec

f1_raw_W2V = []
f1_clean_W2V = []
f1_wtstop_W2V = []
for k in tqdm([11, 13, 15, 17, 19, 21, 23, 25, 27, 29]):
    f1_raw_W2V.append(cross_validation_ensemble_owa(data, ['Vector_Word2Vec_text'], 'HS', K_fold, [k], additive(), additive(), 'labels', 'f1'))
    f1_clean_W2V.append(cross_validation_ensemble_owa(data, ['Vector_Word2Vec_Cleaned_tweet'], 'HS', K_fold, [k], additive(), additive(), 'labels', 'f1'))
    f1_wtstop_W2V.append(cross_validation_ensemble_owa(data, ['Vector_Word2Vec_Cleaned_tweet_wt_stopwords'], 'HS', K_fold, [k], additive(), additive(), 'labels', 'f1'))

In [50]:
print(ttest_ind(f1_raw_W2V, f1_clean_W2V))
print(ttest_ind(f1_clean_W2V, f1_wtstop_W2V))
print(ttest_ind(f1_raw_W2V, f1_wtstop_W2V))

Ttest_indResult(statistic=0.843432546225684, pvalue=0.4100554944124666)
Ttest_indResult(statistic=-33.024896297917174, pvalue=1.4648602590988157e-17)
Ttest_indResult(statistic=-30.150039658636025, pvalue=7.340637716543324e-17)


**Define the best preprocessing, the most efficient k, and F1 score that they provide**

In [54]:
ks = [11, 13, 15, 17, 19, 21, 23, 25, 27, 29]

for array in [f1_raw_roBERTa, f1_clean_BERT, f1_raw_sBERT, f1_raw_DeepMoji, f1_clean_USE, f1_wtstop_W2V]:
    print('The highest F1 score: ', max(array), ' with k = ', ks[array.index(max(array))])

The highest F1 score:  0.8769230027547492  with k =  25
The highest F1 score:  0.7172900711451752  with k =  15
The highest F1 score:  0.7063688555139183  with k =  15
The highest F1 score:  0.6223832079832569  with k =  19
The highest F1 score:  0.7037900857065156  with k =  13
The highest F1 score:  0.6700108681026531  with k =  13


**Ensemble of the best models**

In [2]:
# labels - it means we use the same weights for all models

cross_validation_ensemble_owa(data, ['Vector_roBERTa_text', 'Vector_BERT_Cleaned_tweet', 
                            'Vector_sBERT_text', 'Vector_DeepMoji_text', 'Vector_USE_Cleaned_tweet', 
                            'Vector_Word2Vec_Cleaned_tweet_wt_stopwords'], 'HS', K_fold, 
                              [25, 15, 15, 19, 13, 13], 
                              additive(), additive(), 'labels', 'f1')

0.7500831390090896


In [75]:
# confident scores - we use CS as weight for models' outputs

cross_validation_ensemble_owa(data, ['Vector_roBERTa_text', 'Vector_BERT_Cleaned_tweet', 
                            'Vector_sBERT_text', 'Vector_DeepMoji_text', 'Vector_USE_Cleaned_tweet', 
                            'Vector_Word2Vec_Cleaned_tweet_wt_stopwords'], 'HS', K_fold, 
                              [25, 15, 15, 19, 13, 13], 
                              additive(), additive(), 'conf_scores', 'f1')       

0.8435949487244606


In [16]:
# tune alpha parameter for confidence scores approach

for alpha in np.arange(0.1, 1, 0.1):
    print(alpha)
    print(cross_validation_ensemble_owa(data, ['Vector_roBERTa_text', 
                            'Vector_BERT_Cleaned_tweet', 'Vector_sBERT_text', 
                            'Vector_DeepMoji_text', 'Vector_USE_Cleaned_tweet', 
                            'Vector_Word2Vec_Cleaned_tweet_wt_stopwords'], 'HS', 5, 
                              [25, 19, 15, 15, 13, 13], 
                              additive(), additive(), 'conf_scores', 'f1', alpha))

0.1
0.8116510478070585
0.2
0.8073159290890268
0.30000000000000004
0.8076055021832225
0.4
0.8106237180682454
0.5
0.8089246279070123
0.6
0.8079684318718336
0.7000000000000001
0.8092404184071709
0.8
0.8115789041556499
0.9
0.8103005890308277


The highest F1-score = 0.8116510478070585 was provided by alpha = 0.1.

**Define the best models setup with grid search**

In [None]:
vectors =  ['Vector_roBERTa_text', 'Vector_BERT_Cleaned_tweet', 'Vector_sBERT_text', 
            'Vector_DeepMoji_text', 'Vector_USE_Cleaned_tweet', 
              'Vector_Word2Vec_Cleaned_tweet_wt_stopwords']

size = [0, 1, 2, 3, 4, 5]
neighbours = [25, 19, 15, 15, 13, 13]
anger_names = []
anger_means = []

for L in range(len(vectors)+1):
    for subset in itertools.combinations(size, L):
        anger_names.append([i.split('_')[1] for i in [vectors[k] for k in subset]])
        res = cross_validation_ensemble_owa(data, [vectors[k] for k in subset], 'HS', K_fold, 
                                            [neighbours[k] for k in subset], 
                                      additive(), additive(), 'conf_scores', 'f1', 0.1)
        anger_means.append(res)

In [1]:
print('The highest F1 score: ', max(anger_means), ' with models:', 
      anger_names[anger_means.index(max(anger_means))])

The highest F1 score: 0.8765144484070471 with models: roBERTa


**Apply on the test data**

In [8]:
# Apply roBERTa on the test data

test['Vector_roBERTa_text'] = test['text'].apply(lambda x: get_vector_roberta(x, tokenizer_roberta, model_roberta))

In [9]:
# Calculate predicted labels

test_labels = test_ensemble_labels(data, data['HS'], test, ['Vector_roBERTa_text'], [25], additive(), additive())

In [10]:
p_hs, r_hs, f1_hs, support = precision_recall_fscore_support(test['HS'], test_labels, average = "macro")
print("Test F1-score for irony test data: ", f1_hs)

Test F1-score for irony test data:  0.5351496563781046


**Leaderboard:**

F1-score (averaged):
0.651
0.571
0.546
0.535
0.519

We got 0.5351 - it will be 4th or 5th place

**Explore explainability**

In [None]:
# Indexes of wrong predicted test instances

error = [i for i in range(len(test['HS'])) if test['HS'][i] != test_labels[i]]

In [148]:
# Correct sample

i = 304 # not in error list
print(hs_test['text'].iloc[i])
print(hs_test['HS'].iloc[i])

WAKE UP AMERICA. We cannot continue to allow illegal aliens to stay in County. They are a real and present danger to LEGAL AMERICAN CITIZENS. #BuildThatWall #EndCatchAndReleash #DefundSantuaryCities
1


In [175]:
# Explore its train neighbours and their classes

test_vector = hs_test['Vector_roBERTa'].iloc[i]
get_neigbours(test_vector, hs_train, 'Vector_roBERTa', 9, 'text', 'HS')

(['Migrants go Home. https://t.co/bUCcoB8480',
  '@SenatorLankford #NoDaca #NoAmnesty #NoAnchorBabies 2 illegals do not make a legal #BuildTheWall',
  'Illegal Criminals EVERYWHERE #BuildThatWall !!',
  '@KamalaHarris Again going on record that you care more for illegals than US Citizens. Theyre here illegally! #NoDACA #NoAmnesty',
  'Since 911 63,000 Americans have been killed by Illegal Aliens  #Trump #MAGA #RedNationRising  #Immigration’ #SaturdayMorning https://t.co/qqmOnUMtZh',
  '@SpeakerRyan #BuildThatWall and deport, deport, deport. We have a right to live in an illegal-less and safe country.',
  'Dr. Vliet on #FOXNews 7/12/2014 @JudgeJeanine Nothing has changed! #Illegals bring #USA diseases!#BuildThatWall ASAP.@DrLee4America',
  "@narendramodi Pl don't wait for SC intervention.All 126.90Crs Indians r with u. Pl deport all Rohingyas&amp;other illegal migrants.Big threat",
  'Rep. Biggs pitches plan to fund border wall by penalizing countries of illegal immigrants https://t.co/

In [151]:
# Sample of wrong classified instance

i = 321 # is in error list 
print(hs_test['text'].iloc[i])
print(hs_test['HS'].iloc[i])

The Last Refuge has a fantastic collection of reports on a business model that profits from illegal immigration. #UniParty #RobbingUsBlind #EndChainMigration#tcot #ccot #pjnet #qanon
1


In [152]:
# Explore its train neighbours and their classes

test_vector = hs_test['Vector_roBERTa'].iloc[i]
get_neigbours(test_vector, hs_train, 'Vector_roBERTa', 9, 'text', 'HS')

(['The Truth about #Immigration https://t.co/nKPVzuTB2M',
  'Forced migration history: https://t.co/mw7ApfnFQZ',
  '@john_tatnell @PimlicoPlumbers @RevMcCafferty @afneil Mass immigration for one',
  'Victor D. Hanson: The 4 Groups that Benefit from IllegalÂ\xa0Immigration https://t.co/kd5TuU0W1n https://t.co/ISmJOJd2S4',
  '@TOOEdit Bishop Schneider: mass migration a plan to undermine identity https://t.co/0rnRAL1Kp6',
  'NPR is trying to group legal and illegal immigration"Jeff Sessions is threatening immigrants rights in... https://t.co/cvnMVppGlO',
  'The Last Refuge has a fantastic collection of reports on a business model that profits from illegal immigration. #UniParty #RobbingUsBlind #EndChainMigration#tcot #ccot #pjnet #qanon',
  "Trump's slippery slope. #ImmigrantChildren #Resistance #NurembergTrials @realDonaldTrump @TheJusticeDept @StephenMillerAL @SecNielsen @PressSec @SenateGOP @HouseGOP https://t.co/oVBp1abcWp",
  'Italy agrees to accept migrant arrivals https://t.co/hq9w

**Data reference:**
    
*V. Basile, C. Bosco, E. Fersini, N. Debora, V. Patti, F. M. R. Pardo, P. Rosso, M. Sanguinetti, et al., Semeval-2019 task 5: Multilingual detection of hate speech against immigrants and women in twitter, in: 13th International Workshop on Semantic Evaluation, 2019, pp. 54–63*       