In [2]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.63.9-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.5/250.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting streamlit
  Downloading streamlit-1.14.0-py2.py3-none-any.whl (9.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.2/9.2 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting watchdog
  Downloading watchdog-2.1.9-py3-none-manylinux2014_x86_64.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.4/78.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck>=0.1.dev5
  Downloading pydeck-0.8.0b4-py2.py3-none-any.whl (4.7 MB)
[2K   

In [3]:
# from google.colab import drive
# from google.colab import drive, files
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from transformers import AutoTokenizer, AutoModelForMaskedLM

from transformers import BertTokenizerFast
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from nltk.tokenize import word_tokenize
from nltk import ngrams
from sklearn.metrics import f1_score, average_precision_score, recall_score, classification_report
from statistics import mean

from scipy.special import softmax

import seaborn as sns
import pandas as pd
import torch
import numpy as np

from copy import deepcopy

import nltk
nltk.download('punkt') 

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# mount file path to google drive (only when working on google collab)
# drive.mount('/content/drive')

## LOAD DATASETS AND LEXICONS

In [5]:
""" lexicon with n-grams between 1-4 contained in hate-labeled tweets
# https://github.com/t-davidson/hate-speech-and-offensive-language/tree/master/lexicons"""

#in google drive
# olid_train  = pd.read_csv('/content/drive/MyDrive/VU/SM/assignment4/data/olid-train-small.csv')
# lex_hb = pd.read_csv('/content/drive/MyDrive/VU/SM/assignment4/data/hatebase_dict_vua_format.csv',sep=";")
# lex_hb_ngrams = pd.read_csv('/content/drive/MyDrive/VU/SM/assignment4/data/refined_ngram_dict.csv')
# olid_test = pd.read_csv('/content/drive/MyDrive/VU/SM/assignment4/data/olid-test.csv') #google drive

#in kaggle
olid_train  = pd.read_csv('../input/olidtrainsmall/olid-train-small.csv')
olid_train  = olid_train.drop(columns=['id'])

hasoc_train = pd.read_csv('../input/hasoctrain/hasoc-train.csv')
hasoc_train = hasoc_train.drop(columns=['id'])

olid_test = pd.read_csv('../input/olid-test/olid-test.csv')
olid_test = olid_test.drop(columns=['id'])

lex_hb = pd.read_csv('../input/lexicons/hatebase_dict_vua_format.csv',sep=";")
lex_hb_ngrams = pd.read_csv('../input/lexicons/refined_ngram_dict.csv') #kaggle


In [6]:
experimentalSetup = {
    'in-domain': {
        'train_dataset' : deepcopy(olid_train),
        'test_dataset'  : deepcopy(olid_test),
        'meta_training_dataset' : None,
        'meta_model' : None,
        'hatebert_full' : None,
        'roberta_full'  : None,
        'name' : "IN-DOMAIN",
        'results' : {}
    },
    'cross-domain': {
        'train_dataset' : deepcopy(hasoc_train),
        'test_dataset'  : deepcopy(olid_test),
        'name' : 'CROSS-DOMAIN',
        'meta_training_dataset' : None,
        'meta_model' : None,
        'hatebert_full' : None,
        'roberta_full'  : None,        
        'results': {}
    }
}

## ADDITIONAL FEATURES FUNCTIONS

In [7]:
def get_token_count_in_lex(text_column,hatebase_lexicon):
  """
    Gets a list with the number of occurences for each tokenized text in hatebase lexicon
  """
  lex = hatebase_lexicon['Entry'].to_list()
  lex_occurrences = []
  for text in text_column:
    
    tokenize = word_tokenize(text)
    count = sum(token in lex for token in tokenize)
    
    lex_occurrences.append(count)    
    # print(f"tokenized: {tokenize}\n\t{count}")
  
  return lex_occurrences


In [8]:
def get_propHate_ngrams_lex(text_column,ngrams_hatebase_lexicon):
  """
   Returns lists containing 0 in case no ngram from tweet appears in lexicon, OR the max o the prop_hate between ngrams that appears
  """
  ngram_lex_prophate = [] 
  tweets = text_column

  # lowecase column
  tweets = tweets.str.lower() 

  for tweet in tweets:
    ngrams_array = [] #array with all tweet ngrams between 1-4
    tokenized = word_tokenize(tweet) #tokenize

    for n in range(1,5):
      ngrams_tuples = ngrams(tokenized, n) #get ngrams with n = 1-4
      
      for gram in ngrams_tuples:
        ngrams_array.append(' '.join(list(gram))) #filling ngrams_array

    # print(ngrams_array)
    match = ngrams_hatebase_lexicon[ngrams_hatebase_lexicon['ngram'].isin(ngrams_array)] #get rows from lexicon where any of the tweet's ngrams are present
    max_prop_hate = 0
    if len(match) > 0:
      # print(f"tweet: {tweet}")
      # print(f"ngram:\n{match}\n")
      max_prop_hate = match['prophate'].max() #in case more than one ngram appears in lex, get the max of the prop_hate
    
    ngram_lex_prophate.append(max_prop_hate)

  return ngram_lex_prophate
      

## TRANSFORMERS (LEVEL-0 MODELS) K-FOLD TRAINING, AND META-MODEL TRAINING DF CREATION

In [9]:
def create_metamodel_training_set(train_set,k=10):
    """
    Train level-0 models with K-fold cross validation
    Returns: dataset to train meta-model
    """

    # prepare cross validation
    rskf = StratifiedKFold(n_splits=k, shuffle=True)

    hatebert_pred = []
    roberta_pred = []

    index = []
    text = []
    gold = []

    token_len = []
    char_len  = []
    hb_lex_occurrences = []
    hb_lex_ngrams_prop = []

    for train_index, test_index in rskf.split(train_set['text'],train_set['labels']):
        train_df = train_set.iloc[train_index]
        test_df  = train_set.iloc[test_index]

        # hateBERT training and prediction
        hatebert_args = ClassificationArgs(num_train_epochs=1,overwrite_output_dir=True)
        hatebert_model = ClassificationModel('bert','GroNLP/hateBERT',args=hatebert_args)
        hatebert_model.train_model(train_df)
        hatebert_predictions, hatebert_probs = hatebert_model.predict(test_df.text.to_list())

        # roBERTa training and prediction
        roberta_args = ClassificationArgs(num_train_epochs=1,overwrite_output_dir=True)
        roberta_model = ClassificationModel('roberta','roberta-base',args=roberta_args)
        roberta_model.train_model(train_df)
        roberta_predictions, roberta_probs = roberta_model.predict(test_df.text.to_list())

        roberta_pred.extend(roberta_predictions)
        hatebert_pred.extend(hatebert_predictions)

        #building meta-model training set columns
        text.extend(test_df['text'])
        gold.extend(test_df['labels'])

        #additional features
        token_len.extend([len(word_tokenize(text)) for text in test_df['text']])
        char_len.extend([len(text) for text in test_df['text']])
        hb_lex_occurrences.extend(get_token_count_in_lex(test_df['text'],lex_hb))
        hb_lex_ngrams_prop.extend(get_propHate_ngrams_lex(test_df['text'],lex_hb_ngrams))

    output = pd.DataFrame(columns = ['text','hatebert_pred','roberta_pred','token_len','char_len','hb_lex_occurr','hb_lex_ngrams_prop','label'])
    output.text           = text
    output.hatebert_pred  = hatebert_pred
    output.roberta_pred   = roberta_pred
    output.token_len      = token_len
    output.char_len       = char_len
    output.hb_lex_occurr       = hb_lex_occurrences
    output.hb_lex_ngrams_prop  = hb_lex_ngrams_prop
    output.label          =  gold

    # output.to_csv('/content/drive/MyDrive/VU/SM/assignment4/data/metamodel_training_df_v2.csv',index=False) #google
    # output.to_csv('/kaggle/working/metamodel_training_df_v3',index=False)
    
    return output


## CREATION AND SAVING OF META MODEL TRAINING SETS

In [16]:
#selecting setup (in-domain or cross-domain)
indomain = experimentalSetup['in-domain']

#creation (OR LOADING) of metamodel training set
   # indomain['meta_training_dataset'] = create_metamodel_training_set(indomain['train_dataset'],10)
   # indomain['meta_training_dataset'].to_csv('/kaggle/working/meta_df_indomain_v6.csv',index=False)
indomain['meta_training_dataset'] = pd.read_csv('../input/metamodel-training-v6/meta_df_indomain_v6.csv') #kaggle

In [17]:
#selecting setup (in-domain or cross-domain)
crossdomain = experimentalSetup['cross-domain']

#creation (OR LOADING) of metamodel training set
# crossdomain['meta_training_dataset'] = create_metamodel_training_set(crossdomain['train_dataset'],10)
# crossdomain['meta_training_dataset'].to_csv('/kaggle/working/meta_df_crossdomain_v6.csv',index=False)
crossdomain['meta_training_dataset'] = pd.read_csv('../input/metamodel-training-v6/meta_df_crossdomain_v6.csv') #kaggle

## ------------------------------------------------------------------------------------------

## META-MODEL TRAINING

In [19]:
def get_trained_metamodel(meta_training_df):
    y_meta = meta_training_df['label']
    X_meta = meta_training_df.drop(columns=['label','text'])

    meta_model = GradientBoostingClassifier()
    meta_model.fit(X_meta,y_meta)
    
    return meta_model


In [20]:
indomain['meta_model'] = get_trained_metamodel(indomain['meta_training_dataset'])
crossdomain['meta_model'] = get_trained_metamodel(crossdomain['meta_training_dataset'])

In [None]:
# # ngram level tf-idf
# # tfidf_vect_ngram = TfidfVectorizer(analyzer='word',ngram_range=(1,4))
# tfidf_vect_ngram = TfidfVectorizer()

# X_asd = tfidf_vect_ngram.fit_transform(X_meta_indomain)

# X_asd


## PREPARING TEST DF FOR META-MODEL

____________________________________________________________________________________________________________________________________
**this is only to reset run and should be deleted after**

In [118]:
# indomain['roberta_full'] = None
# indomain['hatebert_full'] = None

________________________________________________

In [22]:
#train hatebert and roberta on training set (now on the full training df without stackfold) in order to make predictions and make the test set look like meta-training-df
def full_train_transformers(setup):
    
    hatebert_args = ClassificationArgs(num_train_epochs=1,overwrite_output_dir=True)
    roberta_args = ClassificationArgs(num_train_epochs=1,overwrite_output_dir=True)

    #indomain
    setup['hatebert_full'] = ClassificationModel('bert','GroNLP/hateBERT',args=hatebert_args)
    setup['roberta_full'] = ClassificationModel('roberta','roberta-base',args=roberta_args)
    setup['hatebert_full'].train_model(setup['train_dataset'])
    setup['roberta_full'].train_model(setup['train_dataset'])



In [23]:
#indomain
if indomain['hatebert_full'] == None and indomain['roberta_full'] == None:
    print("train")
    full_train_transformers(indomain)
    torch.save(indomain['hatebert_full'], '/kaggle/working/hatebert_full_indomain')
    torch.save(indomain['roberta_full'], '/kaggle/working/roberta_full_indomain')
else:
    print('load')
    indomain['hatebert_full'] = torch.load('./hatebert_full_indomain') #kaggle
    indomain['roberta_full']  = torch.load('./roberta_full_indomain') #kaggle
    

train


Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at GroNLP/hateBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly

Downloading:   0%|          | 0.00/151 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

  0%|          | 0/5852 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/732 [00:00<?, ?it/s]

  0%|          | 0/5852 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/732 [00:00<?, ?it/s]

In [28]:
#cross
if crossdomain['hatebert_full'] == None and crossdomain['roberta_full'] == None:
    print("train")
    full_train_transformers(crossdomain)
    torch.save(crossdomain['hatebert_full'], '/kaggle/working/hatebert_full_crossdomain')
    torch.save(crossdomain['roberta_full'], '/kaggle/working/roberta_full_crossdomain')
else:
    print('load')
    crossdomain['hatebert_full'] = torch.load('./hatebert_full_crossdomain') #kaggle
    crossdomain['roberta_full']  = torch.load('./roberta_full_crossdomain') #kaggle

train


Some weights of the model checkpoint at GroNLP/hateBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly

  0%|          | 0/5852 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/732 [00:00<?, ?it/s]

  0%|          | 0/5852 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/732 [00:00<?, ?it/s]

In [None]:
#crossdomain
if crossdomain['hatebert_full'] == None and crossdomain['roberta_full'] == None:
    full_train_transformers(crossdomain)
    torch.save(crossdomain['hatebert_full'], '/kaggle/working/hatebert_full_crossdomain')
    torch.save(crossdomain['roberta_full'], '/kaggle/working/roberta_full_crossdomain')
else:
    print('load')
#     indomain['hatebert_full'] = torch.load('./hatebert_full_olid_train_df') #kaggle
#     indomain['roberta_full']  = torch.load('./hatebert_full_olid_train_df') #kaggle
    

In [31]:
crossdomain['test_dataset']

Unnamed: 0,text,labels
0,#WhoIsQ #WheresTheServer #DumpNike #DECLASFISA...,1
1,"#ConstitutionDay is revered by Conservatives, ...",0
2,#FOXNews #NRA #MAGA #POTUS #TRUMP #2ndAmendmen...,0
3,#Watching #Boomer getting the news that she is...,0
4,#NoPasaran: Unity demo to oppose the far-right...,1
...,...,...
855,#DespicableDems lie again about rifles. Dem Di...,1
856,#MeetTheSpeakers 🙌 @USER will present in our e...,0
857,3 people just unfollowed me for talking about ...,1
858,#WednesdayWisdom Antifa calls the right fascis...,0


In [33]:
def transform_test_df(setup): 
    hatebert_preds, hatebert_probs = setup['hatebert_full'].predict(setup['test_dataset'].text.to_list())
    roberta_preds, roberta_probs   = setup['roberta_full'].predict(setup['test_dataset'].text.to_list())

    setup['test_dataset']['hatebert_pred'] = hatebert_preds
    setup['test_dataset']['roberta_pred']  = roberta_preds

    #additional features
    setup['test_dataset']['token_len'] = [len(word_tokenize(text)) for text in setup['test_dataset']['text']]
    setup['test_dataset']['char_len']  = [len(text) for text in setup['test_dataset']['text']]
    setup['test_dataset']['hb_lex_occurr'] = get_token_count_in_lex(setup['test_dataset']['text'],lex_hb)
    setup['test_dataset']['hb_lex_ngrams_prop'] = get_propHate_ngrams_lex(setup['test_dataset']['text'],lex_hb_ngrams)

    setup['test_dataset'] = setup['test_dataset'].drop(columns=['text'])
    

In [34]:
transform_test_df(indomain)
transform_test_df(crossdomain)


  0%|          | 0/860 [00:00<?, ?it/s]

  0%|          | 0/108 [00:00<?, ?it/s]

  0%|          | 0/860 [00:00<?, ?it/s]

  0%|          | 0/108 [00:00<?, ?it/s]

  0%|          | 0/860 [00:00<?, ?it/s]

  0%|          | 0/108 [00:00<?, ?it/s]

  0%|          | 0/860 [00:00<?, ?it/s]

  0%|          | 0/108 [00:00<?, ?it/s]

In [None]:
transform_test_df(crossdomain)
crossdomain['test_dataset']

## META MODEL PREDICTION AND EVALUATION

In [35]:
#INDOMAIN
#get classification report of transformer models to then compare with ensemble metrics
print("####INDOMAIN!####")
print("REPORT HATEBERT: \n",classification_report(   indomain['test_dataset']['labels'].values,indomain['test_dataset']['hatebert_pred']))
print("\n\nREPORT ROBERTA: \n",classification_report(indomain['test_dataset']['labels'].values,indomain['test_dataset']['roberta_pred']))


X_meta_test = indomain['test_dataset'].drop(columns=['labels'])
meta_preds  = indomain['meta_model'].predict(X_meta_test)

print("REPORT META MODEL INDOMAIN: \n",classification_report(indomain['test_dataset']['labels'].values, meta_preds))


####INDOMAIN!####
REPORT HATEBERT: 
               precision    recall  f1-score   support

           0       0.87      0.90      0.89       620
           1       0.73      0.66      0.69       240

    accuracy                           0.84       860
   macro avg       0.80      0.78      0.79       860
weighted avg       0.83      0.84      0.83       860



REPORT ROBERTA: 
               precision    recall  f1-score   support

           0       0.89      0.88      0.88       620
           1       0.69      0.71      0.70       240

    accuracy                           0.83       860
   macro avg       0.79      0.79      0.79       860
weighted avg       0.83      0.83      0.83       860

REPORT META MODEL INDOMAIN: 
               precision    recall  f1-score   support

           0       0.88      0.90      0.89       620
           1       0.72      0.70      0.71       240

    accuracy                           0.84       860
   macro avg       0.80      0.80      0.

In [36]:
#CROSSDOMAIN
#get classification report of transformer models to then compare with ensemble metrics
print("####INDOMAIN!####")
print("REPORT HATEBERT: \n",classification_report(   crossdomain['test_dataset']['labels'].values,crossdomain['test_dataset']['hatebert_pred']))
print("\n\nREPORT ROBERTA: \n",classification_report(crossdomain['test_dataset']['labels'].values,crossdomain['test_dataset']['roberta_pred']))


X_meta_test = crossdomain['test_dataset'].drop(columns=['labels'])
meta_preds  = crossdomain['meta_model'].predict(X_meta_test)

print("REPORT META MODEL INDOMAIN: \n",classification_report(crossdomain['test_dataset']['labels'].values, meta_preds))

####INDOMAIN!####
REPORT HATEBERT: 
               precision    recall  f1-score   support

           0       0.81      0.91      0.86       620
           1       0.65      0.45      0.53       240

    accuracy                           0.78       860
   macro avg       0.73      0.68      0.69       860
weighted avg       0.77      0.78      0.76       860



REPORT ROBERTA: 
               precision    recall  f1-score   support

           0       0.75      0.89      0.82       620
           1       0.47      0.25      0.33       240

    accuracy                           0.71       860
   macro avg       0.61      0.57      0.57       860
weighted avg       0.68      0.71      0.68       860

REPORT META MODEL INDOMAIN: 
               precision    recall  f1-score   support

           0       0.79      0.88      0.84       620
           1       0.57      0.41      0.48       240

    accuracy                           0.75       860
   macro avg       0.68      0.65      0.