In [3]:
import torch
import pandas as pd
from simpletransformers.classification import ClassificationModel, ClassificationArgs

model_args = ClassificationArgs()
model_args.use_cuda = torch.cuda.is_available()
model_args.overwrite_output_dir = True

# Getting two HateBERT models
olid_model = ClassificationModel(
    'bert',
    'bert-base-cased',
    args=model_args,
)

hasoc_model = ClassificationModel(
    'bert',
    'bert-base-cased',
    args=model_args,
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [4]:
# Loading our data in
olid_train = pd.read_csv('data/olid-train-small.csv')
hasoc_train = pd.read_csv('data/hasoc-train.csv')
test = pd.read_csv('data/olid-test.csv')

In [5]:
# First we train on OLID (IN-DOMAIN)
olid_model.train_model(olid_train, overwrite_output_dir=True)

  0%|          | 0/5852 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/732 [00:00<?, ?it/s]



(732, 0.5448657332873735)

In [6]:
# Getting the predictions
olid_predictions, _ = olid_model.predict(list(test['text']))

  0%|          | 0/860 [00:00<?, ?it/s]

  0%|          | 0/108 [00:00<?, ?it/s]

In [7]:
# Getting model outputs for OLID trained model
olid_result, olid_model_outputs, olid_wrong_preds = olid_model.eval_model(test)

  0%|          | 0/860 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/108 [00:00<?, ?it/s]

In [8]:
# Getting model outputs for OLID trained model
olid_p_neg = list()
olid_p_pos = list()
for neg, pos in olid_model_outputs:
  olid_p_neg.append(neg)
  olid_p_pos.append(pos)

In [9]:
# Then we train with HASOC (CROSS-DOMAIN)
hasoc_model.train_model(hasoc_train, overwrite_output_dir=True)

  0%|          | 0/5852 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/732 [00:00<?, ?it/s]

(732, 0.638805738563746)

In [10]:
# Getting the predictions
hasoc_predictions, _ = hasoc_model.predict(list(test['text']))

  0%|          | 0/860 [00:00<?, ?it/s]

  0%|          | 0/108 [00:00<?, ?it/s]

In [11]:
# Getting model outputs for HASOC trained model
hasoc_result, hasoc_model_outputs, hasoc_wrong_preds = hasoc_model.eval_model(test)

  0%|          | 0/860 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/108 [00:00<?, ?it/s]

In [12]:
# Getting model outputs for HASOC trained model
hasoc_p_neg = list()
hasoc_p_pos = list()
for neg, pos in hasoc_model_outputs:
  hasoc_p_neg.append(neg)
  hasoc_p_pos.append(pos)

In [13]:
# Creatig copies of the test set in order to store our predtictions for future use
from copy import deepcopy

olid_final = deepcopy(test)
hasoc_final = deepcopy(test)

In [14]:
# Writing our predictions to each dataset respectively
olid_final['predictions'] = olid_predictions
olid_final['p_neg'] = olid_p_neg
olid_final['p_pos'] = olid_p_pos

hasoc_final['predictions'] = hasoc_predictions
hasoc_final['p_neg'] = hasoc_p_neg
hasoc_final['p_pos'] = hasoc_p_pos

In [15]:
# Writing our predicted sets to disk for analysis later
olid_final.to_csv('in_domain_BERT_probs.csv', index=False)
hasoc_final.to_csv('cross_domain_BERT_probs.csv', index=False)

In [17]:
# Here is what we wrote to disk
pd.read_csv('in_domain_BERT_probs.csv').head()

Unnamed: 0,id,text,labels,predictions,p_neg,p_pos
0,15923,#WhoIsQ #WheresTheServer #DumpNike #DECLASFISA...,1,0,0.668457,-0.57373
1,27014,"#ConstitutionDay is revered by Conservatives, ...",0,0,0.17041,-0.201904
2,30530,#FOXNews #NRA #MAGA #POTUS #TRUMP #2ndAmendmen...,0,0,0.814453,-0.697266
3,13876,#Watching #Boomer getting the news that she is...,0,0,1.230469,-1.079102
4,60133,#NoPasaran: Unity demo to oppose the far-right...,1,0,1.141602,-1.085938


In [18]:
# Here is what we wrote to disk
pd.read_csv('cross_domain_BERT_probs.csv').head()

Unnamed: 0,id,text,labels,predictions,p_neg,p_pos
0,15923,#WhoIsQ #WheresTheServer #DumpNike #DECLASFISA...,1,0,-0.232666,-0.891602
1,27014,"#ConstitutionDay is revered by Conservatives, ...",0,0,-0.427002,-0.592285
2,30530,#FOXNews #NRA #MAGA #POTUS #TRUMP #2ndAmendmen...,0,0,-0.103821,-1.03125
3,13876,#Watching #Boomer getting the news that she is...,0,0,0.054413,-0.995605
4,60133,#NoPasaran: Unity demo to oppose the far-right...,1,0,-0.06897,-1.102539
