# BERT Transfer Learning from Generic Hate Speech to COVID-19 specific Racist Hate Speech 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 4.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 78.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 8.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 89.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 66.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [3]:
# data_path_1 --> t-davidson/hate-speech-and-offensive-language 
# data_path_2 --> ENCASEH2020/hatespeech-twitter processed 
#covid_data --> covid hate speech data

data_path_1 = "/content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/data/tdavidson.csv"
data_path_2 = "/content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/data/encaseh2020.csv"
covid_data = "/content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/data/covid_hatespeech.tsv"

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, AutoTokenizer, RobertaTokenizer
import torch
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, AutoModelForSequenceClassification, RobertaForSequenceClassification
from sklearn.metrics import f1_score, classification_report
import numpy as np

#### Data 1 (t-davidson/hate-speech-and-offensive-language)

In [5]:
data1_pd = pd.read_csv(data_path_1, index_col=0)
data1_pd.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [6]:
#Original Labels
#0 - hate
#1 - offensive
#2  - neither
data1_pd['class'].value_counts()

1    19190
2     4163
0     1430
Name: class, dtype: int64

In [7]:
#Converting labels to 0 (neutral) and 1 (hate)
data1_pd['class'] = data1_pd['class'].replace(0,1)
data1_pd['class'] = data1_pd['class'].replace(2,0)
data1_pd['class'].value_counts()

1    20620
0     4163
Name: class, dtype: int64

In [8]:
#Getting rid of all columns except class and tweet
data1_pd = data1_pd.drop(columns = ['count', 'hate_speech', 'offensive_language', 'neither'])
data1_pd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24783 entries, 0 to 25296
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   class   24783 non-null  int64 
 1   tweet   24783 non-null  object
dtypes: int64(1), object(1)
memory usage: 580.9+ KB


In [9]:
#Convert tweet and class to lists
text_list = data1_pd['tweet'].tolist()
label_list = data1_pd['class'].tolist()

In [10]:
#train, dev, test split of 80:20:20
train_texts, test_texts, train_labels, test_labels = train_test_split(text_list, label_list, test_size=.2, random_state=585)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.25, random_state=585)

In [19]:
#Intializing tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

In [20]:
#Create tokenized encodings
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [21]:
#Create dataset object
class HateDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = HateDataset(train_encodings, train_labels)
val_dataset = HateDataset(val_encodings, val_labels)
test_dataset = HateDataset(test_encodings, test_labels)

In [22]:
model_1_path = "/content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/stage_1/roberta"

In [23]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base")

training_args = TrainingArguments(
    output_dir=model_1_path,          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=100,
    save_strategy = "no"
)


trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset             
)

trainer.train()
trainer.save_model(model_1_path)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

Step,Training Loss
100,0.485
200,0.1918
300,0.1871
400,0.1791
500,0.1696
600,0.1888
700,0.1667
800,0.1342
900,0.1599
1000,0.1302




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/stage_1/roberta
Configuration saved in /content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/stage_1/roberta/config.json
Model weights saved in /content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/stage_1/roberta/pytorch_model.bin


In [24]:
model_1_trained = RobertaForSequenceClassification.from_pretrained(model_1_path)

loading configuration file /content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/stage_1/roberta/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file /content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/stage_1/roberta/pytorch_model.bin
All mod

In [25]:
# arguments for Trainer
test_args = TrainingArguments(
    output_dir = './output',
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = 16,   
    dataloader_drop_last = False    
)

# init trainer
trainer = Trainer(
              model = model_1_trained, #Using the recently trained model to make predictions
              args = test_args)

test_results = trainer.predict(test_dataset)
predictions = test_results.predictions.argmax(1)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 4957
  Batch size = 16


In [26]:
print("Micro:", f1_score(test_labels, predictions, average='micro'))
print("Macro:", f1_score(test_labels, predictions, average='macro'))

Micro: 0.9592495460964293
Macro: 0.9268497056788322


In [27]:
#0 Neutral
#1 Hate
print(classification_report(test_labels, predictions))

              precision    recall  f1-score   support

           0       0.91      0.85      0.88       860
           1       0.97      0.98      0.98      4097

    accuracy                           0.96      4957
   macro avg       0.94      0.91      0.93      4957
weighted avg       0.96      0.96      0.96      4957



#### Data 2 (ENCASEH2020/hatespeech-twitter)

In [28]:
data2_pd = pd.read_csv(data_path_2, sep='\t', usecols=['tweet', 'label', 'votes'], skiprows=0).dropna().drop('votes', axis=1)
data2_pd.head(5)

Unnamed: 0,tweet,label
1,RT @Papapishu: Man it would fucking rule if we...,abusive
2,It is time to draw close to Him &#128591;&#127...,normal
3,if you notice me start to act different or dis...,normal
4,"Forget unfollowers, I believe in growing. 7 ne...",normal
5,RT @Vitiligoprince: Hate Being sexually Frustr...,abusive


In [29]:
data2_pd.label.value_counts()

normal     53851
abusive    27150
spam       14029
hateful     4965
Name: label, dtype: int64

In [30]:
#Drop rows labelled 'spam' 
data2_pd = data2_pd[data2_pd.label != 'spam']
data2_pd.label.value_counts()

normal     53851
abusive    27150
hateful     4965
Name: label, dtype: int64

In [31]:
#Converting 'abusive' and 'hateful' to label 1
data2_pd['label'] = data2_pd['label'].replace('abusive', 1)
data2_pd['label'] = data2_pd['label'].replace('hateful', 1)

#Converting 'normal' to label 0
data2_pd['label'] = data2_pd['label'].replace('normal', 0)

data2_pd.label.value_counts()

0    53851
1    32115
Name: label, dtype: int64

In [32]:
data2_pd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85966 entries, 1 to 99995
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   85966 non-null  object
 1   label   85966 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.0+ MB


In [33]:
text_list = data2_pd['tweet'].tolist()
label_list = data2_pd['label'].tolist()

In [34]:
#train, dev, test split of 80:20:20
train_texts, test_texts, train_labels, test_labels = train_test_split(text_list, label_list, test_size=.2, random_state=585)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.25, random_state=585)

In [35]:
#Create tokenized encodings
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [36]:
train_dataset = HateDataset(train_encodings, train_labels)
val_dataset = HateDataset(val_encodings, val_labels)
test_dataset = HateDataset(test_encodings, test_labels)

In [37]:
model_2_path = "/content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/stage_2/roberta"

In [38]:
training_args = TrainingArguments(
    output_dir=model_2_path,          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=100,
    save_strategy = "no"
)


trainer = Trainer(
    model=model_1_trained, #using the model trained on dataset 1                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset             
)

trainer.train()
trainer.save_model(model_2_path)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 51579
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 9672


Step,Training Loss
100,0.2559
200,0.2391
300,0.2347
400,0.2052
500,0.2437
600,0.2417
700,0.2392
800,0.2193
900,0.218
1000,0.2065




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/stage_2/roberta
Configuration saved in /content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/stage_2/roberta/config.json
Model weights saved in /content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/stage_2/roberta/pytorch_model.bin


In [39]:
model_2_trained = RobertaForSequenceClassification.from_pretrained(model_2_path)

loading configuration file /content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/stage_2/roberta/config.json
Model config RobertaConfig {
  "_name_or_path": "/content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/stage_1/roberta",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file /content/drive/MyDrive/mds/585/COLX_58

In [40]:
test_args = TrainingArguments(
    output_dir = './output',
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = 16,   
    dataloader_drop_last = False    
)

trainer = Trainer(
              model = model_2_trained, #Using the recently trained model to make predictions
              args = test_args)

test_results = trainer.predict(test_dataset)
predictions = test_results.predictions.argmax(1)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 17194
  Batch size = 16


In [41]:
print("Micro:", f1_score(test_labels, predictions, average='micro'))
print("Macro:", f1_score(test_labels, predictions, average='macro'))

Micro: 0.9388158659997674
Macro: 0.9345823766185068


In [42]:
print(classification_report(test_labels, predictions))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95     10740
           1       0.92      0.91      0.92      6454

    accuracy                           0.94     17194
   macro avg       0.94      0.93      0.93     17194
weighted avg       0.94      0.94      0.94     17194



#### Fine tuning (COVID Hate)
TODO: This is the data on which the model will be finetuned

In [43]:
covid_pd = pd.read_csv(covid_data, sep='\t').dropna()
covid_pd.head(5)

Unnamed: 0,tweet,label
0,Coronavirus: UK advises British citizens to le...,0.0
1,DA – Overcrowded Tembisa hospital not suitable...,0.0
2,States scramble to carry out Trump’s coronavir...,0.0
3,LOL westerns whining about common flu taking l...,1.0
4,"Reading the news everyday, it's as if people a...",0.0


In [44]:
#Changing label from float64 to int64
covid_pd['label'] = covid_pd['label'].astype(int)
covid_pd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26759 entries, 0 to 26768
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   26759 non-null  object
 1   label   26759 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 627.2+ KB


In [45]:
covid_pd['label'].value_counts()

0    21155
2     3205
1     2399
Name: label, dtype: int64

In [46]:
#Changing labels such that 0 is hate, 1 is normal
covid_pd['label'] = covid_pd['label'].replace(0,5)
covid_pd['label'] = covid_pd['label'].replace(2,0)
covid_pd['label'] = covid_pd['label'].replace(1,0)
covid_pd['label'] = covid_pd['label'].replace(5,1)

In [47]:
#1 - Normal
#0 - hate
covid_pd.label.value_counts()

1    21155
0     5604
Name: label, dtype: int64

In [48]:
text_list = covid_pd['tweet'].tolist()
label_list = covid_pd['label'].tolist()

In [49]:
#Train test split. Since we're fine tuning on covid-specific hate speech, we want the best possible results with the least
# possible training data (keep TEST_RATIO as high possible)
TEST_RATIO = 0.90
train_texts, test_texts, train_labels, test_labels = train_test_split(text_list, label_list, test_size=TEST_RATIO, random_state=585)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2, random_state=585)

In [50]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [51]:
train_dataset = HateDataset(train_encodings, train_labels)
val_dataset = HateDataset(val_encodings, val_labels)
test_dataset = HateDataset(test_encodings, test_labels)

In [52]:
#Load the model pre-trained on the second dataset
model_2_trained = RobertaForSequenceClassification.from_pretrained(model_2_path)

loading configuration file /content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/stage_2/roberta/config.json
Model config RobertaConfig {
  "_name_or_path": "/content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/stage_1/roberta",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file /content/drive/MyDrive/mds/585/COLX_58

In [53]:
model_final_path = "/content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/final/roberta"

In [54]:
training_args = TrainingArguments(
    output_dir=model_final_path,          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    save_strategy = "no"
)

trainer = Trainer(
    model=model_2_trained,                         # Here, the model is the one trained on the previous dataset
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset             
)

trainer.train()
trainer.save_model(model_final_path)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 2140
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 402


Step,Training Loss
100,2.5612
200,0.5135
300,0.3887
400,0.3853




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/final/roberta
Configuration saved in /content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/final/roberta/config.json
Model weights saved in /content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/final/roberta/pytorch_model.bin


In [55]:
#Load saved final model
final_model = RobertaForSequenceClassification.from_pretrained(model_final_path)

loading configuration file /content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/final/roberta/config.json
Model config RobertaConfig {
  "_name_or_path": "/content/drive/MyDrive/mds/585/COLX_585-COVID-Racism/models/stage_2/roberta",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file /content/drive/MyDrive/mds/585/COLX_585-

In [56]:
test_args = TrainingArguments(
    output_dir = './output',
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = 16,   
    dataloader_drop_last = False    
)

# init trainer
trainer = Trainer(
              model = final_model, 
              args = test_args)

test_results = trainer.predict(test_dataset)
predictions = test_results.predictions.argmax(1)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 24084
  Batch size = 16


In [57]:
print("Micro:", f1_score(test_labels, predictions, average='micro'))
print("Macro:", f1_score(test_labels, predictions, average='macro'))

Micro: 0.8251536289652882
Macro: 0.7727301153452818


In [58]:
print(classification_report(test_labels, predictions))

              precision    recall  f1-score   support

           0       0.56      0.82      0.66      5054
           1       0.95      0.83      0.88     19030

    accuracy                           0.83     24084
   macro avg       0.75      0.82      0.77     24084
weighted avg       0.86      0.83      0.84     24084

