In [None]:
%pip install pandas
%pip install scikit-learn
%pip install huggingface_hub
%pip install datasets
%pip install transformers
%pip install torch

In [None]:
import pandas as pd
import numpy as np

# Load the training dataset
df_train = pd.read_csv("hf://datasets/christophsonntag/OLID/train.csv")

# Load the test dataset
df_test = pd.read_csv("hf://datasets/christophsonntag/OLID/test.csv")

# Extract tweets and labels from both datasets
train_tweets = np.array(df_train['tweet'].values)
train_labels = np.where(df_train['subtask_a'].values == 'OFF', 1, 0)

test_tweets = np.array(df_test['tweet'].values)
test_labels = np.where(df_test['subtask_a'].values == 'OFF', 1, 0)

print(train_tweets.shape)
print(train_labels.shape)
print(test_tweets.shape)
print(test_labels.shape)

Creating tokenizer to turn testing and training tweets into tokens for the BERT model

In [None]:
from transformers import BertTokenizer

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize training and test tweets
train_encodings = tokenizer(list(train_tweets), padding="max_length", truncation=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(list(test_tweets), padding="max_length", truncation=True, max_length=128, return_tensors="pt")

Converting the tokens and labels of training and testing into the form the BERT model is expecting (which is a Dataset class)

In [4]:
import torch
from torch.utils.data import Dataset

class OLIDDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

# Convert tokenized data into datasets
train_dataset = OLIDDataset(train_encodings, train_labels)
test_dataset = OLIDDataset(test_encodings, test_labels)

print(f"Training set size: {len(train_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Training set size: 13240
Test set size: 860


Set up pre-trained BERT model and creater a trainer for it using our specific data set.

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

# Load BERT model with dropout regularization
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.2
)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="best",  # Save only the best model
    load_best_model_at_end=True,  # Fix for EarlyStoppingCallback
    metric_for_best_model="eval_loss",  # Ensure best model is based on validation loss
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_dir="./logs",
    fp16=torch.cuda.is_available(),  # use half precision training if CUDA available
    learning_rate=3e-6,
    weight_decay=0.01,
    report_to=[],  # disable default reporting since we only need our logs
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)

Train the model

In [6]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.436491
2,0.576600,0.373159
3,0.465700,0.36982
4,0.444100,0.366061
5,0.426700,0.36492


TrainOutput(global_step=2070, training_loss=0.4762388763796304, metrics={'train_runtime': 478.2812, 'train_samples_per_second': 138.412, 'train_steps_per_second': 4.328, 'total_flos': 4354487966208000.0, 'train_loss': 0.4762388763796304, 'epoch': 5.0})

Code to check which checkpoint corresponds to the best model. This the model we'll load to use.

In [7]:
import json
import os

best_checkpoint = None
best_loss = float("inf")

for checkpoint in os.listdir("./results"):
    path = f"./results/{checkpoint}/trainer_state.json"

    if os.path.exists(path):
        with open(path, "r") as f:
            data = json.load(f)
            val_loss = data["log_history"][-1].get("eval_loss", None)  # Get last validation loss

            if val_loss is not None:
                print(f"Checkpoint: {checkpoint}, Validation Loss: {val_loss}")

                # Find the checkpoint with the lowest validation loss
                if val_loss < best_loss:
                    best_loss = val_loss
                    best_checkpoint = checkpoint

print(f"\nBest checkpoint: {best_checkpoint} with Validation Loss: {best_loss}")


Checkpoint: checkpoint-414, Validation Loss: 0.4364910125732422
Checkpoint: checkpoint-1242, Validation Loss: 0.3698195219039917
Checkpoint: checkpoint-828, Validation Loss: 0.3731590211391449
Checkpoint: checkpoint-1656, Validation Loss: 0.3660610318183899
Checkpoint: checkpoint-2070, Validation Loss: 0.36491966247558594

Best checkpoint: checkpoint-2070 with Validation Loss: 0.36491966247558594


Running model on test data to generate predictions

In [8]:
from transformers import BertForSequenceClassification

# Load the best model
best_checkpoint = "./results/checkpoint-2070"
best_model = BertForSequenceClassification.from_pretrained(best_checkpoint)

# Save the best model
best_model.save_pretrained("./final_bert_model")
tokenizer.save_pretrained("./final_bert_model")

('./final_bert_model/tokenizer_config.json',
 './final_bert_model/special_tokens_map.json',
 './final_bert_model/vocab.txt',
 './final_bert_model/added_tokens.json')

Getting predictions using the model on the testing data

In [9]:
import torch
from sklearn.metrics import classification_report

# Get predictions from BERT
preds = trainer.predict(test_dataset)
pred_labels = torch.argmax(torch.tensor(preds.predictions), axis=1)

# Print classification report
print(classification_report(test_labels, pred_labels.numpy()))

              precision    recall  f1-score   support

           0       0.86      0.94      0.90       620
           1       0.81      0.61      0.69       240

    accuracy                           0.85       860
   macro avg       0.83      0.78      0.80       860
weighted avg       0.85      0.85      0.84       860



Evaluate model performance using accuracy, precision, recall, and F1 score

In [10]:
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)

In [15]:
accuracy = accuracy_score(test_labels, pred_labels)
f1 = f1_score(test_labels, pred_labels)
precision = precision_score(test_labels, pred_labels)
recall = recall_score(test_labels, pred_labels)

print(f"Accuracy:  {round(accuracy, 16)}")
print(f"F1 Score:  {round(f1, 16)}")
print(f"Precision: {round(precision, 16)}")
print(f"Recall:    {round(recall, 16)}")

Accuracy:  0.85
F1 Score:  0.6935866983372921
Precision: 0.8066298342541437
Recall:    0.6083333333333333


Print confusion matrix for further exploration

In [33]:
from sklearn.metrics import confusion_matrix

In [34]:
print("Confusion Matrix:")
confusion_matrix(test_labels, pred_labels)

Confusion Matrix:


array([[1727,   44],
       [ 731,  146]])