In [None]:
%pip install pandas
%pip install scikit-learn
%pip install matplotlib
%pip install fsspec
%pip install huggingface_hub
%pip install datasets
%pip install tqdm

Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.0-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tqdm

# Load the training dataset
df_train = pd.read_csv("hf://datasets/christophsonntag/OLID/train.csv")

# Load the test dataset
df_test = pd.read_csv("hf://datasets/christophsonntag/OLID/test.csv")

# Extract tweets and labels from both datasets
train_tweets = np.array(df_train['tweet'].values)
train_labels = np.where(df_train['subtask_a'].values == 'OFF', 1, 0)

test_tweets = np.array(df_test['tweet'].values)
test_labels = np.where(df_test['subtask_a'].values == 'OFF', 1, 0)

print(train_tweets.shape)
print(train_labels.shape)
print(test_tweets.shape)
print(test_labels.shape)

(13240,)
(13240,)
(860,)
(860,)


In [10]:
!pip install transformers datasets torch scikit-learn

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

Creating tokenizer to turn testing and training tweets into tokens for the BERT model

In [32]:
from transformers import BertTokenizer

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize training and test tweets
train_encodings = tokenizer(list(train_tweets), padding="max_length", truncation=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(list(test_tweets), padding="max_length", truncation=True, max_length=128, return_tensors="pt")

Converting the tokens and labels of training and testing into the form the BERT model is expecting (which is a Dataset class)

In [33]:
import torch
from torch.utils.data import Dataset

class OLIDDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

# Convert tokenized data into datasets
train_dataset = OLIDDataset(train_encodings, train_labels)
test_dataset = OLIDDataset(test_encodings, test_labels)

print(f"Training set size: {len(train_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Training set size: 13240
Test set size: 860


Set up pre-trained BERT model and creater a trainer for it using our specific data set.

In [34]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

# Load BERT model with dropout regularization
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.2
)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="best",  # Save only the best model
    load_best_model_at_end=True,  # Fix for EarlyStoppingCallback
    metric_for_best_model="eval_loss",  # Ensure best model is based on validation loss
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_dir="./logs",
    fp16=True,
    learning_rate=3e-6,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train the model

In [35]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.452166
2,0.576600,0.380874
3,0.466700,0.370522
4,0.444500,0.367556
5,0.429900,0.367392


TrainOutput(global_step=2070, training_loss=0.47742839297234724, metrics={'train_runtime': 119.0149, 'train_samples_per_second': 556.233, 'train_steps_per_second': 17.393, 'total_flos': 4354487966208000.0, 'train_loss': 0.47742839297234724, 'epoch': 5.0})

Code to check which checkpoint corresponds to the best model. This the model we'll load to use.

In [39]:
import json
import os

best_checkpoint = None
best_loss = float("inf")

for checkpoint in os.listdir("./results"):
    path = f"./results/{checkpoint}/trainer_state.json"

    if os.path.exists(path):
        with open(path, "r") as f:
            data = json.load(f)
            val_loss = data["log_history"][-1].get("eval_loss", None)  # Get last validation loss

            if val_loss is not None:
                print(f"Checkpoint: {checkpoint}, Validation Loss: {val_loss}")

                # Find the checkpoint with the lowest validation loss
                if val_loss < best_loss:
                    best_loss = val_loss
                    best_checkpoint = checkpoint

print(f"\nBest checkpoint: {best_checkpoint} with Validation Loss: {best_loss}")


Checkpoint: checkpoint-2070, Validation Loss: 0.3673921823501587
Checkpoint: checkpoint-1656, Validation Loss: 0.36755576729774475
Checkpoint: checkpoint-828, Validation Loss: 0.38087400794029236
Checkpoint: checkpoint-4140, Validation Loss: 0.8781325221061707
Checkpoint: checkpoint-1242, Validation Loss: 0.37052223086357117
Checkpoint: checkpoint-3312, Validation Loss: 0.7189924120903015
Checkpoint: checkpoint-414, Validation Loss: 0.45216605067253113
Checkpoint: checkpoint-2484, Validation Loss: 0.5050203800201416

Best checkpoint: checkpoint-2070 with Validation Loss: 0.3673921823501587


Running model on test data to generate predictions

In [41]:
from transformers import BertForSequenceClassification

# Load the best model
best_checkpoint = "./results/checkpoint-2070"
best_model = BertForSequenceClassification.from_pretrained(best_checkpoint)

# Save the best model
best_model.save_pretrained("./final_bert_model")
tokenizer.save_pretrained("./final_bert_model")

('./final_bert_model/tokenizer_config.json',
 './final_bert_model/special_tokens_map.json',
 './final_bert_model/vocab.txt',
 './final_bert_model/added_tokens.json')

Getting predictions using the model on the testing data

In [42]:
import torch
from sklearn.metrics import classification_report

# Get predictions from BERT
preds = trainer.predict(test_dataset)
pred_labels = torch.argmax(torch.tensor(preds.predictions), axis=1)

# Print classification report
print(classification_report(test_labels, pred_labels.numpy()))

              precision    recall  f1-score   support

           0       0.86      0.94      0.90       620
           1       0.80      0.62      0.70       240

    accuracy                           0.85       860
   macro avg       0.83      0.78      0.80       860
weighted avg       0.85      0.85      0.84       860

