In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tqdm

# Load the training dataset
df_train = pd.read_csv("hf://datasets/christophsonntag/OLID/train.csv")

# Load the test dataset
df_test = pd.read_csv("hf://datasets/christophsonntag/OLID/test.csv")

# Extract tweets and labels from both datasets
train_tweets = np.array(df_train['tweet'].values)
train_labels = np.where(df_train['subtask_a'].values == 'OFF', 1, 0)

test_tweets = np.array(df_test['tweet'].values)
test_labels = np.where(df_test['subtask_a'].values == 'OFF', 1, 0)

print(train_tweets.shape)
print(train_labels.shape)
print(test_tweets.shape)
print(test_labels.shape)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


(13240,)
(13240,)
(860,)
(860,)


### Creating tokenizer to turn testing and training tweets into tokens for the BERT model

In [2]:
from transformers import BertTokenizer

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize training and test tweets
train_encodings = tokenizer(list(train_tweets), padding="max_length", truncation=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(list(test_tweets), padding="max_length", truncation=True, max_length=128, return_tensors="pt")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

### Converting the tokens and labels of training and testing into the form the BERT model is expecting (which is a Dataset class)

In [3]:
import torch
from torch.utils.data import Dataset

class OLIDDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

# Convert tokenized data into datasets
train_dataset = OLIDDataset(train_encodings, train_labels)
test_dataset = OLIDDataset(test_encodings, test_labels)

print(f"Training set size: {len(train_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Training set size: 13240
Test set size: 860


### Set up pre-trained BERT model and creater a trainer for it using our specific data set.

In [4]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

# Load BERT model with dropout regularization
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.2
)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="best",  # Save only the best model
    load_best_model_at_end=True,  # Fix for EarlyStoppingCallback
    metric_for_best_model="eval_loss",  # Ensure best model is based on validation loss
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_dir="./logs",
    fp16=True,
    learning_rate=3e-6,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfakalizak[0m ([33mfakalizak-michigan-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.446438
2,0.573900,0.387327
3,0.469500,0.371055
4,0.449100,0.36753
5,0.431700,0.367523


TrainOutput(global_step=2070, training_loss=0.4789929799987498, metrics={'train_runtime': 130.1147, 'train_samples_per_second': 508.782, 'train_steps_per_second': 15.909, 'total_flos': 4354487966208000.0, 'train_loss': 0.4789929799987498, 'epoch': 5.0})

### Running model on test data to generate predictions

In [6]:
import torch
from sklearn.metrics import classification_report

# Get predictions from BERT
preds = trainer.predict(test_dataset)
pred_labels = torch.argmax(torch.tensor(preds.predictions), axis=1)

# Print classification report
print(classification_report(test_labels, pred_labels.numpy()))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90       620
           1       0.77      0.64      0.70       240

    accuracy                           0.85       860
   macro avg       0.82      0.78      0.80       860
weighted avg       0.84      0.85      0.84       860



## Using RoBERTa and DistilBERT models for prediction

In [7]:
from transformers import RobertaForSequenceClassification, DistilBertForSequenceClassification

roberta_model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=2,
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.2
)

distilbert_model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    dropout=0.2
)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import RobertaTokenizer, DistilBertTokenizer

# Load pre-trained RoBERTa tokenizer
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Tokenize tweets
roberta_train_encodings = roberta_tokenizer(
    list(train_tweets),
    padding="max_length",
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

roberta_test_encodings = roberta_tokenizer(
    list(test_tweets),
    padding="max_length",
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# Load pre-trained DistilBERT tokenizer
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize tweets
distilbert_train_encodings = distilbert_tokenizer(
    list(train_tweets),
    padding="max_length",
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

distilbert_test_encodings = distilbert_tokenizer(
    list(test_tweets),
    padding="max_length",
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
# Convert tokenized data into datasets
roberta_train_dataset = OLIDDataset(roberta_train_encodings, train_labels)
roberta_test_dataset = OLIDDataset(roberta_test_encodings, test_labels)

print(f"Training set size: {len(train_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Training set size: 13240
Test set size: 860


In [10]:
distilbert_train_dataset = OLIDDataset(distilbert_train_encodings, train_labels)
distilbert_test_dataset = OLIDDataset(distilbert_test_encodings, test_labels)

print(f"Training set size: {len(train_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Training set size: 13240
Test set size: 860


### RoBERTa model training and testing

In [11]:
roberta_trainer = Trainer(
    model=roberta_model,
    args=training_args,
    train_dataset=roberta_train_dataset,
    eval_dataset=roberta_test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)

roberta_trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.429378
2,0.597900,0.377497
3,0.473200,0.366133
4,0.445900,0.362051
5,0.436800,0.35952


TrainOutput(global_step=2070, training_loss=0.486303038297644, metrics={'train_runtime': 121.0673, 'train_samples_per_second': 546.803, 'train_steps_per_second': 17.098, 'total_flos': 4354487966208000.0, 'train_loss': 0.486303038297644, 'epoch': 5.0})

In [12]:
# Get predictions from RoBERTa
preds = roberta_trainer.predict(roberta_test_dataset)
pred_labels = torch.argmax(torch.tensor(preds.predictions), axis=1)

# Print classification report
print(classification_report(test_labels, pred_labels.numpy()))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89       620
           1       0.73      0.65      0.69       240

    accuracy                           0.84       860
   macro avg       0.80      0.78      0.79       860
weighted avg       0.83      0.84      0.83       860



### DistilBERT model training and testing

In [13]:
distilbert_trainer = Trainer(
    model=distilbert_model,
    args=training_args,
    train_dataset=distilbert_train_dataset,
    eval_dataset=distilbert_test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)

distilbert_trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.417654
2,0.545100,0.391635
3,0.442900,0.38392
4,0.426900,0.37991
5,0.417100,0.379422


TrainOutput(global_step=2070, training_loss=0.4563758739526721, metrics={'train_runtime': 64.6884, 'train_samples_per_second': 1023.368, 'train_steps_per_second': 32.0, 'total_flos': 2192335447756800.0, 'train_loss': 0.4563758739526721, 'epoch': 5.0})

In [14]:
# Get predictions from DistilBERT
preds = distilbert_trainer.predict(distilbert_test_dataset)
pred_labels = torch.argmax(torch.tensor(preds.predictions), axis=1)

# Print classification report
print(classification_report(test_labels, pred_labels.numpy()))

              precision    recall  f1-score   support

           0       0.86      0.95      0.90       620
           1       0.82      0.60      0.70       240

    accuracy                           0.85       860
   macro avg       0.84      0.78      0.80       860
weighted avg       0.85      0.85      0.84       860



### Training the BERT model for subtask B

In [15]:
# Filter for OFF tweets only
off_train_df = df_train[df_train['subtask_a'] == 'OFF']
off_test_df = df_test[df_test['subtask_a'] == 'OFF']

# Get inputs and labels for subtask B
train_b_tweets = np.array(off_train_df['tweet'].values)
train_b_labels = np.where(off_train_df['subtask_b'].values == 'TIN', 1, 0)

test_b_tweets = np.array(off_test_df['tweet'].values)
test_b_labels = np.where(off_test_df['subtask_b'].values == 'TIN', 1, 0)

print(train_b_tweets.shape)
print(train_b_labels.shape)
print(test_b_tweets.shape)
print(test_b_labels.shape)

(4400,)
(4400,)
(240,)
(240,)


In [16]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize training and test tweets
train_b_encodings = tokenizer(list(train_b_tweets), padding="max_length", truncation=True, max_length=128, return_tensors="pt")
test_b_encodings = tokenizer(list(test_b_tweets), padding="max_length", truncation=True, max_length=128, return_tensors="pt")

# Convert tokenized data into datasets
train_b_dataset = OLIDDataset(train_b_encodings, train_b_labels)
test_b_dataset = OLIDDataset(test_b_encodings, test_b_labels)

print(f"Training set size: {len(train_b_dataset)}")
print(f"Test set size: {len(test_b_dataset)}")

Training set size: 4400
Test set size: 240


In [17]:
from transformers import Trainer
from torch.nn import CrossEntropyLoss

class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights.to(self.args.device)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

train_counts = np.bincount(train_b_labels)
class_weights = 1.0 / train_counts
class_weights = class_weights / class_weights.sum()
class_weights = torch.tensor(class_weights, dtype=torch.float)
print("Class weights:", class_weights)

Class weights: tensor([0.8809, 0.1191])


In [18]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.2
)

trainer = WeightedTrainer(
    class_weights=class_weights,
    model=model,
    args=training_args,
    train_dataset=train_b_dataset,
    eval_dataset=test_b_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.63968
2,No log,0.607891
3,No log,0.570747
4,0.660100,0.552455
5,0.660100,0.551852


TrainOutput(global_step=690, training_loss=0.6523660853289176, metrics={'train_runtime': 47.8713, 'train_samples_per_second': 459.565, 'train_steps_per_second': 14.414, 'total_flos': 1447110804480000.0, 'train_loss': 0.6523660853289176, 'epoch': 5.0})

In [19]:
# Get predictions from BERT
preds = trainer.predict(test_b_dataset)
pred_labels = torch.argmax(torch.tensor(preds.predictions), axis=1)

# Print classification report
print(classification_report(test_b_labels, pred_labels.numpy()))

              precision    recall  f1-score   support

           0       0.50      0.63      0.56        27
           1       0.95      0.92      0.94       213

    accuracy                           0.89       240
   macro avg       0.73      0.77      0.75       240
weighted avg       0.90      0.89      0.89       240



### Training the BERT model for subtask C

In [20]:
# Filter for Subtask C (TIN tweets only)
train_c_df = df_train[(df_train["subtask_a"] == "OFF") & (df_train["subtask_b"] == "TIN")]
test_c_df = df_test[(df_test["subtask_a"] == "OFF") & (df_test["subtask_b"] == "TIN")]

# Extract tweets and labels (IND=0, GRP=1, OTH=2)
c_label_map = {"IND": 0, "GRP": 1, "OTH": 2}
train_c_tweets = train_c_df["tweet"].values
train_c_labels = train_c_df["subtask_c"].map(c_label_map).values

test_c_tweets = test_c_df["tweet"].values
test_c_labels = test_c_df["subtask_c"].map(c_label_map).values

# Tokenize
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_c_encodings = tokenizer(list(train_c_tweets), padding="max_length", truncation=True, max_length=128, return_tensors="pt")
test_c_encodings = tokenizer(list(test_c_tweets), padding="max_length", truncation=True, max_length=128, return_tensors="pt")

# Create datasets
train_c_dataset = OLIDDataset(train_c_encodings, train_c_labels)
test_c_dataset = OLIDDataset(test_c_encodings, test_c_labels)

# Print summary
print(f"Train C Size: {len(train_c_dataset)}")
print(f"Test C Size: {len(test_c_dataset)}")
print("Class Distribution (Train):", dict(pd.Series(train_c_labels).value_counts().sort_index()))
print("Class Distribution (Test):", dict(pd.Series(test_c_labels).value_counts().sort_index()))
print("Unique train labels:", np.unique(train_c_labels))
print("Unique test labels:", np.unique(test_c_labels))

Train C Size: 3876
Test C Size: 213
Class Distribution (Train): {0: np.int64(2407), 1: np.int64(1074), 2: np.int64(395)}
Class Distribution (Test): {0: np.int64(100), 1: np.int64(78), 2: np.int64(35)}
Unique train labels: [0 1 2]
Unique test labels: [0 1 2]


In [26]:
train_counts = np.bincount(train_c_labels)
class_weights = 1.0 / train_counts
class_weights = class_weights / class_weights.sum()  # normalize
class_weights = torch.tensor(class_weights, dtype=torch.float)
print("Class weights:", class_weights)

Class weights: tensor([0.1071, 0.2401, 0.6528])


In [27]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

# Load BERT model with dropout regularization
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3,
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.2
)

trainer = WeightedTrainer(
    class_weights=class_weights,
    model=model,
    args=training_args,
    train_dataset=train_c_dataset,
    eval_dataset=test_c_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,1.094848
2,No log,1.062211
3,No log,1.035649
4,No log,1.020508
5,1.072100,1.013304


TrainOutput(global_step=610, training_loss=1.0612883020619877, metrics={'train_runtime': 42.3231, 'train_samples_per_second': 457.906, 'train_steps_per_second': 14.413, 'total_flos': 1274784508892160.0, 'train_loss': 1.0612883020619877, 'epoch': 5.0})

In [28]:
# Get predictions from BERT
preds = trainer.predict(test_c_dataset)
pred_labels = torch.argmax(torch.tensor(preds.predictions), axis=1)

# Print classification report
print(classification_report(test_c_labels, pred_labels.numpy()))

              precision    recall  f1-score   support

           0       0.75      0.42      0.54       100
           1       0.49      0.91      0.64        78
           2       0.17      0.06      0.09        35

    accuracy                           0.54       213
   macro avg       0.47      0.46      0.42       213
weighted avg       0.56      0.54      0.50       213

