# Bert baseline for POLAR

## Introduction

In this part of the starter notebook, we will take you through the process of all three Subtasks.

## Subtask 1 - Polarization detection

This is a binary classification to determine whether a post contains polarized content (Polarized or Not Polarized).

In [None]:
!unzip dev_phase.zip

## Imports

In [1]:
import transformers
print(transformers.__file__)


  from .autonotebook import tqdm as notebook_tqdm


/Users/tejo9855/Documents/Classes/Fall '25/NLP - Martin/Assignments/SemEval2026-task9/.venv/lib/python3.13/site-packages/transformers/__init__.py


In [2]:
from collections import Counter

In [3]:
from transformers import TrainingArguments

In [4]:
import pandas as pd

from sklearn.metrics import recall_score, precision_score, f1_score
import numpy as np

import torch

from sklearn.metrics import f1_score

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset

In [54]:
import wandb

# Disable wandb logging for this script
wandb.init(mode="disabled")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Data Import

The training data consists of a short text and binary labels

The data is structured as a CSV file with the following fields:
- id: a unique identifier for the sample
- text: a sentence or short text
- polarization:  1 text is polarized, 0 text is not polarized

The data is in all three subtask folders the same but only containing the labels for the specific task.

In [55]:
# Load the training and validation data for subtask 1

train = pd.read_csv('subtask1/train/eng.csv')
val = pd.read_csv('subtask1/train/eng.csv')

train.head()

Unnamed: 0,id,text,polarization
0,en_973938b90b0ff5d87d35a582f83f5c89,is defending imperialism in the dnd chat,0
1,en_07dfd4600426caca6e2c5883fcbea9ea,Still playing with this. I am now following Ra...,0
2,en_f14519ff2302b6cd47712073f13bc461,.senate.gov Theres 3 groups out there Republic...,0
3,en_e48b7e7542faafa544ac57b64bc80daf,"""ABC MD, David Anderson, said the additional f...",0
4,en_7c581fb77bce8033aeba3d6dbd6273eb,"""bad people"" I have some conservative values s...",0


In [75]:
test = pd.read_csv('subtask1/dev/eng.csv')

In [92]:
test

Unnamed: 0,id,text,polarization
0,en_f66ca14d60851371f9720aaf4ccd9b58,God is with Ukraine and Zelensky,
1,en_3a489aa7fed9726aa8d3d4fe74c57efb,"4 Dems, 2 Republicans Luzerne County Council s...",
2,en_95770ff547ea5e48b0be00f385986483,Abuse Survivor Recounts Her Struggles at YWCA ...,
3,en_2048ae6f9aa261c48e6d777bcc5b38bf,"After Rwanda, another deportation camp disaster",
4,en_07781aa88e61e7c0a996abd1e5ea3a20,Another plea in Trump election interference probe,
...,...,...,...
128,en_2fd4484b6bab80971a96b2100e20966a,Should run for GOP press. LOL,
129,en_0e11358da0c0e0cd8e0fdedf05a0cbba,"The shampoo will rinse down in the shower, and...",
130,en_0fb944d51bb376102a3ea6b65bafab6a,Tomorrow Mike Pence is in love with bussy,
131,en_d9253eaeb206934208a57786b688c316,Unknown gunmen have shot and killed an Israeli...,


In [56]:
len(train)

2676

In [57]:
Counter(val.polarization)

Counter({0: 1674, 1: 1002})

# Dataset
-  Create a pytorch class for handling data
-  Wrapping the raw texts and labels into a format that Huggingface’s Trainer can use for training and evaluation

In [58]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
  def __init__(self,texts,labels,tokenizer,max_length =128):
    self.texts=texts
    self.labels=labels
    self.tokenizer= tokenizer
    self.max_length = max_length # Store max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,idx):
    text=self.texts[idx]
    label=self.labels[idx]
    encoding=self.tokenizer(text,truncation=True,padding=False,max_length=self.max_length,return_tensors='pt')

    # Ensure consistent tensor conversion for all items
    item = {key: encoding[key].squeeze() for key in encoding.keys()}
    item['labels'] = torch.tensor(label, dtype=torch.long)
    return item

In [82]:
class TestPolarizationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=128, ids=None):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(text, truncation=True, padding=False, max_length=self.max_length, return_tensors='pt')
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        return item

Now, we'll tokenize the text data and create the datasets using `bert-base-uncased` as the tokenizer.

In [59]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Create datasets
train_dataset = PolarizationDataset(train['text'].tolist(), train['polarization'].tolist(), tokenizer)
val_dataset = PolarizationDataset(val['text'].tolist(), val['polarization'].tolist(), tokenizer)

In [83]:
test_dataset = TestPolarizationDataset(test['text'].tolist(), test['polarization'].tolist(), tokenizer)

In [61]:
set(train_dataset.labels)

{0, 1}

In [62]:
str_len_list = []
for text in train_dataset.texts:
    str_len_list.append(len(text))

In [63]:
max(str_len_list)

299

Next, we'll load the pre-trained `bert-base-uncased` model for sequence classification. Since this is a binary classification task (Polarized/Not Polarized), we set `num_labels=2`.

In [64]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Now, we'll define the training arguments and the evaluation metric. We'll use macro F1 score for evaluation.

In [65]:
# Define metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

# Define training arguments
training_args = TrainingArguments(
        output_dir=f"./",
        num_train_epochs=1,
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=2,
        eval_strategy="epoch",
        save_strategy="no",
        # logging_steps=100,
        logging_strategy="no",
        disable_tqdm=False,
        no_cuda=True
    )




Finally, we'll initialize the `Trainer` and start training.

In [66]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    data_collator=DataCollatorWithPadding(tokenizer) # Data collator for dynamic padding
)

In [67]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.341643,0.84586


TrainOutput(global_step=669, training_loss=0.4925452262296805, metrics={'train_runtime': 92.8414, 'train_samples_per_second': 28.823, 'train_steps_per_second': 7.206, 'total_flos': 20533481693232.0, 'train_loss': 0.4925452262296805, 'epoch': 1.0})

In [68]:

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set: {eval_results['eval_f1_macro']}")

Macro F1 score on validation set: 0.8458601714415668


In [84]:
predictions = trainer.predict(test_dataset)

In [97]:
predictions.predictions

array([[ 0.80841994, -0.95065427],
       [ 1.7726249 , -2.0752807 ],
       [ 1.5228413 , -2.074303  ],
       [ 1.274685  , -1.7090815 ],
       [ 1.65749   , -2.0647178 ],
       [ 1.5189959 , -1.9382255 ],
       [ 0.362134  , -0.801064  ],
       [ 1.8673475 , -2.3095307 ],
       [ 1.8860964 , -2.3123853 ],
       [ 0.12346455, -0.582104  ],
       [ 1.8749917 , -2.3235226 ],
       [ 1.9421335 , -2.4173486 ],
       [-0.8257258 ,  0.7105239 ],
       [ 1.7407446 , -2.257275  ],
       [ 1.8449645 , -2.2399752 ],
       [ 1.8535681 , -2.3629668 ],
       [ 1.8661692 , -2.4241593 ],
       [ 1.371798  , -1.848873  ],
       [ 1.6753466 , -2.1952906 ],
       [-0.9580275 ,  0.92904997],
       [ 1.8522745 , -2.3188694 ],
       [ 1.8405004 , -2.2848642 ],
       [ 1.4821299 , -1.8248116 ],
       [ 1.2656393 , -1.6960965 ],
       [ 1.6848361 , -2.184631  ],
       [ 1.1265047 , -1.3484718 ],
       [ 1.707003  , -2.1666794 ],
       [ 1.7333535 , -2.179791  ],
       [ 1.5168521 ,

In [98]:
pred_labels = np.argmax(predictions.predictions, axis=1)

In [100]:
pred_labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       0])

In [93]:
ids = test.id


In [94]:
len(ids)

133

In [95]:
df = pd.DataFrame({
    'id': ids,
    'polarization': pred_labels
})

In [96]:
df

Unnamed: 0,id,polarization
0,en_f66ca14d60851371f9720aaf4ccd9b58,0
1,en_3a489aa7fed9726aa8d3d4fe74c57efb,0
2,en_95770ff547ea5e48b0be00f385986483,0
3,en_2048ae6f9aa261c48e6d777bcc5b38bf,0
4,en_07781aa88e61e7c0a996abd1e5ea3a20,0
...,...,...
128,en_2fd4484b6bab80971a96b2100e20966a,0
129,en_0e11358da0c0e0cd8e0fdedf05a0cbba,1
130,en_0fb944d51bb376102a3ea6b65bafab6a,0
131,en_d9253eaeb206934208a57786b688c316,0


In [101]:
df.to_csv("polarization_predictions.csv", index=False)


# Subtask 2: Polarization Type Classification
Multi-label classification to identify the target of polarization as one of the following categories: Gender/Sexual, Political, Religious, Racial/Ethnic, or Other.
For this task we will load the data for subtask 2.

In [19]:
train = pd.read_csv('subtask2/train/eng.csv')
val = pd.read_csv('subtask2/train/eng.csv')
train.head()

Unnamed: 0,id,text,political,racial/ethnic,religious,gender/sexual,other
0,en_973938b90b0ff5d87d35a582f83f5c89,is defending imperialism in the dnd chat,0,0,0,0,0
1,en_07dfd4600426caca6e2c5883fcbea9ea,Still playing with this. I am now following Ra...,0,0,0,0,0
2,en_f14519ff2302b6cd47712073f13bc461,.senate.gov Theres 3 groups out there Republic...,0,0,0,0,0
3,en_e48b7e7542faafa544ac57b64bc80daf,"""ABC MD, David Anderson, said the additional f...",0,0,0,0,0
4,en_7c581fb77bce8033aeba3d6dbd6273eb,"""bad people"" I have some conservative values s...",0,0,0,0,0


In [20]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length # Store max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding=False, max_length=self.max_length, return_tensors='pt')

        # Ensure consistent tensor conversion for all items
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        # CHANGE THIS LINE: Use torch.float instead of torch.long for multi-label classification
        item['labels'] = torch.tensor(label, dtype=torch.float)
        return item


In [21]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Create train and Test dataset for multilabel
train_dataset = PolarizationDataset(train['text'].tolist(), train[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer)
val_dataset = PolarizationDataset(val['text'].tolist(), val[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer)
dev_dataset = PolarizationDataset(val['text'].tolist(), val[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer)


In [22]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5, problem_type="multi_label_classification") # 5 labels

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# Define metrics function for multi-label classification
def compute_metrics_multilabel(p):
    # Sigmoid the predictions to get probabilities
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    # Convert probabilities to predicted labels (0 or 1)
    preds = (probs > 0.5).int().numpy()
    # Compute macro F1 score
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

# Define training arguments
training_args = TrainingArguments(
    output_dir=f"./",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=100,
    disable_tqdm=False
)

In [24]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_multilabel,  # Use the new metrics function
    data_collator=DataCollatorWithPadding(tokenizer)
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set for Subtask 2: {eval_results['eval_f1_macro']}")

A ConfigError was raised whilst setting the number of model parameters in Weights & Biases config.


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.2301,0.180045,0.239544
2,0.1743,0.124341,0.459029
3,0.1473,0.106562,0.508902




Macro F1 score on validation set for Subtask 2: 0.5089023985811894


# Subtask 3: Manifestation Identification
Multi-label classification to classify how polarization is expressed, with multiple possible labels including Vilification, Extreme Language, Stereotype, Invalidation, Lack of Empathy, and Dehumanization.



In [25]:
train = pd.read_csv('subtask3/train/eng.csv')
val = pd.read_csv('subtask3/train/eng.csv')

train.head()

Unnamed: 0,id,text,stereotype,vilification,dehumanization,extreme_language,lack_of_empathy,invalidation
0,en_973938b90b0ff5d87d35a582f83f5c89,is defending imperialism in the dnd chat,0,0,0,0,0,0
1,en_07dfd4600426caca6e2c5883fcbea9ea,Still playing with this. I am now following Ra...,0,0,0,0,0,0
2,en_f14519ff2302b6cd47712073f13bc461,.senate.gov Theres 3 groups out there Republic...,0,0,0,0,0,0
3,en_e48b7e7542faafa544ac57b64bc80daf,"""ABC MD, David Anderson, said the additional f...",0,0,0,0,0,0
4,en_7c581fb77bce8033aeba3d6dbd6273eb,"""bad people"" I have some conservative values s...",0,0,0,0,0,0


In [26]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length # Store max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding=False, max_length=self.max_length, return_tensors='pt')

        # Ensure consistent tensor conversion for all items
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        # CHANGE THIS LINE: Use torch.float instead of torch.long for multi-label classification
        item['labels'] = torch.tensor(label, dtype=torch.float)
        return item

In [27]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Create train and Test dataset for multilabel
train_dataset = PolarizationDataset(train['text'].tolist(), train[['vilification','extreme_language','stereotype','invalidation','lack_of_empathy','dehumanization']].values.tolist(), tokenizer)
val_dataset = PolarizationDataset(val['text'].tolist(), val[['vilification','extreme_language','stereotype','invalidation','lack_of_empathy','dehumanization']].values.tolist(), tokenizer)

In [28]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6, problem_type="multi_label_classification") # use 6 labels

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=f"./",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=100,
    disable_tqdm=False
)

# Define metrics function for multi-label classification
def compute_metrics_multilabel(p):
    # Sigmoid the predictions to get probabilities
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    # Convert probabilities to predicted labels (0 or 1)
    preds = (probs > 0.5).int().numpy()
    # Compute macro F1 score
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

In [30]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_multilabel,  # Use the new metrics function
    data_collator=DataCollatorWithPadding(tokenizer)
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set for Subtask 3: {eval_results['eval_f1_macro']}")

A ConfigError was raised whilst setting the number of model parameters in Weights & Biases config.


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.4085,0.351823,0.395926
2,0.3397,0.289207,0.495095
3,0.3141,0.26908,0.57429




Macro F1 score on validation set for Subtask 3: 0.5742903939114335
