# Bert baseline for POLAR

## Introduction

In this part of the starter notebook, we will take you through the process of all three Subtasks.

## Subtask 1 - Polarization detection

This is a binary classification to determine whether a post contains polarized content (Polarized or Not Polarized).

## Imports

In [1]:
import transformers
print(transformers.__file__)


  from .autonotebook import tqdm as notebook_tqdm


/Users/tejo9855/Documents/Classes/Fall '25/NLP - Martin/Assignments/SemEval2026-task9/.venv/lib/python3.13/site-packages/transformers/__init__.py


In [2]:
from collections import Counter

In [3]:
from transformers import TrainingArguments

In [4]:
import pandas as pd

from sklearn.metrics import recall_score, precision_score, f1_score
import numpy as np

import torch

from sklearn.metrics import f1_score

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset

In [5]:
import wandb

# Disable wandb logging for this script
wandb.init(mode="disabled")



## Data Import

The training data consists of a short text and binary labels

The data is structured as a CSV file with the following fields:
- id: a unique identifier for the sample
- text: a sentence or short text
- polarization:  1 text is polarized, 0 text is not polarized

The data is in all three subtask folders the same but only containing the labels for the specific task.

In [6]:
# Load the training and validation data for subtask 1

train = pd.read_csv('subtask1/train/eng.csv')
val = pd.read_csv('subtask1/train/eng.csv')

train.head()

Unnamed: 0,id,text,polarization
0,en_973938b90b0ff5d87d35a582f83f5c89,is defending imperialism in the dnd chat,0
1,en_07dfd4600426caca6e2c5883fcbea9ea,Still playing with this. I am now following Ra...,0
2,en_f14519ff2302b6cd47712073f13bc461,.senate.gov Theres 3 groups out there Republic...,0
3,en_e48b7e7542faafa544ac57b64bc80daf,"""ABC MD, David Anderson, said the additional f...",0
4,en_7c581fb77bce8033aeba3d6dbd6273eb,"""bad people"" I have some conservative values s...",0


In [7]:
train.to_excel('train_set.xlsx')

In [8]:
Counter(train['polarization'])

Counter({0: 1674, 1: 1002})

In [9]:
test = pd.read_csv('subtask1/dev/eng.csv')

In [10]:
test

Unnamed: 0,id,text,polarization
0,eng_f66ca14d60851371f9720aaf4ccd9b58,God is with Ukraine and Zelensky,
1,eng_3a489aa7fed9726aa8d3d4fe74c57efb,"4 Dems, 2 Republicans Luzerne County Council s...",
2,eng_95770ff547ea5e48b0be00f385986483,Abuse Survivor Recounts Her Struggles at YWCA ...,
3,eng_2048ae6f9aa261c48e6d777bcc5b38bf,"After Rwanda, another deportation camp disaster",
4,eng_07781aa88e61e7c0a996abd1e5ea3a20,Another plea in Trump election interference probe,
...,...,...,...
128,eng_2fd4484b6bab80971a96b2100e20966a,Should run for GOP press. LOL,
129,eng_0e11358da0c0e0cd8e0fdedf05a0cbba,"The shampoo will rinse down in the shower, and...",
130,eng_0fb944d51bb376102a3ea6b65bafab6a,Tomorrow Mike Pence is in love with bussy,
131,eng_d9253eaeb206934208a57786b688c316,Unknown gunmen have shot and killed an Israeli...,


In [11]:
len(train)

2676

In [12]:
Counter(val.polarization)

Counter({0: 1674, 1: 1002})

# Pre process Data

In [21]:
import re

In [52]:
patterns_to_mask = [
    # Political figures
    r"\bdonald\s+trump\b|\btrump(s)?\b",
    r"\bjoe\s+biden\b|\bbiden\b",
    r"\bbarack\s+obama\b|\bobama\b",
    r"\bhillary\s+clinton\b|\bclinton\b",
    r"\bnancy\s+pelosi\b|\bpelosi\b",
    r"\bchuck\s+schumer\b|\bschumer\b",
    r"\bmitch\s+mcconnell\b|\bmcconnell\b",
    r"\bkamala\s+harris\b|\bkamala\b",
    r"\bmike\s+pence\b|\bpence\b",
    r"\bron\s+desantis\b|\bdesantis\b",

    # Political parties & ideologies
    r"\bdemocrat(s)?\b",
    r"\brepublican(s)?\b",
    r"\bliberal(s)?\b",
    r"\bconservative(s)?\b",
    r"\bprogressive(s)?\b",
    r"\bleftist(s)?\b",
    r"\bright-?wing\b",

    # Countries & regions
    r"\bisrael\b|\bpalestine\b|\bgaza\b|\biran\b|\bukraine\b|\brussia\b|\bchina\b|\btaiwan\b",

    # Hot-button issues
    r"\babortion\b",
    r"\bgun(s)?\b",
    r"\bimmigration\b|\bborder\b",
    r"\bclimate\b",
    r"\bcovid\b|\bvaccine(s)?\b|\bmask mandate\b",

    # Movements & slogans
    r"\bblm\b|\bblack\s+lives\s+matter\b",
    r"\bmaga\b",
    r"\bantifa\b",
    r"\bwoke\b",
    r"\bcancel\s+culture\b",
    r"\bme\s+too\b",

    # Media outlets
    r"\bcnn\b",
    r"\bfox\b|\bfox\s+news\b",
    r"\bmsnbc\b",
    r"\bbreitbart\b",
    r"\bnytimes\b|\bnew\s+york\s+times\b|\bnyt\b",
    r"\bwashington\s+post\b",
    r"\btwitter\b|\bx\b",
    r"\bfacebook\b"
]


In [53]:
def mask_spurious_words(text):
    for pattern in patterns_to_mask:
        text = re.sub(pattern, "MASKED_TOKEN", text, flags=re.IGNORECASE)
    return text

In [62]:
train_masked = train
val_masked = val
test_masked = test

In [63]:
train_masked["text"] = train["text"].apply(mask_spurious_words)
val_masked["text"] = val["text"].apply(mask_spurious_words)
test_masked["text"] = test["text"].apply(mask_spurious_words)

In [58]:
val_masked.to_excel('val_masked.xlsx')

# Dataset
-  Create a pytorch class for handling data
-  Wrapping the raw texts and labels into a format that Huggingfaceâ€™s Trainer can use for training and evaluation

In [64]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
  def __init__(self,texts,labels,tokenizer,max_length =128):
    self.texts=texts
    self.labels=labels
    self.tokenizer= tokenizer
    self.max_length = max_length # Store max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,idx):
    text=self.texts[idx]
    label=self.labels[idx]
    encoding=self.tokenizer(text,truncation=True,padding=False,max_length=self.max_length,return_tensors='pt')

    # Ensure consistent tensor conversion for all items
    item = {key: encoding[key].squeeze() for key in encoding.keys()}
    item['labels'] = torch.tensor(label, dtype=torch.long)
    # try:
    #   item['labels'] = int(label)
    # except Exception as e:
    #   print(f"Error {e} - {item}")
    #   item['labels'] = 0

    return item

In [65]:
class TestPolarizationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=128, ids=None):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(text, truncation=True, padding=False, max_length=self.max_length, return_tensors='pt')
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        return item

Now, we'll tokenize the text data and create the datasets using `bert-base-uncased` as the tokenizer.

In [90]:
model_name = "vinai/bertweet-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [91]:
# Load the tokenizer
# tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Create datasets
train_dataset = PolarizationDataset(train_masked['text'].tolist(), train_masked['polarization'].tolist(), tokenizer)
val_dataset = PolarizationDataset(val_masked['text'].tolist(), val_masked['polarization'].tolist(), tokenizer)

In [None]:
# test_dataset = TestPolarizationDataset(test_masked['text'].tolist(), test_masked['polarization'].tolist(), tokenizer)

In [92]:
test_dataset = TestPolarizationDataset(test['text'].tolist(), test['polarization'].tolist(), tokenizer)

In [68]:
set(train_dataset.labels)

{0, 1}

In [69]:
str_len_list = []
for text in train_dataset.texts:
    str_len_list.append(len(text))

In [70]:
max(str_len_list)

419

# Training

Next, we'll load the pre-trained `bert-base-uncased` model for sequence classification. Since this is a binary classification task (Polarized/Not Polarized), we set `num_labels=2`.

In [93]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Now, we'll define the training arguments and the evaluation metric. We'll use macro F1 score for evaluation.

In [94]:
# Define metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

# Define training arguments
training_args = TrainingArguments(
        output_dir=f"./",
        num_train_epochs=6,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=2,
        weight_decay = 0.01,
        lr_scheduler_type="linear",
        warmup_ratio=0.1,
        eval_strategy="epoch",
        save_strategy="no",
        # logging_steps=100,
        logging_strategy="no",
        disable_tqdm=False,
        no_cuda=True
    )




Finally, we'll initialize the `Trainer` and start training.

In [95]:
from transformers import default_data_collator
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    data_collator=DataCollatorWithPadding(tokenizer) # Data collator for dynamic padding
)

In [96]:
# Train the model
trainer.train()

A ConfigError was raised whilst setting the number of model parameters in Weights & Biases config.


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.349184,0.853389
2,No log,0.230005,0.91638
3,No log,0.142977,0.95249
4,No log,0.093484,0.974525
5,No log,0.067044,0.985216
6,No log,0.065334,0.984423


TrainOutput(global_step=1008, training_loss=0.2637803213936942, metrics={'train_runtime': 738.2765, 'train_samples_per_second': 21.748, 'train_steps_per_second': 1.365, 'total_flos': 417458578210560.0, 'train_loss': 0.2637803213936942, 'epoch': 6.0})

In [104]:
trainer.save_model("./saved_model")
tokenizer.save_pretrained("./saved_model")

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/bpe.codes',
 './saved_model/added_tokens.json')

In [105]:
model = AutoModelForSequenceClassification.from_pretrained("./saved_model")
tokenizer = AutoTokenizer.from_pretrained("./saved_model")

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [76]:
# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set: {eval_results['eval_f1_macro']}")

Macro F1 score on validation set: 0.9532883053516021


In [27]:
train_predictions = trainer.predict(train_dataset)

In [78]:
test_predictions = trainer.predict(test_dataset)
test_predictions = np.argmax(test_predictions.predictions, axis=1)

In [79]:
test_predictions

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0])

In [80]:
test['predicted_polarization'] = test_predictions

In [81]:
test

Unnamed: 0,id,text,polarization,predicted_polarization
0,eng_f66ca14d60851371f9720aaf4ccd9b58,God is with MASKED_TOKEN and Zelensky,,0
1,eng_3a489aa7fed9726aa8d3d4fe74c57efb,"4 Dems, 2 MASKED_TOKEN Luzerne County Council ...",,0
2,eng_95770ff547ea5e48b0be00f385986483,Abuse Survivor Recounts Her Struggles at YWCA ...,,0
3,eng_2048ae6f9aa261c48e6d777bcc5b38bf,"After Rwanda, another deportation camp disaster",,0
4,eng_07781aa88e61e7c0a996abd1e5ea3a20,Another plea in MASKED_TOKEN election interfer...,,0
...,...,...,...,...
128,eng_2fd4484b6bab80971a96b2100e20966a,Should run for GOP press. LOL,,0
129,eng_0e11358da0c0e0cd8e0fdedf05a0cbba,"The shampoo will rinse down in the shower, and...",,1
130,eng_0fb944d51bb376102a3ea6b65bafab6a,Tomorrow MASKED_TOKEN is in love with bussy,,0
131,eng_d9253eaeb206934208a57786b688c316,Unknown gunmen have shot and killed an Israeli...,,0


In [82]:
test.to_excel('test_set_predictions.xlsx')

In [50]:
train_predictions

PredictionOutput(predictions=array([[ 3.3171904 , -3.0713718 ],
       [ 3.1359115 , -2.7941902 ],
       [ 3.27824   , -2.9667137 ],
       ...,
       [-2.4472935 ,  2.34022   ],
       [-0.55915266,  0.7921563 ],
       [-2.5786695 ,  2.5507584 ]], shape=(2676, 2), dtype=float32), label_ids=array([0, 0, 0, ..., 1, 1, 1], shape=(2676,)), metrics={'test_loss': 0.15930086374282837, 'test_f1_macro': 0.9585035859662725, 'test_runtime': 18.2843, 'test_samples_per_second': 146.355, 'test_steps_per_second': 73.177})

In [32]:
train['predicted_polarization'] = train_predictions[1]

In [33]:
train.head()

Unnamed: 0,id,text,polarization,predicted_polarization
0,en_973938b90b0ff5d87d35a582f83f5c89,is defending imperialism in the dnd chat,0,0
1,en_07dfd4600426caca6e2c5883fcbea9ea,Still playing with this. I am now following Ra...,0,0
2,en_f14519ff2302b6cd47712073f13bc461,.senate.gov Theres 3 groups out there Republic...,0,0
3,en_e48b7e7542faafa544ac57b64bc80daf,"""ABC MD, David Anderson, said the additional f...",0,0
4,en_7c581fb77bce8033aeba3d6dbd6273eb,"""bad people"" I have some conservative values s...",0,0


In [None]:
mismatched_rows = train[train['polarization'] != train['predicted_polarization']]

In [35]:
len(mismatched_rows)

0

In [97]:
predictions = trainer.predict(test_dataset)

In [98]:
pred_labels = np.argmax(predictions.predictions, axis=1)

In [99]:
pred_labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       0])

In [None]:
ids = test.id

In [101]:
len(ids)

133

In [102]:
df = pd.DataFrame({
    'id': ids,
    'polarization': pred_labels
})

In [103]:
df.to_csv("subtask_1/pred_eng.csv", index=False)

# Decoder Model Approach

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "google/gemma-2b"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

Fetching 2 files:   0%|          | 0/2 [01:39<?, ?it/s]
Cancellation requested; stopping current tasks.


KeyboardInterrupt: 

In [None]:
prompt = "Classify this text as liberal or conservative: 'The government should provide universal healthcare.'"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=50)

In [None]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Subtask 2: Polarization Type Classification
Multi-label classification to identify the target of polarization as one of the following categories: Gender/Sexual, Political, Religious, Racial/Ethnic, or Other.
For this task we will load the data for subtask 2.

In [19]:
train = pd.read_csv('subtask2/train/eng.csv')
val = pd.read_csv('subtask2/train/eng.csv')
train.head()

Unnamed: 0,id,text,political,racial/ethnic,religious,gender/sexual,other
0,en_973938b90b0ff5d87d35a582f83f5c89,is defending imperialism in the dnd chat,0,0,0,0,0
1,en_07dfd4600426caca6e2c5883fcbea9ea,Still playing with this. I am now following Ra...,0,0,0,0,0
2,en_f14519ff2302b6cd47712073f13bc461,.senate.gov Theres 3 groups out there Republic...,0,0,0,0,0
3,en_e48b7e7542faafa544ac57b64bc80daf,"""ABC MD, David Anderson, said the additional f...",0,0,0,0,0
4,en_7c581fb77bce8033aeba3d6dbd6273eb,"""bad people"" I have some conservative values s...",0,0,0,0,0


In [20]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length # Store max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding=False, max_length=self.max_length, return_tensors='pt')

        # Ensure consistent tensor conversion for all items
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        # CHANGE THIS LINE: Use torch.float instead of torch.long for multi-label classification
        item['labels'] = torch.tensor(label, dtype=torch.float)
        return item


In [21]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Create train and Test dataset for multilabel
train_dataset = PolarizationDataset(train['text'].tolist(), train[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer)
val_dataset = PolarizationDataset(val['text'].tolist(), val[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer)
dev_dataset = PolarizationDataset(val['text'].tolist(), val[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer)


In [22]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5, problem_type="multi_label_classification") # 5 labels

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# Define metrics function for multi-label classification
def compute_metrics_multilabel(p):
    # Sigmoid the predictions to get probabilities
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    # Convert probabilities to predicted labels (0 or 1)
    preds = (probs > 0.5).int().numpy()
    # Compute macro F1 score
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

# Define training arguments
training_args = TrainingArguments(
    output_dir=f"./",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=100,
    disable_tqdm=False
)

In [24]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_multilabel,  # Use the new metrics function
    data_collator=DataCollatorWithPadding(tokenizer)
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set for Subtask 2: {eval_results['eval_f1_macro']}")

A ConfigError was raised whilst setting the number of model parameters in Weights & Biases config.


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.2301,0.180045,0.239544
2,0.1743,0.124341,0.459029
3,0.1473,0.106562,0.508902




Macro F1 score on validation set for Subtask 2: 0.5089023985811894


# Subtask 3: Manifestation Identification
Multi-label classification to classify how polarization is expressed, with multiple possible labels including Vilification, Extreme Language, Stereotype, Invalidation, Lack of Empathy, and Dehumanization.



In [25]:
train = pd.read_csv('subtask3/train/eng.csv')
val = pd.read_csv('subtask3/train/eng.csv')

train.head()

Unnamed: 0,id,text,stereotype,vilification,dehumanization,extreme_language,lack_of_empathy,invalidation
0,en_973938b90b0ff5d87d35a582f83f5c89,is defending imperialism in the dnd chat,0,0,0,0,0,0
1,en_07dfd4600426caca6e2c5883fcbea9ea,Still playing with this. I am now following Ra...,0,0,0,0,0,0
2,en_f14519ff2302b6cd47712073f13bc461,.senate.gov Theres 3 groups out there Republic...,0,0,0,0,0,0
3,en_e48b7e7542faafa544ac57b64bc80daf,"""ABC MD, David Anderson, said the additional f...",0,0,0,0,0,0
4,en_7c581fb77bce8033aeba3d6dbd6273eb,"""bad people"" I have some conservative values s...",0,0,0,0,0,0


In [26]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length # Store max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding=False, max_length=self.max_length, return_tensors='pt')

        # Ensure consistent tensor conversion for all items
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        # CHANGE THIS LINE: Use torch.float instead of torch.long for multi-label classification
        item['labels'] = torch.tensor(label, dtype=torch.float)
        return item

In [27]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Create train and Test dataset for multilabel
train_dataset = PolarizationDataset(train['text'].tolist(), train[['vilification','extreme_language','stereotype','invalidation','lack_of_empathy','dehumanization']].values.tolist(), tokenizer)
val_dataset = PolarizationDataset(val['text'].tolist(), val[['vilification','extreme_language','stereotype','invalidation','lack_of_empathy','dehumanization']].values.tolist(), tokenizer)

In [28]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6, problem_type="multi_label_classification") # use 6 labels

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=f"./",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=100,
    disable_tqdm=False
)

# Define metrics function for multi-label classification
def compute_metrics_multilabel(p):
    # Sigmoid the predictions to get probabilities
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    # Convert probabilities to predicted labels (0 or 1)
    preds = (probs > 0.5).int().numpy()
    # Compute macro F1 score
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

In [30]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_multilabel,  # Use the new metrics function
    data_collator=DataCollatorWithPadding(tokenizer)
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set for Subtask 3: {eval_results['eval_f1_macro']}")

A ConfigError was raised whilst setting the number of model parameters in Weights & Biases config.


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.4085,0.351823,0.395926
2,0.3397,0.289207,0.495095
3,0.3141,0.26908,0.57429




Macro F1 score on validation set for Subtask 3: 0.5742903939114335
