link to our [sem eval task](https://propaganda.math.unipd.it/semeval2024task4/teampage.php?passcode=7a4c50dc60f44593a07529d2253593e9)

to do:
- import models
- finetune models
- evaluate models
- adjust test data to be list of applicable labels instead of / separated


# Setting up libraries and mounting google drive




In [4]:
# install dependencies and codebase
# transformers Hugging Face library: access to BERT / GPT
!pip install torch transformers datasets tqdm gdown==v4.6.3
!mkdir checkpoints

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting gdown==v4.6.3
  Downloading gdown-4.6.3-py3-none-any.whl.metadata (4.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading gdown-4.6.3-py3-none-any.whl (14 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.8 MB/s[0

In [5]:
# mounting data and files to drive
# hypothetically can upload SemEval datasets to drive for easier access + we would both have access
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Preprocessing the Data

In [10]:
import json
from dataclasses import dataclass
from typing import List, Dict, Tuple, Union

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

creating Meme object and dataset

In [54]:
# represents single meme as text, label
@dataclass
class Meme:
  text: str
  labels: Union[List[str], None] # labels can be none for test data

  @staticmethod
  def from_dict(data: dict):
    text = data["text"]
    labels = data.get("labels", None)
    return Meme(text=text, labels=labels)

# custom dataset for meme data, supporting multi-label classification
class MemeDataset(Dataset):

  def __init__(self, tokenizer, data: List[Dict], label_encoder=None):
    MemeDataset.tokenizer = tokenizer
    self.examples = [Meme.from_dict(item) for item in data]
    self.label_encoder = label_encoder

    if label_encoder:
      # encode labels if available
      self.encoded_labels = [label_encoder.transform([item.labels])[0] if item.labels else None
                                   for item in self.examples]
    else:
      self.encoded_labels = None

  def __len__(self):
    return len(self.examples)

  def __getitem__(self, idx):
    return self.examples[idx]

  # batch processing
  @staticmethod
  def collate_fn(batched_samples: List[Meme], max_length=512):

    batched_texts = [sample.text for sample in batched_samples]
    batched_labels = [sample.labels for sample in batched_samples if sample.labels is not None]

    text_encoding = MemeDataset.tokenizer(
        batched_texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    if batched_labels:
      labels_tensor = torch.tensor(batched_labels, dtype=torch.float32) # multi-label as float
    else:
      labels_tensor = None

    return {
        "input_ids": text_encoding["input_ids"],
        "attention_mask": text_encoding["attention_mask"],
        "labels": labels_tensor,
    }


In [13]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [57]:
def load_json(file_path):
  with open(file_path, 'r') as f:
    return json.load(f)

# accessing the datasets from drive
train_file_path = '/content/drive/MyDrive/NLP_FINAL/train.json'
val_file_path = '/content/drive/MyDrive/NLP_FINAL/validation.json'
dev_unlabeled_file_path = '/content/drive/MyDrive/NLP_FINAL/dev_unlabeled.json'
test_file_path = '/content/drive/MyDrive/NLP_FINAL/dev_subtask1_en.json'

raw_datasets = {}

raw_datasets["train"] = load_json(train_file_path)
raw_datasets["validation"] = load_json(val_file_path)
raw_datasets["dev"] = load_json(dev_unlabeled_file_path)
raw_datasets['test'] = load_json(test_file_path)

print(raw_datasets["train"][0])
print(type(raw_datasets['train'][0]))
print(raw_datasets['train'][0]["text"])

{'id': '65635', 'text': 'THIS IS WHY YOU NEED\\n\\nA SHARPIE WITH YOU AT ALL TIMES', 'labels': ['Black-and-white Fallacy/Dictatorship'], 'link': 'https://www.facebook.com/photo/?fbid=4023552137722493&set=g.633131750534436'}
<class 'dict'>
THIS IS WHY YOU NEED\n\nA SHARPIE WITH YOU AT ALL TIMES


In [58]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

datasets = {}
for split_name in raw_datasets.keys():
  split_data = list(raw_datasets[split_name])

  datasets[split_name] = MemeDataset(tokenizer, split_data)

validation_dataloader = DataLoader(datasets['validation'],
                                   batch_size=64,
                                   shuffle=False,
                                   collate_fn=MemeDataset.collate_fn,
                                   num_workers=2)

print(datasets["train"][0])

Meme(text='THIS IS WHY YOU NEED\\n\\nA SHARPIE WITH YOU AT ALL TIMES', labels=['Black-and-white Fallacy/Dictatorship'])
