<a href="https://colab.research.google.com/github/maoo1/meme-propaganda-detector/blob/master/MEME_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[sem eval task](https://propaganda.math.unipd.it/semeval2024task4/teampage.php?passcode=7a4c50dc60f44593a07529d2253593e9)

to do:
- import models
- finetune models
- evaluate models

In [None]:
!git pull origin master

fatal: not a git repository (or any of the parent directories): .git



# Setting up libraries and mounting google drive




In [3]:
# install dependencies and codebase
# transformers Hugging Face library: access to BERT / GPT
!pip install torch transformers datasets tqdm gdown==v4.6.3
#!pip install evaluate
#!pip install sklearn_hierarchical_classification
!mkdir checkpoints

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting gdown==v4.6.3
  Downloading gdown-4.6.3-py3-none-any.whl.metadata (4.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading gdown-4.6.3-py3-none-any.whl (14 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0

In [4]:
# mounting data and files to drive
# hypothetically can upload SemEval datasets to drive for easier access + we would both have access
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# HIERARCHY

# Preprocessing the Data

In [5]:
import json
from dataclasses import dataclass
from typing import List, Dict, Tuple, Union

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

creating Meme object and dataset

In [6]:
# represents single meme as text, label
@dataclass
class Meme:
  text: str
  labels: Union[List[str], None] # labels can be none for test data

  @staticmethod
  def from_dict(data: dict):
    text = data["text"]
    labels = data.get("labels")
    if len(labels) == 0:
        labels = ['None']
    return Meme(text=text, labels=labels)

# custom dataset for meme data, supporting multi-label classification
class MemeDataset(Dataset):

  def __init__(self, tokenizer, data: List[Dict], label_encoder):
    MemeDataset.tokenizer = tokenizer
    self.examples = [Meme.from_dict(item) for item in data]
    self.label_encoder = label_encoder

    if label_encoder:
      # encode labels if available

      self.encoded_labels = [label_encoder.fit_transform([item.labels]) for item in self.examples]
    else:
      self.encoded_labels = None

  def __len__(self):
    return len(self.examples)

  # def __getitem__(self, idx):
    # return self.examples[idx]
  def __getitem__(self, idx):
    example = self.examples[idx]
    labels = self.encoded_labels[idx] if self.encoded_labels else None
    return {"text": example.text, "labels": labels}

  # batch processing
  @staticmethod
  def collate_fn(batched_samples: List[Meme], max_length=512):

    batched_texts = [sample.text for sample in batched_samples]
    batched_labels = [sample.labels for sample in batched_samples if sample.labels is not None]

    text_encoding = MemeDataset.tokenizer(
        batched_texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    if batched_labels:
      labels_tensor = torch.tensor(batched_labels, dtype=torch.float32) # multi-label as float
    else:
      labels_tensor = None

    return {
        "input_ids": text_encoding["input_ids"],
        "attention_mask": text_encoding["attention_mask"],
        "labels": labels_tensor,
    }

  def inverse_transform_labels(encoded_labels, label_encoder):

    if encoded_labels is not None:
      return label_encoder.inverse_transform(encoded_labels)
    return []

In [7]:
from sklearn.preprocessing import MultiLabelBinarizer
class_labels = ['Appeal to authority', 'Appeal to fear/prejudice', 'Bandwagon',
                'Black-and-white Fallacy/Dictatorship', 'Causal Oversimplification',
                'Doubt', 'Exaggeration/Minimisation', 'Flag-waving',
                'Glittering generalities (Virtue)', 'Loaded Language',
                "Misrepresentation of Someone's Position (Straw Man)",
                'Name calling/Labeling', 'Obfuscation, Intentional vagueness, Confusion',
                'Presenting Irrelevant Data (Red Herring)', 'Reductio ad hitlerum',
                'Repetition', 'Slogans', 'Smears', 'Thought-terminating cliché',
                'Whataboutism', 'None']
mlb = MultiLabelBinarizer(classes=class_labels)

In [8]:
def load_json(file_path):
  with open(file_path, 'r') as f:
    return json.load(f)

# initializing datasets
def initialize_datasets(tokenizer, dataset_paths: Dict, label_encoder) -> dict:

  raw_datasets = {}

  # loading raw datasets from file paths
  for split_name, file_path in dataset_paths.items():
    raw_datasets[split_name] = load_json(file_path)

  # fit mlb with full hierarchy
  #all_labels = list(hierarchy.keys())
  #label_encoder = MultiLabelBinarizer(classes=all_labels)
  #label_encoder.fit([all_labels])
  #print(type(label_encoder))

  # convert raw datasets to MemeDataset objects
  split_datasets = {}
  for split_name, split_data in raw_datasets.items():
    split_datasets[split_name] = MemeDataset(tokenizer, list(split_data), label_encoder)

  return split_datasets

dataset_paths = {
  "train": '/content/drive/MyDrive/NLP_FINAL/train.json',
  "validation": '/content/drive/MyDrive/NLP_FINAL/validation.json',
  # "dev_unlabeled":  '/content/drive/MyDrive/NLP_FINAL/dev_unlabeled.json',
  "test":  '/content/drive/MyDrive/NLP_FINAL/dev_subtask1_en.json'
}

Loading the data

In [9]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

"""
datasets = {}
for split_name in raw_datasets.keys():
  split_data = list(raw_datasets[split_name])

  datasets[split_name] = MemeDataset(tokenizer, split_data)
"""
# loading datasets
datasets = initialize_datasets(tokenizer, dataset_paths, mlb)

length_train = len(datasets['train'])
length_val = len(datasets['validation'])
# length_dev_unlabaled = len(datasets['dev_unlabeled'])
length_test = len(datasets['test'])

print("num training examples:", length_train)
print("num validation examples:", length_val)
print("num test examples:", length_test)

#validation_dataloader = DataLoader(datasets['validation'],
#                                   batch_size=64,
#                                   shuffle=False,
#                                   collate_fn=MemeDataset.collate_fn,
#                                   num_workers=2)

for i in range(0, 6):
  print(datasets['train'][i]['text'])
  print(datasets['train'][i]['labels'])
  print(mlb.inverse_transform(datasets['train'][i]['labels']))

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

num training examples: 7000
num validation examples: 500
num test examples: 1000
THIS IS WHY YOU NEED\n\nA SHARPIE WITH YOU AT ALL TIMES
[[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
[('Black-and-white Fallacy/Dictatorship',)]
GOOD NEWS!\n\nNAZANIN ZAGHARI-RATCLIFFE AND ANOOSHEH ASHOORI HAVE BEEN RELEASED\n\nAfter years of being unjustly detained in Iran, they are making their way safely back to the UK.
[[0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0]]
[('Glittering generalities (Virtue)', 'Loaded Language')]
PAING PHYO MIN IS FREE!
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]]
[('None',)]
Move your ships away!\n\noooook\n\nMove your ships away!\n\nNo, and I just added 10 more
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]]
[('None',)]
WHEN YOU'RE THE FBI, THEY LET YOU DO IT.
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]]
[('Thought-terminating cliché',)]
PUTIN'S SECRET CAMOUFLAGE ARMY
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]]
[('None',)]


# Checking if Preprocessing Data Correct

In [10]:
def check_label_encoding(dataset, mlb):
    """
    Checks if all items in the dataset have been properly encoded by MLB.
    """
    for i, item in enumerate(dataset.examples):
        if item.labels is not None:
            # Encode labels
            encoded = mlb.transform([item.labels])[0]
            # Convert to 2D numpy array for inverse_transform
            decoded = mlb.inverse_transform(np.array([encoded]))[0]

            # Check that original and decoded labels match
            assert set(item.labels) == set(decoded), \
                f"Mismatch in encoding for sample {i}.\nOriginal: {item.labels}\nDecoded: {decoded}"

    print(f"All labels in {len(dataset.examples)} examples are properly encoded!")

check_label_encoding(datasets['train'], mlb)
check_label_encoding(datasets['validation'], mlb)
check_label_encoding(datasets['test'], mlb)

All labels in 7000 examples are properly encoded!
All labels in 500 examples are properly encoded!
All labels in 1000 examples are properly encoded!


# TRAINING

In [11]:
train_dataloader = DataLoader(datasets['train'],
                               batch_size=64,
                               shuffle=True,
                               collate_fn=MemeDataset.collate_fn,
                               num_workers=2)

validation_dataloader = DataLoader(datasets['validation'],
                                   batch_size=64,
                                   shuffle=False,
                                   collate_fn=MemeDataset.collate_fn,
                                   num_workers=2)
# final eval only
test_dataloader = DataLoader(datasets['test'],
                              batch_size=64,
                              shuffle=False,
                              collate_fn=MemeDataset.collate_fn,
                              num_workers=2)

In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim import Optimizer, AdamW
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# models bertweet + roberta
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=20, torch_dtype="auto")

bertweet = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base")#.cuda()
training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")

def evaluate(predictions, data):
    accuracy = np.zeros(len(predictions))
    for index in predictions.keys():
        count_correct = 0
        i = 0
        for label in class_labels:
            print(label)
            print(mlb.inverse_transform(data[index]['labels']))
            print(mlb.inverse_transform(predictions[index]))
            if label in mlb.inverse_transform(data[index]['labels']) and label in mlb.inverse_transform(predictions[index]):
                count_correct += 1
            elif (label not in mlb.inverse_transform(data[index]['labels'])) and (label not in mlb.inverse_transform(predictions[index])):
                count_correct += 1
        accuracy[i] = count_correct / len(class_labels)
    return np.mean(accuracy)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # convert logits to binary predictions
    predictions = (logits > 0.5).astype(int)

    # convert predictions and labels to lists of labels for each example
    pred_labels = {
        idx: [mlb.inverse_transform(pred_row)]# for i, pred in enumerate(pred_row) if pred == 1]
        for idx, pred_row in enumerate(predictions)
    }
    return

print(evaluate({0:datasets['train'][1]['labels']}, datasets['train']))
'''trainer = Trainer(
    model=bertweet,
    args=training_args,
    train_dataset=datasets['train'],
    eval_dataset=datasets['validation'],
    compute_metrics=compute_metrics,
)

trainer.train()'''

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Appeal to authority
[('Black-and-white Fallacy/Dictatorship',)]
[[0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0]]
Appeal to fear/prejudice
[('Black-and-white Fallacy/Dictatorship',)]
[[0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0]]
Bandwagon
[('Black-and-white Fallacy/Dictatorship',)]
[[0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0]]
Black-and-white Fallacy/Dictatorship
[('Black-and-white Fallacy/Dictatorship',)]
[[0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0]]
Causal Oversimplification
[('Black-and-white Fallacy/Dictatorship',)]
[[0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0]]
Doubt
[('Black-and-white Fallacy/Dictatorship',)]
[[0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0]]
Exaggeration/Minimisation
[('Black-and-white Fallacy/Dictatorship',)]
[[0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0]]
Flag-waving
[('Black-and-white Fallacy/Dictatorship',)]
[[0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0]]
Glittering generalities (Virtue)
[('Black-and-white Fallacy/Dictatorship',)]
[[0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0

"trainer = Trainer(\n    model=bertweet,\n    args=training_args,\n    train_dataset=datasets['train'],\n    eval_dataset=datasets['validation'],\n    compute_metrics=compute_metrics,\n)\n\ntrainer.train()"