In [1]:
class CFG:
    device = 'cuda'
    seed = 42
    
    lr = 2e-3
    num_warmup_steps = 100
    num_training_steps = 500000
    epochs = 10
    batch_size = 10
    n_dev = 1
    dev = 'gpu'
    
    checkpoint_dir = './checkpoints'
    log_dir = './logs'
    exp_name = f'clip_lr={lr}_bs={batch_size}_random_split'
    model_path = f'clip_lr={lr}_bs={batch_size}_random_split'
    validate_every_n = 400

# LIBS

In [2]:
!pip install lightning -q

In [3]:
import os
import gc
import json
from PIL import Image
from tqdm import tqdm

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.optim import lr_scheduler, Adam, SGD
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger, TensorBoardLogger

from transformers import AutoTokenizer, AutoModel, AdamW, CLIPProcessor, CLIPModel



In [4]:
torch.multiprocessing.set_start_method('spawn')

In [5]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available(): # для GPU отдельный seed
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
set_seed(CFG.seed)
# есть стохастические операции на GPU
# сделаем их детерминированными для воспроизводимости
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False

# DATA

In [6]:
with open('/kaggle/input/semeval2024subtask1/subtask2a/train.json') as fp:
    train = json.load(fp)
with open('/kaggle/input/semeval2024subtask1/subtask2a/validation.json') as fp:
    valid = json.load(fp)
with open('/kaggle/input/semeval2024subtask1/dev_gold_labels/dev_gold_labels/dev_subtask2a_en.json') as fp:
    dev = json.load(fp)
    
train_df = pd.DataFrame(train)
valid_df = pd.DataFrame(valid)
dev_df = pd.DataFrame(dev)

In [7]:
train_df

Unnamed: 0,id,text,image,labels,link
0,63292,This is why we're free\n\nThis is why we're sa...,prop_meme_556.png,"[Causal Oversimplification, Transfer, Flag-wav...",https://www.facebook.com/SilentmajorityDJT/pho...
1,65635,THIS IS WHY YOU NEED\n\nA SHARPIE WITH YOU AT ...,prop_meme_4839.png,"[Transfer, Black-and-white Fallacy/Dictatorshi...",https://www.facebook.com/photo/?fbid=402355213...
2,67927,GOOD NEWS!\n\nNAZANIN ZAGHARI-RATCLIFFE AND AN...,prop_meme_7653.png,"[Loaded Language, Glittering generalities (Vir...",https://www.facebook.com/amnesty/photos/531198...
3,68031,PAING PHYO MIN IS FREE!,prop_meme_7826.png,[Glittering generalities (Virtue)],https://www.facebook.com/amnesty/photos/427419...
4,77490,Move your ships away!\n\noooook\n\nMove your s...,prop_meme_18807.png,[Smears],https://www.facebook.com/rightpatriots/photos/...
...,...,...,...,...,...
6995,67360,If your doctor prescribes you medication witho...,prop_meme_8426.png,"[Loaded Language, Causal Oversimplification, T...",https://www.facebook.com/TheControversia/photo...
6996,70579,DEFENDS TRUMP. \nMADE ALLEGATIONS OF ELECTION ...,prop_meme_11322.png,"[Loaded Language, Smears, Whataboutism]",https://www.facebook.com/PatriotFetch/photos/p...
6997,70305,I'm having trouble selling our incredibly enor...,prop_meme_11030.png,"[Transfer, Smears]",https://www.facebook.com/PatriotFetch/photos/p...
6998,77769,I'm so happy we live in a world without slaver...,prop_meme_17424.png,[Whataboutism],https://www.facebook.com/communism101/photos/5...


In [8]:
labels_set = set()
for lst in train_df.labels.values:
    for l in lst:
        labels_set.add(l)
labels_set

{'Appeal to (Strong) Emotions',
 'Appeal to authority',
 'Appeal to fear/prejudice',
 'Bandwagon',
 'Black-and-white Fallacy/Dictatorship',
 'Causal Oversimplification',
 'Doubt',
 'Exaggeration/Minimisation',
 'Flag-waving',
 'Glittering generalities (Virtue)',
 'Loaded Language',
 "Misrepresentation of Someone's Position (Straw Man)",
 'Name calling/Labeling',
 'Obfuscation, Intentional vagueness, Confusion',
 'Presenting Irrelevant Data (Red Herring)',
 'Reductio ad hitlerum',
 'Repetition',
 'Slogans',
 'Smears',
 'Thought-terminating cliché',
 'Transfer',
 'Whataboutism'}

In [9]:
len(labels_set)

22

In [10]:
graph = {
    'Persuasion': ['Ethos', 'Pathos', 'Logos'],
    'Ethos': ['Transfer', 'Ad Hominem', 'Bandwagon', 'Appeal to authority', 'Glittering generalities (Virtue)'], 
    'Pathos': ['Appeal to (Strong) Emotions', 'Transfer', 'Exaggeration/Minimisation', 'Loaded Language', 'Flag-waving', 'Appeal to fear/prejudice'], 
    'Logos': ['Justification', 'Reasoning', 'Repetition', 'Obfuscation, Intentional vagueness, Confusion'],
    'Ad Hominem': ['Name calling/Labeling', 'Doubt', 'Smears', 'Reductio ad hitlerum', 'Whataboutism'],
    'Justification': ['Bandwagon', 'Appeal to authority', 'Flag-waving', 'Appeal to fear/prejudice', 'Slogans'],
    'Reasoning': ['Distraction', 'Simplification'],
    'Distraction': ["Misrepresentation of Someone's Position (Straw Man)", 'Presenting Irrelevant Data (Red Herring)', 'Whataboutism'], 
    'Simplification': ['Causal Oversimplification', 'Black-and-white Fallacy/Dictatorship', 'Thought-terminating cliché']
}



replace = {n: i for i, n in enumerate(set(graph.keys()) | labels_set)}
replace_back = {i: n for i, n in enumerate(set(graph.keys()) | labels_set)}
ancestors = {replace['Persuasion']: set([replace['Persuasion']])}

for a, lst in graph.items():
    a_ = replace[a]
    for b in lst:
        b_ = replace[b]
        ancestors[b_] = ancestors.get(b_, set()) | set([b_]) | ancestors[a_]

In [11]:
ancestors_matrix = torch.zeros((len(ancestors), len(ancestors)), dtype=torch.float32)
for a, lst in ancestors.items():
    ancestors_matrix[a, list(lst)] = 1.0

In [12]:
classes = len(replace)

In [13]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df, clip_model, processor, replace, replace_back, dir_path='/kaggle/input/semeval2024subtask1/train_images/train_images/'):
        self.df = df
        self.labels = df['labels']
        for idx in df.index:
            ls = self.labels.iloc[idx]
            ls_new = set()
            for n in ls:
                ls_new |= ancestors[replace[n]]
            ls_new = [replace_back[n] for n in ls_new]
            self.labels.iloc[idx] = ls_new
        self.texts = df['text']
        self.images = df['image']
        self.replace = replace
        self.replace_back = replace_back
        self.clip_model = clip_model
        self.processor = processor
        self.dir_path = dir_path

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        y = [self.replace[n] for n in self.labels.iloc[idx]]
        labels = torch.zeros(len(self.replace))
        labels[y] = 1
        return labels

    def get_batch_features(self, idx):
        # Fetch a batch of inputs
        text = self.texts.iloc[idx]
        image_path = self.images.iloc[idx]
        image = Image.open(self.dir_path + image_path)
        inputs = self.processor(text=[text], images=image, max_length=77, return_tensors="pt", truncation=True, padding='max_length').to(CFG.device)
        with torch.no_grad():
            x = self.clip_model(**inputs)
            out = torch.cat((x.text_embeds, x.image_embeds), dim=-1)[0]
        return out

    def __getitem__(self, idx):

        batch_features = self.get_batch_features(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_features, batch_y

In [14]:
class Collator:
    def __init__(self):
        pass

    def __call__(self, batch):
        tokens, labels = zip(*batch)
        
        return torch.stack(tokens), torch.stack(labels)

# MODEL

In [15]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.dp1 = nn.Dropout(0.1)
        self.dp2 = nn.Dropout(0.2)
        self.dp3 = nn.Dropout(0.3)
        self.dp4 = nn.Dropout(0.4)
        self.dp5 = nn.Dropout(0.5)
        self.clf = nn.Linear(1024, classes)
        
    def forward(self, x):
        x1 = self.dp1(x)
        x2 = self.dp2(x)
        x3 = self.dp3(x)
        x4 = self.dp4(x)
        x5 = self.dp5(x)
        x = (x1 + x2 + x3 + x4 + x5) / 5
        return self.clf(x)

# TRAIN

In [16]:
def get_prediction(y):
#     l0 = [replace['Persuasion']]
#     l1 = [replace['Ethos'], replace['Pathos'], replace['Logos']]
#     l2 = [replace['Ad Hominem'], replace['Justification'], replace['Reasoning']]
#     l3 = [replace['Distraction'], replace['Simplification']]
    
#     y[l0[0]] = (y[l0[0]] > 0).to(torch.float32)
    
#     y[l1[0]] = y[l0[0]] * (y[l1[0]] > 0).to(torch.float32)
#     y[l1[1]] = y[l0[0]] * (y[l1[1]] > 0).to(torch.float32)
#     y[l1[2]] = y[l0[0]] * (y[l1[2]] > 0).to(torch.float32)
    
#     y[l2[0]] = y[l1[0]] * (y[l2[0]] > 0).to(torch.float32)
#     y[l2[1]] = y[l1[2]] * (y[l2[1]] > 0).to(torch.float32)
#     y[l2[2]] = y[l1[2]] * (y[l2[2]] > 0).to(torch.float32)
    
#     y[l3[0]] = y[l2[2]] * (y[l3[0]] > 0).to(torch.float32)
#     y[l3[1]] = y[l2[2]] * (y[l3[1]] > 0).to(torch.float32)
    
#     for i, lst in graph.items():
#         i = replace[i]
#         for j in lst:
#             j = replace[j]
#             y[j] = y[i] * (y[j] > 0).to(torch.float32)
            
        
    return (y > 0).to(torch.float32)

In [17]:
def f_score(y_pred, y_true, beta=1):
    y_true_anc =  torch.clip(y_true @ ancestors_matrix, 0, 1)
    y_pred = get_prediction(y_pred)
    y_pred_anc = torch.clip(y_pred @ ancestors_matrix, 0, 1)
    
    
    tp = (y_true_anc * y_pred_anc).sum()
    hr = tp / y_true_anc.sum()
    hp = tp / y_pred_anc.sum()
    f1 = (1 + beta**2) * hr * hp / (hr + hp * (beta ** 2))
    return {'F1': f1, 'hR': hr, 'hP': hp}

In [18]:
def calc_metrics(y_hat, y, metrics_func=[]):
    metrics = {}
    for f in metrics_func:
        metrics.update(f(y_hat, y))
    return metrics

In [19]:
class LitModel(pl.LightningModule):

    def __init__(self, model, loss, optimizer, scheduler, metric_functions):
        super().__init__()
        self.model = model
        self.loss = loss
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.metric_functions = metric_functions
        self.train_y_out = []
        self.train_y = []
        self.val_y_out = []
        self.val_y = []
        self.history = []

    def calc_grad_norm(self):
        total_norm = 0
        parameters = [p for p in self.model.parameters() if p.grad is not None and p.requires_grad]
        for p in parameters:
            param_norm = p.grad.detach().data.norm(2)
            total_norm += param_norm.item() ** 2
        total_norm = total_norm ** 0.5
        return total_norm
    
    def _common_step(self, batch, batch_idx):
        (x, y) = batch
        y_hat = self.model(x)
        loss = self.loss(y_hat, y)
        return {
            'loss': loss,
            'y': y_hat
        }

    def training_step(self, batch, batch_idx):
        res = self._common_step(batch, batch_idx)
        loss = res['loss']
        self.log_dict(
            {'loss/Train': loss.detach().cpu().item()},
            on_step=True,
            on_epoch=False,
            prog_bar=True,
        )
        self.logger.log_metrics(
            {
                'loss/Train': loss.detach().cpu().item(), 
                'grad_norm': self.calc_grad_norm(),
                'lr': self.optimizer.param_groups[0]['lr']
            }, 
            self.global_step)
        return res

    def on_training_epoch_end(self):
        y_hat = torch.cat(self.train_y_out, dim=0)
        y = torch.cat(self.train_y, dim=0)
        
        metrics = calc_metrics(y_hat.detach().cpu(), y.detach().cpu(), self.metric_functions)
        self.log_dict(
            {n+'/Train': v for n, v in metrics.items()},
            on_step=False,
            on_epoch=True,
            prog_bar=True,
        )
        
        for name, val in metrics.items():
            self.logger.log_metrics({name + '/Train': val}, self.global_step)

        self.train_y_out.clear()
        self.train_y.clear()

    def validation_step(self, batch, batch_idx):
        res = self._common_step(batch, batch_idx)
        loss = res['loss']
        self.val_y_out.append(res['y'])
        self.val_y.append(batch[1])
        return res

    def on_validation_epoch_end(self):
        y_hat = torch.cat(self.val_y_out, dim=0)
        y = torch.cat(self.val_y, dim=0)

        metrics = calc_metrics(y_hat.detach().cpu(), y.detach().cpu(), self.metric_functions)
        self.log_dict(
            {n+'/Valid': v for n, v in metrics.items()},
            on_step=False,
            on_epoch=True,
            prog_bar=True,
        )
        self.history += [metrics.copy()]
        for name, val in metrics.items():
            self.logger.log_metrics({name + '/Valid': val}, self.global_step)

        self.val_y_out.clear()
        self.val_y.clear()
      
    def test_step(self, *args, **kwargs):
        self.validation_step(*args, **kwargs)
    
    def on_test_epoch_end(self):
        y_hat = torch.cat(self.val_y_out, dim=0)
        y = torch.cat(self.val_y, dim=0)

        metrics = calc_metrics(y_hat.detach().cpu(), y.detach().cpu(), self.metric_functions)
        self.log_dict(
            {n+'/Test': v for n, v in metrics.items()},
            on_step=False,
            on_epoch=True,
            prog_bar=True,
        )
        for name, val in metrics.items():
            self.logger.log_metrics({name + '/Test': val}, self.global_step)

        self.val_y_out.clear()
        self.val_y.clear()

    def configure_optimizers(self):
        if self.scheduler is None:
            return [self.optimizer], []
        return [self.optimizer], [self.scheduler]

In [20]:
model = Classifier()
loss = nn.BCEWithLogitsLoss()
optimizer = Adam(model.parameters(), lr=CFG.lr)
scheduler = None
collator = Collator()

In [21]:
checkpoint_callback = ModelCheckpoint(
    mode="max",
    filename=CFG.model_path,
    dirpath=CFG.checkpoint_dir,
    save_top_k=1, monitor="F1/Valid"
    )
logger = TensorBoardLogger(
    save_dir=CFG.log_dir,
    name=CFG.exp_name,
    )

In [22]:
from transformers import CLIPConfig

In [23]:
# config = CLIPConfig.from_pretrained("openai/clip-vit-base-patch32")
# config.max_position_embeddings = 512
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# clip_model = CLIPModel(config)
clip_model.to(CFG.device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
train_ds = Dataset(train_df, clip_model, processor, replace, replace_back)
valid_ds = Dataset(valid_df, clip_model, processor, replace, replace_back, dir_path='/kaggle/input/semeval2024subtask1/validation_images/validation_images/')
dev_ds = Dataset(dev_df, clip_model, processor, replace, replace_back, dir_path='/kaggle/input/semeval2024subtask1/dev_images/dev_images/')

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [24]:
train_batch_size = CFG.batch_size
test_batch_size = CFG.batch_size
train_dataloader = DataLoader(
    torch.utils.data.ConcatDataset([train_ds, dev_ds]),
    batch_size=train_batch_size,
    shuffle=True,
    drop_last=False,
    collate_fn=collator,
#     num_workers=3
)
val_dataloader = DataLoader(
    valid_ds,
    batch_size=test_batch_size,
    shuffle=False,
    drop_last=False,
    collate_fn=collator,
#     num_workers=3
)

In [25]:
trainer = pl.Trainer(
    logger=logger,
    max_epochs=CFG.epochs,
    devices=CFG.n_dev, accelerator=CFG.dev,
    callbacks=[checkpoint_callback],
    val_check_interval=CFG.validate_every_n,
    num_sanity_val_steps=0
    )

clf_model = LitModel(model, loss, optimizer, scheduler, metric_functions=[f_score])

INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs


In [26]:
clf_model.load_state_dict(torch.load('/kaggle/input/semeval2024subtask1/clip_lr0.002_bs10_step2.ckpt')['state_dict'])

<All keys matched successfully>

In [27]:
trainer.fit(
    clf_model,
    train_dataloader,
    val_dataloader
    )

/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=10` reached.


In [28]:
clf_model.history

[{'F1': tensor(0.6256), 'hR': tensor(0.6155), 'hP': tensor(0.6361)},
 {'F1': tensor(0.6640), 'hR': tensor(0.5759), 'hP': tensor(0.7839)},
 {'F1': tensor(0.6730), 'hR': tensor(0.5808), 'hP': tensor(0.8001)},
 {'F1': tensor(0.6826), 'hR': tensor(0.5955), 'hP': tensor(0.7994)},
 {'F1': tensor(0.6892), 'hR': tensor(0.5961), 'hP': tensor(0.8166)},
 {'F1': tensor(0.6951), 'hR': tensor(0.6123), 'hP': tensor(0.8038)},
 {'F1': tensor(0.6990), 'hR': tensor(0.6161), 'hP': tensor(0.8078)},
 {'F1': tensor(0.6992), 'hR': tensor(0.6129), 'hP': tensor(0.8138)},
 {'F1': tensor(0.7004), 'hR': tensor(0.6175), 'hP': tensor(0.8091)},
 {'F1': tensor(0.7042), 'hR': tensor(0.6250), 'hP': tensor(0.8064)},
 {'F1': tensor(0.7059), 'hR': tensor(0.6273), 'hP': tensor(0.8070)},
 {'F1': tensor(0.7048), 'hR': tensor(0.6247), 'hP': tensor(0.8085)},
 {'F1': tensor(0.7071), 'hR': tensor(0.6291), 'hP': tensor(0.8071)},
 {'F1': tensor(0.7071), 'hR': tensor(0.6279), 'hP': tensor(0.8092)},
 {'F1': tensor(0.7088), 'hR': tens

In [29]:
# dev_df['labels'] = [[]] * len(dev_df) 
# # for i in dev_df.index:
# #     dev_df.loc[i, 'labels'] = list()

In [30]:
with open('/kaggle/input/semeval2024subtask1/test_data/test_data/english/en_subtask2a_test_unlabeled.json') as fp:
    dev = json.load(fp)
dev_df = pd.DataFrame(dev)

In [31]:
dev_df['labels'] = [[]] * len(dev_df)
dev_ds = Dataset(dev_df, clip_model, processor, replace, replace_back, dir_path='/kaggle/input/semeval2024subtask1/test_images/test_images/subtask1_2a/english/')
dev_dataloader = DataLoader(
    dev_ds,
    batch_size=test_batch_size,
    shuffle=False,
    drop_last=False,
    collate_fn=collator,
#     num_workers=3
)

In [32]:
dev_df

Unnamed: 0,id,text,image,labels
0,68254,Nicola Sturgeon\n\nWE'RE SCOTTISH GETUSOUTOFHE...,prop_meme_8883.png,[]
1,69640,I saw a movie once where only the police and m...,prop_meme_10818.png,[]
2,71251,Heaven has a Wall and strict immigration polic...,prop_meme_7048.png,[]
3,79369,Don't expect a broken government to fix itself.,prop_meme_15611.png,[]
4,69351,HOW MOST AMERICANS SEE THE DEBATE\n\nHOW FREE ...,prop_meme_10392.png,[]
...,...,...,...,...
1495,65323,Croats:Nikola tesla was a Croatian-American Sc...,prop_meme_1412.png,[]
1496,71081,"For the first time in history, you can say, \H...",prop_meme_10954.png,[]
1497,64216,\MY MESSAGE TO EVERYONE STRUGGLING RIGHT NOW I...,prop_meme_799.png,[]
1498,70426,CATHING MEXICANS!!!,prop_meme_10039.png,[]


In [33]:
from tqdm import tqdm

In [34]:
final_model = clf_model.model
final_model.eval()
final_model.to(CFG.device)
res = []
with torch.no_grad():
    for x, y in tqdm(dev_dataloader):
        x = x.to(CFG.device)
        res += [final_model(x)]

100%|██████████| 150/150 [01:35<00:00,  1.57it/s]


In [35]:
res = torch.cat(res, dim=0)

In [36]:
res = get_prediction(res)

In [37]:
for i in tqdm(range(len(res))):
    dev_df.loc[i, 'labels'].extend([replace_back[j.item()] for j in torch.where(res[i]>0.5)[0] if replace_back[j.item()]!='Persuasion'])

100%|██████████| 1500/1500 [00:00<00:00, 4169.58it/s]


In [38]:
# dict_res = dev_df.to_dict()

In [39]:
dev_df

Unnamed: 0,id,text,image,labels
0,68254,Nicola Sturgeon\n\nWE'RE SCOTTISH GETUSOUTOFHE...,prop_meme_8883.png,"[Transfer, Smears, Ad Hominem, Pathos, Ethos]"
1,69640,I saw a movie once where only the police and m...,prop_meme_10818.png,"[Ad Hominem, Pathos, Ethos, Logos]"
2,71251,Heaven has a Wall and strict immigration polic...,prop_meme_7048.png,"[Justification, Pathos, Ethos, Appeal to autho..."
3,79369,Don't expect a broken government to fix itself.,prop_meme_15611.png,"[Pathos, Ethos, Logos]"
4,69351,HOW MOST AMERICANS SEE THE DEBATE\n\nHOW FREE ...,prop_meme_10392.png,"[Transfer, Smears, Ad Hominem, Pathos, Ethos]"
...,...,...,...,...
1495,65323,Croats:Nikola tesla was a Croatian-American Sc...,prop_meme_1412.png,"[Ad Hominem, Pathos, Ethos]"
1496,71081,"For the first time in history, you can say, \H...",prop_meme_10954.png,"[Pathos, Ethos, Logos]"
1497,64216,\MY MESSAGE TO EVERYONE STRUGGLING RIGHT NOW I...,prop_meme_799.png,"[Smears, Ad Hominem, Justification, Ethos, App..."
1498,70426,CATHING MEXICANS!!!,prop_meme_10039.png,"[Smears, Ad Hominem, Ethos]"


In [40]:
dev_df = dev_df[['id', 'labels']]

In [41]:
with open('test_sub_task2a.json', 'w') as fp:
    json.dump([dev_df.iloc[i].to_dict() for i in range(len(dev_df))], fp)