In [1]:
class CFG:
    device = 'cuda'
    seed = 42
    
    lr = 2e-3
    num_warmup_steps = 100
    num_training_steps = 500000
    epochs = 10
    batch_size = 8
    n_dev = 1
    dev = 'gpu'
    
    checkpoint_dir = './checkpoints'
    log_dir = './logs'
    exp_name = f'clip_lr={lr}_bs={batch_size}_bin'
    model_path = f'clip_lr={lr}_bs={batch_size}_bin'
    validate_every_n = 60

# LIBS

In [2]:
!pip install lightning -q

In [3]:
import os
import gc
import json
from PIL import Image
from tqdm import tqdm

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.optim import lr_scheduler, Adam, SGD
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger, TensorBoardLogger

from transformers import AutoTokenizer, AutoModel, AdamW, CLIPProcessor, CLIPModel

In [4]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available(): # для GPU отдельный seed
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
set_seed(CFG.seed)
# есть стохастические операции на GPU
# сделаем их детерминированными для воспроизводимости
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False

# DATA

In [5]:
with open('/kaggle/input/semeval2024subtask1/subtask2b/train.json') as fp:
    train = json.load(fp)
with open('/kaggle/input/semeval2024subtask1/subtask2b/val.json') as fp:
    valid = json.load(fp)
with open('/kaggle/input/semeval2024subtask1/dev_gold_labels/dev_gold_labels/dev_subtask2b_en.json') as fp:
    dev = json.load(fp)
    
train_df = pd.DataFrame(train)
valid_df = pd.DataFrame(valid)
dev_df = pd.DataFrame(dev)

In [6]:
(train_df['label'] == 'propagandistic').astype(np.int32)

0       1
1       1
2       0
3       1
4       1
       ..
1195    1
1196    1
1197    1
1198    1
1199    1
Name: label, Length: 1200, dtype: int32

In [7]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df, clip_model, processor, dir_path='/kaggle/input/semeval2024subtask1/train_images/train_images/'):
        self.df = df
        self.labels = (df['label'] == 'propagandistic').astype(np.int32)
        self.texts = df['text']
        self.images = df['image']
        self.clip_model = clip_model
        self.processor = processor
        self.dir_path = dir_path

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return torch.tensor([self.labels[idx]], dtype=torch.float32)

    def get_batch_features(self, idx):
        # Fetch a batch of inputs
        text = self.texts.iloc[idx]
        image_path = self.images.iloc[idx]
        image = Image.open(self.dir_path + image_path)
        inputs = self.processor(text=[text], images=image, max_length=512, return_tensors="pt", truncation=True, padding='max_length').to(CFG.device, torch.float16)
        with torch.no_grad():
            x = self.clip_model(**inputs)
            out = torch.cat((x.qformer_outputs.pooler_output , x.vision_outputs.pooler_output ), dim=-1)[0]
        return out.to(torch.float32)

    def __getitem__(self, idx):

        batch_features = self.get_batch_features(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_features, batch_y

In [8]:
class Collator:
    def __init__(self):
        pass

    def __call__(self, batch):
        tokens, labels = zip(*batch)
        
        return torch.stack(tokens), torch.stack(labels)

# MODEL

In [9]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.dp1 = nn.Dropout(0.1)
        self.dp2 = nn.Dropout(0.2)
        self.dp3 = nn.Dropout(0.3)
        self.dp4 = nn.Dropout(0.4)
        self.dp5 = nn.Dropout(0.5)
        self.clf = nn.Linear(2176, 1)
        
    def forward(self, x):
        x1 = self.dp1(x)
        x2 = self.dp2(x)
        x3 = self.dp3(x)
        x4 = self.dp4(x)
        x5 = self.dp5(x)
        x = (x1 + x2 + x3 + x4 + x5) / 5
        return self.clf(x)

# TRAIN

In [10]:
def get_prediction(y):
        
    return (y > 0).to(torch.float32)

In [11]:
from sklearn.metrics import f1_score

In [12]:
def f_score(y_pred, y_true, beta=1):
    y_pred = get_prediction(y_pred)
    return {'F1': f1_score(y_true[:, 0].numpy(), y_pred[:, 0].numpy(), average='macro')}

In [13]:
def calc_metrics(y_hat, y, metrics_func=[]):
    metrics = {}
    for f in metrics_func:
        metrics.update(f(y_hat, y))
    return metrics

In [14]:
class LitModel(pl.LightningModule):

    def __init__(self, model, loss, optimizer, scheduler, metric_functions):
        super().__init__()
        self.model = model
        self.loss = loss
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.metric_functions = metric_functions
        self.train_y_out = []
        self.train_y = []
        self.val_y_out = []
        self.val_y = []
        self.history = []

    def calc_grad_norm(self):
        total_norm = 0
        parameters = [p for p in self.model.parameters() if p.grad is not None and p.requires_grad]
        for p in parameters:
            param_norm = p.grad.detach().data.norm(2)
            total_norm += param_norm.item() ** 2
        total_norm = total_norm ** 0.5
        return total_norm
    
    def _common_step(self, batch, batch_idx):
        (x, y) = batch
        y_hat = self.model(x)
        loss = self.loss(y_hat, y)
        return {
            'loss': loss,
            'y': y_hat
        }

    def training_step(self, batch, batch_idx):
        res = self._common_step(batch, batch_idx)
        loss = res['loss']
        self.log_dict(
            {'loss/Train': loss.detach().cpu().item()},
            on_step=True,
            on_epoch=False,
            prog_bar=True,
        )
        self.logger.log_metrics(
            {
                'loss/Train': loss.detach().cpu().item(), 
                'grad_norm': self.calc_grad_norm(),
                'lr': self.optimizer.param_groups[0]['lr']
            }, 
            self.global_step)
        return res

    def on_training_epoch_end(self):
        y_hat = torch.cat(self.train_y_out, dim=0)
        y = torch.cat(self.train_y, dim=0)
        
        metrics = calc_metrics(y_hat.detach().cpu(), y.detach().cpu(), self.metric_functions)
        self.log_dict(
            {n+'/Train': v for n, v in metrics.items()},
            on_step=False,
            on_epoch=True,
            prog_bar=True,
        )
        
        for name, val in metrics.items():
            self.logger.log_metrics({name + '/Train': val}, self.global_step)

        self.train_y_out.clear()
        self.train_y.clear()

    def validation_step(self, batch, batch_idx):
        res = self._common_step(batch, batch_idx)
        loss = res['loss']
        self.val_y_out.append(res['y'])
        self.val_y.append(batch[1])
        return res

    def on_validation_epoch_end(self):
        y_hat = torch.cat(self.val_y_out, dim=0)
        y = torch.cat(self.val_y, dim=0)

        metrics = calc_metrics(y_hat.detach().cpu(), y.detach().cpu(), self.metric_functions)
        self.log_dict(
            {n+'/Valid': v for n, v in metrics.items()},
            on_step=False,
            on_epoch=True,
            prog_bar=True,
        )
        self.history += [metrics.copy()]
        for name, val in metrics.items():
            self.logger.log_metrics({name + '/Valid': val}, self.global_step)

        self.val_y_out.clear()
        self.val_y.clear()
      
    def test_step(self, *args, **kwargs):
        self.validation_step(*args, **kwargs)
    
    def on_test_epoch_end(self):
        y_hat = torch.cat(self.val_y_out, dim=0)
        y = torch.cat(self.val_y, dim=0)

        metrics = calc_metrics(y_hat.detach().cpu(), y.detach().cpu(), self.metric_functions)
        self.log_dict(
            {n+'/Test': v for n, v in metrics.items()},
            on_step=False,
            on_epoch=True,
            prog_bar=True,
        )
        for name, val in metrics.items():
            self.logger.log_metrics({name + '/Test': val}, self.global_step)

        self.val_y_out.clear()
        self.val_y.clear()

    def configure_optimizers(self):
        if self.scheduler is None:
            return [self.optimizer], []
        return [self.optimizer], [self.scheduler]

In [15]:
model = Classifier()
loss = nn.BCEWithLogitsLoss()
optimizer = Adam(model.parameters(), lr=CFG.lr)
scheduler = None
collator = Collator()

In [16]:
checkpoint_callback = ModelCheckpoint(
    mode="max",
    filename=CFG.model_path,
    dirpath=CFG.checkpoint_dir,
    save_top_k=1, monitor="F1/Valid"
    )
logger = TensorBoardLogger(
    save_dir=CFG.log_dir,
    name=CFG.exp_name,
    )

In [17]:
from transformers import CLIPConfig, Blip2Processor, Blip2Model

In [18]:
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
clip_model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
clip_model.to(CFG.device)
1

2024-02-01 00:07:14.594333: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-01 00:07:14.594439: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-01 00:07:14.733423: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/6.96k [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/122k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/5.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


1

In [19]:
# # config = CLIPConfig.from_pretrained("openai/clip-vit-base-patch32")
# # config.max_position_embeddings = 512
# clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# # clip_model = CLIPModel(config)
# # clip_model.to(CFG.device)
# processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
train_ds = Dataset(train_df, clip_model, processor, dir_path='/kaggle/input/semeval2024subtask1/subtask2b_images/train/')
valid_ds = Dataset(valid_df, clip_model, processor, dir_path='/kaggle/input/semeval2024subtask1/subtask2b_images/val/')
dev_ds = Dataset(dev_df, clip_model, processor, dir_path='/kaggle/input/semeval2024subtask1/subtask2b_images/dev/')

In [20]:
train_batch_size = CFG.batch_size
test_batch_size = CFG.batch_size
train_dataloader = DataLoader(
    torch.utils.data.ConcatDataset([train_ds, dev_ds]),
    batch_size=train_batch_size,
    shuffle=True,
    drop_last=False,
    collate_fn=collator,
#     num_workers=3
)
val_dataloader = DataLoader(
    valid_ds,
    batch_size=test_batch_size,
    shuffle=False,
    drop_last=False,
    collate_fn=collator,
#     num_workers=3
)

In [21]:
trainer = pl.Trainer(
    logger=logger,
    max_epochs=CFG.epochs,
    devices=CFG.n_dev, accelerator=CFG.dev,
    callbacks=[checkpoint_callback],
    val_check_interval=CFG.validate_every_n,
    num_sanity_val_steps=0
    )

clf_model = LitModel(model, loss, optimizer, scheduler, metric_functions=[f_score])

INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs


In [22]:
# clf_model.load_state_dict(torch.load('/kaggle/input/semeval2024subtask1/clip_lr0.002_bs10_step2.ckpt')['state_dict'])

In [23]:
trainer.fit(
    clf_model,
    train_dataloader,
    val_dataloader
    )

/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=10` reached.


In [24]:
clf_model.history

[{'F1': 0.7222222222222221},
 {'F1': 0.730423620025674},
 {'F1': 0.731247881458525},
 {'F1': 0.7843335640875446},
 {'F1': 0.7250293772032903},
 {'F1': 0.7228165726512352},
 {'F1': 0.7397339502602661},
 {'F1': 0.7350863422291994},
 {'F1': 0.7412646000689962},
 {'F1': 0.7250293772032903},
 {'F1': 0.7267759562841531},
 {'F1': 0.7936507936507936},
 {'F1': 0.7458279845956354},
 {'F1': 0.7375207220482594},
 {'F1': 0.7395833333333333},
 {'F1': 0.7361676466154078},
 {'F1': 0.7354497354497354},
 {'F1': 0.7361676466154078},
 {'F1': 0.7457750230013074},
 {'F1': 0.7638515057640547},
 {'F1': 0.7498037676609105},
 {'F1': 0.7455540355677155},
 {'F1': 0.7748713550600344},
 {'F1': 0.7391871332319062},
 {'F1': 0.7740377123266324},
 {'F1': 0.7702205882352942},
 {'F1': 0.6960965287470173},
 {'F1': 0.7387929747748645},
 {'F1': 0.7188905547226387},
 {'F1': 0.731247881458525}]

In [25]:
# with open('/kaggle/input/semeval2024subtask1/dev_gold_labels/dev_gold_labels/dev_subtask2b_en.json') as fp:
#     dev = json.load(fp)

In [26]:
# dev_df = pd.DataFrame(dev)

In [27]:
# dev_df['labels'] = [[]] * len(dev_df) 
# # for i in dev_df.index:
# #     dev_df.loc[i, 'labels'] = list()

In [28]:
# dev_ds = Dataset(dev_df, clip_model, processor, dir_path='/kaggle/input/semeval2024subtask1/subtask2b_images/dev/')

In [29]:
# trainer.test(clf_model, dataloaders=[dev_dataloader])

In [30]:
with open('/kaggle/input/semeval2024subtask1/test_data/test_data/english/en_subtask2b_test_unlabeled.json') as fp:
    dev = json.load(fp)
dev_df = pd.DataFrame(dev)

In [31]:
dev_df['label'] = [''] * len(dev_df)
dev_ds = Dataset(dev_df, clip_model, processor, dir_path='/kaggle/input/semeval2024subtask1/test_images/test_images/subtask2b/english/')
dev_dataloader = DataLoader(
    dev_ds,
    batch_size=test_batch_size,
    shuffle=False,
    drop_last=False,
    collate_fn=collator,
#     num_workers=3
)

In [32]:
dev_df

Unnamed: 0,id,image,text,label
0,56107,prop_meme_24920.png,I ain't no mind reader but I can\ndefinitely s...,
1,43907,prop_meme_24122.png,YOU ARE HAMAS! AND YOU ARE\nHAMAS! AND YOU ARE...,
2,45154,prop_meme_17843.png,RE\nENT\nDESCRIBE ME\nIN ONE WORD,
3,24510,prop_meme_2884.png,\LET'S BE CLEAR:\nIf a Black 17-year-old had c...,
4,44406,prop_meme_24621.png,DEMOCRAT\nEMOCR/\nDEMOCRAT\nmade with mematic\...,
...,...,...,...,...
595,46938,prop_meme_15450.png,"Ol\nWitty,\nEp. 187\nU.N.\nTERM\nLIMITS\nNO H\...",
596,43436,prop_meme_22924.png,"I DON'T USUALLY HAVE SEX\nBUT WHEN I DO,\n\FOR...",
597,45280,prop_meme_17969.png,THERE IS SOMETHING\nSERIOUSLY WRONG IN THIS\nC...,
598,44450,prop_meme_24665.png,IF MORE SANE PEOPLE WERE ARMED\nTHE\nCRAZY\n2N...,


In [33]:
from tqdm import tqdm

In [34]:
final_model = clf_model.model
final_model.eval()
final_model.to(CFG.device)
res = []
with torch.no_grad():
    for x, y in tqdm(dev_dataloader):
        x = x.to(CFG.device)
        res += [final_model(x)]

100%|██████████| 75/75 [05:49<00:00,  4.66s/it]


In [35]:
res = torch.cat(res, dim=0)

In [36]:
res = get_prediction(res)

In [37]:
for i in tqdm(range(len(res))):
    dev_df.loc[i, 'label'] = 'propagandistic' if res[i, 0] > 0.5 else 'non_propagandistic'

100%|██████████| 600/600 [00:00<00:00, 6342.42it/s]


In [38]:
# dict_res = dev_df.to_dict()

In [39]:
dev_df

Unnamed: 0,id,image,text,label
0,56107,prop_meme_24920.png,I ain't no mind reader but I can\ndefinitely s...,non_propagandistic
1,43907,prop_meme_24122.png,YOU ARE HAMAS! AND YOU ARE\nHAMAS! AND YOU ARE...,propagandistic
2,45154,prop_meme_17843.png,RE\nENT\nDESCRIBE ME\nIN ONE WORD,propagandistic
3,24510,prop_meme_2884.png,\LET'S BE CLEAR:\nIf a Black 17-year-old had c...,propagandistic
4,44406,prop_meme_24621.png,DEMOCRAT\nEMOCR/\nDEMOCRAT\nmade with mematic\...,non_propagandistic
...,...,...,...,...
595,46938,prop_meme_15450.png,"Ol\nWitty,\nEp. 187\nU.N.\nTERM\nLIMITS\nNO H\...",non_propagandistic
596,43436,prop_meme_22924.png,"I DON'T USUALLY HAVE SEX\nBUT WHEN I DO,\n\FOR...",propagandistic
597,45280,prop_meme_17969.png,THERE IS SOMETHING\nSERIOUSLY WRONG IN THIS\nC...,propagandistic
598,44450,prop_meme_24665.png,IF MORE SANE PEOPLE WERE ARMED\nTHE\nCRAZY\n2N...,propagandistic


In [40]:
dev_df = dev_df[['id', 'label']]

In [41]:
with open('test_sub_task2b.json', 'w') as fp:
    json.dump([dev_df.iloc[i].to_dict() for i in range(len(dev_df))], fp)