In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd drive/MyDrive/hateful_memes

/content/drive/MyDrive/hateful_memes


In [None]:
!ls

CLIP  dbert			      madgrad  Notes.gdoc	 results
data  HatefulMemesTransformers.ipynb  models   requirements.txt


In [None]:
#!pip install ftfy
#!pip install -r requirements.txt

### Imports

In [None]:
import json
import os
from collections import Counter
import random
import numpy as np

import torch
import torch.nn as nn
# import torchvision
# import torchvision.transforms as transforms
from PIL import Image
from torch.utils.data import Dataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from madgrad.madgrad import MADGRAD

from sklearn.metrics import f1_score, accuracy_score, roc_auc_score

import CLIP.clip.clip as clip
import pickle

from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

import copy

import transformers
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    MMBTConfig,
    MMBTModel,
    MMBTForClassification,
    get_linear_schedule_with_warmup,
)

device variable for future reference

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

Load clip model and preprocessor

In [None]:
clip_model, preprocess = clip.load("RN50x4", device=device, jit=False)


100%|███████████████████████████████████████| 402M/402M [00:18<00:00, 22.5MiB/s]


Freeze weights of CLIP feature encoder, as we will not finetune it.

In [None]:
for p in clip_model.parameters():
    p.requires_grad = False

Initialize variables

In [None]:
num_image_embeds = 4
num_labels = 1
gradient_accumulation_steps = 20
data_dir = './data'
max_seq_length = 80 
max_grad_norm = 0.5
train_batch_size = 16
eval_batch_size = 16
image_encoder_size = 288
image_features_size = 640
num_train_epochs = 5

Create a function that will prepare an image for CLIP encoder in a special manner. This function will split image into three tiles (by height or width, depending on the aspect ratio of the image). Finally we will get four vectors after encoding (one vector for each tile and one vector for whole image that was padded to square).

In [None]:
def slice_image(im, desired_size):
    '''
    Resize and slice image
    '''
    old_size = im.size  

    ratio = float(desired_size)/min(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])

    im = im.resize(new_size, Image.ANTIALIAS)
    
    ar = np.array(im)
    images = []
    if ar.shape[0] < ar.shape[1]:
        middle = ar.shape[1] // 2
        half = desired_size // 2
        
        images.append(Image.fromarray(ar[:, :desired_size]))
        images.append(Image.fromarray(ar[:, middle-half:middle+half]))
        images.append(Image.fromarray(ar[:, ar.shape[1]-desired_size:ar.shape[1]]))
    else:
        middle = ar.shape[0] // 2
        half = desired_size // 2
        
        images.append(Image.fromarray(ar[:desired_size, :]))
        images.append(Image.fromarray(ar[middle-half:middle+half, :]))
        images.append(Image.fromarray(ar[ar.shape[0]-desired_size:ar.shape[0], :]))

    return images
def resize_pad_image(im, desired_size):
    '''
    Resize and pad image to a desired size
    '''
    old_size = im.size  

    ratio = float(desired_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])

    im = im.resize(new_size, Image.ANTIALIAS)

    # create a new image and paste the resized on it
    new_im = Image.new("RGB", (desired_size, desired_size))
    new_im.paste(im, ((desired_size-new_size[0])//2,
                        (desired_size-new_size[1])//2))

    return new_im

Define a function, that will get image features from CLIP.



In [None]:
class ClipEncoderMulti(nn.Module):
    def __init__(self, num_embeds, num_features=image_features_size):
        super().__init__()        
        self.model = clip_model
        self.num_embeds = num_embeds
        self.num_features = num_features

    def forward(self, x):
        # 4x3x288x288 -> 1x4x640
        out = self.model.encode_image(x.view(-1,3,288,288))
        out = out.view(-1, self.num_embeds, self.num_features).float()
        return out  # Bx4x640

Create JsonlDataset class that will load texts and preprocessed images. collate_fn will group data from dataset in a format needed for our pytorch model.

In [None]:
class JsonlDataset(Dataset):
    def __init__(self, data_path, tokenizer, transforms, max_seq_length):
        self.data = [json.loads(l) for l in open(data_path)]
        self.data_dir = os.path.dirname(data_path)
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

        self.transforms = transforms

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sentence = torch.LongTensor(self.tokenizer.encode(self.data[index]["text"], add_special_tokens=True))
        start_token, sentence, end_token = sentence[0], sentence[1:-1], sentence[-1]
        sentence = sentence[:self.max_seq_length]

        label = torch.FloatTensor([self.data[index]["label"]])

        image = Image.open(os.path.join(self.data_dir, self.data[index]["img"])).convert("RGB")
        sliced_images = slice_image(image, 288)
        sliced_images = [np.array(self.transforms(im)) for im in sliced_images]
        image = resize_pad_image(image, image_encoder_size)
        image = np.array(self.transforms(image))
        
        sliced_images = [image] + sliced_images         
        sliced_images = torch.from_numpy(np.array(sliced_images)).to(device)

        return {
            "image_start_token": start_token,            
            "image_end_token": end_token,
            "sentence": sentence,
            "image": sliced_images,
            "label": label            
        }

    def get_label_frequencies(self):
        label_freqs = Counter()
        for row in self.data:
            label_freqs.update([row["label"]])
        return label_freqs
    
    def get_labels(self):
        labels = []
        for row in self.data:
            labels.append(row["label"])
        return labels

def collate_fn(batch):
    lens = [len(row["sentence"]) for row in batch]
    bsz, max_seq_len = len(batch), max(lens)

    mask_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)
    text_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)

    for i_batch, (input_row, length) in enumerate(zip(batch, lens)):
        text_tensor[i_batch, :length] = input_row["sentence"]
        mask_tensor[i_batch, :length] = 1
    
    img_tensor = torch.stack([row["image"] for row in batch])
    tgt_tensor = torch.stack([row["label"] for row in batch])
    img_start_token = torch.stack([row["image_start_token"] for row in batch])
    img_end_token = torch.stack([row["image_end_token"] for row in batch])

    return text_tensor, mask_tensor, img_tensor, img_start_token, img_end_token, tgt_tensor

Define load_examples function that will load data described in json dataset into JsonlDataset object.

In [None]:
def load_examples(tokenizer, evaluate=False):
    path = os.path.join(data_dir, "dev_unseen.jsonl" if evaluate else f"train.jsonl")
    transforms = preprocess
    dataset = JsonlDataset(path, tokenizer, transforms, max_seq_length - num_image_embeds - 2)
    return dataset

Create functions to load and save model weights.

In [None]:
def save_checkpoint(save_path, model, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')
    
def load_checkpoint(load_path, model):
    
    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']

Needed functions and classes are created, so we can load our models.

In [None]:
model_name = 'Hate-speech-CNERG/bert-base-uncased-hatexplain'
transformer_config = AutoConfig.from_pretrained(model_name) 
transformer = AutoModel.from_pretrained(model_name, config=transformer_config)
img_encoder = ClipEncoderMulti(num_image_embeds)

Downloading:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at Hate-speech-CNERG/bert-base-uncased-hatexplain were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

Downloading:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
config = MMBTConfig(transformer_config, num_labels=num_labels, modal_hidden_size=image_features_size)
model = MMBTForClassification(config, transformer, img_encoder)

In [None]:
model.to(device);

Load train and evaluation datasets and create dataloaders for these datasets.

In [None]:
train_dataset = load_examples(tokenizer, evaluate=False)
eval_dataset = load_examples(tokenizer, evaluate=True)   

train_sampler = RandomSampler(train_dataset)
eval_sampler = SequentialSampler(eval_dataset)

train_dataloader = DataLoader(
        train_dataset,
        sampler=train_sampler,
        batch_size=train_batch_size,
        collate_fn=collate_fn
    )


eval_dataloader = DataLoader(
        eval_dataset, 
        sampler=eval_sampler, 
        batch_size=eval_batch_size, 
        collate_fn=collate_fn
    )

Define model training parameters, optimizer and loss.

In [None]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ["bias", 
            "LayerNorm.weight"
           ]
weight_decay = 0.0005

optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]

t_total = (len(train_dataloader) // gradient_accumulation_steps) * num_train_epochs
warmup_steps = t_total // 10

optimizer = MADGRAD(optimizer_grouped_parameters, lr=2e-4)

scheduler = get_linear_schedule_with_warmup(
        optimizer, warmup_steps, t_total
    )

criterion = nn.BCEWithLogitsLoss()

Define evaluation function that will take evaluation dataloader and calculate prediction AUC, F1 score and accuracy.

In [None]:
def evaluate(model, tokenizer, criterion, dataloader, tres = 0.5): 
    
    # Eval!
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    proba = None
    out_label_ids = None
    for batch in dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            labels = batch[5]
            inputs = {
                "input_ids": batch[0],
                "input_modal": batch[2],
                "attention_mask": batch[1],
                "modal_start_tokens": batch[3],
                "modal_end_tokens": batch[4],
                "return_dict": False
            }
            outputs = model(**inputs)
            logits = outputs[0]  # model outputs are always tuple in transformers (see doc)
            tmp_eval_loss = criterion(logits, labels)
            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = torch.sigmoid(logits).detach().cpu().numpy() > tres
            proba = torch.sigmoid(logits).detach().cpu().numpy()
            out_label_ids = labels.detach().cpu().numpy()
        else:            
            preds = np.append(preds, torch.sigmoid(logits).detach().cpu().numpy() > tres, axis=0)
            proba = np.append(proba, torch.sigmoid(logits).detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0)
    
    eval_loss = eval_loss / nb_eval_steps

    result = {
        "loss": eval_loss,
        "accuracy": accuracy_score(out_label_ids, preds),
        "AUC": roc_auc_score(out_label_ids, proba),
        "micro_f1": f1_score(out_label_ids, preds, average="micro"),
        "prediction": preds,
        "labels": out_label_ids,
        "proba": proba
    }
    
    return result

Finally we can train our model. We specify minimun needed AUC value in best_valid_auc variable, so if the model achieves higher AUC on validation data than it was specified, we will save that model.

In [None]:
optimizer_step = 0
global_step = 0
train_step = 0
tr_loss, logging_loss = 0.0, 0.0
best_valid_auc = 0.75
global_steps_list = []
train_loss_list = []
val_loss_list = []
val_acc_list = []
val_auc_list = []
eval_every = len(train_dataloader) // 7
running_loss = 0
file_path="models/"

model.zero_grad()

for i in range(num_train_epochs):
    print("Epoch", i+1, f"from {num_train_epochs}")
    whole_y_pred=np.array([])
    whole_y_t=np.array([])
    for step, batch in enumerate(tqdm(train_dataloader)):
        model.train()
        batch = tuple(t.to(device) for t in batch)
        labels = batch[5]
        inputs = {
            "input_ids": batch[0],
            "input_modal": batch[2],
            "attention_mask": batch[1],
            "modal_start_tokens": batch[3],
            "modal_end_tokens": batch[4],
            "return_dict": False
        }
        outputs = model(**inputs)
        logits = outputs[0]  # model outputs are always tuple in transformers (see doc)
        loss = criterion(logits, labels)        
        
        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps
            
        loss.backward()
        
        tr_loss += loss.item()
        running_loss += loss.item()
        global_step += 1
        
        if (step + 1) % gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            scheduler.step()  # Update learning rate schedule         
            
            optimizer_step += 1
            optimizer.zero_grad()   
                        
        if (step + 1) % eval_every == 0:
            
            average_train_loss = running_loss / eval_every
            train_loss_list.append(average_train_loss)
            global_steps_list.append(global_step)
            running_loss = 0.0  
            
            val_result = evaluate(model, tokenizer, criterion, eval_dataloader)
            
            val_loss_list.append(val_result['loss'])
            val_acc_list.append(val_result['accuracy'])
            val_auc_list.append(val_result['AUC'])
            
            # checkpoint
            if val_result['AUC'] > best_valid_auc:
                best_valid_auc = val_result['AUC']
                val_loss = val_result['loss']
                val_acc = val_result['accuracy']
                model_path = f'{file_path}/model-embs{num_image_embeds}-seq{max_seq_length}-auc{best_valid_auc:.3f}-loss{val_loss:.3f}-acc{val_acc:.3f}.pt'
                print(f"AUC improved, so saving this model")  
                save_checkpoint(model_path, model, val_result['loss'])              
            
            print("Train loss:", f"{average_train_loss:.4f}", 
                  "Val loss:", f"{val_result['loss']:.4f}",
                  "Val acc:", f"{val_result['accuracy']:.4f}",
                  "AUC:", f"{val_result['AUC']:.4f}")   
    print('\n')
    if i == num_train_epochs - 1:
      val_result = evaluate(model, tokenizer, criterion, eval_dataloader)
      val_loss = val_result['loss']
      val_acc = val_result['accuracy']
      model_path = f'{file_path}/model-embs{num_image_embeds}-seq{max_seq_length}-auc{val_result['AUC']:.3f}-loss{val_loss:.3f}-acc{val_acc:.3f}.pt'
      print(f"Final epoch finished so saving this model")  
      save_checkpoint(model_path, model, val_result['loss'])  

Epoch 1 from 5


  0%|          | 0/532 [00:00<?, ?it/s]

Train loss: 0.0329 Val loss: 0.6719 Val acc: 0.6259 AUC: 0.4617
Train loss: 0.0316 Val loss: 0.6542 Val acc: 0.6296 AUC: 0.5705
Train loss: 0.0292 Val loss: 0.6967 Val acc: 0.6278 AUC: 0.5820
Train loss: 0.0280 Val loss: 0.7405 Val acc: 0.6333 AUC: 0.5814
Train loss: 0.0287 Val loss: 0.7162 Val acc: 0.6278 AUC: 0.5849
Train loss: 0.0262 Val loss: 0.7236 Val acc: 0.6130 AUC: 0.5825
Train loss: 0.0262 Val loss: 0.7044 Val acc: 0.6167 AUC: 0.6000


Epoch 2 from 5


  0%|          | 0/532 [00:00<?, ?it/s]

Train loss: 0.0240 Val loss: 0.7704 Val acc: 0.6241 AUC: 0.6190
Train loss: 0.0239 Val loss: 0.6937 Val acc: 0.6315 AUC: 0.6366
Train loss: 0.0233 Val loss: 0.6906 Val acc: 0.6222 AUC: 0.6455
Train loss: 0.0241 Val loss: 0.7000 Val acc: 0.6537 AUC: 0.6556
Train loss: 0.0234 Val loss: 0.8013 Val acc: 0.6444 AUC: 0.6510
Train loss: 0.0229 Val loss: 0.7162 Val acc: 0.6204 AUC: 0.6438
Train loss: 0.0227 Val loss: 0.8424 Val acc: 0.6333 AUC: 0.6327


Epoch 3 from 5


  0%|          | 0/532 [00:00<?, ?it/s]

Train loss: 0.0211 Val loss: 0.6782 Val acc: 0.6019 AUC: 0.6335
Train loss: 0.0186 Val loss: 0.8005 Val acc: 0.6259 AUC: 0.6372
Train loss: 0.0191 Val loss: 0.8477 Val acc: 0.6463 AUC: 0.6482
Train loss: 0.0192 Val loss: 0.7766 Val acc: 0.6519 AUC: 0.6680
Train loss: 0.0205 Val loss: 0.7208 Val acc: 0.6611 AUC: 0.6771
Train loss: 0.0199 Val loss: 0.6890 Val acc: 0.6537 AUC: 0.6721
Train loss: 0.0188 Val loss: 0.7334 Val acc: 0.6537 AUC: 0.6741


Epoch 4 from 5


  0%|          | 0/532 [00:00<?, ?it/s]

Train loss: 0.0157 Val loss: 0.6956 Val acc: 0.6667 AUC: 0.6815
Train loss: 0.0134 Val loss: 0.9076 Val acc: 0.6704 AUC: 0.6893
Train loss: 0.0133 Val loss: 1.0316 Val acc: 0.6519 AUC: 0.6700
Train loss: 0.0163 Val loss: 0.8746 Val acc: 0.6648 AUC: 0.6820
Train loss: 0.0155 Val loss: 0.7897 Val acc: 0.6463 AUC: 0.6666
Train loss: 0.0163 Val loss: 0.7775 Val acc: 0.6759 AUC: 0.6805
Train loss: 0.0148 Val loss: 0.8655 Val acc: 0.6685 AUC: 0.6897


Epoch 5 from 5


  0%|          | 0/532 [00:00<?, ?it/s]

Train loss: 0.0122 Val loss: 0.8086 Val acc: 0.6704 AUC: 0.6926
Train loss: 0.0120 Val loss: 0.8789 Val acc: 0.6630 AUC: 0.6939
Train loss: 0.0100 Val loss: 0.8757 Val acc: 0.6704 AUC: 0.6986
Train loss: 0.0111 Val loss: 0.9267 Val acc: 0.6759 AUC: 0.6991
Train loss: 0.0106 Val loss: 0.8949 Val acc: 0.6704 AUC: 0.7052
Train loss: 0.0116 Val loss: 0.9364 Val acc: 0.6593 AUC: 0.7061
Train loss: 0.0088 Val loss: 0.8723 Val acc: 0.6778 AUC: 0.7063




For evaluation just load model from path

In [None]:
CHECKPOINT = "models/model-embs4-seq80-auc0.7063-loss0.8723-acc0.6778.pt"
load_checkpoint(CHECKPOINT, model)

Model loaded from <== models/model-embs4-seq80-auc0.7063-loss0.8723-acc0.6778.pt


0.8723

In [None]:
import pandas as pd

num_labels = 1
data_dir = './data'
test_batch_size = 16

class TestJsonlDataset(Dataset):
    def __init__(self, data_path, tokenizer, transforms, max_seq_length):
        self.data = [json.loads(l) for l in open(data_path)]
        self.data_dir = os.path.dirname(data_path)
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.transforms = transforms

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sentence = torch.LongTensor(self.tokenizer.encode(self.data[index]["text"], add_special_tokens=True))
        start_token, sentence, end_token = sentence[0], sentence[1:-1], sentence[-1]
        sentence = sentence[:self.max_seq_length]

        id = torch.LongTensor([int(self.data[index]["id"])])
        try:
          image = Image.open(os.path.join(self.data_dir, self.data[index]["img"])).convert("RGB")
        except:
          print(self.data[index]["img"])
        sliced_images = slice_image(image, 288)
        sliced_images = [np.array(self.transforms(im)) for im in sliced_images]
        image = resize_pad_image(image, image_encoder_size)
        image = np.array(self.transforms(image))        
        sliced_images = [image] + sliced_images        
        sliced_images = torch.from_numpy(np.array(sliced_images)).to(device)

        return {
            "image_start_token": start_token,            
            "image_end_token": end_token,
            "sentence": sentence,
            "image": sliced_images,
            "id": id,
        }

def final_collate_fn(batch):
    lens = [len(row["sentence"]) for row in batch]
    bsz, max_seq_len = len(batch), max(lens)

    mask_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)
    text_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)

    for i_batch, (input_row, length) in enumerate(zip(batch, lens)):
        text_tensor[i_batch, :length] = input_row["sentence"]
        mask_tensor[i_batch, :length] = 1

    img_tensor = torch.stack([row["image"] for row in batch])
    id_tensor = torch.stack([row["id"] for row in batch])
    img_start_token = torch.stack([row["image_start_token"] for row in batch])
    img_end_token = torch.stack([row["image_end_token"] for row in batch])

    return text_tensor, mask_tensor, img_tensor, img_start_token, img_end_token, id_tensor

def load_test_examples(test_file="test_seen.jsonl"):
    path = os.path.join(data_dir, test_file)
    dataset = TestJsonlDataset(path, tokenizer, preprocess, max_seq_length - num_image_embeds - 2)
    return dataset

def final_prediction(model, dataloader): 
    preds = None
    proba = None
    all_ids = None
    for batch in tqdm(dataloader):
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            ids = batch[5]
            inputs = {
                "input_ids": batch[0],
                "input_modal": batch[2],
                "attention_mask": batch[1],
                "modal_start_tokens": batch[3],
                "modal_end_tokens": batch[4],
                "return_dict": False
            }
            outputs = model(**inputs)
            logits = outputs[0]
        if preds is None:
            all_ids = ids.detach().cpu().numpy()
            preds = torch.sigmoid(logits).detach().cpu().numpy() > 0.5
            proba = torch.sigmoid(logits).detach().cpu().numpy()            
        else:  
            all_ids = np.append(all_ids, ids.detach().cpu().numpy(), axis=0)
            preds = np.append(preds, torch.sigmoid(logits).detach().cpu().numpy() > 0.5, axis=0)
            proba = np.append(proba, torch.sigmoid(logits).detach().cpu().numpy(), axis=0)

    result = {
        "ids": all_ids,
        "preds": preds,
        "probs": proba,
    }

    return result

final_test = load_test_examples(test_file="test_unseen_with_ocr_2.jsonl")

final_test_sampler = SequentialSampler(final_test)

final_test_dataloader = DataLoader(
        final_test, 
        sampler=final_test_sampler, 
        batch_size=test_batch_size, 
        collate_fn=final_collate_fn
    )

In [None]:
results = final_prediction(model, final_test_dataloader)

results['ids'] = results['ids'].reshape(-1)
results['preds'] = results['preds'].reshape(-1)
results['probs'] = results['probs'].reshape(-1)

df = pd.DataFrame(results)
df = df[['ids', 'probs', 'preds']]
df.columns = ['id', 'proba', 'label']
df.label = df.label.astype(int)

df.to_csv('results/test_unseen_with_ocr_2_prediction.csv', index=False, float_format='%.3f')

  0%|          | 0/112 [00:00<?, ?it/s]

For 1 image

In [None]:
import pandas as pd

num_labels = 1
test_batch_size = 1

class OneImage(Dataset):
    def __init__(self, img_path, img_text, tokenizer, transforms, max_seq_length):
        self.data = [{"img": img_path, "text": img_text}]
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.transforms = transforms

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sentence = torch.LongTensor(self.tokenizer.encode(self.data[index]["text"], add_special_tokens=True))
        start_token, sentence, end_token = sentence[0], sentence[1:-1], sentence[-1]
        sentence = sentence[:self.max_seq_length]

        image = Image.open(self.data[index]["img"]).convert("RGB")
        sliced_images = slice_image(image, 288)
        sliced_images = [np.array(self.transforms(im)) for im in sliced_images]
        image = resize_pad_image(image, image_encoder_size)
        image = np.array(self.transforms(image))        
        sliced_images = [image] + sliced_images        
        sliced_images = torch.from_numpy(np.array(sliced_images)).to(device)

        return {
            "image_start_token": start_token,            
            "image_end_token": end_token,
            "sentence": sentence,
            "image": sliced_images,
        }

def final_collate_fn(batch):
    lens = [len(row["sentence"]) for row in batch]
    bsz, max_seq_len = len(batch), max(lens)

    mask_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)
    text_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)

    for i_batch, (input_row, length) in enumerate(zip(batch, lens)):
        text_tensor[i_batch, :length] = input_row["sentence"]
        mask_tensor[i_batch, :length] = 1

    img_tensor = torch.stack([row["image"] for row in batch])
    img_start_token = torch.stack([row["image_start_token"] for row in batch])
    img_end_token = torch.stack([row["image_end_token"] for row in batch])

    return text_tensor, mask_tensor, img_tensor, img_start_token, img_end_token

def get_prediction(model, dataloader): 
    preds = None
    proba = None
    for batch in tqdm(dataloader):
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "input_modal": batch[2],
                "attention_mask": batch[1],
                "modal_start_tokens": batch[3],
                "modal_end_tokens": batch[4],
                "return_dict": False
            }
            outputs = model(**inputs)
            logits = outputs[0]
        if preds is None:
            preds = torch.sigmoid(logits).detach().cpu().numpy() > 0.5
            proba = torch.sigmoid(logits).detach().cpu().numpy()            
        else:  
            preds = np.append(preds, torch.sigmoid(logits).detach().cpu().numpy() > 0.5, axis=0)
            proba = np.append(proba, torch.sigmoid(logits).detach().cpu().numpy(), axis=0)

    result = {
        "preds": preds,
        "probs": proba,
    }

    return result

img_path = "./data/img/34067.png"
img_text = "international day against violence"
test = OneImage(img_path, img_text, tokenizer, preprocess, max_seq_length - num_image_embeds - 2)

final_test_sampler = SequentialSampler(test)

final_test_dataloader = DataLoader(
        test, 
        sampler=final_test_sampler, 
        batch_size=test_batch_size, 
        collate_fn=final_collate_fn
    )

In [None]:
#load_checkpoint("models/first_result.pt", model)
results = get_prediction(model, final_test_dataloader)


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
print(results)

{'preds': array([[False]]), 'probs': array([[0.01596578]], dtype=float32)}
