In [140]:
from typing import Dict, List, Type

import pandas as pd
import torch
from sklearn.decomposition import PCA
from torch import nn
from torch.nn import functional as F
from torchvision import transforms as T
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from torch.nn.functional import normalize as normalize_emb
from torchvision import models as tv_models
from datasets import load_dataset

In [146]:
DEVICE="cuda"

In [148]:
next(SentenceTransformer("all-MiniLM-L6-v2").parameters()).device

device(type='cpu')

In [152]:
def preprocess_data(dataset):
    # Transformations
    resize = T.Resize((224, 224))
    normalize = T.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    )
    pil_to_image = T.PILToTensor()
    # Get models used to preprocess features
    mini_lm = SentenceTransformer("all-MiniLM-L6-v2").to(DEVICE)
    resnet = tv_models.resnet50(pretrained=True).to(DEVICE)
    resnet.eval()
    resnet.fc = nn.Identity()
    # Preprocess
    preprocessed = []
    for sample in tqdm(
        iterable=dataset,
        total=dataset.num_rows,
        desc="Processing data",
    ):
        image = sample["image"]
        label = sample["label"]
        # There are 4 images in "L" format
        if sample["image"].mode == "L":
            continue
        image = pil_to_image(image).float().to(DEVICE)
        resized_img = resize(image)
        normalized_img = normalize(resized_img)
        for description in sample["description"].split("\n"):
            if not description:
                continue
            with torch.no_grad():
                # Added batch dim
                img_emb = resnet(normalized_img.unsqueeze(dim=0))
                text_emb = mini_lm.encode(
                    sentences=description,
                    convert_to_tensor=True,
                )
            preprocessed.append(
                {
                    "img_emb": normalize_emb(img_emb[0], dim=0), # Drop batch dim
                    "text_emb": normalize_emb(text_emb, dim=0),
                    "image": resized_img,
                    "text": description,
                    "label": sample["label"],
                }
            )
    return pd.DataFrame(preprocessed)

## Preprocess Cube

In [150]:
dataset = load_dataset("alkzar90/CC6204-Hackaton-Cub-Dataset")

Found cached dataset cc6204-hackaton-cub-dataset (/home/cicheck/.cache/huggingface/datasets/alkzar90___cc6204-hackaton-cub-dataset/default/0.0.0/de850c9086bff0dd6d6eab90f79346241178f65e1a016a50eec240ae9cdf2064)


  0%|          | 0/2 [00:00<?, ?it/s]

In [153]:
preprocessed_train = preprocess_data(dataset["train"])

Processing data:   1%|▉                                                                 | 88/5994 [00:11<13:17,  7.41it/s]


KeyboardInterrupt: 

In [131]:
preprocessed_train.head(2)

Unnamed: 0,img_emb,text_emb,image,text,label
0,"[tensor(0.0176), tensor(0.0005), tensor(0.0025...","[tensor(0.0832, device='cuda:0'), tensor(0.064...","[[[tensor(159.5414), tensor(161.8534), tensor(...",this bird is brown with a lighter brown crest.,0
1,"[tensor(0.0176), tensor(0.0005), tensor(0.0025...","[tensor(0.0553, device='cuda:0'), tensor(0.112...","[[[tensor(159.5414), tensor(161.8534), tensor(...","aquatic large bird with long hooked bill, whit...",0


In [132]:
print(preprocessed_train.shape)

(59900, 5)


In [138]:
preprocessed_train.to_pickle("data/cub/preprocessed_train.pkl")

In [135]:
preprocessed_test = preprocess_data(dataset["test"])

Processing data: 100%|████████████████████████████████████████████████████████████████| 5794/5794 [40:48<00:00,  2.37it/s]


In [139]:
preprocessed_test.to_pickle("data/cub/preprocessed_test.pkl")

## Preprocess Hatefull Meme

In [141]:
train = pd.read_json(path_or_buf="data/heatfull_meme/data/train.jsonl", lines=True)
test = pd.read_json(path_or_buf="data/heatfull_meme/data/test.jsonl", lines=True)

Unnamed: 0,id,img,label,text
0,42953,img/42953.png,0,its their character not their color that matters
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...
2,13894,img/13894.png,0,putting bows on your pet
3,37408,img/37408.png,0,i love everything and everybody! except for sq...
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h..."
...,...,...,...,...
8495,10423,img/10423.png,1,nobody wants to hang auschwitz me
8496,98203,img/98203.png,1,when god grants you a child after 20 years of ...
8497,36947,img/36947.png,1,gays on social media: equality! body positivit...
8498,16492,img/16492.png,1,having a bad day? you could be a siamese twin ...
