In [2]:
from typing import Dict, List, Type

import pandas as pd
import torch
from sklearn.decomposition import PCA
from torch import nn
from torch.nn import functional as F
from torchvision import transforms as T
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from torch.nn.functional import normalize as normalize_emb
from torchvision import models as tv_models
from datasets import load_dataset

In [3]:
DEVICE="cuda"

In [31]:
def preprocess_data(dataset):
    # Transformations
    resize = T.Resize((224, 224))
    normalize = T.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    )
    pil_to_image = T.PILToTensor()
    # Get models used to preprocess features
    mini_lm = SentenceTransformer("all-MiniLM-L6-v2").to(DEVICE)
    resnet = tv_models.resnet50(pretrained=True).to(DEVICE)
    resnet.eval()
    resnet.fc = nn.Identity()
    # Preprocess
    preprocessed = []
    for sample in tqdm(
        iterable=dataset,
        total=len(dataset),
        desc="Processing data",
    ):
        image = sample["image"]
        label = sample["label"]
        # There are 4 images in "L" format
        if sample["image"].mode == "L":
            continue
        image = pil_to_image(image).float().to(DEVICE)
        resized_img = resize(image)
        normalized_img = normalize(resized_img)
        for description in sample["description"].split("\n"):
            if not description:
                continue
            with torch.no_grad():
                # Added batch dim
                img_emb = resnet(normalized_img.unsqueeze(dim=0))
                text_emb = mini_lm.encode(
                    sentences=description,
                    convert_to_tensor=True,
                )
            preprocessed.append(
                {
                    "img_emb": normalize_emb(img_emb[0], dim=0), # Drop batch dim
                    "text_emb": normalize_emb(text_emb, dim=0),
                    "image_index": sample["img_index"],
                    "text": description,
                    "label": sample["label"],
                }
            )
    return pd.DataFrame(preprocessed)

## Preprocess Cube

In [None]:
!mkdir data/cub/

In [5]:
dataset = load_dataset("alkzar90/CC6204-Hackaton-Cub-Dataset")

Found cached dataset cc6204-hackaton-cub-dataset (/home/cicheck/.cache/huggingface/datasets/alkzar90___cc6204-hackaton-cub-dataset/default/0.0.0/de850c9086bff0dd6d6eab90f79346241178f65e1a016a50eec240ae9cdf2064)


  0%|          | 0/2 [00:00<?, ?it/s]

In [26]:
# Orginal type is not mutable
dataset = {
    "train": list(dataset["train"]),
    "test": list(dataset["test"]),
}

In [27]:
for split in ["train", "test"]:
    for index, sample in enumerate(dataset[split]):
        sample["img_index"] = index

In [32]:
preprocessed_train = preprocess_data(dataset["train"])

Processing data: 100%|██████████████████████| 5994/5994 [11:03<00:00,  9.03it/s]


In [35]:
preprocessed_train.to_pickle("data/cub/preprocessed_train.pkl")

In [36]:
preprocessed_test = preprocess_data(dataset["test"])

Processing data: 100%|██████████████████████| 5794/5794 [11:01<00:00,  8.77it/s]


In [37]:
preprocessed_test.to_pickle("data/cub/preprocessed_test.pkl")

## Preprocess Hatefull Meme

In [None]:
!mkdir data/meme/

In [167]:
train = pd.read_json(path_or_buf="data/heatfull_meme/data/train.jsonl", lines=True)
test = pd.read_json(path_or_buf="data/heatfull_meme/data/test.jsonl", lines=True)

In [168]:
train = train.to_dict("records")
test = test.to_dict("records")

In [169]:
train[0]

{'id': 42953,
 'img': 'img/42953.png',
 'label': 0,
 'text': 'its their character not their color that matters'}

In [172]:
type(dataset["train"][0]["image"])

PIL.JpegImagePlugin.JpegImageFile

In [173]:
from PIL import Image

In [183]:
for sample in train:
    img = Image.open(f"data/heatfull_meme/data/{sample['img']}")
    sample["img"] = img
    img.close()

OSError: [Errno 24] Too many open files: 'data/heatfull_meme/data/<PIL.PngImagePlugin.PngImageFile image mode=RGB size=265x400 at 0x7F53405B56C0>'