In [16]:
import torch, random
from tqdm import tqdm
import sys
import re
import os

gpu = 'cuda:0'
device = torch.device(gpu)
torch.cuda.empty_cache()

In [2]:
from transformers import AutoImageProcessor, AutoModel

vision_model_name = "facebook/dinov2-large"
processor = AutoImageProcessor.from_pretrained(vision_model_name)
model = AutoModel.from_pretrained(vision_model_name).to(device)

In [7]:
from sentence_transformers import SentenceTransformer, models

language_model_name = "all-roberta-large-v1"
language_model = SentenceTransformer(language_model_name).to(device)

In [6]:
len([i for i in os.listdir("/shared/raiymbek/vlm_2/collection-data_2/") if "clip" in i])

99

In [18]:
import numpy as np
np.unique([torch.load(f"/shared/raiymbek/vlm_2/collection-data_2/{i}").shape[1] for i in os.listdir("/shared/raiymbek/vlm_2/collection-data_2/") if "clip" in i])

array([768])

In [17]:
torch.load("/shared/raiymbek/vlm_2/collection-data_2/00090_clip.pt").shape

torch.Size([8290, 768])

In [3]:
from torch.utils.data import Dataset
from PIL import Image
import torchvision.transforms as transforms

class Collection(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, part):
        self.part = part
        filename = f'/shared/raiymbek/vlm_2/collection-data_2/{self.part:05d}.parquet'
        df = pd.read_parquet(filename)
        self.df = df[df["status"] == "success"]
        
        self.key_list = list(self.df["key"])
        self.caption_list = list(self.df["caption"])

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        key = self.key_list[idx]
        caption = self.caption_list[idx]

        img = Image.open( f'/shared/raiymbek/vlm_2/collection-data_2/{self.part:05d}/{key}.jpg')
        img = transforms.PILToTensor()(img)
        if img.shape[0] == 1:
            img = torch.cat(3*[img], dim = 0)
        return img, caption

In [11]:
from torch.utils.data import DataLoader
import pandas as pd

for part in range(0, 99):
    cap = Collection(part)
    cap_dataloader = DataLoader(cap, batch_size=512, shuffle=False)
    image_representations = []
    text_representations = []
    for batch in tqdm(cap_dataloader):
        text_representation = language_model.encode(batch[1])
        text_representation = torch.Tensor(text_representation)
        text_representation = text_representation / torch.norm(text_representation, dim = 1, keepdim = True)
        text_representations.append(text_representation)

    text_representations_tensor = torch.cat(text_representations, dim = 0)
    torch.save(text_representations_tensor, f'/shared/raiymbek/vlm_2/collection-data_2/{part:05d}_allroberta.pt')

100%|███████████████████████████████████████████████████████████████████████████████████| 16/16 [16:08<00:00, 60.50s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 16/16 [07:53<00:00, 29.58s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [42:44<00:00, 160.29s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [33:04<00:00, 124.05s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [28:14<00:00, 105.88s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [39:40<00:00, 148.81s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 16/16 [01:34<00:00,  5.88s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 16/16 [01:21<00:00,  5.10s/it]
100%|███████████████████████████

In [71]:
from torch.utils.data import DataLoader
cap = Collection(0)
cap_dataloader = DataLoader(cap, batch_size=4, shuffle=False)

In [94]:
for part in range(0, 99):
    cap = Collection(part)
    cap_dataloader = DataLoader(cap, batch_size=512, shuffle=False)
    image_representations = []
    text_representations = []
    for batch in tqdm(cap_dataloader):
        inputs = processor(images=batch[0], return_tensors="pt")
        inputs = inputs.to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        #print(outputs.last_hidden_state.shape)
        image_representation = outputs.last_hidden_state.mean(dim=1).detach().cpu()
        image_representations.append(image_representation)

        inputs = clip_processor(text=batch[1], return_tensors="pt", padding=True, truncation = True)
        inputs = inputs.to(device)
        with torch.no_grad():
            outputs = clip_model.get_text_features(**inputs)
        text_representation = outputs
        text_representation = text_representation / torch.norm(text_representation, dim = 1, keepdim = True)
        text_representations.append(text_representation)

    image_representations_tensor = torch.cat(image_representations, dim = 0)
    torch.save(image_representations_tensor, f'/shared/raiymbek/vlm_2/collection-data_2/{part:05d}_allroberta.pt')
    text_representations_tensor = torch.cat(text_representations, dim = 0)
    torch.save(text_representations_tensor, f'/shared/raiymbek/vlm_2/collection-data_2/{part:05d}_clip.pt')

100%|███████████████████████████████████████████████████████████████████████████████████| 16/16 [06:24<00:00, 24.03s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 16/16 [05:11<00:00, 19.46s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 16/16 [05:12<00:00, 19.53s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 16/16 [05:00<00:00, 18.80s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 16/16 [05:06<00:00, 19.13s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 16/16 [05:20<00:00, 20.03s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 16/16 [05:19<00:00, 19.94s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 16/16 [05:48<00:00, 21.76s/it]
100%|███████████████████████████

In [93]:
text_representations_tensor = torch.cat(text_representations, dim = 0)
torch.save(text_representations_tensor, f'/shared/raiymbek/vlm_2/collection-data_2/{part:05d}_clip.pt')

In [47]:
from transformers import CLIPProcessor, CLIPModel

clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


In [None]:
for part in range(99):
    cap = Collection(part)
    cap_dataloader = DataLoader(cap, batch_size=128, shuffle=False)
    text_representations = []
    for batch in tqdm(cap_dataloader):
    
        inputs = clip_processor(text=batch[1], return_tensors="pt", padding=True, truncation = True)
        inputs = inputs.to(device)
        with torch.no_grad():
            outputs = clip_model.get_text_features(**inputs)
        text_representation = outputs
        text_representation = text_representation / torch.norm(text_representation, dim = 1, keepdim = True)
        text_representations.append(text_representation)
    text_representations_tensor = torch.cat(text_representations, dim = 1)
    torch.save(text_representations_tensor, f'/shared/raiymbek/vlm_2/collection-data_2/{part:05d}_clip.pt')