# Эксперименты с эмбеддерами

In [None]:
# #!python -m pip install -U numpy --user
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 --user

In [None]:
%pip install git+https://github.com/huggingface/transformers --user

In [1]:
import requests
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor
import torch.nn.functional as F
from torchvision.datasets import CIFAR100
from torch.utils.data import DataLoader, TensorDataset
from PIL import Image
from transformers import (
    AutoTokenizer, AutoModel,
    AutoImageProcessor, ResNetModel,
    CLIPProcessor, CLIPModel,
    AutoModelForCausalLM, BitsAndBytesConfig,
    AutoImageProcessor, SwinModel
)
from urllib.request import urlopen
from huggingface_hub import hf_hub_download
from datasets import load_dataset

from sklearn.cluster import DBSCAN, KMeans
from sklearn import metrics

device = "cuda"
torch_dtype = torch.float16
torch.set_default_device('cuda')

# Datasets

## CIFAR

In [2]:
# впервые download=True
cifar100 = CIFAR100('../data/cifar', download=False, train=False)
cifar_txt = [f'a photo of {cifar100.classes[img[1]]}' for img in cifar100]
cifar_img = [img[0] for img in cifar100]
cifar_labels = [obj[1] for obj in cifar100]

print(cifar100)
len(cifar100.classes)

Dataset CIFAR100
    Number of datapoints: 10000
    Root location: ../data/cifar
    Split: Test


100

## MSCOCO

In [3]:
mscoco = load_dataset('../data/coco', split="test")   # "clip-benchmark/wds_mscoco_captions2017"
#temp_jpg = mscoco['jpg']
mscoco_img = mscoco['jpg'] #[np.array(temp_jpg[i])[:, :, :3] for i in range(5000)]
mscoco_img = [img.convert('RGB') for img in mscoco_img]
mscoco_txt = mscoco['txt']
#ds.save_to_disk('../data/coco')

mscoco

Dataset({
    features: ['__key__', '__url__', 'jpg', 'txt'],
    num_rows: 5000
})

# Models

## CLIP Model
1. Входные эмбеддинги - выбраны
2. Последний слой - не выбраны, чуть хуже 1. и длиннее

In [42]:
def get_clip_embeddings(model_name: str, img_inputs, txt_inputs, batch: int = 1):
    """Получение CLIP эмбеддингов
    
    Args:
        name (str): название модели: "openai/clip-vit-large-patch14", "openai/clip-vit-base-patch32", "../models/CLIP-GmP-ViT-L-14"
    """
    # Проверка на len в img_inputs/txt_inputs
    model = CLIPModel.from_pretrained(model_name, device_map=device)
    processor = CLIPProcessor.from_pretrained(model_name, device_map=device)
    
    for i in range(0, len(txt_inputs), batch):
        try:
            inputs = processor(
                text=txt_inputs[i:i+batch],
                images=img_inputs[i:i+batch], 
                return_tensors="pt"
            )
            outputs = model(**inputs)
        except:
            continue
        
        # Проверка на len в img_inputs/txt_inputs
        if i==0:
            embeds = np.zeros((len(img_inputs), outputs.text_embeds.shape[1]+outputs.image_embeds.shape[1]))
        
        for k in range(0, batch):
            embeds[i+k] = \
                list(outputs.text_embeds[k].detach().cpu().numpy())+\
                list(outputs.image_embeds[k].detach().cpu().numpy())
        
        torch.cuda.empty_cache()
    
    return embeds

## Baseline: ResNet50 + E5_small

In [19]:
def get_resnet50_e5small_embeddings(img_inputs, txt_inputs):
    """Получение resnet50 и e5small эмбеддингов"""
    
    # Инициализация
    def average_pool(last_hidden_states: Tensor,
                     attention_mask: Tensor) -> Tensor:
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
    
    resnet_processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
    resnet_model = ResNetModel.from_pretrained("microsoft/resnet-50")

    e5_tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-small')
    e5 = AutoModel.from_pretrained('intfloat/multilingual-e5-small')

    
    # Скоринг
    final_embeds = np.zeros((10000, 384+2048)) # задал константами, так как baseline

    for i in range(0, len(txt_inputs), 1):
        resent_inputs = resnet_processor(img_inputs[i], return_tensors="pt")
        e5_inputs = e5_tokenizer(txt_inputs[i], max_length=512, padding=True, truncation=True, return_tensors='pt')
        
        e5_outputs = e5(**e5_inputs)
        e5_embeddings = average_pool(e5_outputs.last_hidden_state, e5_inputs['attention_mask'])
        e5_embeddings = F.normalize(e5_embeddings, p=2, dim=1)
        
        with torch.no_grad():
            final_embeds[i] = \
                np.concatenate(
                    (e5_embeddings.detach().cpu().numpy(),
                     resnet_model(**resent_inputs).pooler_output.detach().cpu().numpy().reshape(1, 2048)),
                     axis=1
                )
        
        torch.cuda.empty_cache()
    
    return final_embeds

## Qwen2-VL-2B-Instruct

In [7]:
# from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor, Qwen2VLModel
# from qwen_vl_utils import process_vision_info

# quantization_config = BitsAndBytesConfig(load_in_8bit=True)

# # default: Load the model on the available device(s)
# model = Qwen2VLModel.from_pretrained(
#     "../models/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="cuda"
# )

In [9]:
#processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=256*28*28, max_pixels=256*28*28)

# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

In [7]:
# messages = [
#     {
#         "role": "user",
#         "content": [
#             {
#                 "type": "image",
#                 "image": "../demo.jpeg",
#             },
#             {
#                 "type": "text",
#                 "text": "Describe this image."
#             },
#         ],
#     }
# ]

In [8]:
# model.eval()

In [8]:
# text = processor.apply_chat_template(
#     messages, tokenize=False, add_generation_prompt=True
# )
# image_inputs, video_inputs = process_vision_info(messages)
# inputs = processor(
#     text=[text],
#     images=image_inputs,
#     videos=video_inputs,
#     padding=True,
#     return_tensors="pt",
# )
# inputs = inputs.to("cuda")

# # Inference: Generation of the output
# generated_ids = model.generate(**inputs, max_new_tokens=128)
# generated_ids_trimmed = [
#     out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
# ]
# output_text = processor.batch_decode(
#     generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
# )
# print(output_text)

In [9]:
# output_embeddings = [
#     out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
# ]

## Swin_transformer и e5-small

In [23]:
def get_swin_e5small_embeddings(img_inputs, txt_inputs):
    """Получение swin_transformer и e5small эмбеддингов"""
    
    # Инициализация
    def average_pool(last_hidden_states: Tensor,
                     attention_mask: Tensor) -> Tensor:
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
    
    swin_processor = AutoImageProcessor.from_pretrained("microsoft/swin-base-patch4-window7-224-in22k")
    swin_model = SwinModel.from_pretrained("microsoft/swin-base-patch4-window7-224-in22k")

    e5_tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-small')
    e5 = AutoModel.from_pretrained('intfloat/multilingual-e5-small')

    # Скоринг
    final_embeds = np.zeros((10000, 384+1024)) # задал константами, так как baseline

    for i in range(0, len(txt_inputs), 1):
        swin_inputs = swin_processor(img_inputs[i], return_tensors="pt")
        e5_inputs = e5_tokenizer(txt_inputs[i], max_length=512, padding=True, truncation=True, return_tensors='pt')
        
        e5_outputs = e5(**e5_inputs)
        e5_embeddings = average_pool(e5_outputs.last_hidden_state, e5_inputs['attention_mask'])
        e5_embeddings = F.normalize(e5_embeddings, p=2, dim=1)
        
        with torch.no_grad():
            final_embeds[i] = \
                np.concatenate(
                    (e5_embeddings.detach().cpu().numpy(),
                     swin_model(**swin_inputs).pooler_output.detach().cpu().numpy()),
                     axis=1
                )
        
        torch.cuda.empty_cache()
    
    return final_embeds

# Кластеризация

In [10]:
def get_clustering_metrics(X, labels, target):
    return [
        # Внутренние меры
        metrics.silhouette_score(X, labels, metric='euclidean'),
        metrics.calinski_harabasz_score(X, labels),
        metrics.davies_bouldin_score(X, labels),
        
        # Внешние меры
        metrics.rand_score(target, labels) if target else 0,
        metrics.normalized_mutual_info_score(target, labels) if target else 0
    ]

def get_clustering_experiment(exp_name: str, img_inputs, txt_inputs,
                              preprocessor_name: str, data_name: str,
                              df_results: pd.DataFrame = None,
                              model_name: str = None, target = None):
    
    if data_name == "cifar":
        kmeans = KMeans(n_clusters=100)
    else:
        kmeans = KMeans(n_clusters=10)
    
    print('exp_name')
    print('preprocessor step')
    if preprocessor_name == "clip":
        embs = get_clip_embeddings(model_name, img_inputs, txt_inputs, batch=1)
    if preprocessor_name == "baseline":
        embs = get_resnet50_e5small_embeddings(img_inputs, txt_inputs)
    if preprocessor_name == "swin_e5":
        embs = get_swin_e5small_embeddings(img_inputs, txt_inputs)
    
    print('preprocessor kmeans step')
    kmeans.fit(embs)
    
    if df_results is None:
        df_results = pd.DataFrame(columns=['method-data', 
                                           'silhouette', 'calinski_harabasz', 'davies_bouldin_score',
                                           'rand_score', 'normalized_mutual_info_score'])
    
    print('calculate metrics step')
    df_results.loc[df_results.shape[0]] = [exp_name]+get_clustering_metrics(embs, kmeans.labels_, target)
    
    
    return df_results

In [47]:
exp_setup = {
    "clip_l-CIFAR": {
        "exp_name": "clip_l-CIFAR",
        "preprocessor_name": "clip",
        "data_name": "cifar",
        "model_name": "openai/clip-vit-large-patch14",
        "img_inputs": cifar_img.copy(),
        "txt_inputs": cifar_txt.copy(),
        "target": cifar_labels,
    },
    "clip_b-CIFAR": {
        "exp_name": "clip_b-CIFAR",
        "preprocessor_name": "clip",
        "data_name": "cifar",
        "model_name": "openai/clip-vit-base-patch32",
        "img_inputs": cifar_img.copy(),
        "txt_inputs": cifar_txt.copy(),
        "target": cifar_labels,
    },
    "baseline-CIFAR": {
        "exp_name": "baseline-CIFAR",
        "preprocessor_name": "baseline",
        "data_name": "cifar",
        "img_inputs": cifar_img.copy(),
        "txt_inputs": cifar_txt.copy(),
        "target": cifar_labels,
    },
    "swin_e5-CIFAR": {
        "exp_name": "swin_e5-CIFAR",
        "preprocessor_name": "swin_e5",
        "data_name": "cifar",
        "img_inputs": cifar_img.copy(),
        "txt_inputs": cifar_txt.copy(),
        "target": cifar_labels,
    },
    
    "clip_l-MSCOCO": {
        "exp_name": "clip_l-MSCOCO",
        "preprocessor_name": "clip",
        "model_name": "openai/clip-vit-large-patch14",
        "data_name": "mscoco",
        "img_inputs": mscoco_img.copy(),
        "txt_inputs": mscoco_txt.copy(),
        "target": None,
    },
    "clip_b-MSCOCO": {
        "exp_name": "clip_b-MSCOCO",
        "preprocessor_name": "clip",
        "model_name": "openai/clip-vit-base-patch32",
        "data_name": "mscoco",
        "img_inputs": mscoco_img.copy(),
        "txt_inputs": mscoco_txt.copy(),
        "target": None,
    },
    "baseline-MSCOCO": {
        "exp_name": "baseline-MSCOCO",
        "preprocessor_name": "baseline",
        "data_name": "mscoco",
        "img_inputs": mscoco_img.copy(),
        "txt_inputs": mscoco_txt.copy(),
        "target": None,
    },
    "swin_e5-MSCOCO": {
        "exp_name": "swin_e5-MSCOCO",
        "preprocessor_name": "swin_e5",
        "data_name": "mscoco",
        "img_inputs": mscoco_img.copy(),
        "txt_inputs": mscoco_txt.copy(),
        "target": None,
    },
}

In [46]:
results = None

In [48]:
for exp in exp_setup.keys():
    print(exp)
    results = get_clustering_experiment(**exp_setup[exp], df_results=results)

clip_l-CIFAR
exp_name
preprocessor step
preprocessor kmeans step
calculate metrics step
clip_b-CIFAR
exp_name
preprocessor step
preprocessor kmeans step
calculate metrics step
baseline-CIFAR
exp_name
preprocessor step
preprocessor kmeans step
calculate metrics step
swin_e5-CIFAR
exp_name
preprocessor step
preprocessor kmeans step
calculate metrics step
clip_l-MSCOCO
exp_name
preprocessor step


Token indices sequence length is longer than the specified maximum sequence length for this model (95 > 77). Running this sequence through the model will result in indexing errors


preprocessor kmeans step
calculate metrics step
clip_b-MSCOCO
exp_name
preprocessor step


Token indices sequence length is longer than the specified maximum sequence length for this model (95 > 77). Running this sequence through the model will result in indexing errors


preprocessor kmeans step
calculate metrics step
baseline-MSCOCO
exp_name
preprocessor step
preprocessor kmeans step
calculate metrics step
swin_e5-MSCOCO
exp_name
preprocessor step
preprocessor kmeans step
calculate metrics step


In [49]:
results

Unnamed: 0,method-data,silhouette,calinski_harabasz,davies_bouldin_score,f1_score,rand_score,normalized_mutual_info_score
0,clip_l-CIFAR,0.317251,191.928211,1.474333,0.007421,0.999256,0.993279
1,clip_b-CIFAR,0.255371,163.843154,1.747091,0.0,0.998281,0.984153
2,baseline-CIFAR,0.031173,48.950943,3.452677,0.004937,0.982901,0.533847
3,swin_e5-CIFAR,0.054847,41.406548,3.329345,0.001234,0.990701,0.804645
4,clip_l-MSCOCO,0.050994,106.564796,3.801835,0.0,0.0,0.0
5,clip_b-MSCOCO,0.063904,131.452354,3.271781,0.0,0.0,0.0
6,baseline-MSCOCO,0.442548,431.290599,3.677789,0.0,0.0,0.0
7,swin_e5-MSCOCO,0.348317,76.934665,3.258689,0.0,0.0,0.0
