In [None]:
# #!python -m pip install -U numpy --user
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 --user

In [None]:
%pip install scikit-learn --user 

In [112]:
import requests
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import torch

import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel

device = "cuda"
torch_dtype = torch.float16
torch.set_default_device('cuda')

from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPProcessor, CLIPModel
from urllib.request import urlopen
import torch.nn as nn
from huggingface_hub import hf_hub_download

from transformers import AutoImageProcessor, ResNetModel
import torch
from datasets import load_dataset

from torchvision.datasets import CIFAR100
from torch.utils.data import DataLoader, TensorDataset

# Datasets

## CIFAR

In [41]:
# впервые download=True
cifar100 = CIFAR100('../data/cifar', download=True, train=False)
cifar_txt = [f'a photo of {cifar100.classes[img[1]]}' for img in cifar100]
cifar_img = [img[0] for img in cifar100]

print(cifar100)
len(cifar100.classes)

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ../data/cifar\cifar-100-python.tar.gz


100%|██████████| 169001437/169001437 [00:14<00:00, 11489886.61it/s]


Extracting ../data/cifar\cifar-100-python.tar.gz to ../data/cifar
Dataset CIFAR100
    Number of datapoints: 10000
    Root location: ../data/cifar
    Split: Test


100

# Models

## CLIP Model + CIFAR
1. Входные эмбеддинги
2. Последний слой

In [42]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", device_map=device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", device_map=device)

In [None]:
# Размер эмбеддингов
# len(outputs.text_embeds[0]), len(outputs.image_embeds[0])
# (512, 512)

In [108]:
# 1 Batch - 1 GB - 1414MiB /   8192MiB 
cifar_inputs = np.zeros((10000, 1024))
cifar_outputs = np.zeros((10000, 512+768))

for i in range(0, len(cifar_txt), 1):
    inputs = processor(
        text=cifar_txt[i:i+1],
        images=cifar_img[i:i+1], 
        return_tensors="pt", 
        padding=True
    )
    outputs = model(**inputs)
    
    cifar_inputs[i] = \
        list(outputs.text_embeds[0].detach().cpu().numpy())+\
        list(outputs.image_embeds[0].detach().cpu().numpy())
        
    cifar_outputs[i] = \
        list(outputs.text_model_output.pooler_output[0].detach().cpu().numpy())+\
        list(outputs.vision_model_output.pooler_output[0].detach().cpu().numpy())
    
    torch.cuda.empty_cache()

In [73]:
np.save('../data/cifar/cifar_inputs.npy', cifar_inputs)
np.save('../data/cifar/cifar_outputs.npy', cifar_outputs)

## ResNet50 + E5_small + CIFAR

In [None]:
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

In [130]:
resnet_processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
resnet_model = ResNetModel.from_pretrained("microsoft/resnet-50")

In [113]:
e5_tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-small')
e5 = AutoModel.from_pretrained('intfloat/multilingual-e5-small')

In [114]:
batch_dict = e5_tokenizer(cifar_txt[0], max_length=512, padding=True, truncation=True, return_tensors='pt')

outputs = e5(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
embeddings = F.normalize(embeddings, p=2, dim=1)

In [131]:
resnet_e5_outputs = np.zeros((10000, 384+2048))

for i in range(0, len(cifar_img), 1):
    inputs = resnet_processor(cifar_img[i], return_tensors="pt")
    batch_dict = e5_tokenizer(cifar_txt[i], max_length=512, padding=True, truncation=True, return_tensors='pt')
    
    with torch.no_grad():
        resnet_e5_outputs[i] = \
            np.concatenate(
                (embeddings.detach().cpu().numpy(),
                model(**inputs).pooler_output.detach().cpu().numpy().reshape(1, 2048)),
                axis=1
            )
    
    torch.cuda.empty_cache()

In [132]:
np.save('../data/cifar/cifar_resnet_e5_outputs.npy', resnet_e5_outputs)

### Кластеризация

In [153]:
from sklearn.cluster import DBSCAN, KMeans
from sklearn import metrics

dbscan = DBSCAN()
kmeans = KMeans(n_clusters=100)

In [61]:
def get_clustering_metrics(X, labels):
    print('silhouette_score', metrics.silhouette_score(X, labels, metric='euclidean'))
    print('calinski_harabasz_score', metrics.calinski_harabasz_score(X, labels))
    print('davies_bouldin_score', metrics.davies_bouldin_score(X, labels))

In [142]:
with open('../data/cifar/cifar_clip_outputs.npy', 'rb') as f:
    cifar_outputs = np.load(f)
with open('../data/cifar/cifar_clip_inputs.npy', 'rb') as f:
    cifar_inputs = np.load(f)
with open('../data/cifar/cifar_resnet_e5_outputs.npy', 'rb') as f:
    resnet_e5_outputs = np.load(f)

In [148]:
kmeans.fit(cifar_inputs)
get_clustering_metrics(cifar_inputs, kmeans.labels_)

silhouette_score 0.2636079304302523
calinski_harabasz_score 167.83438706886284
davies_bouldin_score 1.6538392274912632


In [149]:
kmeans.fit(cifar_outputs)
get_clustering_metrics(cifar_outputs, kmeans.labels_)

silhouette_score 0.25672458816254984
calinski_harabasz_score 159.1281359320414
davies_bouldin_score 1.7534762280303813


In [150]:
kmeans.fit(resnet_e5_outputs)
get_clustering_metrics(resnet_e5_outputs, kmeans.labels_)

silhouette_score 0.03077189422010465
calinski_harabasz_score 48.866926465298775
davies_bouldin_score 3.4453984017374157
