In [None]:
# #!python -m pip install -U numpy --user
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 --user

In [None]:
%pip install scikit-learn --user 

In [16]:
import requests
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor
import torch.nn.functional as F
from torchvision.datasets import CIFAR100
from torch.utils.data import DataLoader, TensorDataset
from PIL import Image
from transformers import AutoTokenizer, AutoModel, AutoImageProcessor, ResNetModel, CLIPProcessor, CLIPModel
from urllib.request import urlopen
from huggingface_hub import hf_hub_download
from datasets import load_dataset

device = "cuda"
torch_dtype = torch.float16
torch.set_default_device('cuda')

# Datasets

## CIFAR

In [3]:
# впервые download=True
cifar100 = CIFAR100('../data/cifar', download=True, train=False)
cifar_txt = [f'a photo of {cifar100.classes[img[1]]}' for img in cifar100]
cifar_img = [img[0] for img in cifar100]

print(cifar100)
len(cifar100.classes)

Files already downloaded and verified
Dataset CIFAR100
    Number of datapoints: 10000
    Root location: ../data/cifar
    Split: Test


100

# Models

## CLIP Model + CIFAR
1. Входные эмбеддинги
2. Последний слой

In [4]:
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14", device_map=device) # openai/clip-vit-base-patch32
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14", device_map=device) # openai/clip-vit-base-patch32

In [None]:
# Размер эмбеддингов
# len(outputs.text_embeds[0]), len(outputs.image_embeds[0])
# (512, 512)

In [8]:
# Base/ 1 Batch - 1 GB
# Large/ 1 Batch - 3 GB
cifar_inputs = np.zeros((10000, 768+768))

for i in range(0, len(cifar_txt), 1):
    inputs = processor(
        text=cifar_txt[i:i+1],
        images=cifar_img[i:i+1], 
        return_tensors="pt", 
        padding=True
    )
    outputs = model(**inputs)
    
    cifar_inputs[i] = \
        list(outputs.text_embeds[0].detach().cpu().numpy())+\
        list(outputs.image_embeds[0].detach().cpu().numpy())
    
    torch.cuda.empty_cache()

In [9]:
np.save('../data/cifar/cifar_clip_l_inputs.npy', cifar_inputs)
#np.save('../data/cifar/cifar_clip_l_outputs.npy', cifar_outputs)

## ResNet50 + E5_small + CIFAR

In [None]:
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

In [130]:
resnet_processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
resnet_model = ResNetModel.from_pretrained("microsoft/resnet-50")

In [113]:
e5_tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-small')
e5 = AutoModel.from_pretrained('intfloat/multilingual-e5-small')

In [114]:
batch_dict = e5_tokenizer(cifar_txt[0], max_length=512, padding=True, truncation=True, return_tensors='pt')

outputs = e5(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
embeddings = F.normalize(embeddings, p=2, dim=1)

In [131]:
resnet_e5_outputs = np.zeros((10000, 384+2048))

for i in range(0, len(cifar_img), 1):
    inputs = resnet_processor(cifar_img[i], return_tensors="pt")
    batch_dict = e5_tokenizer(cifar_txt[i], max_length=512, padding=True, truncation=True, return_tensors='pt')
    
    with torch.no_grad():
        resnet_e5_outputs[i] = \
            np.concatenate(
                (embeddings.detach().cpu().numpy(),
                model(**inputs).pooler_output.detach().cpu().numpy().reshape(1, 2048)),
                axis=1
            )
    
    torch.cuda.empty_cache()

In [132]:
np.save('../data/cifar/cifar_resnet_e5_outputs.npy', resnet_e5_outputs)

# Кластеризация

In [12]:
from sklearn.cluster import DBSCAN, KMeans
from sklearn import metrics

dbscan = DBSCAN()
kmeans = KMeans(n_clusters=100)

In [27]:
def get_clustering_metrics(X, labels):
    return [
        metrics.silhouette_score(X, labels, metric='euclidean'),
        metrics.calinski_harabasz_score(X, labels),
        metrics.davies_bouldin_score(X, labels)
    ]

In [18]:
with open('../data/cifar/cifar_clip_outputs.npy', 'rb') as f:
    clip_outputs = np.load(f)
with open('../data/cifar/cifar_clip_inputs.npy', 'rb') as f:
    clip_inputs = np.load(f)
with open('../data/cifar/cifar_resnet_e5_outputs.npy', 'rb') as f:
    resnet_e5_outputs = np.load(f)
with open('../data/cifar/cifar_clip_l_inputs.npy', 'rb') as f:
    clip_l_inputs = np.load(f)

In [30]:
results = pd.DataFrame(columns=['method-data', 'silhouette', 'calinski_harabasz', 'davies_bouldin_score'])

kmeans.fit(clip_outputs)
results.loc[results.shape[0]] = ['clip_pooler_layers-CIFAR']+get_clustering_metrics(clip_outputs, kmeans.labels_)

kmeans.fit(clip_inputs)
results.loc[results.shape[0]] = ['clip_embeddings-CIFAR']+get_clustering_metrics(clip_inputs, kmeans.labels_)

kmeans.fit(resnet_e5_outputs)
results.loc[results.shape[0]] = ['resnet_e5-CIFAR']+get_clustering_metrics(resnet_e5_outputs, kmeans.labels_)

kmeans.fit(clip_l_inputs)
results.loc[results.shape[0]] = ['clip_large_embeddings-CIFAR']+get_clustering_metrics(clip_l_inputs, kmeans.labels_)

results.sort_values('silhouette', ascending=False)

Unnamed: 0,method-data,silhouette,calinski_harabasz,davies_bouldin_score
3,clip_large_embeddings-CIFAR,0.310093,188.514256,1.526368
1,clip_embeddings-CIFAR,0.270955,171.867107,1.556349
0,clip_pooler_layers-CIFAR,0.266921,163.704176,1.638731
2,resnet_e5-CIFAR,0.030211,49.020138,3.464721
