# Step 3. Creating clusters of images, using CLIP and BERT vectors and count metrics

Here we work with CLIP and BERT vectors and try to create clusters and count metrics

## 0. Preparation

In [1]:
#import necessary libraries
import pandas as pd
import json
import torch
import numpy as np
from sklearn import metrics
from sklearn.cluster import KMeans
from collections import defaultdict
import os
from sklearn.metrics.pairwise import paired_distances
import matplotlib.pyplot as plt

## 1. Representation of classes and functions

### 1.1. Dataset class: it is responsible for all operations with dataset

In [5]:
class Dataset:
    def __init__(self):
        pass
    
    def json_file_opening(self, json_file, img, feature_1, feature_2, 
                          dictionary=False, reshaping=False, tensor=False):
        with open(json_file, 'r') as file:
            img_features = [(json.loads(line)[img],
                 json.loads(line)[feature_1],
                 json.loads(line)[feature_2])
                  for line in file]
        img_list = [feature[0] for feature in img_features]
        feature_1_list = [feature[1] for feature in img_features]
        feature_2_list = [feature[2] for feature in img_features]
        if dictionary == False:
            return img_list, feature_1_list, feature_2_list
        else:
            dict_1_list, dict_2_list = {}, {}
            assert len(img_list) == len(feature_1_list)
            assert len(img_list) == len(feature_2_list)
            for i in range(len(img_list)):
                if reshaping == True:
                    dict_1_list[img_list[i]] = np.array(feature_1_list[i]).reshape(1, -1)
                    dict_2_list[img_list[i]] = np.array(feature_2_list[i]).reshape(1, -1)
                elif tensor == True:
                    dict_1_list[img_list[i]] = torch.FloatTensor(feature_1_list[i]).unsqueeze(dim=0)
                    dict_2_list[img_list[i]] = torch.FloatTensor(feature_2_list[i]).unsqueeze(dim=0)
                else:
                    dict_1_list[img_list[i]] = feature_1_list[i]
                    dict_2_list[img_list[i]] = feature_2_list[i]
            return dict_1_list, dict_2_list

### 1.2. Analytics class: it is responsible for all analytics

In [4]:
def to_1D(series):
    return [x for _list in series for x in _list]

In [6]:
class Analytics:
    def __init__(self):
        pass
    
    def nearest_clusters(self, img_number, clusters_number, nearest, clusters, nearest_clusters_number):
        nearest_clusters_dict = {}
        with open('nearest_clusters.json', 'w') as near:
            pass
        images_vect = [0] * img_number
        for i in range(img_number):
            images_clust = [0] * clusters_number
            for j in range(clusters_number):
                vector = nearest[i][clusters[i]]
                images_clust[j] = abs(vector - nearest[i][j])
            images_vect[i] = images_clust
        images_sorted = [0] * img_number
        for i in range(img_number):
            images_sorted[i] = sorted(images_vect[i])[:nearest_clusters_number]
            nearest_clusters = [0] * img_number
        for i in range(img_number):
            clusters_per_img = [0] * nearest_clusters_number
            for j in range(nearest_clusters_number):
                clusters_per_img[j] = images_vect[i].index(images_sorted[i][j])
            nearest_clusters[i] = clusters_per_img
            with open('nearest_clusters.texts.json', 'a') as near:
                nearest_clusters_json = {'img': self.img_list[i], 'clusters': nearest_clusters[i]}
                json.dump(nearest_clusters_json, near)
                near.write('\n')
            nearest_clusters_dict[self.img_list[i]] = nearest_clusters[i]
        return nearest_clusters_dict
    
    def nearest_clusters_images(self, nearest_cl_dict):
        nearest_clusters_images = {}
        zip_keys = [(value[0], key) for key, value in nearest_cl_dict.items()]
        clusters_images_2 = defaultdict(list)
        for key, value in zip_keys:
            clusters_images_2[key].append(value)
        for key, values in nearest_cl_dict.items():
            nearest_clusters_images[key] = list(set(to_1D([clusters_images_2.get(value) for value in values])))
        data = {'image': nearest_clusters_images.keys(), 'candidate': nearest_clusters_images.values()}
        nearest_clusters_df = pd.DataFrame(data)
        nearest_clusters_df = nearest_clusters_df.explode('candidate', ignore_index=True)
        nearest_clusters_df = nearest_clusters_df.drop(
            nearest_clusters_df.loc[nearest_clusters_df['image'] == nearest_clusters_df['candidate']].index)
        return nearest_clusters_df
    
    def metrics_count(self, dataframe):
        clip_1 = list(dataframe['img_clip_1'].values)
        clip_2 = list(dataframe['img_clip_2'].values)
        bert_1 = list(dataframe['img_bert_1'].values)
        bert_2 = list(dataframe['img_bert_2'].values)
        assert len(clip_1) == len(clip_2)
        n = len(clip_1)
        clip_cossim, clip_l2, bert_cossim, bert_l2 = [0] * n, [0] * n, [0] * n, [0] * n
        for i in range(n):
            clip_cossim[i] = (torch.nn.functional.cosine_similarity(clip_1[i], clip_2[i])).item()
            clip_l2[i] = paired_distances(clip_1[i], clip_2[i], metric='l2')[0]
            bert_cossim[i] = (torch.nn.functional.cosine_similarity(bert_1[i], bert_2[i])).item()
            bert_l2[i] = paired_distances(bert_1[i], bert_2[i], metric='l2')[0]
        dataframe['clip_cossim'] = clip_cossim
        dataframe['clip_l2'] = clip_l2
        dataframe['bert_cossim'] = bert_cossim
        dataframe['bert_l2'] = bert_l2
        return dataframe

## 3. Creating clusters of images

In [None]:
#read json files in two formats: as dictionaries and dataframe
dataset = Dataset()
img_clip, img_bert = dataset.json_file_opening('clip_bert.json', 'img', 'clip', 'bert', 
                                               dictionary=True, reshaping=False, tensor=True)
clip_bert_data = pd.read_json('clip_bert.json')

In [8]:
# we mark necessary metrics
assert len(list(img_clip.keys())) == len(list(img_bert.keys()))
img_list = list(img_clip.keys())
img_number = len(img_list)
clusters_number = 1000
nearest_clusters_number = 3

In [5]:
# we create clusters and distance to clusters
kmeans = KMeans(n_clusters=clusters_number)
clusters = kmeans.fit_predict(clip_bert_data)
nearest = kmeans.fit_transform(clip_bert_data)

## 4. Counting nearest clusters

In [8]:
analytics = Analytics()
# we count nearest clusters for image
nearest_clusters = analytics.nearest_clusters(img_number, clusters_number, nearest, clusters, nearest_clusters_number)
# we change clusters to images and create dataframe with pairs of images
nearest_clusters_df = analytics.nearest_clusters_images(nearest_clusters)