# Similarity tests

the goal of this notebook is to explore more in depth a solution that:
* store a dataset of images
* explore further similarity metrics
* explore dimensionality reduction 

### Packages

In [1]:
import numpy as np

import cv2

import os
import random

In [2]:
from datasets import load_dataset
from transformers import AutoFeatureExtractor, AutoModel

import torchvision.transforms as T
import torch 

In [3]:
import mlflow
from  mlflow.tracking import MlflowClient

In [4]:
from tqdm.auto import tqdm
import shutil

In [3]:
import mlflow as mlf
print('mlflow' + ' ' + mlf.__version__)
import torch as trc
print('torch' + ' ' + trc.__version__)
import torchvision as trv
print('torchvision' + ' ' + trv.__version__)
import datasets as dts
print('datasets' + ' ' + dts.__version__)
import numpy as np
print('numpy' + ' ' + np.__version__)

mlflow 2.1.1
torch 2.0.0+cpu
torchvision 0.15.1+cpu
datasets 2.12.0
numpy 1.21.5


### Configs

In [5]:
my_local_path = os.path.normpath(os.getcwd() + os.sep + os.pardir)

In [6]:
proj_name = 'ImageFinder'

In [7]:
registered_model_name = f'{proj_name}_models' 
# model credential
uri = 'runs:/1f30b69e008e42c795b551e3fb240884'
tmp_path = my_local_path + '/research_env/tmp_image/'

### Transformations

<b> Loading Model and embeddings

In [8]:
extractor_dict = mlflow.artifacts.load_dict(
    uri+'/extractor_dict.json'
)

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [10]:
image_path = my_local_path + '/dataset'
dataset = load_dataset("imagefolder", data_dir=image_path, drop_labels=True)

Resolving data files:   0%|          | 0/202 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/95 [00:00<?, ?it/s]

Found cached dataset imagefolder (C:/Users/Miguel/.cache/huggingface/datasets/imagefolder/default-9098521b0ec6eb33/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f)


  0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
# Data transformation chain.
transformation_chain = T.Compose(
    [
        # We first resize the input image to 256x256 and then we take center crop.
        T.Resize(int((224 / 224) * extractor_dict["size"]["height"])),
        T.CenterCrop(extractor_dict["size"]["height"]),
        T.ToTensor(),
        T.Normalize(mean=extractor_dict["image_mean"], std=extractor_dict["image_std"]),
    ]
)

In [12]:
model_local_path = mlflow.artifacts.download_artifacts(
            run_id= '7a8aae8b4b91454fb177ad796c95cf5b',
            artifact_path=f"{proj_name}-run"
        )   

In [13]:
model = mlflow.pytorch.load_model(model_local_path)

In [14]:
embeding_db = torch.load('embeding_db.pt')

In [15]:
embeddings_loaded = embeding_db[:,0:-1]
ids_loaded = embeding_db[:,-1]

In [16]:
candidate_ids = []

for id in tqdm(range(len(ids_loaded))):
    id_true = int(np.array(ids_loaded[id]))

    # Create a unique indentifier.
    entry = str(id) + "_" + str(id_true)

    candidate_ids.append(entry)

  0%|          | 0/190 [00:00<?, ?it/s]

In [17]:
def compute_scores(emb_one, emb_two):
    """Computes cosine similarity between two vectors."""
    scores = torch.nn.functional.cosine_similarity(emb_one, emb_two)
    return scores.numpy().tolist()


def fetch_similar(model, all_candidate_embeddings, image, top_k=5):
    """Fetches the `top_k` similar images with `image` as the query."""
    # Prepare the input query image for embedding computation.
    image_transformed = transformation_chain(image).unsqueeze(0)
    new_batch = {"pixel_values": image_transformed.to(device)}

    # Comute the embedding.
    with torch.no_grad():
        query_embeddings = model(**new_batch).last_hidden_state[:, 0].cpu()

    # Compute similarity scores with all the candidate images at one go.
    # We also create a mapping between the candidate image identifiers
    # and their similarity scores with the query image.
    sim_scores = compute_scores(all_candidate_embeddings, query_embeddings)
    similarity_mapping = dict(zip(candidate_ids, sim_scores))
 
    # Sort the mapping dictionary and return `top_k` candidates.
    similarity_mapping_sorted = dict(
        sorted(similarity_mapping.items(), key=lambda x: x[1], reverse=True)
    )
    id_entries = list(similarity_mapping_sorted.keys())[:top_k]

    ids = list(map(lambda x: int(x.split("_")[0]), id_entries))
    ids_true = list(map(lambda x: int(x.split("_")[-1]), id_entries))
    scores = list(similarity_mapping_sorted.values())[:top_k]
    
    return ids, ids_true, scores, similarity_mapping_sorted

In [18]:
#test_idx = np.random.choice(len(dataset["test"]))  ## good ids 72, 38, 17
test_idx = 38
test_sample = dataset["test"][test_idx]["image"]

sim_ids, sim_ids_true, sim_score, sim_map = fetch_similar(model, embeddings_loaded ,test_sample)
print(f'test id is: {test_idx}')
print(f"top 5 ids are: {sim_ids}")
print(f"top 5 actual ids are: {sim_ids_true}")
print(f"top 5 scores are: {sim_score}")

test id is: 38
top 5 ids are: [6, 80, 44, 20, 111]
top 5 actual ids are: [423485, 106187, 579115, 117201, 195166]
top 5 scores are: [0.6933450088041885, 0.6407462925440754, 0.5778804254596666, 0.4826035706874567, 0.44418710175889053]


In [19]:
tmp_image_name = '303217440_f595f9b310_o.jpg'
test_id_path = my_local_path + '/dataset/test/' + tmp_image_name

if not os.path.exists(tmp_path):
        os.makedirs(tmp_path)
        
dst = tmp_path + tmp_image_name
shutil.copyfile(test_id_path, dst)

tmp_dataset = load_dataset("imagefolder", data_dir=tmp_path, drop_labels=True)

tmp_image = tmp_dataset['train'][0]['image']

sim_ids, sim_ids_true, sim_score, sim_map = fetch_similar(model, embeddings_loaded,tmp_image)
print(f'test id is: {test_idx}')
print(f"top 5 ids are: {sim_ids}")
print(f"top 5 actual ids are: {sim_ids_true}")
print(f"top 5 scores are: {sim_score}")

shutil.rmtree(tmp_path)

Downloading and preparing dataset imagefolder/default to C:/Users/Miguel/.cache/huggingface/datasets/imagefolder/default-c95b25725be68847/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset imagefolder downloaded and prepared to C:/Users/Miguel/.cache/huggingface/datasets/imagefolder/default-c95b25725be68847/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

test id is: 38
top 5 ids are: [6, 80, 44, 20, 111]
top 5 actual ids are: [423485, 106187, 579115, 117201, 195166]
top 5 scores are: [0.6933450088041885, 0.6407462925440754, 0.5778804254596666, 0.4826035706874567, 0.44418710175889053]


In [20]:
dict(zip(sim_ids_true, sim_score))

{423485: 0.6933450088041885,
 106187: 0.6407462925440754,
 579115: 0.5778804254596666,
 117201: 0.4826035706874567,
 195166: 0.44418710175889053}