# Calculating the Similarity between images



In [13]:
import torch
from torch import nn
import torchvision
from torchvision import transforms as T
from torchvision.models import resnet50, ResNet50_Weights
from torch.utils.data import DataLoader
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

In [14]:
# insert your own root directory
path_to_root = "/home/oem/Documents/coding/personal/computer_vision_toolkit"
os.chdir(path_to_root)
from src.dataset import CustomImageDataset

In [15]:
# Utilize the transforms from the resnet itself
weights = torchvision.models.ResNet50_Weights.IMAGENET1K_V2
transform = weights.transforms()
dataset = CustomImageDataset("notebooks/dataset_trunctated.csv", transform)

In [16]:
# Ensure that shuffle is set to false as we will be appending the output array back to the dataframe.
dataloader = DataLoader(dataset, batch_size=100, shuffle=False)

In [17]:
class ClassifierModel:
    def __init__(self, weights):
        self.model = torchvision.models.resnet50(weights=weights)
        modules = list(self.model.children())[:-1]
        self.model = nn.Sequential(*modules)
        self.model.eval()

    def get_embedding(self, batch):
        with torch.no_grad():
            batch = self.model(batch).squeeze()
            batch = batch.detach().numpy()

            return batch

In [18]:
files_df = pd.read_csv("notebooks/dataset_trunctated.csv")
files_df.head()

Unnamed: 0,file_name,image_id,lesion_id,dx,dx_type,age,sex,localization,label
0,data/ham10000_images_part_2/ISIC_0031774.jpg,ISIC_0031774,HAM_0002275,melanocytic_Nevi,follow_up,45.0,female,lower extremity,0
1,data/ham10000_images_part_2/ISIC_0030527.jpg,ISIC_0030527,HAM_0006713,melanocytic_Nevi,follow_up,50.0,female,trunk,0
2,data/ham10000_images_part_2/ISIC_0033561.jpg,ISIC_0033561,HAM_0004708,melanocytic_Nevi,histo,45.0,male,trunk,0
3,data/ham10000_images_part_2/ISIC_0034041.jpg,ISIC_0034041,HAM_0005496,melanocytic_Nevi,histo,15.0,female,lower extremity,0
4,data/ham10000_images_part_2/ISIC_0031369.jpg,ISIC_0031369,HAM_0000531,melanoma,histo,85.0,male,face,1


In [19]:
ls_tensor = []
for sample, _ in tqdm(dataloader):
    embedding = ClassifierModel(weights).get_embedding(sample)
    ls_tensor.append(embedding)

ls_tensor = np.vstack(ls_tensor)

100%|██████████| 10/10 [03:44<00:00, 22.47s/it]


Calculate the similarity score

In [20]:
norm = np.linalg.norm(ls_tensor, axis=1).reshape(-1, 1)
cos_sim_matrix = np.matmul(ls_tensor, ls_tensor.T) / np.matmul(norm, norm.T)
# Since we do not wish to double count we make only the upper triangle values available and change the other variables to 1
cos_sim_matrix = np.triu(cos_sim_matrix, k=1)
cos_sim_matrix

array([[0.        , 0.68474674, 0.56418365, ..., 0.65904576, 0.5718129 ,
        0.5503208 ],
       [0.        , 0.        , 0.38249406, ..., 0.692612  , 0.47265816,
        0.49114147],
       [0.        , 0.        , 0.        , ..., 0.42406806, 0.39262268,
        0.6326931 ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.5163081 ,
        0.616625  ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.49040404],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

subset the matrix

In [21]:
# for each value in the triu(cos_sim_matrix) that is >0.9 we can get the value and return as 1

ls_similar_img = np.zeros(len(files_df))
rows, cols = np.nonzero(cos_sim_matrix > 0.9)
for row, col in zip(rows, cols):
    ls_similar_img[row] = 1
    ls_similar_img[col] = 1

Append the numpy array of similar features to the existing dataframe

In [22]:
# Attach back the results back to the dataframe
files_df["similar"] = ls_similar_img 
files_df.head()

Unnamed: 0,file_name,image_id,lesion_id,dx,dx_type,age,sex,localization,label,similar
0,data/ham10000_images_part_2/ISIC_0031774.jpg,ISIC_0031774,HAM_0002275,melanocytic_Nevi,follow_up,45.0,female,lower extremity,0,0.0
1,data/ham10000_images_part_2/ISIC_0030527.jpg,ISIC_0030527,HAM_0006713,melanocytic_Nevi,follow_up,50.0,female,trunk,0,1.0
2,data/ham10000_images_part_2/ISIC_0033561.jpg,ISIC_0033561,HAM_0004708,melanocytic_Nevi,histo,45.0,male,trunk,0,1.0
3,data/ham10000_images_part_2/ISIC_0034041.jpg,ISIC_0034041,HAM_0005496,melanocytic_Nevi,histo,15.0,female,lower extremity,0,1.0
4,data/ham10000_images_part_2/ISIC_0031369.jpg,ISIC_0031369,HAM_0000531,melanoma,histo,85.0,male,face,1,0.0
