In [151]:
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
from typing import Callable, Optional, Tuple, Any


class ImageDataLoader:
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
        ])
        self.dataset = CustomImageFolder(
            self.data_dir, transform=self.transform, target_transform=self._get_class_name)

        self.dataloader = DataLoader(self.dataset, shuffle=False)

    def __len__(self):
        return len(self.dataset)

    def __iter__(self):
        return iter(self.dataloader)

    def _get_class_name(self, index):
        return index


class CustomImageFolder(ImageFolder):
    def __init__(
        self,
        root: str,
        transform: Optional[Callable] = None,
        target_transform: Optional[Callable] = None,
    ):

        super().__init__(
            root=root,
            transform=transform,
            target_transform=target_transform,
        )
        # self.paths = [s[0] for s in self.samples]
        # self.labels = [self.classes[s[1]]
        #                for s in self.samples]

    def __getitem__(self, index: int) -> Tuple[Any, Any]:
        """
        Args:
            index (int): Index

        Returns:
            tuple: (sample, target) where target is class_index of the target class.
        """
        path, target = self.samples[index]

        sample = self.loader(path)
        if self.transform is not None:
            sample = self.transform(sample)
        if self.target_transform is not None:
            target = self.target_transform(target)

        return sample, target, path, self.classes[target]



In [1]:
class TestClass:
    def __init__(self) -> None:
        self.value='a'
        self.value2=4

a = TestClass()
a.__dict__

{'value': 'a', 'value2': 4}

In [2]:
import torch.nn.functional as F
import torch 

kl_loss = torch.nn.KLDivLoss(reduction="batchmean")
# input should be a distribution in the log space
input = F.log_softmax(torch.randn(3, 5, requires_grad=True), dim=1)
# Sample a batch of distributions. Usually this would come from the dataset
target = F.softmax(torch.rand(3, 5), dim=1)
output = kl_loss(input, target)

kl_loss = torch.nn.KLDivLoss(reduction="batchmean", log_target=True)
log_target = F.log_softmax(torch.rand(3, 5), dim=1)
output = kl_loss(input, log_target)


In [4]:
target

tensor([[0.2092, 0.2701, 0.1380, 0.2157, 0.1670],
        [0.2003, 0.1553, 0.1906, 0.2305, 0.2233],
        [0.2406, 0.2203, 0.2519, 0.1886, 0.0987]])

In [152]:
data_dir = "C:/Users/Maods/Documents/Development/Mestrado/terumo/apps/renal-pathology-retrieval/data/02_data_split/train_data"
data_loader = ImageDataLoader(data_dir)
dataloader = DataLoader(data_loader.dataset, batch_size=100, shuffle=False)

In [156]:
data_loader.dataset[0]

(tensor([[[ 0.0196, -0.0431, -0.0824,  ..., -0.1529, -0.1059, -0.1373],
          [ 0.0510, -0.0510, -0.1451,  ..., -0.1294, -0.1451, -0.1451],
          [ 0.0510, -0.0824, -0.1608,  ..., -0.0980, -0.1451, -0.0588],
          ...,
          [ 1.0000,  1.0000,  1.0000,  ...,  0.9059,  0.7882,  0.7804],
          [ 1.0000,  1.0000,  1.0000,  ...,  0.8510,  0.7804,  0.8275],
          [ 1.0000,  1.0000,  1.0000,  ...,  0.8431,  0.6941,  0.5608]],
 
         [[-0.4902, -0.5843, -0.4196,  ..., -0.6549, -0.6549, -0.6549],
          [-0.4980, -0.4902, -0.4667,  ..., -0.7098, -0.6863, -0.6235],
          [-0.5137, -0.4353, -0.4745,  ..., -0.6863, -0.6000, -0.4196],
          ...,
          [ 1.0000,  1.0000,  1.0000,  ...,  0.5451,  0.4118,  0.4588],
          [ 1.0000,  1.0000,  1.0000,  ...,  0.5686,  0.4510,  0.4902],
          [ 1.0000,  1.0000,  1.0000,  ...,  0.6157,  0.4039,  0.2078]],
 
         [[-0.3725, -0.4039, -0.2157,  ..., -0.4431, -0.4039, -0.3804],
          [-0.3569, -0.2941,

In [43]:
mask = data_loader.dataset.classes

In [44]:

target = []
paths = []
labels = []
for i, (x, y, path, label) in enumerate(dataloader):


    target.extend(list(y.cpu().detach().numpy()))
    paths.extend([i.split('/')[11].replace('\\','/') for i in path])
    labels.extend(label)


In [57]:
len(labels)

12131

In [58]:
for path, label in zip(paths, labels):
    # print(path.split('/')[1], label)
    assert path.split('/')[1] == label



In [59]:

for path, y in zip(paths, target):
    # print(path.split('/')[1], label)
    assert path.split('/')[1] == mask[y]

In [60]:
for label, y in zip(labels, target):
    # print(path.split('/')[1], label)
    assert label == mask[y]

In [65]:
import pickle
import numpy as np

def load_embeddings(pickle_file_path):
    with open(pickle_file_path, 'rb') as pickle_file:
        loaded_data_dict = pickle.load(pickle_file)

    # Convert lists to numpy arrays
    data = loaded_data_dict["embedding"]
    labels = np.array(loaded_data_dict["target"])
    return data, labels, loaded_data_dict

path = '../data_output/embeddings/vgg16_4096_pretrained.pickle'

data, labels, result = load_embeddings(path)

In [68]:
result.keys()

dict_keys(['model', 'embedding', 'target', 'paths', 'classes'])

In [85]:
for path, label in zip(result['paths'], result['classes']):
    # print(path.split('/')[0])
    assert path.split('/')[0] == label, print(path.split('/')[0])

In [88]:
for y, label in zip(result['target'], result['classes']):
    # print(path.split('/')[0])
    assert mask[y] == label

In [89]:
for y, path in zip(result['target'], result['paths']):
    # print(path.split('/')[0])
    assert mask[y] == path.split('/')[0]

In [113]:
import pandas as pd
base_path = "C:/Users/Maods/Documents/Development/Mestrado/terumo/apps/renal-pathology-retrieval/data/01_raw/"





In [100]:
import umap
import numpy as np
import requests
from renumics import spotlight
import json

df = {
    "label": result['classes'],
    "image":[base_path+i for i in result['paths']],
    "path":result['paths']
}

df = pd.DataFrame(df)

reducer = umap.UMAP(n_components=3)
reduced_embedding = reducer.fit_transform(result['embedding'])

df["embedding_reduced"] = np.array(reduced_embedding).tolist()
df["embx"] = [emb[0] for emb in df["embedding_reduced"]]
df["emby"] = [emb[1] for emb in df["embedding_reduced"]]

layout_url = "https://raw.githubusercontent.com/Renumics/spotlight/main/playbook/rookie/embedding_layout.json"
response = requests.get(layout_url)
layout = spotlight.layout.nodes.Layout(**json.loads(response.text))
spotlight.show(
    df,
    dtype={"image": spotlight.Image, "embedding_reduced": spotlight.Embedding},
    layout=layout,
)

In [116]:
import torch
from transformers import CLIPModel, CLIPProcessor

In [117]:
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name)


Downloading (…)lve/main/config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

In [121]:
inputs = torch.rand((1,3,224,224))

In [125]:
image_features = model.get_image_features(inputs)
image_features /= image_features.norm(dim=-1, keepdim=True)


In [134]:
image_features.view(image_features.size(0), -1)

torch.Size([1, 512])

In [138]:
from torch.utils.data import Dataset
import os

class TripletData(Dataset):
    def __init__(self, path, transforms, split="train"):
        self.path = path
        self.split = split    # train or valid
        self.cats = 6       # number of categories
        self.transforms = transforms
    
    def __getitem__(self, idx):
        # our positive class for the triplet
        idx = str(idx%self.cats + 1)
        
        # choosing our pair of positive images (im1, im2)
        positives = os.listdir(os.path.join(self.path, idx))
        im1, im2 = random.sample(positives, 2)
        
        # choosing a negative class and negative image (im3)
        negative_cats = [str(x+1) for x in range(self.cats)]
        negative_cats.remove(idx)
        negative_cat = str(random.choice(negative_cats))
        negatives = os.listdir(os.path.join(self.path, negative_cat))
        im3 = random.choice(negatives)
        
        im1,im2,im3 = os.path.join(self.path, idx, im1), os.path.join(self.path, idx, im2), os.path.join(self.path, negative_cat, im3)
        
        im1 = self.transforms(Image.open(im1))
        im2 = self.transforms(Image.open(im2))
        im3 = self.transforms(Image.open(im3))
        
        return [im1, im2, im3]
        
    # we'll put some value that we want since there can be far too many triplets possible
    # multiples of the number of images/ number of categories is a good choice
    def __len__(self):
        return self.cats*8
    

# Transforms
train_transforms = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

val_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])


# Datasets and Dataloaders
train_data = TripletData('../data/02_data_split/train_data', train_transforms)
val_data = TripletData('../data/02_data_split/test_data', val_transforms)

In [146]:
import random

path = '../data/02_data_split/train_data'
split = 'train'    # train or valid
cats = 6       # number of categories
transforms = train_transforms

idx=0
# our positive class for the triplet
idx = str(idx%cats + 1)
# idx
# choosing our pair of positive images (im1, im2)
# positives = os.listdir(os.path.join(path, idx))
# im1, im2 = random.sample(positives, 2)

# # choosing a negative class and negative image (im3)
negative_cats = [str(x+1) for x in range(cats)]
negative_cats.remove(idx)
negative_cat = str(random.choice(negative_cats))
# negatives = os.listdir(os.path.join(path, negative_cat))
# im3 = random.choice(negatives)

# im1,im2,im3 = os.path.join(path, idx, im1), os.path.join(path, idx, im2), os.path.join(path, negative_cat, im3)

# im1 = transforms(Image.open(im1))
# im2 = transforms(Image.open(im2))
# im3 = transforms(Image.open(im3))


In [150]:
negative_cat

'3'