In [1]:
TEST = '/kaggle/input/unified-dataset-for-skin-cancer-classification/Unified_dataset/test'
TRAIN = '/kaggle/input/unified-dataset-for-skin-cancer-classification/Unified_dataset/train'
VAL = '/kaggle/input/unified-dataset-for-skin-cancer-classification/Unified_dataset/val'

In [2]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import numpy as np


DEVICE = torch.device('cpu')
OUTPUT_SIZE = 2048

model = models.resnext50_32x4d(weights=models.ResNeXt50_32X4D_Weights.IMAGENET1K_V2)

extraction_layer = model._modules.get('avgpool')
model.to(DEVICE)
model.eval()

scaler = transforms.Resize((224, 224))
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
to_tensor = transforms.ToTensor()

def get_vec(arg, model, extraction_layer):
    image = normalize(to_tensor(scaler(arg))).unsqueeze(0).to(DEVICE)
    result = torch.zeros(1, OUTPUT_SIZE, 1, 1)
    def copy_data(m, i, o):
        result.copy_(o.data)
    hooked = extraction_layer.register_forward_hook(copy_data)
    with torch.no_grad():
        model(image)
    hooked.remove()
    return result

Downloading: "https://download.pytorch.org/models/resnext50_32x4d-1a0047aa.pth" to /root/.cache/torch/hub/checkpoints/resnext50_32x4d-1a0047aa.pth
100%|██████████| 95.8M/95.8M [00:00<00:00, 152MB/s]


In [3]:
import arrow
import base64
import pandas as pd
from glob import iglob
from io import BytesIO
from os.path import basename
from os.path import isdir
from PIL import Image

THUMBNAIL_SIZE = (64, 64)

def embed(model, filename: str):
    with Image.open(fp=filename, mode='r') as image:
        return get_vec(arg=image.convert('RGB'), model=model, extraction_layer=extraction_layer).numpy().reshape(OUTPUT_SIZE,)


# https://stackoverflow.com/a/952952
def flatten(arg):
    return [x for xs in arg for x in xs]

def png(filename: str) -> str:
    with Image.open(fp=filename, mode='r') as image:
        buffer = BytesIO()
        # our images are pretty big; let's shrink the hover images to thumbnail size
        image.resize(size=THUMBNAIL_SIZE).convert('RGB').save(buffer, format='png')
        return 'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode()

def get_picture_from_glob(arg: str, tag: str,) -> list:
    time_get = arrow.now()
    result = [pd.Series(data=[tag, basename(input_file), embed(model=model, filename=input_file), png(filename=input_file)],
                        index=['tag', 'name', 'value', 'png'] )
        for index, input_file in enumerate(list(iglob(pathname=arg))) if index < 100]
    print('encoded {} rows of {}  in {}'.format(len(result), tag, arrow.now() - time_get))
    return result

time_start = arrow.now()
train_dict = {basename(folder) : folder + '/*.*' for folder in iglob(TRAIN + '/*') if isdir(folder)}
train_df = pd.DataFrame(data=flatten(arg=[get_picture_from_glob(arg=value, tag=key) for key, value in train_dict.items()]))
test_dict = {basename(folder) : folder + '/*.*' for folder in iglob(TEST + '/*') if isdir(folder)}
test_df = pd.DataFrame(data=flatten(arg=[get_picture_from_glob(arg=value, tag=key) for key, value in test_dict.items()]))
val_dict = {basename(folder) : folder + '/*.*' for folder in iglob(VAL + '/*') if isdir(folder)}
val_df = pd.DataFrame(data=flatten(arg=[get_picture_from_glob(arg=value, tag=key) for key, value in val_dict.items()]))
train_df = pd.concat(axis='index', objs=[train_df, val_df])
print('done in {}'.format(arrow.now() - time_start))

encoded 100 rows of mel  in 0:00:19.298577
encoded 100 rows of vasc  in 0:00:18.795816
encoded 100 rows of df  in 0:00:19.260866
encoded 100 rows of nv  in 0:00:18.977234
encoded 100 rows of bkl  in 0:00:18.724995
encoded 100 rows of akiec  in 0:00:19.030111
encoded 100 rows of bcc  in 0:00:19.253188
encoded 100 rows of mel  in 0:00:17.710416
encoded 41 rows of vasc  in 0:00:07.273223
encoded 52 rows of df  in 0:00:18.367366
encoded 100 rows of nv  in 0:00:17.170712
encoded 100 rows of bkl  in 0:00:17.341885
encoded 100 rows of akiec  in 0:00:17.103726
encoded 100 rows of bcc  in 0:00:17.350498
encoded 100 rows of mel  in 0:00:17.436402
encoded 87 rows of vasc  in 0:00:14.990867
encoded 71 rows of df  in 0:00:12.178350
encoded 100 rows of nv  in 0:00:17.243750
encoded 100 rows of bkl  in 0:00:17.456390
encoded 100 rows of akiec  in 0:00:17.204985
encoded 100 rows of bcc  in 0:00:17.335926
done in 0:05:59.676713


In [4]:
train_df['tag'].value_counts().to_frame().T

tag,mel,nv,bkl,akiec,bcc,vasc,df
count,200,200,200,200,200,187,171


In [5]:
from sklearn.manifold import TSNE

train_reducer = TSNE(random_state=2025, verbose=True, n_jobs=1, perplexity=20.0, init='pca')
train_df[['x', 'y']] = train_reducer.fit_transform(X=train_df['value'].apply(func=pd.Series))
test_reducer = TSNE(random_state=2025, verbose=True, n_jobs=1, perplexity=20.0, init='pca')
test_df[['x', 'y']] = test_reducer.fit_transform(X=test_df['value'].apply(func=pd.Series))

[t-SNE] Computing 61 nearest neighbors...
[t-SNE] Indexed 1358 samples in 0.007s...
[t-SNE] Computed neighbors for 1358 samples in 0.303s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1358
[t-SNE] Computed conditional probabilities for sample 1358 / 1358
[t-SNE] Mean sigma: 3.834972
[t-SNE] KL divergence after 250 iterations with early exaggeration: 72.631134
[t-SNE] KL divergence after 1000 iterations: 1.124246
[t-SNE] Computing 61 nearest neighbors...
[t-SNE] Indexed 593 samples in 0.003s...
[t-SNE] Computed neighbors for 593 samples in 0.067s...
[t-SNE] Computed conditional probabilities for sample 593 / 593
[t-SNE] Mean sigma: 3.819822
[t-SNE] KL divergence after 250 iterations with early exaggeration: 63.414688
[t-SNE] KL divergence after 1000 iterations: 1.035268
