In [1]:
import arrow
import pandas as pd
from glob import glob
from pathlib import Path


GLOB = '/kaggle/input/doodle-dataset/doodle/*/*.png'

time_start = arrow.now()
df = pd.DataFrame(data=[item for item in glob(pathname=GLOB)], columns=['file'])
df['path'] = df['file'].apply(func=lambda x: Path(x).parents[0])
print('done in {}'.format(arrow.now() - time_start))

done in 0:01:54.757920


We have all the files in a DataFrame; let's pick the first file from each folder as the exemplar and build a smaller DataFrame with the file names of one example from each class.

In [2]:
from os.path import basename
files_df = df.groupby(by='path').first().reset_index()
files_df['tag'] = files_df['path'].apply(basename)
files_df.head()

Unnamed: 0,path,file,tag
0,/kaggle/input/doodle-dataset/doodle/The Eiffel...,/kaggle/input/doodle-dataset/doodle/The Eiffel...,The Eiffel Tower
1,/kaggle/input/doodle-dataset/doodle/The Great ...,/kaggle/input/doodle-dataset/doodle/The Great ...,The Great Wall of China
2,/kaggle/input/doodle-dataset/doodle/The Mona Lisa,/kaggle/input/doodle-dataset/doodle/The Mona L...,The Mona Lisa
3,/kaggle/input/doodle-dataset/doodle/airplane,/kaggle/input/doodle-dataset/doodle/airplane/6...,airplane
4,/kaggle/input/doodle-dataset/doodle/alarm clock,/kaggle/input/doodle-dataset/doodle/alarm cloc...,alarm clock


Now let's load up some code based on [img2vec](https://github.com/christiansafka/img2vec) that will let us get an embedding vector for each image using ResNet18.

In [3]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import numpy as np


DEVICE = torch.device('cpu')
OUTPUT_SIZE = 512

model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
extraction_layer = model._modules.get('avgpool')
model.to(DEVICE)
model.eval()

scaler = transforms.Resize((224, 224))
normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.2, 0.2, 0.2])
to_tensor = transforms.ToTensor()

def get_vec(arg, model, extraction_layer):
    image = normalize(to_tensor(scaler(arg))).unsqueeze(0).to(DEVICE)
    result = torch.zeros(1, OUTPUT_SIZE, 1, 1)
    def copy_data(m, i, o):
        result.copy_(o.data)
    hooked = extraction_layer.register_forward_hook(copy_data)
    with torch.no_grad():
        model(image)
    hooked.remove()
    return result

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 130MB/s]


And let's apply that code to get embeddings for our example files.

In [4]:
from arrow import now
from PIL import Image

def embed(model, filename: str):
    with Image.open(fp=filename, mode='r') as image:
        return get_vec(arg=image.convert('RGB'), model=model, extraction_layer=extraction_layer).numpy().reshape(OUTPUT_SIZE,)


time_start = now()
files_df['vector'] = files_df['file'].apply(func=lambda x: embed(model=model, filename=x))
print('got embeddings in {}'.format(now() - time_start))

got embeddings in 0:00:21.495653


Now we can use the embedding vectors to locate our doodles in a two-dimensional space if we use dimension reduction.

In [5]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=1000)
files_df[['x', 'y']] = umap.fit_transform(X=files_df['vector'].apply(func=pd.Series))
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-08-12 15:37:34.992901: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-12 15:37:34.993048: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-12 15:37:35.182147: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=1000, n_jobs=1, random_state=2024, verbose=True)
Mon Aug 12 15:37:48 2024 Construct fuzzy simplicial set
Mon Aug 12 15:37:48 2024 Finding Nearest Neighbors
Mon Aug 12 15:37:52 2024 Finished Nearest Neighbor Search
Mon Aug 12 15:37:56 2024 Construct embedding


Epochs completed:   0%|            0/1000 [00:00]

	completed  0  /  1000 epochs
	completed  100  /  1000 epochs
	completed  200  /  1000 epochs
	completed  300  /  1000 epochs
	completed  400  /  1000 epochs
	completed  500  /  1000 epochs
	completed  600  /  1000 epochs
	completed  700  /  1000 epochs
	completed  800  /  1000 epochs
	completed  900  /  1000 epochs
Mon Aug 12 15:37:59 2024 Finished embedding
done with UMAP in 0:00:11.263000


Now we can use these coordinates to lay out our doodle names in a relative space.

In [6]:
from plotly import express

express.scatter(data_frame=files_df, x='x', y='y', hover_name='tag', height=800, text='tag').update_traces(marker={'size': 1})

Some of these make sense without looking at the images: I'd expect a doodle of a mountain to look like a doodle of a triangle, wouldn't you? But then again a lot of them don't make much sense. Does a cruise ship look like a traffic light?

Let's try again but this time let's place the images at the coordinates using code adapted from [here](https://community.plotly.com/t/put-images-inside-bubbles/41364/2) so we can browse them visually.

In [7]:
from PIL import Image

fig = express.scatter(data_frame=files_df,x='x', y='y', hover_name='tag', height=800, template='plotly_dark',).update_traces(marker_color='rgba(0,0,0,0)')
for _, row in files_df.iterrows():
    fig.add_layout_image(
        dict(source=Image.open(row['file']), xref='x', yref='y', xanchor='center', yanchor='middle', x=row['x'],
            y=row['y'], sizex=0.33, sizey=0.33, layer='above', sizing='contain', opacity=0.6,)
    )
fig.show()

We can see immediately that our model has put round things (necklace and blueberry) together and square things (suitcase, the Great Wall of China) together, and if we pick a region to zoom in on we can see that the model has done reasonably well at grouping objects according to finer details, like for example being closed curves containing smaller closed curves. And in that way yes, the lights on a stop light look enough like the portals on our cruise ship that they are reasonable neighbors.