In [1]:
import pandas as pd

WORDS = {'traffic light', 'umbrella'}


DATA = '/kaggle/input/doodle-dataset/master_doodle_dataframe.csv'
df = pd.read_csv(filepath_or_buffer=DATA).drop(columns=['drawing', 'key_id', 'recognized', 'countrycode'])
df['file'] = df['image_path'].apply(lambda x: '/kaggle/input/doodle-dataset/doodle/{}'.format(x[5:]))
df.head()

Unnamed: 0,word,image_path,file
0,traffic light,data/traffic light/5613582005829632.png,/kaggle/input/doodle-dataset/doodle/traffic li...
1,traffic light,data/traffic light/5769631006457856.png,/kaggle/input/doodle-dataset/doodle/traffic li...
2,traffic light,data/traffic light/4999795544424448.png,/kaggle/input/doodle-dataset/doodle/traffic li...
3,traffic light,data/traffic light/4878417906368512.png,/kaggle/input/doodle-dataset/doodle/traffic li...
4,traffic light,data/traffic light/5572841187573760.png,/kaggle/input/doodle-dataset/doodle/traffic li...


In [2]:
df['word'].value_counts().head(n=5)

word
traffic light    3000
umbrella         3000
house            3000
trombone         3000
paper clip       3000
Name: count, dtype: int64

We have 340 classes x 3000 instances each. That's a lot of data.

In [3]:
from random import sample
words = sample(population=df['word'].unique().tolist(), k=2)
t_df = pd.concat([df[df['word'] == word].sample(frac=0.1, random_state=2024) for word in words])

In [4]:
t_df['word'].value_counts().to_dict()

{'circle': 300, 'cooler': 300}

In [5]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import numpy as np


DEVICE = torch.device('cpu')
OUTPUT_SIZE = 512

model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
extraction_layer = model._modules.get('avgpool')
model.to(DEVICE)
model.eval()

scaler = transforms.Resize((224, 224))
normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.2, 0.2, 0.2])
to_tensor = transforms.ToTensor()

def get_vec(arg, model, extraction_layer):
    image = normalize(to_tensor(scaler(arg))).unsqueeze(0).to(DEVICE)
    result = torch.zeros(1, OUTPUT_SIZE, 1, 1)
    def copy_data(m, i, o):
        result.copy_(o.data)
    hooked = extraction_layer.register_forward_hook(copy_data)
    with torch.no_grad():
        model(image)
    hooked.remove()
    return result

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 115MB/s]


In [6]:
import arrow
from PIL import Image

def embed(model, filename: str):
    with Image.open(fp=filename, mode='r') as image:
        return get_vec(arg=image.convert('RGB'), model=model, extraction_layer=extraction_layer).numpy().reshape(OUTPUT_SIZE,)


time_start = arrow.now()
t_df['vector'] = t_df['file'].apply(func=lambda x: embed(model=model, filename=x))
print('got embeddings in {}'.format(arrow.now() - time_start))

got embeddings in 0:00:39.555254


In [7]:
from plotly import express
from sklearn.metrics.pairwise import cosine_similarity

express.imshow(img=cosine_similarity(X=t_df.sort_values(by='word')['vector'].apply(pd.Series)))

What does this tell us? It tells us that images in one class tend to be more similar to images in their class than to images in the other class. Let's make a scatter plot.

In [8]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=1000)
t_df[['x', 'y']] = umap.fit_transform(X=t_df['vector'].apply(func=pd.Series))
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-08-12 18:24:20.807897: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-12 18:24:20.808034: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-12 18:24:20.982110: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=1000, n_jobs=1, random_state=2024, verbose=True)
Mon Aug 12 18:24:33 2024 Construct fuzzy simplicial set
Mon Aug 12 18:24:34 2024 Finding Nearest Neighbors
Mon Aug 12 18:24:38 2024 Finished Nearest Neighbor Search
Mon Aug 12 18:24:42 2024 Construct embedding


Epochs completed:   0%|            0/1000 [00:00]

	completed  0  /  1000 epochs
	completed  100  /  1000 epochs
	completed  200  /  1000 epochs
	completed  300  /  1000 epochs
	completed  400  /  1000 epochs
	completed  500  /  1000 epochs
	completed  600  /  1000 epochs
	completed  700  /  1000 epochs
	completed  800  /  1000 epochs
	completed  900  /  1000 epochs
Mon Aug 12 18:24:46 2024 Finished embedding
done with UMAP in 0:00:13.121985


In [9]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.scatter(data_frame=t_df, x='x', y='y', color='word')

Almost all the time we should easily be able to tell an umbrella from a traffic light.