We have a lot of data. Are all of our pictures in the same format? Yes. We know from the data card that we have 1020000 images and they are all PNGs. 

In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
import numpy as np


DEVICE = torch.device('cpu')
OUTPUT_SIZE = 512

model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
extraction_layer = model._modules.get('avgpool')
model.to(DEVICE)
model.eval()

scaler = transforms.Resize((224, 224))
normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.2, 0.2, 0.2])
to_tensor = transforms.ToTensor()

def get_vec(arg, model, extraction_layer):
    image = normalize(to_tensor(scaler(arg))).unsqueeze(0).to(DEVICE)
    result = torch.zeros(1, OUTPUT_SIZE, 1, 1)
    def copy_data(m, i, o):
        result.copy_(o.data)
    hooked = extraction_layer.register_forward_hook(copy_data)
    with torch.no_grad():
        model(image)
    hooked.remove()
    return result

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 134MB/s]


In [2]:
import base64
import pandas as pd
from arrow import now
from glob import glob
from io import BytesIO
from os.path import basename
from PIL import Image

DATA = '/kaggle/input/doodle-dataset/doodle'
STOP = 100
THUMBNAIL_SIZE = (128, 128)


def embed(model, filename: str):
    with Image.open(fp=filename, mode='r') as image:
        return get_vec(arg=image.convert('RGB'), model=model, extraction_layer=extraction_layer).numpy().reshape(OUTPUT_SIZE,)


# https://stackoverflow.com/a/952952
def flatten(arg):
    return [x for xs in arg for x in xs]

def png(filename: str) -> str:
    with Image.open(fp=filename, mode='r') as image:
        buffer = BytesIO()
        # our images are pretty big; let's shrink the hover images to thumbnail size
        image.resize(size=THUMBNAIL_SIZE).convert('RGB').save(buffer, format='png')
        return 'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode()

def get_picture_from_glob(arg: str, tag: str, stop: int) -> list:
    time_get = now()
    result = [pd.Series(data=[tag, basename(input_file), embed(model=model, filename=input_file), png(filename=input_file), ],
                        index=['tag', 'name', 'value', 'image'])
        for index, input_file in enumerate(glob(pathname=arg)) if index < stop and input_file.endswith('.png')]
    print('encoded {} rows of {}  in {}'.format(len(result), tag, now() - time_get))
    return result

time_start = now()
data_dict = {basename(folder) : folder + '/*.*' for folder in glob(DATA + '/*')}
df = pd.DataFrame(data=flatten(arg=[get_picture_from_glob(arg=value, tag=key, stop=STOP) for key, value in data_dict.items()]))
print('done in {}'.format(now() - time_start))

encoded 100 rows of fan  in 0:00:06.171371
encoded 100 rows of moon  in 0:00:06.593322
encoded 100 rows of microphone  in 0:00:06.148222
encoded 100 rows of calculator  in 0:00:06.090581
encoded 100 rows of van  in 0:00:06.095469
encoded 100 rows of spider  in 0:00:05.887411
encoded 100 rows of parrot  in 0:00:06.827812
encoded 100 rows of piano  in 0:00:06.167317
encoded 100 rows of scorpion  in 0:00:05.954906
encoded 100 rows of broccoli  in 0:00:05.986825
encoded 100 rows of sea turtle  in 0:00:06.006649
encoded 100 rows of envelope  in 0:00:06.520807
encoded 100 rows of mouth  in 0:00:06.136141
encoded 100 rows of birthday cake  in 0:00:05.909167
encoded 100 rows of beard  in 0:00:06.239911
encoded 100 rows of rake  in 0:00:06.317999
encoded 100 rows of motorbike  in 0:00:06.954711
encoded 100 rows of teddy-bear  in 0:00:06.518163
encoded 100 rows of cell phone  in 0:00:06.706752
encoded 100 rows of airplane  in 0:00:06.307238
encoded 100 rows of hedgehog  in 0:00:06.266299
encoded

In [3]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=201)
df[['x', 'y']] = umap.fit_transform(X=df['value'].apply(func=pd.Series))
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-08-09 15:33:39.713064: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-09 15:33:39.713261: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-09 15:33:39.878895: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=201, n_jobs=1, random_state=2024, verbose=True)
Fri Aug  9 15:33:58 2024 Construct fuzzy simplicial set
Fri Aug  9 15:33:58 2024 Finding Nearest Neighbors
Fri Aug  9 15:33:58 2024 Building RP forest with 14 trees
Fri Aug  9 15:34:07 2024 NN descent for 15 iterations
	 1  /  15
	 2  /  15
	 3  /  15
	 4  /  15
	 5  /  15
	 6  /  15
	Stopping threshold met -- exiting after 6 iterations
Fri Aug  9 15:34:34 2024 Finished Nearest Neighbor Search
Fri Aug  9 15:34:40 2024 Construct embedding


Epochs completed:   0%|            0/201 [00:00]

	completed  0  /  201 epochs
	completed  20  /  201 epochs
	completed  40  /  201 epochs
	completed  60  /  201 epochs
	completed  80  /  201 epochs
	completed  100  /  201 epochs
	completed  120  /  201 epochs
	completed  140  /  201 epochs
	completed  160  /  201 epochs
	completed  180  /  201 epochs
	completed  200  /  201 epochs
Fri Aug  9 15:35:09 2024 Finished embedding
done with UMAP in 0:01:15.910606


In [4]:
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool

from bokeh.plotting import figure
from bokeh.plotting import output_notebook
from bokeh.plotting import show


# for performance reasons we need to take a sample of the data to visualize
TOP_N = min(len(df)-1, 1000)

output_notebook()
datasource = ColumnDataSource(df.sample(n=TOP_N, random_state=2024))
plot_figure = figure(title='UMAP projection: doodles', width=1000, height=800, tools=('pan, wheel_zoom, reset'))

plot_figure.add_tools(HoverTool(tooltips="""
<div>
    <div>
        <img src='@image' style='float: left; margin: 5px 5px 5px 5px'/>
    </div>
    <div>
        <span style='font-size: 16px; color: #224499'>tag:</span>
        <span style='font-size: 18px'>@tag</span>
    </div>
</div>
"""))

plot_figure.scatter(x='x', y='y', source=datasource, line_alpha=0.6, fill_alpha=0.6, size=5, )
show(plot_figure)

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df['value'].apply(pd.Series), df['tag'], test_size=0.2, random_state=2024, stratify=df['tag'])
model = LogisticRegression(max_iter=1000, tol=1e-4).fit(X_train, y_train)
print('model fit in {} iterations'.format(model.n_iter_[0]))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))

model fit in 1000 iterations
accuracy: 0.4943


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
