In [1]:
!pip install --quiet img2vec_pytorch
print('pip installed img2vec')

from warnings import filterwarnings
filterwarnings(action='ignore', category=FutureWarning) # quiet a plotly issue
filterwarnings(action='ignore', category=UserWarning) # quiet an img2vec issue

pip installed img2vec


In [2]:
from img2vec_pytorch import Img2Vec
from PIL import Image
from arrow import now
from glob import glob
import pandas as pd
from os.path import basename

SIZE = 512
STOP = 100 
DATA_GLOB = '/kaggle/input/rock-paper-scissors-dataset'

# https://stackoverflow.com/a/952952
def flatten(arg):
    return [x for xs in arg for x in xs]

def get_from_glob(arg: str, tag: str, stop: int) -> list:
    time_get = now()
    result = []
    for index, input_file in enumerate(glob(pathname=arg)):
        if index < stop:
            name = basename(input_file)
            with Image.open(fp=input_file, mode='r') as image:
                vector = img2vec.get_vec(image, tensor=True).numpy().reshape(SIZE,)
                result.append(pd.Series(data=[tag, name, vector], index=['tag', 'name', 'value']))
    print('encoded {} data {} rows in {}'.format(tag, len(result), now() - time_get))
    return result


img2vec = Img2Vec(cuda=False, model='resnet-18', layer='default', layer_output_size=SIZE)

time_start = now()

files = {basename(folder) : folder + '/*.png' for folder in glob(DATA_GLOB + '/*')}
data = [get_from_glob(arg=value, tag=key, stop=STOP) for key, value in files.items()]
df = pd.DataFrame(data=flatten(arg=data))
    
print('done in {}'.format(now() - time_start))

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 110MB/s]


encoded paper data 100 rows in 0:00:07.516042
encoded rock data 100 rows in 0:00:07.331164
encoded scissors data 100 rows in 0:00:07.598564
done in 0:00:22.494596


In [3]:
from arrow import now
from umap import UMAP

time_start = now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
plot_df = pd.concat(objs=[df, pd.DataFrame(data=umap.fit_transform(X=df['value'].apply(pd.Series)), columns=['x', 'y'])], axis=1)
print('done with UMAP in {}'.format(now() - time_start))

2024-02-24 14:39:34.358252: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-24 14:39:34.358419: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-24 14:39:34.532612: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Sat Feb 24 14:39:50 2024 Construct fuzzy simplicial set
Sat Feb 24 14:39:51 2024 Finding Nearest Neighbors
Sat Feb 24 14:39:56 2024 Finished Nearest Neighbor Search
Sat Feb 24 14:40:00 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Sat Feb 24 14:40:02 2024 Finished embedding
done with UMAP in 0:00:11.692296


In [4]:
from plotly.express import scatter
scatter(data_frame=plot_df, x='x', y='y', color='tag', hover_name='name', height=900).show()