In [1]:
!pip install --quiet img2vec_pytorch
print('pip install img2vec complete')

pip install img2vec complete


In [2]:
import base64
import pandas as pd
from arrow import now
from glob import glob
from img2vec_pytorch import Img2Vec
from io import BytesIO
from os.path import basename
from PIL import Image

# we're going to use the updated dataset
GLOB = '/kaggle/input/mnist-greek-letters/Greek_Letters'
SIZE = 512
STOP = 10000
THUMBNAIL_SIZE = (128, 128)


def embed(model, filename: str):
    with Image.open(fp=filename, mode='r') as image:
        return model.get_vec(image.convert('RGB'), tensor=True).numpy().reshape(SIZE,)


# https://stackoverflow.com/a/952952
def flatten(arg):
    return [x for xs in arg for x in xs]

def png(filename: str) -> str:
    with Image.open(fp=filename, mode='r') as image:
        buffer = BytesIO()
        # our images are pretty big; let's shrink the hover images to thumbnail size
        image.resize(size=THUMBNAIL_SIZE).save(buffer, format='png')
        return 'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode()

def get_picture_from_glob(arg: str, tag: str, stop: int) -> list:
    time_get = now()
    result = [pd.Series(data=[tag, basename(input_file), embed(model=model, filename=input_file), png(filename=input_file), ],
                        index=['tag', 'name', 'value', 'image'])
        for index, input_file in enumerate(glob(pathname=arg)) if index < stop]
    print('encoded {} rows of {}  in {}'.format(len(result), tag, now() - time_get))
    return result

time_start = now()
model = Img2Vec(cuda=False, model='resnet-18')
data_dict = {basename(folder) : folder + '/*.*' for folder in glob(GLOB + '/*')}
df = pd.DataFrame(data=flatten(arg=[get_picture_from_glob(arg=value, tag=key, stop=STOP) for key, value in data_dict.items()]))
print('done in {}'.format(now() - time_start))

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 113MB/s]


encoded 25 rows of theta  in 0:00:01.613128
encoded 25 rows of mu  in 0:00:01.380525
encoded 25 rows of epsilon  in 0:00:01.372130
encoded 25 rows of xi  in 0:00:01.359821
encoded 25 rows of omicron  in 0:00:01.395823
encoded 25 rows of beta  in 0:00:01.345766
encoded 25 rows of gamma  in 0:00:01.351804
encoded 25 rows of tau  in 0:00:01.378814
encoded 25 rows of alpha  in 0:00:01.385720
encoded 25 rows of nu  in 0:00:01.356610
encoded 25 rows of phi  in 0:00:01.360810
encoded 25 rows of chi  in 0:00:01.333785
encoded 25 rows of sigma  in 0:00:01.373029
encoded 25 rows of kappa  in 0:00:01.364753
encoded 25 rows of psi  in 0:00:01.338882
encoded 25 rows of zeta  in 0:00:01.425473
encoded 25 rows of omega  in 0:00:01.376579
encoded 25 rows of rho  in 0:00:01.357462
encoded 25 rows of pi  in 0:00:01.362429
encoded 25 rows of upsilon  in 0:00:01.345862
encoded 25 rows of iota  in 0:00:01.363098
encoded 25 rows of lambda  in 0:00:01.417682
encoded 25 rows of eta  in 0:00:01.955032
encoded 

In [3]:
from plotly import express
express.histogram(data_frame=df, x='tag')

We have only 25 examples of each letter, so we don't have much data to work with.

In [4]:
from arrow import now
from umap import UMAP

time_start = now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=201)
df[['x', 'y']] = umap.fit_transform(X=df['value'].apply(func=pd.Series))
print('done with UMAP in {}'.format(now() - time_start))

2024-07-01 13:56:16.084802: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-01 13:56:16.084965: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-01 13:56:16.258280: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=201, n_jobs=1, random_state=2024, verbose=True)
Mon Jul  1 13:56:28 2024 Construct fuzzy simplicial set
Mon Jul  1 13:56:28 2024 Finding Nearest Neighbors
Mon Jul  1 13:56:32 2024 Finished Nearest Neighbor Search
Mon Jul  1 13:56:36 2024 Construct embedding


Epochs completed:   0%|            0/201 [00:00]

	completed  0  /  201 epochs
	completed  20  /  201 epochs
	completed  40  /  201 epochs
	completed  60  /  201 epochs
	completed  80  /  201 epochs
	completed  100  /  201 epochs
	completed  120  /  201 epochs
	completed  140  /  201 epochs
	completed  160  /  201 epochs
	completed  180  /  201 epochs
	completed  200  /  201 epochs
Mon Jul  1 13:56:38 2024 Finished embedding
done with UMAP in 0:00:10.752335


In [5]:
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool

from bokeh.plotting import figure
from bokeh.plotting import output_notebook
from bokeh.plotting import show
from bokeh.palettes import TolRainbow
from bokeh.transform import factor_cmap

output_notebook()

datasource = ColumnDataSource(df)
mapper = factor_cmap(field_name = 'tag', palette=TolRainbow[23], factors=df['tag'].unique().tolist(), start=0, end=23)

plot_figure = figure(title='UMAP projection: Greek letters', width=1000, height=800, tools=('pan, wheel_zoom, reset'))

plot_figure.add_tools(HoverTool(tooltips="""
<div>
    <div>
        <img src='@image' style='float: left; margin: 5px 5px 5px 5px'/>
    </div>
    <div>
        <span style='font-size: 16px; color: #224499'>tag:</span>
        <span style='font-size: 18px'>@tag</span>
    </div>
</div>
"""))

plot_figure.circle('x', 'y', source=datasource, line_alpha=0.6, fill_alpha=0.6, size=5, color=mapper)
show(plot_figure)




UMAP does a good job of building separate clusters for each letter. Let's build a little model.

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df['value'].apply(pd.Series), df['tag'], test_size=0.2, random_state=2024, stratify=df['tag'])
model = LogisticRegression(max_iter=100000, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(model.n_iter_[0]))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))

model fit in 432 iterations
accuracy: 0.9917


Oh wow. Our simple model makes almost no errors. What mistake did it make?

In [7]:
from sklearn.metrics import classification_report
print(classification_report(y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

       alpha       1.00      1.00      1.00         5
        beta       1.00      1.00      1.00         5
         chi       1.00      1.00      1.00         5
       delta       1.00      1.00      1.00         5
     epsilon       1.00      1.00      1.00         5
         eta       1.00      1.00      1.00         5
       gamma       1.00      1.00      1.00         5
        iota       1.00      0.80      0.89         5
       kappa       0.83      1.00      0.91         5
      lambda       1.00      1.00      1.00         5
          mu       1.00      1.00      1.00         5
          nu       1.00      1.00      1.00         5
       omega       1.00      1.00      1.00         5
     omicron       1.00      1.00      1.00         5
         phi       1.00      1.00      1.00         5
          pi       1.00      1.00      1.00         5
         psi       1.00      1.00      1.00         5
         rho       1.00    

Our model confuses some kappas and iotas but is otherwise fine.