In [1]:
!pip install --upgrade --quiet pip
!pip install --quiet img2vec_pytorch
print('pip install/upgrade complete.')

pip install/upgrade complete.


We're going to use ResNet to get embeddings for each of our images and then use a model to classify the embeddings.

In [2]:
from img2vec_pytorch import Img2Vec

SIZE = 512
img2vec = Img2Vec(cuda=False, model='resnet-18', layer='default', layer_output_size=SIZE)
print('built the img2vec/ResNet model.')

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 118MB/s]


built the img2vec/ResNet model.


In [3]:
import base64
import pandas as pd

from glob import glob
from io import BytesIO
from os.path import basename

from arrow import now
from PIL import Image

STOP =  10000

# https://stackoverflow.com/a/952952
def flatten(arg):
    return [x for xs in arg for x in xs]

def get_from_glob(arg: str, tag: str, stop: int) -> list:
    time_get = now()
    result = []
    for index, input_file in enumerate(glob(pathname=arg)):
        if index < stop:
            name = basename(input_file)
            with Image.open(fp=input_file, mode='r') as image:
                vector = img2vec.get_vec(image, tensor=True).numpy().reshape(SIZE,)
                buffer = BytesIO()
                size = (128, 128)
                image.resize(size=size).save(buffer, format='png')
                result.append(pd.Series(data=[tag, name, vector, 
                                              'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode(),
                                             ], index=['tag', 'name', 'value', 'image']))
    print('encoded {} data {} rows in {}'.format(tag, len(result), now() - time_get))
    return result

time_start = now()

files = {basename(folder) : folder + '/*.jpg' for folder in glob('/kaggle/input/american-sign-language/American sign language hand gestures/Alphabets' + '/*')}
data = [get_from_glob(arg=value, tag=key, stop=STOP) for key, value in files.items()]
df = pd.DataFrame(data=flatten(arg=data))

files = {basename(folder) : folder + '/*.jpg' for folder in glob('/kaggle/input/american-sign-language/American sign language hand gestures/Numbers' + '/*')}
data = [get_from_glob(arg=value, tag=key, stop=STOP) for key, value in files.items()]
df = pd.concat(objs=[df, pd.DataFrame(data=flatten(arg=data))])
print('done in {}'.format(now() - time_start))

encoded N data 287 rows in 0:00:21.699109
encoded R data 300 rows in 0:00:20.577023
encoded I data 305 rows in 0:00:22.123234
encoded F data 305 rows in 0:00:21.428020
encoded H data 300 rows in 0:00:21.332290
encoded E data 305 rows in 0:00:22.508568
encoded U data 305 rows in 0:00:21.116830
encoded M data 305 rows in 0:00:21.838132
encoded X data 300 rows in 0:00:21.579949
encoded K data 305 rows in 0:00:21.445862
encoded Q data 305 rows in 0:00:22.687725
encoded Y data 308 rows in 0:00:22.788795
encoded S data 305 rows in 0:00:23.066198
encoded G data 296 rows in 0:00:22.339912
encoded O data 305 rows in 0:00:22.248227
encoded T data 306 rows in 0:00:22.352272
encoded V data 305 rows in 0:00:21.648275
encoded P data 305 rows in 0:00:21.371874
encoded L data 297 rows in 0:00:22.457848
encoded W data 306 rows in 0:00:21.652845
encoded D data 305 rows in 0:00:21.644307
encoded 7 data 208 rows in 0:00:15.024635
encoded 2 data 204 rows in 0:00:14.222621
encoded 10 data 208 rows in 0:00:1

We have lots of classes and not an awful lot of data, so we expect simple models to perform poorly. Let's use dimension reduction to visualize.

In [4]:
from arrow import now
from umap import UMAP
from plotly import express

time_start = now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=1000)
df[['x', 'y']] = umap.fit_transform(X=df['value'].apply(func=pd.Series))
express.scatter(data_frame=df, x='x', y='y', color='tag').show()
print('done with UMAP in {}'.format(now() - time_start))

2024-03-31 20:09:02.496016: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-31 20:09:02.496194: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-31 20:09:02.676297: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=1000, n_jobs=1, random_state=2024, verbose=True)
Sun Mar 31 20:09:16 2024 Construct fuzzy simplicial set
Sun Mar 31 20:09:16 2024 Finding Nearest Neighbors
Sun Mar 31 20:09:16 2024 Building RP forest with 10 trees
Sun Mar 31 20:09:23 2024 NN descent for 13 iterations
	 1  /  13
	 2  /  13
	 3  /  13
	 4  /  13
	 5  /  13
	Stopping threshold met -- exiting after 5 iterations
Sun Mar 31 20:09:44 2024 Finished Nearest Neighbor Search
Sun Mar 31 20:09:48 2024 Construct embedding


Epochs completed:   0%|            0/1000 [00:00]

	completed  0  /  1000 epochs
	completed  100  /  1000 epochs
	completed  200  /  1000 epochs
	completed  300  /  1000 epochs
	completed  400  /  1000 epochs
	completed  500  /  1000 epochs
	completed  600  /  1000 epochs
	completed  700  /  1000 epochs
	completed  800  /  1000 epochs
	completed  900  /  1000 epochs
Sun Mar 31 20:10:19 2024 Finished embedding


  sf: grouped.get_group(s if len(s) > 1 else s[0])


done with UMAP in 0:01:05.405914


So we do see some clustering within classes but not much separation among classes, Let's build a simple model.

In [5]:
import arrow
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['value'].apply(func=pd.Series), df['tag'], test_size=0.20, random_state=2024, stratify=df['tag'])

time_start = arrow.now()
model = LogisticRegression(max_iter=10000, tol=1e-12).fit(X=X_train, y=y_train)
print('model fit in {} iterations took {}'.format(model.n_iter_[0], arrow.now() - time_start))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))
print('model done in {}'.format(now() - time_start))

model fit in 3838 iterations took 0:04:26.039132
accuracy: 0.8609
model done in 0:04:26.128778


Honestly this is surprisingly good; let's look at the classification report.

In [6]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96        39
           1       1.00      0.97      0.99        40
          10       0.98      1.00      0.99        42
           2       0.88      0.88      0.88        41
           3       1.00      1.00      1.00        40
           4       0.97      0.90      0.94        41
           5       0.98      1.00      0.99        42
           6       0.93      0.95      0.94        39
           7       0.95      1.00      0.98        42
           8       1.00      0.95      0.98        43
           9       0.97      1.00      0.99        38
           D       0.81      0.84      0.82        61
           E       0.84      0.92      0.88        61
           F       1.00      0.92      0.96        61
           G       0.75      0.78      0.77        59
           H       0.82      0.75      0.78        60
           I       0.97      0.93      0.95        61
           K       0.91    