In [1]:
!pip install --quiet img2vec_pytorch
print('pip install img2vec complete.')

pip install img2vec complete.


In [2]:
import base64
import pandas as pd
from arrow import now
from glob import glob
from img2vec_pytorch import Img2Vec
from io import BytesIO
from os.path import basename
from PIL import Image

# we're going to use the updated dataset
GLOB = '/kaggle/input/arabic-characters/Arabic Character Dataset/Train Arabic'
SIZE = 512
STOP = 10000
THUMBNAIL_SIZE = (128, 128)


def embed(model, filename: str):
    with Image.open(fp=filename, mode='r') as image:
        return model.get_vec(image.convert('RGB'), tensor=True).numpy().reshape(SIZE,)


# https://stackoverflow.com/a/952952
def flatten(arg):
    return [x for xs in arg for x in xs]

def png(filename: str) -> str:
    with Image.open(fp=filename, mode='r') as image:
        buffer = BytesIO()
        # our images are pretty big; let's shrink the hover images to thumbnail size
        image.resize(size=THUMBNAIL_SIZE).save(buffer, format='png')
        return 'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode()

def get_picture_from_glob(arg: str, tag: str, stop: int) -> list:
    time_get = now()
    result = [pd.Series(data=[tag, basename(input_file), embed(model=model, filename=input_file), png(filename=input_file), ],
                        index=['tag', 'name', 'value', 'image'])
        for index, input_file in enumerate(glob(pathname=arg)) if index < stop]
    print('encoded {} rows of {}  in {}'.format(len(result), tag, now() - time_get))
    return result

time_start = now()
model = Img2Vec(cuda=False, model='resnet-18')
data_dict = {basename(folder) : folder + '/*.*' for folder in glob(GLOB + '/*')}
df = pd.DataFrame(data=flatten(arg=[get_picture_from_glob(arg=value, tag=key, stop=STOP) for key, value in data_dict.items()]))
print('done in {}'.format(now() - time_start))

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 109MB/s]


encoded 480 rows of 7  in 0:00:26.816496
encoded 480 rows of 17  in 0:00:26.652571
encoded 480 rows of 19  in 0:00:26.318920
encoded 480 rows of 22  in 0:00:25.754817
encoded 480 rows of 2  in 0:00:26.530206
encoded 480 rows of 23  in 0:00:26.500480
encoded 480 rows of 10  in 0:00:26.216479
encoded 480 rows of 5  in 0:00:26.398057
encoded 480 rows of 20  in 0:00:26.407004
encoded 480 rows of 27  in 0:00:25.992916
encoded 480 rows of 25  in 0:00:26.604306
encoded 480 rows of 8  in 0:00:26.347428
encoded 480 rows of 12  in 0:00:26.447057
encoded 480 rows of 18  in 0:00:26.586254
encoded 480 rows of 28  in 0:00:26.480344
encoded 480 rows of 16  in 0:00:26.186368
encoded 480 rows of 13  in 0:00:26.810574
encoded 480 rows of 26  in 0:00:26.771056
encoded 480 rows of 15  in 0:00:26.951178
encoded 480 rows of 3  in 0:00:26.845569
encoded 480 rows of 1  in 0:00:26.703604
encoded 480 rows of 14  in 0:00:27.353244
encoded 480 rows of 4  in 0:00:27.802333
encoded 480 rows of 9  in 0:00:28.172578


In [3]:
TEST = '/kaggle/input/arabic-characters/Arabic Character Dataset/Test Arabic'
test_dict = {basename(folder) : folder + '/*.*' for folder in glob(TEST + '/*')}
test_df = pd.DataFrame(data=flatten(arg=[get_picture_from_glob(arg=value, tag=key, stop=STOP) for key, value in test_dict.items()]))
test_df.head()

encoded 120 rows of 7  in 0:00:06.879087
encoded 120 rows of 17  in 0:00:06.903123
encoded 120 rows of 19  in 0:00:06.894561
encoded 120 rows of 22  in 0:00:06.744403
encoded 120 rows of 2  in 0:00:07.460175
encoded 120 rows of 23  in 0:00:06.776095
encoded 120 rows of 10  in 0:00:06.753351
encoded 120 rows of 5  in 0:00:06.767718
encoded 120 rows of 20  in 0:00:06.850445
encoded 120 rows of 27  in 0:00:07.381003
encoded 120 rows of 25  in 0:00:06.900708
encoded 120 rows of 8  in 0:00:06.995161
encoded 120 rows of 12  in 0:00:06.928689
encoded 120 rows of 18  in 0:00:07.456976
encoded 120 rows of 28  in 0:00:06.940977
encoded 120 rows of 16  in 0:00:06.909794
encoded 120 rows of 13  in 0:00:06.945143
encoded 120 rows of 26  in 0:00:07.004203
encoded 120 rows of 15  in 0:00:07.591181
encoded 120 rows of 3  in 0:00:07.014548
encoded 120 rows of 1  in 0:00:06.993563
encoded 120 rows of 14  in 0:00:06.970157
encoded 120 rows of 4  in 0:00:07.511895
encoded 120 rows of 9  in 0:00:06.842469


Unnamed: 0,tag,name,value,image
0,7,id_1973_label_7.png,"[0.06012968, 0.0, 1.7098964, 0.8764554, 0.0639...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA..."
1,7,id_2589_label_7.png,"[0.1791957, 0.011476658, 1.5836203, 1.0415391,...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA..."
2,7,id_2534_label_7.png,"[0.19525072, 0.0, 1.5380439, 0.74947923, 0.065...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA..."
3,7,id_797_label_7.png,"[0.18151568, 0.030502632, 2.007767, 0.79192895...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA..."
4,7,id_686_label_7.png,"[0.029153293, 0.01863813, 1.6742319, 0.0672259...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA..."


In [4]:
from plotly import express
express.histogram(data_frame=df, x='tag')

In [5]:
from arrow import now
from umap import UMAP

time_start = now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=201)
df[['x', 'y']] = umap.fit_transform(X=df['value'].apply(func=pd.Series))
print('done with UMAP in {}'.format(now() - time_start))

2024-03-20 17:01:44.069484: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-20 17:01:44.069653: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-20 17:01:44.236494: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=201, n_jobs=1, random_state=2024, verbose=True)
Wed Mar 20 17:01:58 2024 Construct fuzzy simplicial set
Wed Mar 20 17:01:58 2024 Finding Nearest Neighbors
Wed Mar 20 17:01:58 2024 Building RP forest with 11 trees
Wed Mar 20 17:02:05 2024 NN descent for 14 iterations
	 1  /  14
	 2  /  14
	 3  /  14
	 4  /  14
	 5  /  14
	 6  /  14
	Stopping threshold met -- exiting after 6 iterations
Wed Mar 20 17:02:26 2024 Finished Nearest Neighbor Search
Wed Mar 20 17:02:30 2024 Construct embedding


Epochs completed:   0%|            0/201 [00:00]

	completed  0  /  201 epochs
	completed  20  /  201 epochs
	completed  40  /  201 epochs
	completed  60  /  201 epochs
	completed  80  /  201 epochs
	completed  100  /  201 epochs
	completed  120  /  201 epochs
	completed  140  /  201 epochs
	completed  160  /  201 epochs
	completed  180  /  201 epochs
	completed  200  /  201 epochs
Wed Mar 20 17:02:41 2024 Finished embedding
done with UMAP in 0:00:45.025562


In [6]:
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool

from bokeh.plotting import figure
from bokeh.plotting import output_notebook
from bokeh.plotting import show
from bokeh.palettes import Turbo256
from bokeh.transform import linear_cmap

output_notebook()

datasource = ColumnDataSource(df.sample(n=1000, random_state=2024))
mapper = linear_cmap(field_name = 'tag', palette=Turbo256, low=1, high=27,)

plot_figure = figure(title='UMAP projection: Arabic characters', width=1000, height=800, tools=('pan, wheel_zoom, reset'))

plot_figure.add_tools(HoverTool(tooltips="""
<div>
    <div>
        <img src='@image' style='float: left; margin: 5px 5px 5px 5px'/>
    </div>
    <div>
        <span style='font-size: 16px; color: #224499'>tag:</span>
        <span style='font-size: 18px'>@tag</span>
    </div>
</div>
"""))

plot_figure.circle('x', 'y', source=datasource, line_alpha=0.6, fill_alpha=0.6, size=5, color=mapper)
show(plot_figure)

In [7]:
import arrow
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

time_start = arrow.now()
model = LogisticRegression(max_iter=100000, tol=1e-7).fit(X=df['value'].apply(func=pd.Series), y=df['tag'])
print('model fit in {} iterations took {}'.format(model.n_iter_[0], arrow.now() - time_start))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=test_df['tag'], y_pred=model.predict(X=test_df['value'].apply(func=pd.Series)))))
print('model done in {}'.format(now() - time_start))

model fit in 2745 iterations took 0:04:31.630024
accuracy: 0.8557
model done in 0:04:32.119841


In [8]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_true=test_df['tag'], y_pred=model.predict(X=test_df['value'].apply(func=pd.Series)))

array([[115,   2,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   1],
       [  1, 107,   5,   0,   0,   0,   0,   0,   0,   0,   0,   1,   0,
          0,   0,   2,   0,   0,   0,   1,   0,   0,   0,   0,   0,   0,
          3,   0],
       [  1,   8, 101,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   2,   0,   0,   0,   0,   0,   0,   0,   0,   1,
          0,   7],
       [  0,   0,   0, 101,   0,   8,   2,   0,   0,   0,   0,   0,   1,
          0,   1,   0,   0,   1,   1,   0,   2,   2,   0,   1,   0,   0,
          0,   0],
       [  0,   0,   0,   0, 110,   0,   2,   0,   0,   0,   0,   0,   1,
          2,   0,   0,   0,   0,   0,   1,   2,   0,   2,   0,   0,   0,
          0,   0],
       [  0,   0,   0,   7,   0, 108,   2,   0,   0,   0,   0,   1,   0,
          0,   0,   1,   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  