In [1]:
!pip install --upgrade --quiet pip
!pip install --quiet img2vec_pytorch

In [2]:
import base64
import pandas as pd

from glob import glob
from io import BytesIO
from os.path import basename

from arrow import now
from img2vec_pytorch import Img2Vec
from PIL import Image

SIZE = 512
STOP =  7500
DATA_GLOB = '/kaggle/input/road-classification/Road Classification'

# https://stackoverflow.com/a/952952
def flatten(arg):
    return [x for xs in arg for x in xs]

def get_from_glob(arg: str, tag: str, stop: int) -> list:
    time_get = now()
    result = []
    for index, input_file in enumerate(glob(pathname=arg)):
        if index < stop:
            name = basename(input_file)
            with Image.open(fp=input_file, mode='r') as image:
                vector = img2vec.get_vec(image, tensor=True).numpy().reshape(SIZE,)
                buffer = BytesIO()
                width, height = image.size
                size = (int(width / 2), int(height / 2))
                image.resize(size=size).save(buffer, format='png')
                result.append(pd.Series(data=[tag, name, vector, 
                                              'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode(),
                                             ], index=['tag', 'name', 'value', 'image']))
    print('encoded {} data {} rows in {}'.format(tag, len(result), now() - time_get))
    return result

img2vec = Img2Vec(cuda=False, model='resnet-18', layer='default', layer_output_size=SIZE)

time_start = now()

files = {basename(folder) : folder + '/*.jpg' for folder in glob(DATA_GLOB + '/*')}
data = [get_from_glob(arg=value, tag=key, stop=STOP) for key, value in files.items()]
df = pd.DataFrame(data=flatten(arg=data))
    
print('done in {}'.format(now() - time_start))


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 115MB/s]


encoded Broken data 110 rows in 0:00:40.990205
encoded Not Broken data 94 rows in 0:00:28.947798
done in 0:01:09.968301


In [3]:
from plotly import express
express.pie(data_frame=df, names='tag', color='tag')

We don't have much data but our classes are nearly balanced. Let's use dimension reduction to visualize our data before we build a model.

In [4]:
from arrow import now
from umap import UMAP

time_start = now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=201)
df[['x', 'y']] = umap.fit_transform(X=df['value'].apply(func=pd.Series))
print('done with UMAP in {}'.format(now() - time_start))

2024-03-22 13:52:47.162449: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-22 13:52:47.162604: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-22 13:52:47.336790: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=201, n_jobs=1, random_state=2024, verbose=True)
Fri Mar 22 13:52:59 2024 Construct fuzzy simplicial set
Fri Mar 22 13:52:59 2024 Finding Nearest Neighbors
Fri Mar 22 13:53:03 2024 Finished Nearest Neighbor Search
Fri Mar 22 13:53:06 2024 Construct embedding


Epochs completed:   0%|            0/201 [00:00]

	completed  0  /  201 epochs
	completed  20  /  201 epochs
	completed  40  /  201 epochs
	completed  60  /  201 epochs
	completed  80  /  201 epochs
	completed  100  /  201 epochs
	completed  120  /  201 epochs
	completed  140  /  201 epochs
	completed  160  /  201 epochs
	completed  180  /  201 epochs
	completed  200  /  201 epochs
Fri Mar 22 13:53:08 2024 Finished embedding
done with UMAP in 0:00:09.215550
