In [1]:
!pip install --upgrade --quiet pip
!pip install --quiet img2vec_pytorch

In [2]:
import base64
import pandas as pd

from glob import glob
from io import BytesIO
from os.path import basename

from arrow import now
from img2vec_pytorch import Img2Vec
from PIL import Image

SIZE = 512
STOP =  7500
DATA_GLOB = '/kaggle/input/road-classification/Road Classification'

# https://stackoverflow.com/a/952952
def flatten(arg):
    return [x for xs in arg for x in xs]

def get_from_glob(arg: str, tag: str, stop: int) -> list:
    time_get = now()
    result = []
    for index, input_file in enumerate(glob(pathname=arg)):
        if index < stop:
            name = basename(input_file)
            with Image.open(fp=input_file, mode='r') as image:
                vector = img2vec.get_vec(image, tensor=True).numpy().reshape(SIZE,)
                buffer = BytesIO()
                size = (128, 128)
                image.resize(size=size).save(buffer, format='png')
                result.append(pd.Series(data=[tag, name, vector, 
                                              'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode(),
                                             ], index=['tag', 'name', 'value', 'image']))
    print('encoded {} data {} rows in {}'.format(tag, len(result), now() - time_get))
    return result

img2vec = Img2Vec(cuda=False, model='resnet-18', layer='default', layer_output_size=SIZE)

time_start = now()

files = {basename(folder) : folder + '/*.jpg' for folder in glob(DATA_GLOB + '/*')}
data = [get_from_glob(arg=value, tag=key, stop=STOP) for key, value in files.items()]
df = pd.DataFrame(data=flatten(arg=data))
    
print('done in {}'.format(now() - time_start))


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 104MB/s]


encoded Broken data 110 rows in 0:00:13.115653
encoded Not Broken data 94 rows in 0:00:10.562127
done in 0:00:23.711114


In [3]:
from plotly import express
express.pie(data_frame=df, names='tag', color='tag')

We don't have much data but our classes are nearly balanced. Let's use dimension reduction to visualize our data before we build a model.

In [4]:
from arrow import now
from umap import UMAP

time_start = now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=201)
df[['x', 'y']] = umap.fit_transform(X=df['value'].apply(func=pd.Series))
print('done with UMAP in {}'.format(now() - time_start))

2024-03-22 14:16:41.188514: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-22 14:16:41.188670: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-22 14:16:41.361732: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=201, n_jobs=1, random_state=2024, verbose=True)
Fri Mar 22 14:16:53 2024 Construct fuzzy simplicial set
Fri Mar 22 14:16:54 2024 Finding Nearest Neighbors
Fri Mar 22 14:16:58 2024 Finished Nearest Neighbor Search
Fri Mar 22 14:17:02 2024 Construct embedding


Epochs completed:   0%|            0/201 [00:00]

	completed  0  /  201 epochs
	completed  20  /  201 epochs
	completed  40  /  201 epochs
	completed  60  /  201 epochs
	completed  80  /  201 epochs
	completed  100  /  201 epochs
	completed  120  /  201 epochs
	completed  140  /  201 epochs
	completed  160  /  201 epochs
	completed  180  /  201 epochs
	completed  200  /  201 epochs
Fri Mar 22 14:17:04 2024 Finished embedding
done with UMAP in 0:00:10.307791


In [5]:
express.scatter(data_frame=df, x='x', y='y', color='tag')





This is encouraging: we can imagine putting a line through the plot that puts the majority of the broken instances on one side and the not-broken instances on the other. Let's visualize again and include our thumbnail images we built above.

In [6]:
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool

from bokeh.plotting import figure
from bokeh.plotting import output_notebook
from bokeh.plotting import show
from bokeh.palettes import Set1_3
from bokeh.transform import factor_cmap

output_notebook()

datasource = ColumnDataSource(df)
mapper = factor_cmap(field_name = 'tag', palette=Set1_3, factors=['Not Broken', 'Broken'], start=0, end=3)

plot_figure = figure(title='UMAP projection: roads', width=1000, height=800, tools=('pan, wheel_zoom, reset'))

plot_figure.add_tools(HoverTool(tooltips="""
<div>
    <div>
        <img src='@image' style='float: left; margin: 5px 5px 5px 5px'/>
    </div>
    <div>
        <span style='font-size: 16px; color: #224499'>tag:</span>
        <span style='font-size: 18px'>@tag</span>
    </div>
</div>
"""))

plot_figure.circle('x', 'y', source=datasource, line_alpha=0.6, fill_alpha=0.6, size=5, color=mapper)
show(plot_figure)

Let's build a model. Our classes are somewhat unbalanced, so we need to stratify our split according to the target variable.

In [7]:
import arrow
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['value'].apply(func=pd.Series), df['tag'], test_size=0.25, random_state=2024, stratify=df['tag'])

time_start = arrow.now()
model = LogisticRegression(max_iter=100000, tol=1e-12).fit(X=X_train, y=y_train)
print('model fit in {} iterations took {}'.format(model.n_iter_[0], arrow.now() - time_start))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))
print('model done in {}'.format(now() - time_start))

model fit in 113 iterations took 0:00:00.085061
accuracy: 0.9020
model done in 0:00:00.117471


This doesn't look bad. Let's look at the classification report.

In [8]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

      Broken       0.93      0.89      0.91        28
  Not Broken       0.88      0.91      0.89        23

    accuracy                           0.90        51
   macro avg       0.90      0.90      0.90        51
weighted avg       0.90      0.90      0.90        51



Our model does about equally well on either class, which is good news.