In [1]:
# what kind of files do we have?
from glob import glob
from pathlib import Path
from collections import Counter

GLOB = '/kaggle/input/pistachio-image-dataset/Pistachio_Image_Dataset/Pistachio_Image_Dataset/*/*'
suffixes = Counter([Path(input_file).suffix for input_file in glob(pathname=GLOB)])

print(suffixes)

Counter({'.jpg': 2148})


In [2]:
import pandas as pd
from pathlib import Path

data = []
for input_file in glob(pathname=GLOB):
    data.append(pd.Series(data={'tag': str(Path(input_file).parents[0]).split('/')[-1], 'name': input_file}))
df = pd.DataFrame(data=data)
df = df.sample(n=500, random_state=2024)

In [3]:
df.shape

(500, 2)

In [4]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.pie(data_frame=df['tag'].value_counts().to_frame().reset_index(), names='tag', values='count').update_traces(hoverinfo='label+percent', textinfo='value').show()

express.histogram(data_frame=df['tag'].value_counts().to_frame().reset_index(), x='tag', y='count').show()

In [5]:
import arrow
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image

time_start = arrow.now()
device = torch.device('cpu')
model = models.resnet34(weights=models.ResNet34_Weights.IMAGENET1K_V1).to(device=device)
model.eval()

scaler = transforms.Resize(size=(224, 224))
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
to_tensor = transforms.ToTensor()

df['data'] = df['name'].apply(func=lambda x: Image.open(fp=x, mode='r').convert('RGB'))
model_input = [normalize(to_tensor(scaler(item))) for item in df['data'].tolist()]
images = torch.stack(model_input).to(device)
df['value'] = [item.detach().numpy() for item in model(images)]

# let's do some cleanup before we proceed
del model_input
del images
df = df.drop(columns=['data'])
print('load/embed done in {}'.format(arrow.now() - time_start))

Downloading: "https://download.pytorch.org/models/resnet34-b627a593.pth" to /root/.cache/torch/hub/checkpoints/resnet34-b627a593.pth
100%|██████████| 83.3M/83.3M [00:00<00:00, 127MB/s]


load/embed done in 0:01:00.565546


In [6]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
df[['x', 'y']] = umap.fit_transform(X=df['value'].apply(pd.Series))
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-07-31 12:53:56.796927: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-31 12:53:56.797035: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-31 12:53:56.928134: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Wed Jul 31 12:54:07 2024 Construct fuzzy simplicial set
Wed Jul 31 12:54:07 2024 Finding Nearest Neighbors
Wed Jul 31 12:54:10 2024 Finished Nearest Neighbor Search
Wed Jul 31 12:54:14 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Wed Jul 31 12:54:16 2024 Finished embedding
done with UMAP in 0:00:09.464737


In [7]:
import base64
from arrow import now
from io import BytesIO
from PIL import Image

THUMBNAIL_SIZE = (128, 128)

def png(filename: str) -> str:
    with Image.open(fp=filename, mode='r') as image:
        buffer = BytesIO()
        # our images are pretty big; let's shrink the hover images to thumbnail size
        image.resize(size=THUMBNAIL_SIZE).save(buffer, format='png')
        return 'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode()


time_start = now()
df['png'] = df['name'].apply(png)
print('done in {}'.format(now() - time_start))

done in 0:00:04.419069


In [8]:
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool

from bokeh.plotting import figure
from bokeh.plotting import output_notebook
from bokeh.plotting import show
from bokeh.palettes import TolRainbow
from bokeh.transform import factor_cmap

output_notebook()

datasource = ColumnDataSource(df)
factor_count = max(df['tag'].nunique(), 3)
mapper = factor_cmap(field_name = 'tag', palette=TolRainbow[factor_count], factors=df['tag'].unique().tolist(), start=0, end=factor_count-1)

plot_figure = figure(title='UMAP projection: chess pieces', width=1000, height=800, tools=('pan, wheel_zoom, reset'))

plot_figure.add_tools(HoverTool(tooltips="""
<div>
    <div>
        <img src='@png' style='float: left; margin: 5px 5px 5px 5px'/>
    </div>
    <div>
        <span style='font-size: 16px; color: #224499'>tag:</span>
        <span style='font-size: 18px'>@tag</span>
    </div>
</div>
"""))

plot_figure.circle('x', 'y', source=datasource, line_alpha=0.6, fill_alpha=0.6, size=5, color=mapper)
show(plot_figure)



In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df['value'].apply(pd.Series), df['tag'], test_size=0.2, random_state=2024, stratify=df['tag'])
model = LogisticRegression(max_iter=1000, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(model.n_iter_[0]))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))

model fit in 418 iterations
accuracy: 0.9800


In [10]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=model.predict(X=X_test)))

                   precision    recall  f1-score   support

Kirmizi_Pistachio       0.98      0.98      0.98        59
  Siirt_Pistachio       0.98      0.98      0.98        41

         accuracy                           0.98       100
        macro avg       0.98      0.98      0.98       100
     weighted avg       0.98      0.98      0.98       100

