In [1]:
!pip install --quiet img2vec_pytorch
print('pip installed img2vec')

pip installed img2vec


In [2]:
from warnings import filterwarnings
filterwarnings(action='ignore', category=FutureWarning) # quiet a plotly issue
filterwarnings(action='ignore', category=UserWarning) # quiet an img2vec issue

Let's load up our training data; this will take about an hour.

In [3]:
from img2vec_pytorch import Img2Vec
from PIL import Image
from arrow import now
from glob import glob
import pandas as pd
from os.path import basename

img2vec = Img2Vec(cuda=False, model='resnet-18', layer='default', layer_output_size=512)


# https://stackoverflow.com/a/952952
def flatten(arg):
    return [x for xs in arg for x in xs]

def get_from_glob(arg: str, tag: str) -> list:
    time_get = now()
    result = []
    for input_file in glob(pathname=arg):
        name = basename(input_file)
        try:
            with Image.open(fp=input_file, mode='r') as image:
                vector = img2vec.get_vec(image, tensor=True).numpy().reshape(512,)
                result.append(pd.Series(data=[tag, name, vector], index=['tag', 'name', 'value']))
        except RuntimeError:
            # we only have a few failures so we're just going to discard them
            print('runtime failure: {}'.format(tag, name))
            pass
    print('encoded {} data in {}'.format(tag, now() - time_get))
    return result

time_start = now()
train = {
    'arborio'   : '/kaggle/input/abhi-ruk/Train/Arborio/*.jpg',
    'basmati'   : '/kaggle/input/abhi-ruk/Train/Basmati/*.jpg',
    'ipsala'    : '/kaggle/input/abhi-ruk/Train/Ipsala/*.jpg',
    'jasmine'   : '/kaggle/input/abhi-ruk/Train/Jasmine/*.jpg',
    'karacadag' : '/kaggle/input/abhi-ruk/Train/Karacadag/*.jpg',
}
train_data = [get_from_glob(arg=value, tag=key) for key, value in train.items()]
df = pd.DataFrame(data=flatten(arg=train_data))
    
print('done in {}'.format(now() - time_start))

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 91.7MB/s]


encoded arborio data in 0:10:39.580925
encoded basmati data in 0:10:42.413303
encoded ipsala data in 0:10:39.080972
encoded jasmine data in 0:11:15.279020
encoded karacadag data in 0:11:13.370315
done in 0:54:35.137352


In [4]:
from plotly.express import histogram
histogram(data_frame=df, x='tag')

Our classes are well balanced, which is good news. Let's use dimension reduction to see if the vectors we have derived can be clustered effectively.

In [5]:
from umap import UMAP
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
plot_df = pd.concat(objs=[df, pd.DataFrame(data=umap.fit_transform(X=df['value'].apply(pd.Series)), columns=['x', 'y'])], axis=1)
print('done with UMAP in {}'.format(now() - time_start))

2024-02-21 18:02:19.752902: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-21 18:02:19.753039: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-21 18:02:19.933498: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Wed Feb 21 18:02:40 2024 Construct fuzzy simplicial set
Wed Feb 21 18:02:41 2024 Finding Nearest Neighbors
Wed Feb 21 18:02:41 2024 Building RP forest with 16 trees
Wed Feb 21 18:02:50 2024 NN descent for 16 iterations
	 1  /  16
	 2  /  16
	 3  /  16
	 4  /  16
	 5  /  16
	Stopping threshold met -- exiting after 5 iterations
Wed Feb 21 18:03:15 2024 Finished Nearest Neighbor Search
Wed Feb 21 18:03:20 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Wed Feb 21 18:04:56 2024 Finished embedding
done with UMAP in 0:57:36.991202


In [6]:
from plotly.express import scatter
scatter(data_frame=plot_df, x='x', y='y', color='tag', hover_name='name', height=900)

In this plot we see each kind of rice clustering tightly, for the most part, so we have good reason to be optimistic that we can build a model that will identify each variety based on the available data.

Loading the test data will take about ten minutes.

In [7]:
test = {
    'arborio'   : '/kaggle/input/abhi-ruk/Test/Arborio/*.jpg',
    'basmati'   : '/kaggle/input/abhi-ruk/Test/Basmati/*.jpg',
    'ipsala'    : '/kaggle/input/abhi-ruk/Test/Ipsala/*.jpg',
    'jasmine'   : '/kaggle/input/abhi-ruk/Test/Jasmine/*.jpg',
    'karacadag' : '/kaggle/input/abhi-ruk/Test/Karacadag/*.jpg',
}

time_start = now()
test_data = [get_from_glob(arg=value, tag=key) for key, value in test.items()]
test_df = pd.DataFrame(data=flatten(arg=test_data))
print('done in {}'.format(now() - time_start))

encoded arborio data in 0:02:49.205006
encoded basmati data in 0:02:48.065311
encoded ipsala data in 0:02:47.304543
encoded jasmine data in 0:02:39.640570
encoded karacadag data in 0:02:42.587830
done in 0:13:48.022674


In [8]:
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from arrow import now

best_k = 1
best = 0
# let's step through a range of cluster sizes to find the one that will give us the best accuracy
for n_neighbors in range(2, 15):
    current = KNeighborsClassifier(n_neighbors=n_neighbors)
    current.fit(X=df['value'].apply(pd.Series), y=df['tag'])
    score = f1_score(average='weighted', labels=test_df['tag'].unique().tolist(), y_true=test_df['tag'], y_pred=current.predict(X=test_df['value'].apply(pd.Series)))
    if score > best:
        best = score
        best_k = n_neighbors
    print('neighbors: {} score: {:5.4f}'.format(n_neighbors, score))
        
time_start = now()
print('building best-k model with k = {}'.format(best_k))
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X=df['value'].apply(pd.Series), y=df['tag'])
print(classification_report(labels=test_df['tag'].unique().tolist(), y_true=test_df['tag'], y_pred=knn.predict(X=test_df['value'].apply(pd.Series))))
print('model time: {}'.format(now() - time_start))

neighbors: 2 score: 0.9875
neighbors: 3 score: 0.9887
neighbors: 4 score: 0.9901
neighbors: 5 score: 0.9890
neighbors: 6 score: 0.9895
neighbors: 7 score: 0.9895
neighbors: 8 score: 0.9892
neighbors: 9 score: 0.9896
neighbors: 10 score: 0.9900
neighbors: 11 score: 0.9896
neighbors: 12 score: 0.9898
neighbors: 13 score: 0.9894
neighbors: 14 score: 0.9896
building best-k model with k = 4
              precision    recall  f1-score   support

     arborio       0.99      0.98      0.98      2500
     basmati       1.00      0.99      0.99      2500
      ipsala       1.00      1.00      1.00      2500
     jasmine       0.98      0.99      0.99      2500
   karacadag       0.99      0.99      0.99      2500

    accuracy                           0.99     12500
   macro avg       0.99      0.99      0.99     12500
weighted avg       0.99      0.99      0.99     12500

model time: 0:00:24.046403
