In [1]:
!pip install --quiet img2vec_pytorch
print('pip installed img2vec')

from warnings import filterwarnings
filterwarnings(action='ignore', category=FutureWarning) # quiet a plotly issue
filterwarnings(action='ignore', category=UserWarning) # quiet an img2vec issue

pip installed img2vec


In [2]:
from img2vec_pytorch import Img2Vec
from PIL import Image
from arrow import now
from glob import glob
import pandas as pd
from os.path import basename

SIZE = 512
STOP = 10000

# https://stackoverflow.com/a/952952
def flatten(arg):
    return [x for xs in arg for x in xs]

def get_from_glob(arg: str, tag: str, stop: int) -> list:
    time_get = now()
    result = []
    for index, input_file in enumerate(glob(pathname=arg)):
        if index < stop:
            name = basename(input_file)
            with Image.open(fp=input_file, mode='r') as image:
                vector = img2vec.get_vec(image, tensor=True).numpy().reshape(SIZE,)
                result.append(pd.Series(data=[tag, name, vector], index=['tag', 'name', 'value']))
    print('encoded {} data {} rows in {}'.format(tag, len(result), now() - time_get))
    return result


img2vec = Img2Vec(cuda=False, model='resnet-18', layer='default', layer_output_size=SIZE)

time_start = now()

train = {basename(folder) : folder + '/*.jpg' for folder in glob('/kaggle/input/multi-class-rice-image-dataset/Train/*')}
train_data = [get_from_glob(arg=value, tag=key, stop=STOP) for key, value in train.items()]
df = pd.DataFrame(data=flatten(arg=train_data))
    
print('done in {}'.format(now() - time_start))

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 188MB/s]


encoded Karacadag data 10000 rows in 0:07:02.708767
encoded Basmati data 10000 rows in 0:07:14.222208
encoded Jasmine data 10000 rows in 0:07:16.162434
encoded Arborio data 10000 rows in 0:07:24.783143
encoded Ipsala data 10000 rows in 0:07:25.903838
done in 0:36:27.512678


In [3]:
from arrow import now
from umap import UMAP

time_start = now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
plot_df = pd.concat(objs=[df, pd.DataFrame(data=umap.fit_transform(X=df['value'].apply(pd.Series)), columns=['x', 'y'])], axis=1)
print('done with UMAP in {}'.format(now() - time_start))

2024-02-24 01:27:19.485566: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-24 01:27:19.485787: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-24 01:27:19.659421: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Sat Feb 24 01:27:37 2024 Construct fuzzy simplicial set
Sat Feb 24 01:27:37 2024 Finding Nearest Neighbors
Sat Feb 24 01:27:37 2024 Building RP forest with 16 trees
Sat Feb 24 01:27:43 2024 NN descent for 16 iterations
	 1  /  16
	 2  /  16
	 3  /  16
	 4  /  16
	 5  /  16
	Stopping threshold met -- exiting after 5 iterations
Sat Feb 24 01:28:00 2024 Finished Nearest Neighbor Search
Sat Feb 24 01:28:03 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Sat Feb 24 01:28:58 2024 Finished embedding
done with UMAP in 0:01:25.898960


In [4]:
from plotly.express import scatter
scatter(data_frame=plot_df, x='x', y='y', color='tag', hover_name='name', height=900).show()

In [5]:
from arrow import now

test = {basename(folder) : folder + '/*.jpg' for folder in glob('/kaggle/input/multi-class-rice-image-dataset/Test/*')}

time_start = now()
test_data = [get_from_glob(arg=value, tag=key, stop=STOP) for key, value in test.items()]
test_df = pd.DataFrame(data=flatten(arg=test_data))
print('done in {}'.format(now() - time_start))

encoded Karacadag data 2500 rows in 0:01:47.478360
encoded Basmati data 2500 rows in 0:01:49.594619
encoded Jasmine data 2500 rows in 0:01:48.489658
encoded Arborio data 2500 rows in 0:01:48.419318
encoded Ipsala data 2500 rows in 0:01:49.256273
done in 0:09:04.057305


In [6]:
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from arrow import now

best_estimators = 0
best = 0
# let's step through a range of estimator counts to find the one that will give us the best accuracy
for n_estimators in range(250, 270, 5):
    current = RandomForestClassifier(random_state=2024, n_estimators=n_estimators)
    current.fit(X=df['value'].apply(pd.Series), y=df['tag'])
    score = f1_score(average='weighted', labels=test_df['tag'].unique().tolist(), y_true=test_df['tag'], y_pred=current.predict(X=test_df['value'].apply(pd.Series)))
    if score > best:
        best = score
        best_estimators = n_estimators
    print('estimators: {} score: {:5.4f}'.format(n_estimators, score))
        
time_start = now()
print('building best model with estimators count = {}'.format(best_estimators))
forest = RandomForestClassifier(verbose=0, random_state=2024, n_estimators=best_estimators)
forest.fit(X=df['value'].apply(pd.Series), y=df['tag'])
print(classification_report(labels=test_df['tag'].unique().tolist(), y_true=test_df['tag'], y_pred=forest.predict(X=test_df['value'].apply(pd.Series))))
print('model time: {}'.format(now() - time_start))

estimators: 250 score: 0.9873
estimators: 255 score: 0.9873
estimators: 260 score: 0.9872
estimators: 265 score: 0.9874
building best model with estimators count = 265
              precision    recall  f1-score   support

   Karacadag       0.99      0.99      0.99      2500
     Basmati       1.00      0.97      0.99      2500
     Jasmine       0.97      0.99      0.98      2500
     Arborio       0.98      0.99      0.98      2500
      Ipsala       1.00      1.00      1.00      2500

    accuracy                           0.99     12500
   macro avg       0.99      0.99      0.99     12500
weighted avg       0.99      0.99      0.99     12500

model time: 0:08:33.943081
