In [1]:
!pip install --quiet img2vec_pytorch
print('pip installed img2vec')

from warnings import filterwarnings
filterwarnings(action='ignore', category=FutureWarning) # quiet a plotly issue
filterwarnings(action='ignore', category=UserWarning) # quiet an img2vec issue

pip installed img2vec


In [2]:
from img2vec_pytorch import Img2Vec
from PIL import Image
from arrow import now
from glob import glob
import pandas as pd
from os.path import basename

SIZE = 512
STOP = 100000 # get all the data
TRAIN_GLOB = '/kaggle/input/national-flowers/flowerdataset/train'

# https://stackoverflow.com/a/952952
def flatten(arg):
    return [x for xs in arg for x in xs]

def get_from_glob(arg: str, tag: str, stop: int) -> list:
    time_get = now()
    result = []
    for index, input_file in enumerate(glob(pathname=arg)):
        if index < stop:
            name = basename(input_file)
            with Image.open(fp=input_file, mode='r') as image:
                vector = img2vec.get_vec(image, tensor=True).numpy().reshape(SIZE,)
                result.append(pd.Series(data=[tag, name, vector], index=['tag', 'name', 'value']))
    print('encoded {} data {} rows in {}'.format(tag, len(result), now() - time_get))
    return result


img2vec = Img2Vec(cuda=False, model='resnet-18', layer='default', layer_output_size=SIZE)

time_start = now()

train = {basename(folder) : folder + '/*.jp*g' for folder in glob(TRAIN_GLOB + '/*')}
train_data = [get_from_glob(arg=value, tag=key, stop=STOP) for key, value in train.items()]
df = pd.DataFrame(data=flatten(arg=train_data))
    
print('done in {}'.format(now() - time_start))

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 198MB/s]


encoded Orchid data 400 rows in 0:00:20.776655
encoded Lavender data 400 rows in 0:00:19.145368
encoded Dandelion data 400 rows in 0:00:19.737570
encoded Daisy data 400 rows in 0:00:19.780182
encoded Sunflower data 400 rows in 0:00:19.689885
encoded Tulip data 400 rows in 0:00:20.690132
encoded Lotus data 400 rows in 0:00:20.294204
encoded Rose data 400 rows in 0:00:18.879041
encoded Lilly data 400 rows in 0:00:20.744380
done in 0:02:59.991266


In [3]:
from arrow import now
from umap import UMAP

time_start = now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
plot_df = pd.concat(objs=[df, pd.DataFrame(data=umap.fit_transform(X=df['value'].apply(pd.Series)), columns=['x', 'y'])], axis=1)
print('done with UMAP in {}'.format(now() - time_start))

2024-02-24 00:16:40.851616: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-24 00:16:40.851766: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-24 00:16:41.058527: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Sat Feb 24 00:16:55 2024 Construct fuzzy simplicial set
Sat Feb 24 00:17:01 2024 Finding Nearest Neighbors
Sat Feb 24 00:17:05 2024 Finished Nearest Neighbor Search
Sat Feb 24 00:17:07 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Sat Feb 24 00:17:12 2024 Finished embedding
done with UMAP in 0:00:17.951172


In [4]:
from plotly.express import scatter
scatter(data_frame=plot_df, x='x', y='y', color='tag', hover_name='name', height=900).show()

We don't have a lot of training data relative to the amount of test data, and we have some classes that are not well distinguished (daisies, lotus flowers, maybe lilies), so we will probably not get super high accurate predictions out of simple models. Let's find out.

In [5]:
from arrow import now

GLOB_TEST = '/kaggle/input/national-flowers/flowerdataset/test'
test = {basename(folder) : folder + '/*.jp*g' for folder in glob(GLOB_TEST + '/*')}

time_start = now()
test_data = [get_from_glob(arg=value, tag=key, stop=STOP) for key, value in test.items()]
test_df = pd.DataFrame(data=flatten(arg=test_data))
# unfortunately our lavender test data is mislabeled and our test lavender flowers are all daisies
test_df['tag'] = test_df['tag'].apply(func=lambda x: x if x != 'Lavender' else 'Daisy')
print('done in {}'.format(now() - time_start))

encoded Orchid data 89 rows in 0:00:04.940573
encoded Lavender data 100 rows in 0:00:04.876535
encoded Dandelion data 100 rows in 0:00:04.924970
encoded Daisy data 92 rows in 0:00:04.555988
encoded Sunflower data 100 rows in 0:00:05.034236
encoded Tulip data 100 rows in 0:00:05.233385
encoded Lotus data 100 rows in 0:00:05.347378
encoded Rose data 100 rows in 0:00:04.604658
encoded Lilly data 100 rows in 0:00:05.080256
done in 0:00:44.659506


In [6]:
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from arrow import now

best_estimators = 0
best = 0
# let's step through a range of estimator counts to find the one that will give us the best accuracy
for n_estimators in range(285, 315, 5):
    current = RandomForestClassifier(random_state=2024, n_estimators=n_estimators)
    current.fit(X=df['value'].apply(pd.Series), y=df['tag'])
    score = f1_score(average='weighted', labels=test_df['tag'].unique().tolist(), y_true=test_df['tag'], y_pred=current.predict(X=test_df['value'].apply(pd.Series)))
    if score > best:
        best = score
        best_estimators = n_estimators
    print('neighbors: {} score: {:5.4f}'.format(n_estimators, score))
        
time_start = now()
print('building best model with estimators count = {}'.format(best_estimators))
forest = RandomForestClassifier(verbose=0, random_state=2024, n_estimators=best_estimators)
forest.fit(X=df['value'].apply(pd.Series), y=df['tag'])
print(classification_report(labels=test_df['tag'].unique().tolist(), y_true=test_df['tag'], y_pred=forest.predict(X=test_df['value'].apply(pd.Series))))
print('model time: {}'.format(now() - time_start))

neighbors: 285 score: 0.7223
neighbors: 290 score: 0.7227
neighbors: 295 score: 0.7226
neighbors: 300 score: 0.7287
neighbors: 305 score: 0.7252
neighbors: 310 score: 0.7267
building best model with estimators count = 300
              precision    recall  f1-score   support

      Orchid       0.78      0.88      0.83        89
       Daisy       0.97      0.43      0.60       192
   Dandelion       0.47      0.85      0.60       100
   Sunflower       0.77      0.91      0.83       100
       Tulip       0.82      0.64      0.72       100
       Lotus       0.83      0.79      0.81       100
        Rose       0.87      0.72      0.79       100
       Lilly       0.76      0.81      0.78       100

   micro avg       0.75      0.72      0.73       881
   macro avg       0.78      0.75      0.75       881
weighted avg       0.80      0.72      0.73       881

model time: 0:00:19.027393


An overall accuracy of 0.7287 hides some variability from class to class. Dandelions and daisies are hard, lotus flowers and lilies not so much.