In [1]:
!pip install --quiet img2vec_pytorch
print('pip installed img2vec')

pip installed img2vec


In [2]:
from warnings import filterwarnings
filterwarnings(action='ignore', category=FutureWarning) # quiet a plotly issue
filterwarnings(action='ignore', category=UserWarning) # quiet an img2vec issue

In [3]:
from img2vec_pytorch import Img2Vec
from PIL import Image
from arrow import now
from glob import glob
import pandas as pd
from os.path import basename

img2vec = Img2Vec(cuda=False, model='resnet-18', layer='default', layer_output_size=512)

train = {
    'dry' : '/kaggle/input/oily-dry-and-normal-skin-types-dataset/Oily-Dry-Skin-Types/train/dry/*.jpg',
    'normal' : '/kaggle/input/oily-dry-and-normal-skin-types-dataset/Oily-Dry-Skin-Types/train/normal/*.jpg',
    'oily' : '/kaggle/input/oily-dry-and-normal-skin-types-dataset/Oily-Dry-Skin-Types/train/oily/*.jpg'
}

def get_from_glob(arg: str, tag: str) -> list:
    result = []
    for input_file in glob(pathname=arg):
        name = basename(input_file)
        try:
            with Image.open(fp=input_file, mode='r') as image:
                vector = img2vec.get_vec(image, tensor=True).numpy().reshape(512,)
                result.append(pd.Series(data=[tag, name, vector], index=['tag', 'name', 'value']))
        except RuntimeError:
            # we only have a few failures so we're just going to discard them
            print('runtime failure: {}'.format(tag, name))
            pass
    return result

time_start = now()
data = []
for key, value in train.items():
    data.extend(get_from_glob(arg=value, tag=key))
    print('done encoding the {} images in {}'.format(key, now() - time_start))
df = pd.DataFrame(data=data)
print('done in {}'.format(now() - time_start))

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 87.1MB/s]


done encoding the dry images in 0:00:47.230791
done encoding the normal images in 0:02:10.527074
done encoding the oily images in 0:03:29.507708
done in 0:03:29.805496


In [4]:
df.head()

Unnamed: 0,tag,name,value
0,dry,dry_ba43086ceb69089f048e_jpg.rf.10b84c97be7986...,"[1.5018677, 0.46136314, 0.2574649, 0.98578155,..."
1,dry,dry_cafd1685d1752d62aa8e_jpg.rf.4eaae9b54d3652...,"[0.8960048, 0.20302434, 0.24822909, 0.4686185,..."
2,dry,dry_ed34be4db7bc534192d4_jpg.rf.aad0e42dc72806...,"[0.13896553, 0.055222817, 0.21849212, 0.328174..."
3,dry,dry_7789231192a8286641d6_jpg.rf.9ca39660b31551...,"[1.2477133, 0.326777, 1.355015, 0.64007056, 0...."
4,dry,dry_cf50bf97d954d18138d2_jpg.rf.5f29150bde3a89...,"[1.815259, 0.6537706, 1.107674, 0.8154092, 0.2..."


Now let's get our test data.

In [5]:
test = {
    'dry' : '/kaggle/input/oily-dry-and-normal-skin-types-dataset/Oily-Dry-Skin-Types/test/dry/*.jpg',
    'normal' : '/kaggle/input/oily-dry-and-normal-skin-types-dataset/Oily-Dry-Skin-Types/test/normal/*.jpg',
    'oily' : '/kaggle/input/oily-dry-and-normal-skin-types-dataset/Oily-Dry-Skin-Types/test/oily/*.jpg'
}

time_start = now()
test_data = []
for key, value in test.items():
    test_data.extend(get_from_glob(arg=value, tag=key))
    print('done encoding the {} images in {}'.format(key, now() - time_start))
test_df = pd.DataFrame(data=data)
print('done in {}'.format(now() - time_start))

done encoding the dry images in 0:00:02.627488
done encoding the normal images in 0:00:06.981219
done encoding the oily images in 0:00:10.011117
done in 0:00:10.208146


In [6]:
from plotly.express import histogram
histogram(data_frame=df, x='tag')

In [7]:
df['tag'].value_counts(normalize=True)

tag
normal    0.400581
oily      0.362845
dry       0.236575
Name: proportion, dtype: float64

We have unbalanced classes, but not terribly so.

In [8]:
from umap import UMAP
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=200)
plot_df = pd.concat(objs=[df, pd.DataFrame(data=umap.fit_transform(X=df['value'].apply(pd.Series)), columns=['x', 'y'])], axis=1)
print('done with UMAP in {}'.format(now() - time_start))

2024-02-20 21:48:21.339051: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-20 21:48:21.339222: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-20 21:48:21.517664: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=200, n_jobs=1, random_state=2024, verbose=True)
Tue Feb 20 21:48:37 2024 Construct fuzzy simplicial set
Tue Feb 20 21:48:46 2024 Finding Nearest Neighbors
Tue Feb 20 21:48:51 2024 Finished Nearest Neighbor Search
Tue Feb 20 21:48:55 2024 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Tue Feb 20 21:48:59 2024 Finished embedding
done with UMAP in 0:01:15.853825


In [9]:
from plotly.express import scatter
scatter(data_frame=plot_df, x='x', y='y', color='tag', hover_name='name', height=900)

We don't see a lot of separation between our classes, but we do see a lot of local clustering in twos and threes; there's some signal here that UMAP can find. Let's see how we do with a simple classifier

In [10]:
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from arrow import now

best_k = 1
best = 0
for n_neighbors in range(2, 10):
    current = KNeighborsClassifier(n_neighbors=n_neighbors)
    current.fit(X=df['value'].apply(pd.Series), y=df['tag'])
    score = f1_score(average='weighted', labels=test_df['tag'].unique().tolist(), y_true=test_df['tag'], y_pred=current.predict(X=test_df['value'].apply(pd.Series)))
    if score > best:
        best = score
        best_k = n_neighbors
        print('neighbors: {} score: {:5.4f}'.format(n_neighbors, score))
        
time_start = now()
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X=df['value'].apply(pd.Series), y=df['tag'])
print(classification_report(labels=test_df['tag'].unique().tolist(), y_true=test_df['tag'], y_pred=knn.predict(X=test_df['value'].apply(pd.Series))))
print('model time: {}'.format(now() - time_start))

neighbors: 2 score: 0.9837
              precision    recall  f1-score   support

         dry       0.96      1.00      0.98       652
      normal       0.98      0.99      0.98      1104
        oily       1.00      0.97      0.98      1000

    accuracy                           0.98      2756
   macro avg       0.98      0.99      0.98      2756
weighted avg       0.98      0.98      0.98      2756

model time: 0:00:01.857700


It turns out that using k-nearest neighbors with a small value for K works pretty well.