In [1]:
!pip install --upgrade --quiet pip
!pip install --quiet img2vec_pytorch
print('pip install/update complete.')

pip install/update complete.


In [2]:
import base64
import pandas as pd

from glob import glob
from io import BytesIO
from os.path import basename

from arrow import now
from img2vec_pytorch import Img2Vec
from PIL import Image

SIZE = 512
STOP =  100

# https://stackoverflow.com/a/952952
def flatten(arg):
    return [x for xs in arg for x in xs]

def get_from_glob(arg: str, tag: str, stop: int) -> list:
    time_get = now()
    result = []
    for index, input_file in enumerate(glob(pathname=arg)):
        if index < stop:
            name = basename(input_file)
            with Image.open(fp=input_file, mode='r') as image:
                vector = img2vec.get_vec(image, tensor=True).numpy().reshape(SIZE,)
                buffer = BytesIO()
                width, height = image.size
                size = (int(width / 2), int(height / 2))
                image.resize(size=size).save(buffer, format='png')
                result.append(pd.Series(data=[tag, name, vector, 
                                              'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode(),
                                             ], index=['tag', 'name', 'value', 'image']))
    print('encoded {} data {} rows in {}'.format(tag, len(result), now() - time_get))
    return result

img2vec = Img2Vec(cuda=False, model='resnet-18', layer='default', layer_output_size=SIZE)

time_start = now()

train_files = {basename(folder) : folder + '/*.jpg' for folder in glob('/kaggle/input/melanoma-cancer-dataset/train' + '/*')}
train_data = [get_from_glob(arg=value, tag=key, stop=STOP) for key, value in train_files.items()]
train_df = pd.DataFrame(data=flatten(arg=train_data))

test_files = {basename(folder) : folder + '/*.jpg' for folder in glob('/kaggle/input/melanoma-cancer-dataset/test' + '/*')}
test_data = [get_from_glob(arg=value, tag=key, stop=STOP) for key, value in test_files.items()]
test_df = pd.DataFrame(data=flatten(arg=test_data))

print('done in {}'.format(now() - time_start))

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 118MB/s]


encoded Benign data 100 rows in 0:00:06.478863
encoded Malignant data 100 rows in 0:00:06.487691
encoded Benign data 100 rows in 0:00:06.240026
encoded Malignant data 100 rows in 0:00:06.272819
done in 0:00:25.526609


In [3]:
from warnings import filterwarnings
from plotly import express
filterwarnings(action='ignore', category=UserWarning)

express.pie(data_frame=train_df, names='tag', color='tag')


In [4]:
from arrow import now
from umap import UMAP

time_start = now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
train_df[['x', 'y']] = pd.DataFrame(data=umap.fit_transform(X=train_df['value'].apply(pd.Series)))
express.scatter(data_frame=train_df, x='x', y='y', color='tag').show()
print('done with UMAP in {}'.format(now() - time_start))

2024-03-27 15:38:11.789266: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-27 15:38:11.789386: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-27 15:38:11.970771: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Wed Mar 27 15:38:22 2024 Construct fuzzy simplicial set
Wed Mar 27 15:38:22 2024 Finding Nearest Neighbors
Wed Mar 27 15:38:26 2024 Finished Nearest Neighbor Search
Wed Mar 27 15:38:29 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Wed Mar 27 15:38:31 2024 Finished embedding






done with UMAP in 0:00:09.007027


In [5]:
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
from bokeh.palettes import Set1_3

from bokeh.plotting import figure
from bokeh.plotting import output_notebook
from bokeh.plotting import show
from bokeh.transform import factor_cmap

output_notebook()

# now let's build the plot and tooltip
datasource = ColumnDataSource(train_df)
mapper = factor_cmap('tag', palette=Set1_3, factors=train_df['tag'].unique().tolist())

plot_figure = figure(title='UMAP projection: cancer dataset', width=1000, height=800, tools=('pan, wheel_zoom, reset'))

plot_figure.add_tools(HoverTool(tooltips="""
<div>
    <div>
        <img src='@image' style='float: left; margin: 5px 5px 5px 5px'/>
    </div>
    <div>
        <span style='font-size: 16px; color: #224499'>Tag:</span>
        <span style='font-size: 18px'>@tag</span>
    </div>
</div>
"""))

plot_figure.circle('x', 'y', source=datasource, color=mapper, line_alpha=0.6, fill_alpha=0.6, size=8,)
show(plot_figure)

In [6]:
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from arrow import now

best_k = 1
best = 0
# let's step through a range of cluster sizes to find the one that will give us the best accuracy
for n_neighbors in range(2, 15):
    current = KNeighborsClassifier(n_neighbors=n_neighbors)
    current.fit(X=train_df['value'].apply(pd.Series), y=train_df['tag'])
    score = f1_score(average='weighted', labels=test_df['tag'].unique().tolist(), y_true=test_df['tag'], y_pred=current.predict(X=test_df['value'].apply(pd.Series)))
    if score > best:
        best = score
        best_k = n_neighbors
    print('neighbors: {} score: {:5.4f}'.format(n_neighbors, score))
        
time_start = now()
print('building best-k model with k = {}'.format(best_k))
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X=train_df['value'].apply(pd.Series), y=train_df['tag'])
print(classification_report(labels=test_df['tag'].unique().tolist(), y_true=test_df['tag'], y_pred=knn.predict(X=test_df['value'].apply(pd.Series))))
print('model time: {}'.format(now() - time_start))

neighbors: 2 score: 0.6239
neighbors: 3 score: 0.7103
neighbors: 4 score: 0.6503
neighbors: 5 score: 0.6809
neighbors: 6 score: 0.6593
neighbors: 7 score: 0.6764
neighbors: 8 score: 0.6460
neighbors: 9 score: 0.6789
neighbors: 10 score: 0.6392
neighbors: 11 score: 0.6593
neighbors: 12 score: 0.6297
neighbors: 13 score: 0.6484
neighbors: 14 score: 0.6324
building best-k model with k = 3
              precision    recall  f1-score   support

      Benign       0.66      0.95      0.78       100
   Malignant       0.91      0.50      0.65       100

    accuracy                           0.73       200
   macro avg       0.78      0.72      0.71       200
weighted avg       0.78      0.72      0.71       200

model time: 0:00:00.084723
