Let's first add some code that will let us use a CNN to get embeddings for our images.

In [1]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import numpy as np


DEVICE = torch.device('cpu')
OUTPUT_SIZE = 2048

model = models.resnext50_32x4d(weights=models.ResNeXt50_32X4D_Weights.IMAGENET1K_V2)

extraction_layer = model._modules.get('avgpool')
model.to(DEVICE)
model.eval()

scaler = transforms.Resize((224, 224))
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
to_tensor = transforms.ToTensor()

def get_vec(arg, model, extraction_layer):
    image = normalize(to_tensor(scaler(arg))).unsqueeze(0).to(DEVICE)
    result = torch.zeros(1, OUTPUT_SIZE, 1, 1)
    def copy_data(m, i, o):
        result.copy_(o.data)
    hooked = extraction_layer.register_forward_hook(copy_data)
    with torch.no_grad():
        model(image)
    hooked.remove()
    return result

Downloading: "https://download.pytorch.org/models/resnext50_32x4d-1a0047aa.pth" to /root/.cache/torch/hub/checkpoints/resnext50_32x4d-1a0047aa.pth
100%|██████████| 95.8M/95.8M [00:00<00:00, 126MB/s]


Now let's load up our data, including embeddings and thumbnails. We don't have labels for our test data, so while we load up images and labels for the training and validation data, we only load up images for the test data.

In [2]:
import arrow
import base64
import pandas as pd
from glob import glob
from io import BytesIO
from os.path import basename
from PIL import Image

ROOT = '/kaggle/input/dataset-of-fertile-and-infertile-chicken-eggs'
TEST_IMAGES = ROOT + '/test/images'
THUMBNAIL_SIZE = (128, 128)
TRAIN_IMAGES = ROOT + '/train/images'
TRAIN_LABELS = ROOT + '/train/labels'
VALID_IMAGES = ROOT + '/valid/images'
VALID_LABELS = ROOT + '/valid/labels'


def embed(model, filename: str):
    with Image.open(fp=filename, mode='r') as image:
        return get_vec(arg=image.convert('RGB'), model=model, extraction_layer=extraction_layer).numpy().reshape(OUTPUT_SIZE,)

def label(filename: str, labels: str) -> int:
    label_file = labels + '/' + filename.replace('.jpg', '.txt')
    with open(file=label_file, mode='r') as input_fp:
        data = input_fp.read()
    return int(data.split()[0])

def png(filename: str) -> str:
    with Image.open(fp=filename, mode='r') as image:
        buffer = BytesIO()
        # our images are pretty big; let's shrink the hover images to thumbnail size
        image.resize(size=THUMBNAIL_SIZE).convert('RGB').save(buffer, format='png')
        return 'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode()

time_start = arrow.now()
train_df = pd.DataFrame(data=[pd.Series(data=[basename(item), embed(model=model, filename=item), png(filename=item)],
                                        index=['item', 'value', 'png']) for item in glob(TRAIN_IMAGES + '/*.jpg')])
train_df['label'] = train_df['item'].apply(func=label, args=(TRAIN_LABELS, ))
valid_df = pd.DataFrame(data=[pd.Series(data=[basename(item), embed(model=model, filename=item), png(filename=item)],
                                        index=['item', 'value', 'png']) for item in glob(VALID_IMAGES + '/*.jpg')])
valid_df['label'] = valid_df['item'].apply(func=label, args=(VALID_LABELS, ))
test_df = pd.DataFrame(data=[pd.Series(data=[basename(item), embed(model=model, filename=item), png(filename=item)],
                                        index=['item', 'value', 'png']) for item in glob(TEST_IMAGES + '/*.jpg')])
print('done in {}'.format(arrow.now() - time_start))

done in 0:00:36.817645


How much data do we have?

In [3]:
len(train_df), len(test_df), len(valid_df)

(139, 20, 40)

Is our target class balanced in our training/validation data? Almost.

In [4]:
pd.concat(objs=[train_df, valid_df])['label'].value_counts().to_dict()

{0: 90, 1: 89}

Let's use our embeddings and UMAP to get x/y coordinates, so we can visualize all of our data in a single plot.

In [5]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=False, n_jobs=1, low_memory=False, n_epochs=201)
train_df[['x', 'y']] = umap.fit_transform(X=train_df['value'].apply(func=pd.Series))
valid_df[['x', 'y']] = umap.transform(X=valid_df['value'].apply(func=pd.Series))
test_df[['x', 'y']] = umap.transform(X=test_df['value'].apply(func=pd.Series))
print('done with UMAP in {}'.format(arrow.now() - time_start))

done with UMAP in 0:00:18.478406


We're going to train the model on our training and validation data and predict labels for our test data, then plot them all together. We don't have ground truth data for our test data, so we're going to color the points differently.

In [6]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(tol=1e-12, random_state=2024).fit(X=pd.concat(objs=[train_df, valid_df])['value'].apply(func=pd.Series), y=pd.concat(objs=[train_df, valid_df])['label'])
y_pred = logreg.predict(X=test_df['value'].apply(func=pd.Series))
result_df = test_df.copy()
result_df['label'] = 2 + y_pred

We have shifted the labels so we get different colors. How many instances do we have of each?

In [7]:
pd.concat(objs=[train_df, valid_df, result_df])['label'].value_counts().to_dict()

{0: 90, 1: 89, 3: 11, 2: 9}

Now we're ready to plot.

In [8]:
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool

from bokeh.plotting import figure
from bokeh.plotting import output_notebook
from bokeh.plotting import show
from bokeh.palettes import TolRainbow
from bokeh.transform import linear_cmap

output_notebook()
datasource = ColumnDataSource(pd.concat(objs=[train_df, valid_df, result_df]))
mapper = linear_cmap(palette=TolRainbow[4], low=0, high=4, field_name='label')

plot_figure = figure(title='UMAP projection: eggs', width=1000, height=800, tools=('pan, wheel_zoom, reset'))

plot_figure.add_tools(HoverTool(tooltips="""
<div>
    <div>
        <img src='@png' style='float: left; margin: 5px 5px 5px 5px'/>
    </div>
    <div>
        <span style='font-size: 16px; color: #224499'>tag:</span>
        <span style='font-size: 18px'>@label</span>
    </div>
</div>
"""))

plot_figure.scatter(x='x', y='y', source=datasource, line_alpha=0.6, fill_alpha=0.6, size=5, color=mapper)
show(plot_figure)

What do we see? Our CNN gives us embeddings that split our data about as nicely as we could ask for: the images we have of fertile eggs look very different from the ones we have of infertile eggs, and the logistic regression model we built from our training data does a good job of distinguishing them. We don't have ground truth data for our test data, so this plot will have to suffice.