In [1]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import numpy as np


DEVICE = torch.device('cpu')
OUTPUT_SIZE = 2048

model = models.resnext50_32x4d(weights=models.ResNeXt50_32X4D_Weights.IMAGENET1K_V2)

extraction_layer = model._modules.get('avgpool')
model.to(DEVICE)
model.eval()

scaler = transforms.Resize((224, 224))
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
to_tensor = transforms.ToTensor()

def get_vec(arg, model, extraction_layer):
    image = normalize(to_tensor(scaler(arg))).unsqueeze(0).to(DEVICE)
    result = torch.zeros(1, OUTPUT_SIZE, 1, 1)
    def copy_data(m, i, o):
        result.copy_(o.data)
    hooked = extraction_layer.register_forward_hook(copy_data)
    with torch.no_grad():
        model(image)
    hooked.remove()
    return result

Downloading: "https://download.pytorch.org/models/resnext50_32x4d-1a0047aa.pth" to /root/.cache/torch/hub/checkpoints/resnext50_32x4d-1a0047aa.pth
100%|██████████| 95.8M/95.8M [00:00<00:00, 140MB/s]


In [2]:
import arrow
import base64
import pandas as pd
from glob import glob
from io import BytesIO
from os.path import basename
from PIL import Image

TEST_IMAGES = '/kaggle/input/remote-sensing-satellite-images/Remote Sensing Data.v2i.yolov8/test/images'
TEST_LABELS = '/kaggle/input/remote-sensing-satellite-images/Remote Sensing Data.v2i.yolov8/test/labels'
TRAIN_IMAGES = '/kaggle/input/remote-sensing-satellite-images/Remote Sensing Data.v2i.yolov8/train/images'
TRAIN_LABELS = '/kaggle/input/remote-sensing-satellite-images/Remote Sensing Data.v2i.yolov8/train/labels'
STOP = 10000
THUMBNAIL_SIZE = (128, 128)


def embed(model, filename: str):
    with Image.open(fp=filename, mode='r') as image:
        return get_vec(arg=image.convert('RGB'), model=model, extraction_layer=extraction_layer).numpy().reshape(OUTPUT_SIZE,)

def label(filename: str, labels: str) -> int:
    with open(file=labels + '/' + filename.replace('.jpg', '.txt'), mode='r') as input_fp:
        data = input_fp.read()
    return int(data.split()[0])

def png(filename: str) -> str:
    with Image.open(fp=filename, mode='r') as image:
        buffer = BytesIO()
        # our images are pretty big; let's shrink the hover images to thumbnail size
        image.resize(size=THUMBNAIL_SIZE).convert('RGB').save(buffer, format='png')
        return 'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode()

time_start = arrow.now()
train_df = pd.DataFrame(data=[pd.Series(data=[basename(item), embed(model=model, filename=item), png(filename=item)],
                                        index=['item', 'value', 'png']) for item in list(glob(TRAIN_IMAGES + '/*.jpg'))[:STOP]])
train_df['label'] = train_df['item'].apply(func=label, args=(TRAIN_LABELS, ))
test_df = pd.DataFrame(data=[pd.Series(data=[basename(item), embed(model=model, filename=item), png(filename=item)],
                                        index=['item', 'value', 'png']) for item in list(glob(TEST_IMAGES + '/*.jpg'))[:STOP]])
test_df['label'] = test_df['item'].apply(func=label, args=(TEST_LABELS, ))
print('done in {}'.format(arrow.now() - time_start))

done in 0:02:15.112998


In [3]:
from plotly import express

express.histogram(data_frame=train_df['label'].value_counts().to_frame().reset_index(), x='label', y='count', nbins=train_df['label'].nunique())

In [4]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=201)
train_df[['x', 'y']] = umap.fit_transform(X=train_df['value'].apply(func=pd.Series))
test_df[['x', 'y']] = umap.transform(X=test_df['value'].apply(func=pd.Series))
print('done with UMAP in {}'.format(arrow.now() - time_start))

UMAP(low_memory=False, n_epochs=201, n_jobs=1, random_state=2024, verbose=True)
Wed Aug 28 14:19:02 2024 Construct fuzzy simplicial set
Wed Aug 28 14:19:03 2024 Finding Nearest Neighbors
Wed Aug 28 14:19:07 2024 Finished Nearest Neighbor Search
Wed Aug 28 14:19:11 2024 Construct embedding


Epochs completed:   0%|            0/201 [00:00]

	completed  0  /  201 epochs
	completed  20  /  201 epochs
	completed  40  /  201 epochs
	completed  60  /  201 epochs
	completed  80  /  201 epochs
	completed  100  /  201 epochs
	completed  120  /  201 epochs
	completed  140  /  201 epochs
	completed  160  /  201 epochs
	completed  180  /  201 epochs
	completed  200  /  201 epochs
Wed Aug 28 14:19:13 2024 Finished embedding


Epochs completed:   0%|            0/67 [00:00]

	completed  0  /  67 epochs
	completed  6  /  67 epochs
	completed  12  /  67 epochs
	completed  18  /  67 epochs
	completed  24  /  67 epochs
	completed  30  /  67 epochs
	completed  36  /  67 epochs
	completed  42  /  67 epochs
	completed  48  /  67 epochs
	completed  54  /  67 epochs
	completed  60  /  67 epochs
	completed  66  /  67 epochs
done with UMAP in 0:00:17.659015


In [5]:
from plotly import express

express.scatter(data_frame=pd.concat(objs=[train_df, test_df]), x='x', y='y', color='label')