Using opencv SIFT compute (as its fast), with evenly/densely sampled grid

In [1]:
from pycocotools.coco import COCO
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

import cv2

In [2]:
data_type = 'val2017'
ann_file = 'dataset/coco/annotations/instances_{}.json'.format(data_type)

coco = COCO(ann_file)

loading annotations into memory...
Done (t=0.32s)
creating index...
index created!


In [3]:
# all categories (with ids)
cats = coco.loadCats(coco.getCatIds())

# load data
img_ids = [coco.dataset['images'][i]['id'] for i in range(len(coco.dataset['images']))]
img_ids_w_filename = {coco.dataset['images'][i]['id']: coco.dataset['images'][i]['file_name'] for i in range(len(coco.dataset['images']))}      # use dictionary for faster query
annotations = coco.loadAnns(coco.getAnnIds(imgIds=img_ids))

In [4]:
# load labels for each imgs (as one img may have multiple labels)
labels_per_imgs = []
for i in range(len(img_ids)):
    labels_per_imgs.append(coco.loadAnns(coco.getAnnIds(imgIds=img_ids[i])))

In [5]:
# now we create the real list of datasets
img_id_w_bb = []
label_per_obj = []

for labels in labels_per_imgs:
    for l in labels:
        img_id_w_bb.append((l['id'], l['image_id'], l['bbox']))
        label_per_obj.append(l['category_id'])

In [6]:
img_id_w_bb

[(82445, 397133, [217.62, 240.54, 38.99, 57.75]),
 (119568, 397133, [1.0, 240.24, 346.63, 186.76]),
 (200887, 397133, [388.66, 69.92, 109.41, 277.62]),
 (693231, 397133, [135.57, 249.43, 22.32, 28.79]),
 (713388, 397133, [31.28, 344.0, 68.12, 40.83]),
 (716434, 397133, [59.63, 287.36, 76.07, 41.3]),
 (1125079, 397133, [1.36, 164.33, 192.56, 98.37]),
 (1218137, 397133, [0.0, 262.81, 62.16, 36.77]),
 (1878837, 397133, [119.4, 272.51, 24.82, 34.25]),
 (1883614, 397133, [141.47, 267.91, 32.19, 35.86]),
 (1902250, 397133, [155.97, 168.95, 26.03, 17.13]),
 (1902971, 397133, [157.2, 114.15, 17.86, 15.82]),
 (1914453, 397133, [98.75, 304.78, 10.78, 5.57]),
 (2105658, 397133, [166.03, 256.36, 8.82, 18.58]),
 (2114911, 397133, [86.41, 293.97, 23.96, 11.18]),
 (2114949, 397133, [70.14, 296.16, 9.28, 4.58]),
 (2139366, 397133, [0.0, 210.9, 191.36, 98.98]),
 (2188144, 397133, [96.69, 297.09, 7.84, 4.86]),
 (2196309, 397133, [497.25, 203.4, 122.01, 28.61]),
 (22328, 37777, [102.49, 118.47, 7.9, 17.3

Create train-test split

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(img_id_w_bb, label_per_obj, test_size=0.2, random_state=42)

Define pipeline for creating keypoints

In [8]:
# using HuggingFace datasets to optimize memory usage

from pathlib import Path
from datasets import Dataset

# 1. First, prepare your data for the datasets library
def prepare_dataset_dict(X, img_ids_w_filename):
    # Create a lightweight dictionary containing only metadata (not images)
    dataset_dict = {
        "image_id": [],
        "bbox": [],
        "file_name": []
    }
    
    for sample in X:
        dataset_dict["image_id"].append(sample[1])
        dataset_dict["bbox"].append(sample[2])
        dataset_dict["file_name"].append(img_ids_w_filename[sample[1]])
        
    return dataset_dict

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
EXTRA_PADDING = 10

def _add_extra_padding(bbox, img_dim, extra_padding=EXTRA_PADDING):
    """
    Adds extra padding to the bounding box.
    """
    x, y, w, h = bbox
    x -= extra_padding
    y -= extra_padding
    w += 2 * extra_padding
    h += 2 * extra_padding

    # Ensure the bounding box is within image dimensions
    x = max(0, x)
    y = max(0, y)
    w = min(w, img_dim[1] - x)
    h = min(h, img_dim[0] - y)

    return [x, y, w, h]

In [10]:
# using evenly sampled interest points
# first define the interest points (x, y) coordinates
# then randomly sample the size (diameter) of the circle

MIN_PATCH_SIZE = 10
MAX_PATCH_SIZE = 40
GRID_SPACING = 10

def find_grid_random_size(hgt, wid, grid_spacing):
    """
    Create a grid of upper-left patch corners.
    
    Parameters:
        hgt (int): Height of the image/grid.
        wid (int): Width of the image/grid.
        grid_spacing (int): Spacing between patches.
    
    Returns:
        grid_x (numpy.ndarray): X-coordinates of the grid.
        grid_y (numpy.ndarray): Y-coordinates of the grid.
    """

    rem_x = (wid - MAX_PATCH_SIZE) % grid_spacing
    offset_x = (rem_x // 2) + 1
    rem_y = (hgt - MAX_PATCH_SIZE) % grid_spacing
    offset_y = (rem_y // 2) + 1
    
    grid_x, grid_y = np.meshgrid(
        np.arange(offset_x, wid - MIN_PATCH_SIZE + 1, grid_spacing),
        np.arange(offset_y, hgt - MIN_PATCH_SIZE + 1, grid_spacing)
    )
    
    return grid_x, grid_y

_seed = 20250321        # set the seed for reproducibility. seed = date
np.random.seed(_seed)

def generate_keypoints_from_grid(grid_x, grid_y):
    keypoints = []
    for i in range(grid_x.shape[0]):
        for j in range(grid_x.shape[1]):
            patch_size = np.random.randint(MIN_PATCH_SIZE, MAX_PATCH_SIZE + 1)

            keypoint_cv2 = cv2.KeyPoint(
                grid_x[i, j] + patch_size / 2.0,
                grid_y[i, j] + patch_size // 2.0,
                size=patch_size,
            )

            keypoints.append(keypoint_cv2)
    return keypoints

In [None]:
# 2. Define the SIFT processing function
def process_image_with_sift(example):
    """Process a single image, extracting SIFT features"""
    # Load image only when needed
    img_path = Path(f"dataset/coco/val2017/{example['file_name']}")
    img = cv2.imread(str(img_path))
    
    # Apply bounding box
    # x, y, w, h = example['bbox']
    # Add extra padding to the bounding box
    bbox = _add_extra_padding(example['bbox'], img.shape)
    x, y, w, h = bbox
    img_cropped = img[int(y): int(y + h) + 1, int(x):int(x + w) + 1]
    
    # Convert to grayscale
    img_gray = cv2.cvtColor(img_cropped, cv2.COLOR_BGR2GRAY)
    
    # create keypoints
    grid_x, grid_y = find_grid_random_size(img.shape[0], img.shape[1], GRID_SPACING)
    keypoints_evenly = generate_keypoints_from_grid(grid_x, grid_y)
    
    # Apply SIFT to produce descriptors
    # roughly 1000 - 1500 keypoints per image, depending on the size of the image
    sift = cv2.SIFT_create()
    keypoints, descriptors = sift.compute(img_gray, keypoints_evenly)

    # Convert keypoints to serializable format
    serialized_keypoints = []
    for kp in keypoints:
        serialized_keypoints.append({
            'x': float(kp.pt[0]),
            'y': float(kp.pt[1]), 
            'size': float(kp.size),
            'angle': float(kp.angle),
            'response': float(kp.response),
            'octave': int(kp.octave)
        })
    
    # Return only the features, together with the image_id, bbox, and filename
    # but not the image (pixel) itself
    return {
        'image_id': example['image_id'],
        'bbox': example['bbox'],
        'file_name': example['file_name'],
        'keypoints': serialized_keypoints,
        'descriptors': descriptors.tolist() if descriptors is not None else []
    }

In [12]:
# 3. Main pipeline
def create_sift_dataset(X_train, img_ids_w_filename):
    # Create the dataset dictionary
    dataset_dict = prepare_dataset_dict(X_train, img_ids_w_filename)
    
    # Create HF dataset
    raw_dataset = Dataset.from_dict(dataset_dict)
    
    # Process using (optional) multi-processing
    processed_dataset = raw_dataset.map(
        process_image_with_sift,
        num_proc=1,
        batched=False,
        desc="Extracting SIFT features"
    )
    
    return processed_dataset

In [13]:
sift_dataset_training = create_sift_dataset(X_train, img_ids_w_filename)

Extracting SIFT features: 100%|██████████| 29424/29424 [55:36<00:00,  8.82 examples/s]   


In [15]:
# write to disk
# sift_dataset_training.save_to_disk('dataset/sift_evenly_grid/sift_dataset_training')

Saving the dataset (145/169 shards):  86%|████████▌ | 25248/29424 [01:25<00:14, 293.97 examples/s]


OSError: [Errno 28] No space left on device

In [16]:
# get all keypoints and descriptors
all_keypoints = []
all_descriptors = []
for example in sift_dataset_training:
    all_keypoints.extend(example['keypoints'])
    all_descriptors.extend(example['descriptors'])
len(all_keypoints), len(all_descriptors)
# convert to numpy array
all_keypoints_np = np.array(all_keypoints)
all_descriptors_np = np.array(all_descriptors)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1080300d0>>
Traceback (most recent call last):
  File "/Users/michaelcheng/miniforge3/envs/comp61342_asm/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


: 

: 

Model

PCA -> K-Mean -> SVM

PCA-SIFT: from the paper "PCA-SIFT: a more distinctive representation for local image descriptors"

In [None]:
from sklearn.decomposition import PCA

PCA_N_COMPONENTS = 20

pca = PCA(n_components=PCA_N_COMPONENTS)
pca.fit(all_descriptors_np)

In [None]:
# transform the descriptors with map

def transform_descriptors(example, pca):

    des = np.array(example['descriptors'])

    if des.size == 0:
        example['descriptors'] = np.array([0] * PCA_N_COMPONENTS, dtype=np.float32)
        return example
    
    # apply PCA transformation
    if des.ndim == 1:
        des = des.reshape(1, -1)
    example['descriptors'] = pca.transform(des)

    return example

# apply the PCA transformation to all descriptors
sift_dataset_training = sift_dataset_training.map(
    transform_descriptors,
    fn_kwargs={'pca': pca},
    batched=False,
    desc="Transforming descriptors with PCA"
)


In [None]:
# get all keypoints and descriptors
all_keypoints = []
all_descriptors = []
for example in sift_dataset_training:
    all_keypoints.extend(example['keypoints'])
    all_descriptors.extend(example['descriptors'])
len(all_keypoints), len(all_descriptors)
# convert to numpy array
all_keypoints_np = np.array(all_keypoints)
all_descriptors_np = np.array(all_descriptors)

In [14]:
from sklearn.cluster import KMeans

K = 200
kmeans = KMeans(n_clusters=K, random_state=42)
kmeans.fit(all_descriptors_np)

kmean_cluster_centers = kmeans.cluster_centers_

NameError: name 'all_descriptors_np' is not defined

In [None]:
# extend the dataset with the cluster id
def assign_cluster_id(example, kmeans):
    """Assign cluster id to each keypoint based on the closest cluster center"""
    # Update the example with the cluster ids

    des = np.array(example['descriptors'])

    # early return if no descriptors
    if des.size == 0:
        example['cluster_ids'] = np.array([], dtype=np.int32)
        return example

    if des.ndim == 1:
        example['cluster_ids'] = kmeans.predict(des.reshape(1, -1))
    else:
        example['cluster_ids'] = kmeans.predict(des)
    
    return example

# Apply the filter to the dataset
sift_dataset_training_filtered = sift_dataset_training.map(
    assign_cluster_id,
    fn_kwargs={'kmeans': kmeans},
    num_proc=1,
    desc="Assigning cluster ids to keypoints"
)

In [None]:
# create a histogram of the cluster ids
# that will be used to compute TF-IDF

def create_histogram(example):
    """Create a histogram of cluster ids"""

    # early exit if descriptors are empty -> cluster_ids will be empty too
    if len(example['descriptors']) == 0:
        example['histogram'] = np.array([[]], dtype=np.int64)
        return example
    

    hist, _ = np.histogram(example['cluster_ids'], bins=np.arange(K + 1))
    
    example['histogram'] = hist.reshape(-1, K)

    return example

# Apply the histogram function to the dataset
sift_dataset_training_filtered = sift_dataset_training_filtered.map(
    create_histogram,
    num_proc=1,
    desc="Creating histogram of cluster ids"
)

In [None]:
# create histogram using TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer

# convert the des_kmeans to a count-matrix
# then use TfidfVectorizer to convert it to a TF-IDF matrix
tfidf_transformer = TfidfTransformer()

# grab all non-empty histograms and concat them to a very large 2D array
des_histo = np.concatenate(
    [example['histogram'] for example in sift_dataset_training_filtered if len(example['histogram'][0]) > 0],
    axis=0
)

# Convert the list of descriptors to TF-IDF representation
tfidf_matrix = tfidf_transformer.fit_transform(des_histo)

In [None]:
# note that we drop some of the images in the y_train set
# because they have no descriptors
# so we need to filter the y_train set too

y_train_filtered = [y_train[i] for i, example in enumerate(sift_dataset_training_filtered) if len(example['histogram'][0]) > 0]
y_train_filtered = np.array(y_train_filtered)

In [None]:
# SVM

from sklearn.svm import SVC

svm = SVC(random_state=42)
svm.fit(tfidf_matrix, y_train_filtered)

Test set

In [None]:
sift_dataset_testing = create_sift_dataset(X_test, img_ids_w_filename)
sift_dataset_testing = sift_dataset_testing.map(
    process_image_with_sift,
    num_proc=1,
    batched=False,
    desc="Extracting SIFT features"
)