Inference with the validation set (as I forgot to save the result of the evaluation set)

In [1]:
from pycocotools.coco import COCO
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import cv2

from helper import print_log

In [2]:
train2017 = 'train2017'
val2017 = 'val2017'
ann_file = 'dataset/coco/annotations/instances_{}.json'

In [3]:
TOP_10_CATS_ID = set([1,  3, 62, 84, 44, 47, 67, 51, 10, 31])
CATS_NAMES = {
    1: 'person',
    3: 'car',
    62: 'chair',
    84: 'book',
    44: 'bottle',
    47: 'cup',
    67: 'dinning table',
    51: 'traffic light',
    10: 'bowl',
    31: 'handbag'
}
LABELS = [CATS_NAMES[id] for id in sorted(list(TOP_10_CATS_ID))]

In [4]:
coco_train = COCO(ann_file.format(train2017))
coco_val = COCO(ann_file.format(val2017))

loading annotations into memory...
Done (t=7.41s)
creating index...
index created!
loading annotations into memory...
Done (t=0.22s)
creating index...
index created!


In [5]:
def get_coco_images_and_labels(coco):

    # get all filenames
    img_ids_w_filename = {coco.dataset['images'][i]['id']: coco.dataset['images'][i]['file_name'] for i in range(len(coco.dataset['images']))}      # use dictionary for faster query

    # get all images
    img_ids = [coco.dataset['images'][i]['id'] for i in range(len(coco.dataset['images']))]

    # load labels for each imgs (as one img may have multiple labels)
    labels_per_imgs = []
    for i in range(len(img_ids)):
        labels_per_imgs.append(coco.loadAnns(coco.getAnnIds(imgIds=img_ids[i])))

    img_id_w_bb = []
    label_per_obj = []

    for labels in labels_per_imgs:
        for l in labels:
            img_id_w_bb.append((l['id'], l['image_id'], l['bbox']))
            label_per_obj.append(l['category_id'])

    return img_ids_w_filename, img_id_w_bb, label_per_obj

In [6]:
img_ids_w_filename_train, img_id_w_bb_train, label_per_obj_train = get_coco_images_and_labels(coco_train)
img_ids_w_filename_val, img_id_w_bb_val, label_per_obj_val = get_coco_images_and_labels(coco_val)

Dataset save/load

In [7]:
# load filtered dataset

import pickle

filtered_dataset_dir = Path('dataset/coco_top10_filtered_20250423')

# with open(filtered_dataset_dir / 'img_id_w_bb_train_top10_v2.pkl', 'rb') as f:
#     img_id_w_bb_train_top10_filtered = pickle.load(f)
# with open(filtered_dataset_dir / 'label_per_obj_train_top10_v2.pkl', 'rb') as f:
#     label_per_obj_train_top10_filtered = pickle.load(f)

with open(filtered_dataset_dir/ 'img_id_w_bb_val_top10.pkl', 'rb') as f:
    img_id_w_bb_val_top10 = pickle.load(f)
with open(filtered_dataset_dir / 'label_per_obj_val_top10.pkl', 'rb') as f:
    label_per_obj_val_top10 = pickle.load(f)

In [8]:
# len(img_id_w_bb_train_top10_filtered), len(label_per_obj_train_top10_filtered)

In [9]:
len(img_id_w_bb_val_top10), len(label_per_obj_val_top10)

(20312, 20312)

---

In [10]:
from datasets import Dataset

# 1. First, prepare your data for the datasets library
def prepare_dataset_dict(X, img_ids_w_filename):
    # Create a lightweight dictionary containing only metadata (not images)
    dataset_dict = {
        "image_id": [],
        "bbox": [],
        "file_name": []
    }
    
    for sample in X:
        dataset_dict["image_id"].append(sample[1])
        dataset_dict["bbox"].append(sample[2])
        dataset_dict["file_name"].append(img_ids_w_filename[sample[1]])
        
    return dataset_dict

# 2. Define the SIFT processing function

def process_image_with_sift(example, coco_ds):
    """Process a single image, extracting SIFT features"""
    # Load image only when needed
    img_path = Path(f"dataset/coco/{coco_ds}/{example['file_name']}")
    img = cv2.imread(str(img_path))
    
    # Apply bounding box
    x, y, w, h = example['bbox']
    img_cropped = img[int(y): int(y + h) + 1, int(x):int(x + w) + 1]
    
    # Convert to grayscale
    img_gray = cv2.cvtColor(img_cropped, cv2.COLOR_BGR2GRAY)
    
    # Apply SIFT
    sift = cv2.SIFT_create()
    keypoints, descriptors = sift.detectAndCompute(img_gray, None)
    
    # Convert keypoints to serializable format
    serialized_keypoints = []
    for kp in keypoints:
        serialized_keypoints.append({
            'x': float(kp.pt[0]),
            'y': float(kp.pt[1]), 
            'size': float(kp.size),
            'angle': float(kp.angle),
            'response': float(kp.response),
            'octave': int(kp.octave)
        })
    
    # Return only the features, together with the image_id, bbox, and filename
    # but not the image (pixel) itself
    return {
        'image_id': example['image_id'],
        'bbox': example['bbox'],
        'file_name': example['file_name'],
        'keypoints': serialized_keypoints,
        'descriptors': descriptors.tolist() if descriptors is not None else []
    }

# 3. Main pipeline
def create_sift_dataset(X_train, coco_ds, img_ids_w_filename):
    # Create the dataset dictionary
    dataset_dict = prepare_dataset_dict(X_train, img_ids_w_filename)
    
    # Create HF dataset
    raw_dataset = Dataset.from_dict(dataset_dict)
    
    # Process using (optional) multi-processing
    processed_dataset = raw_dataset.map(
        process_image_with_sift,
        fn_kwargs={'coco_ds': coco_ds},
        num_proc=2,
        batched=False,
        desc="Extracting SIFT features"
    )
    
    return processed_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [11]:
sift_dataset_val_path = Path('dataset/coco_top10_filtered_20250423/sift_dataset_val')
if sift_dataset_val_path.exists():
    sift_dataset_val = Dataset.load_from_disk(sift_dataset_val_path)

    print("Validation dataset already exists. Loading from disk...")
else:
    # create the dataset
    sift_dataset_val = create_sift_dataset(img_id_w_bb_val_top10, val2017, img_ids_w_filename_val)

    # try to save the dataset
    sift_dataset_val.save_to_disk('dataset/coco_top10_filtered_20250423/sift_dataset_val')

    print("Validation dataset is created and saved to disk.")

Validation dataset already exists. Loading from disk...


---

In [12]:
DESCIPTORS_DIM = 128

Hyperparameter selection

In [13]:
from itertools import product

K_GRID = [50, 100, 150, 200]
PCA_N_COMPONENTS_GRID = [20, 50, 128]       # 128 is the default for SIFT -> no PCA reduction

hyperparam_comb = list(product(K_GRID, PCA_N_COMPONENTS_GRID))

In [14]:
from datetime import datetime

tdy = datetime(2025, 4, 23, 22, 57, 33)
top_model_dir = Path(f'models/PCA-SIFT/{tdy.strftime("%Y%m%d-%H%M%S")}/')
if not top_model_dir.exists():
    top_model_dir.mkdir(parents=True)

In [15]:
# evaluation test-set function

# extend the dataset with the cluster id (equivalent to vector quantization)
def assign_cluster_id(example, pca, kmeans, pca_n_components):
    """Assign cluster id to each keypoint based on the closest cluster center"""
    # Update the example with the cluster ids

    des = np.array(example['descriptors'])
    # check if descriptors are empty
    if des.size == 0:
        example['cluster_ids'] = np.array([], dtype=np.int32)
        return example
    
    # apply PCA to the descriptors
    if pca_n_components < DESCIPTORS_DIM:
        red_des = pca.transform(des)
    else:
        red_des = des

    # early return if transformed descriptors are empty
    if red_des.size == 0:
        example['cluster_ids'] = np.array([], dtype=np.int32)
        return example

    if red_des.ndim == 1:
        example['cluster_ids'] = kmeans.predict(red_des.reshape(1, -1))
    else:
        example['cluster_ids'] = kmeans.predict(red_des)
    
    return example


In [16]:
# create a histogram of the cluster ids
# that will be used to compute TF-IDF

def create_histogram(example, K):
    """Create a histogram of cluster ids"""

    # early exit if descriptors are empty -> cluster_ids will be empty too
    if len(example['descriptors']) == 0:
        example['histogram'] = np.array([[]], dtype=np.int64)
        return example

    hist, _ = np.histogram(example['cluster_ids'], bins=np.arange(K + 1))
    
    example['histogram'] = hist.reshape(-1, K)

    return example

In [17]:
# evaluation sub-functions
# grab accuracy score, confusion matrix, and classification report

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

def compute_accuracy(y_true, y_pred):
    """Compute accuracy score"""
    return accuracy_score(y_true, y_pred)

def compute_classification_report(y_true, y_pred, labels):
    """Compute classification report"""
    return classification_report(y_true, y_pred, target_names=labels, zero_division=0)

def compute_confusion_matrix(y_true, y_pred, labels, save=False, save_path=None):
    """Compute confusion matrix"""
    cm_matrix = confusion_matrix(y_true, y_pred)

    cm_disp = ConfusionMatrixDisplay(confusion_matrix=cm_matrix, display_labels=labels)
    fig, ax = plt.subplots(figsize=(12,12))
    ax.set_title('Confusion Matrix')
    cm_disp.plot(ax=ax, cmap=plt.cm.Blues, xticks_rotation=90)

    if save:
        if save_path is None:
            raise ValueError("save_path must be provided if save is True")
        plt.savefig(save_path)

In [18]:
def evaluate_model(sift_dataset, y, pca, kmeans, tfidf, svm, K, PCA_N_COMPONENT):
    sift_dataset = sift_dataset.map(
        assign_cluster_id,
        fn_kwargs={'kmeans': kmeans, 'pca': pca, 'pca_n_components': PCA_N_COMPONENT},
        num_proc=1,
        desc="Assigning cluster ids to keypoints. Including PCA -> KMeans"
    )

    sift_dataset = sift_dataset.map(
        create_histogram,
        fn_kwargs={'K': K},
        num_proc=1,
        desc="Creating histogram of cluster ids."
    )

    des_histo = np.concatenate(
        [example['histogram'] for example in sift_dataset if len(example['histogram'][0]) > 0],
        axis=0
    )
    des_histo = des_histo.reshape(des_histo.shape[0], -1)

    # Convert the list of descriptors to TF-IDF representation
    tfidf_matrix = tfidf.transform(des_histo)

    y_filtered = [y[i] for i, example in enumerate(sift_dataset) if len(example['histogram'][0]) > 0]
    y_filtered = np.array(y_filtered)

    # Predict the labels using the SVM model
    y_pred = svm.predict(tfidf_matrix)

    return y_filtered, y_pred

In [19]:
def save_evaluations(y, y_pred, labels, model_dir, eval_stage=str):
    """Save the evaluation results
    
    eval_stage: str
        The stage of the evaluation. It can be 'train', 'test' or 'val.
    """
    # Save the accuracy score
    accuracy = compute_accuracy(y, y_pred)
    print(f"Accuracy [{eval_stage}]: {accuracy}")

    # Save the classification report
    report = compute_classification_report(y, y_pred, labels)
    with open(model_dir / f'classification_report_{eval_stage}.txt', 'w') as f:
        f.write(report)

    # Save the confusion matrix
    cm_path = model_dir / f'confusion_matrix_{eval_stage}.png'
    compute_confusion_matrix(y, y_pred, labels, save=True, save_path=cm_path)

hyperparameter re-inference

In [20]:
for i, (K, PCA_N_COMPONENT) in enumerate(hyperparam_comb):
    print_log(f'-' * 50)
    print_log(f'K: {K}, PCA_N_COMPONENT: {PCA_N_COMPONENT}')
    print_log(f'-' * 50)

    # create a directory for each model
    model_dir = top_model_dir / f'KMeans_{K}_PCA_{PCA_N_COMPONENT}'
    if not model_dir.exists():
        model_dir.mkdir(parents=True)
        print_log(f'Model directory {model_dir} created.')
    
    # check if the files already exist
    y_val_filtered_path = model_dir / f'y_val_filtered_{K}_{PCA_N_COMPONENT}.npy'
    y_pred_val_path = model_dir / f'y_pred_val_{K}_{PCA_N_COMPONENT}.npy'
    if y_val_filtered_path.exists() and y_pred_val_path.exists():
        print_log(f'Files already exist. Skipping evaluation for K={K} and PCA_N_COMPONENT={PCA_N_COMPONENT}.')
        continue


    pca_model_name = f'PCA-SIFT_PCA_PCA-N_{PCA_N_COMPONENT}' + '.pkl'
    kmeans_model_name = f'PCA-SIFT_KMeans_PCA-N_{PCA_N_COMPONENT}_KMeans-K_{K}' + '.pkl'
    tfidf_model_name = f'PCA-SIFT_TFIDF_PCA-N_{PCA_N_COMPONENT}_KMeans-K_{K}' + '.pkl'
    svm_model_name = f'PCA-SIFT_SVM-SGD_PCA-N_{PCA_N_COMPONENT}_KMeans-K_{K}.pkl'

    # load the models
    if (model_dir / pca_model_name).exists():
        pca = pickle.load(open(model_dir / pca_model_name, 'rb'))
    else:
        pca = None

    kmeans = pickle.load(open(model_dir / kmeans_model_name, 'rb'))
    tfidf = pickle.load(open(model_dir / tfidf_model_name, 'rb'))
    svm_sgd = pickle.load(open(model_dir / svm_model_name, 'rb'))


    # evaluate on validation set
    y_val_filtered, y_pred_val = evaluate_model(sift_dataset_val, label_per_obj_val_top10, pca, kmeans, tfidf, svm_sgd, K, PCA_N_COMPONENT)
    save_evaluations(y_val_filtered, y_pred_val, labels=LABELS,  model_dir=model_dir, eval_stage='val')
    print_log(f'Evaluation on validation set done.')

    # save the prediction result for future use (create further evaluation)
    np.save(y_val_filtered_path, y_val_filtered)
    np.save(y_pred_val_path, y_pred_val)

    print_log(f'Finished evaluation for K={K} and PCA_N_COMPONENT={PCA_N_COMPONENT}.')
    print_log(f'-' * 50)

[2025-04-25 01:00:25:685] - --------------------------------------------------
[2025-04-25 01:00:25:685] - K: 50, PCA_N_COMPONENT: 20
[2025-04-25 01:00:25:685] - --------------------------------------------------
[2025-04-25 01:00:25:685] - Files already exist. Skipping evaluation for K=50 and PCA_N_COMPONENT=20.
[2025-04-25 01:00:25:685] - --------------------------------------------------
[2025-04-25 01:00:25:685] - K: 50, PCA_N_COMPONENT: 50
[2025-04-25 01:00:25:685] - --------------------------------------------------
[2025-04-25 01:00:25:685] - Files already exist. Skipping evaluation for K=50 and PCA_N_COMPONENT=50.
[2025-04-25 01:00:25:685] - --------------------------------------------------
[2025-04-25 01:00:25:685] - K: 50, PCA_N_COMPONENT: 128
[2025-04-25 01:00:25:685] - --------------------------------------------------
[2025-04-25 01:00:25:685] - Files already exist. Skipping evaluation for K=50 and PCA_N_COMPONENT=128.
[2025-04-25 01:00:25:685] - -------------------------

Single evaluation

In [21]:
K = 100
PCA_N_COMPONENT = 20

In [22]:
# create a directory for each model
model_dir = top_model_dir / f'KMeans_{K}_PCA_{PCA_N_COMPONENT}'
if not model_dir.exists():
    model_dir.mkdir(parents=True)
    print_log(f'Model directory {model_dir} created.')

pca_model_name = f'PCA-SIFT_PCA_PCA-N_{PCA_N_COMPONENT}' + '.pkl'
kmeans_model_name = f'PCA-SIFT_KMeans_PCA-N_{PCA_N_COMPONENT}_KMeans-K_{K}' + '.pkl'
tfidf_model_name = f'PCA-SIFT_TFIDF_PCA-N_{PCA_N_COMPONENT}_KMeans-K_{K}' + '.pkl'
svm_model_name = f'PCA-SIFT_SVM-SGD_PCA-N_{PCA_N_COMPONENT}_KMeans-K_{K}.pkl'

# load the models
if (model_dir / pca_model_name).exists():
    pca = pickle.load(open(model_dir / pca_model_name, 'rb'))
else:
    pca = None

kmeans = pickle.load(open(model_dir / kmeans_model_name, 'rb'))
tfidf = pickle.load(open(model_dir / tfidf_model_name, 'rb'))
svm_sgd = pickle.load(open(model_dir / svm_model_name, 'rb'))


# evaluate on validation set
y_val_filtered, y_pred_val = evaluate_model(sift_dataset_val, label_per_obj_val_top10, pca, kmeans, tfidf, svm_sgd, K, PCA_N_COMPONENT)

In [24]:
from sklearn.metrics import f1_score, classification_report

f1 = f1_score(y_val_filtered, y_pred_val, average='weighted')
print(f"F1 Score: {f1}")

F1 Score: 0.4482640784237015


In [25]:
classification_report(y_val_filtered, y_pred_val, target_names=LABELS, zero_division=0, output_dict=True)

{'person': {'precision': 0.6419445310065441,
  'recall': 0.81030583144852,
  'f1-score': 0.7163660073896979,
  'support': 10169.0},
 'car': {'precision': 0.1933292155651637,
  'recall': 0.18810096153846154,
  'f1-score': 0.1906792567773378,
  'support': 1664.0},
 'bowl': {'precision': 0.15358361774744028,
  'recall': 0.19607843137254902,
  'f1-score': 0.1722488038277512,
  'support': 459.0},
 'handbag': {'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0.0,
  'support': 463.0},
 'bottle': {'precision': 0.07848101265822785,
  'recall': 0.03311965811965812,
  'f1-score': 0.04658151765589782,
  'support': 936.0},
 'cup': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 826.0},
 'traffic light': {'precision': 0.09815950920245399,
  'recall': 0.02735042735042735,
  'f1-score': 0.0427807486631016,
  'support': 585.0},
 'chair': {'precision': 0.22476190476190477,
  'recall': 0.07181984175289105,
  'f1-score': 0.1088560885608856,
  'support': 1643.0},
 'dinning table': {'precision

In [26]:
y_val_filtered

array([44, 67,  1, ..., 47,  1,  1])