## Libraries and Imports

In [None]:
from datasets import load_dataset
import random
import cv2
import numpy as np
import matplotlib.pyplot as plt
import sklearn 
import pandas as pd

## Import data

In [None]:
ds_dict = load_dataset("flwrlabs/caltech101")

random.seed(47)

ds = ds_dict["train"]
class_names = ds.features["label"].names

# sample 5 random label IDs
num_of_labels = 20 # change as needed <--------------
num_classes = len(class_names)
selected_label_ids = random.sample(range(num_classes), num_of_labels) 

# filter dataset
ds = ds.filter(lambda x: x["label"] in selected_label_ids)

# We map the ids to class names for printing so we know which labels were selected
selected_labels = [class_names[i] for i in selected_label_ids]
print("Selected labels:", selected_labels)


## Split data

In [None]:
ds_split = ds.train_test_split(test_size=0.5, seed=42) # define split, it puts the remaining data in "train"

train_ds = ds_split["train"] # training set #50% of data
test_ds  = ds_split["test"] # test set #50% of data

# 1 Codebook generation

## Extract features

In [None]:
sift = cv2.SIFT_create() #SIFT feature extractor, create the SIFT object once

def extract_features(pil_image):

    # Convert PIL image to numpy array
    image = np.array(pil_image)

    # Convert to grayscale
    if image.ndim == 3:
        image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)

    #extract_features
    keypoints, descriptors = sift.detectAndCompute(image, None)

    # Handle images with no keypoints
    if descriptors is None:
        return np.empty((0, 128), dtype=np.float32)

    return keypoints ,descriptors

In [None]:
train_features = []
train_labels   = []
for sample in train_ds:
    keypoints, descriptors = extract_features(sample["image"])
    train_features.append(descriptors)
    train_labels.append(sample["label"])


test_features = []
test_labels   = []
for sample in test_ds:
    keypoints, descriptors = extract_features(sample["image"])
    test_features.append(descriptors)
    test_labels.append(sample["label"])

# Convert lists to matrix
train_feature_matrix = np.vstack(train_features) # use for training k-means
test_feature_matrix = np.vstack(test_features)

train_feature_matrix.shape, test_feature_matrix.shape #check shapes

The shape above tells us that we currently have 87407 (rows) features represented as vectors of length 128. Each feature describes a patch in a given image in the dataset. A feature could fx. be a pattern, blob, corner, edge or something else.

### Reduce number of features (maybe not neccesary)

In [None]:
#Sample some of the descriptors, to reduce size for k-means
num_samples = 1000  # Adjust as needed

num_dec = train_feature_matrix.shape[0]
sample_size = min(num_samples, num_dec) # so we dont sample more than we have

rng = np.random.default_rng(seed=42)  # set random seed
sample_indices = rng.choice(num_dec, size=sample_size, replace=False) # get random indices



train_sampled_descriptors = train_feature_matrix[sample_indices] # get the randomly sampled descriptors
train_sampled_descriptors.shape #check shapes again to make sure sampling worked

## K mens clustering

In [None]:
k = 500  # number of clusters / visual words
random_state = 42
KMeans_model = sklearn.cluster.KMeans(n_clusters=k, random_state=random_state, n_init= 10) # Model definition
KMeans_model.fit(train_sampled_descriptors) # Fit model, which means to train the k-means clustering
codebook = KMeans_model.cluster_centers_  # The cluster centers are our visual words (our codebook)
print("Visual words shape:", codebook.shape)


## Form Bag of visual words for training data
classify each training descriptor to the
closest cluster centers and form the bag of words (BoW) for each image in the
image training set. For each image, we find out which words appears, and how many times it appears. Remember that the Bag of visual words, IS the word histogram

In [None]:

# Which visual word (cluster) does each descriptor belong to?
train_word_ids = []

for descriptors in train_features:   # one image at a time
    word_ids = KMeans_model.predict(descriptors)
    train_word_ids.append(word_ids)

# Now we can build the bag-of-words histograms for each image, i.e. 
#count how many times each visual word appears in the image
train_bow = []
for word_ids in train_word_ids:
    hist = np.bincount(word_ids, minlength=k)
    train_bow.append(hist)

train_bow = np.array(train_bow)
train_bow.shape  # Check shape

# 2 Indexing
For each image in the test set:

* Extract the SIFT descriptors of the feature points in the image, (DONE)
* Project the descriptors onto the codebook, i.e. for each descriptor the find the closest cluster
* Constructs the generated corresponding bag of words, i.e. word histogram

## Form the bag of words for the testing data

In [None]:

# Which visual word (cluster) does each descriptor belong to?
test_word_ids = []

for descriptors in test_features:   # one image at a time
    word_ids = KMeans_model.predict(descriptors)
    test_word_ids.append(word_ids)
# Now we can build the bag-of-words histograms for each image, i.e. 
#count how many times each visual word appears in the image
test_bow = []
for word_ids in test_word_ids:
    hist = np.bincount(word_ids, minlength=k)
    test_bow.append(hist)

test_bow = np.array(test_bow)
test_bow.shape  # Check shape

## Construct table
Per row the table should contain
* file name
* true category
* training or test set
* corresponding bag of words/word histogram

In [None]:
# Create training DataFrame
train_rows = []

for i, sample in enumerate(train_ds):
    row = {
        "filename": sample.get("filename", f"train_{i}"),
        "label": train_labels[i],
        "split": "train",
        "bow": train_bow[i]
    }
    train_rows.append(row)

train_table = pd.DataFrame(train_rows)

# Create test DataFrame
test_rows = []

for i, sample in enumerate(test_ds):
    row = {
        "filename": sample.get("filename", f"test_{i}"),
        "label": test_labels[i],
        "split": "test",
        "bow": test_bow[i]
    }
    test_rows.append(row)

test_table = pd.DataFrame(test_rows)

# Combine the two :)
Table = pd.concat([train_table, test_table], ignore_index=True)
print("Combined table shape: images x categories", Table.shape)