In [None]:
import os
import pickle
import cv2 as cv
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import matplotlib.pyplot as plt

In [None]:
descriptor_extractor = cv.ORB_create()

In [None]:
filename_descriptors_map = f'bins/{descriptor_extractor.__class__.__name__}-descriptors_map.bin'
print(filename_descriptors_map)

In [None]:
# check if descriptors are already computed
if os.path.isfile(filename_descriptors_map):
    print('Loading descriptors')
    # load pre-computed descriptors
    f = open(filename_descriptors_map, 'rb')
    descriptors = pickle.load(f)
    f.close()
else:
    print('Computing descriptors')
    # Initialize the data structure that will contain all the descriptors
    descriptors = None
    # Loop over map images
    for img_name in m_imgs:
        img = cv.imread(os.path.join('data02/', img_name))

        # Extract descriptors
        keypoints_img, descriptors_img = descriptor_extractor.detectAndCompute(
            img, None)
        # Accumulate the computed descriptors
        if descriptors is None:
            descriptors = descriptors_img
        else:
            descriptors = np.vstack((descriptors, descriptors_img))
        # print(descriptors.shape)

    # save descriptors (uncomment if you want to save the computed descriptors)
    f = open(filename_descriptors_map, 'wb')
    pickle.dump(descriptors, f)
    f.close()

# We have extracted 50 descriptors per image, for 1000 images. The ORB descriptors have length 256
print(descriptors.shape)

In [None]:
def KMeansClustering(descriptors, n_clusters):
    clusterer = KMeans(n_clusters=n_clusters)
    clusters = clusterer.fit(descriptors)
    centroids = clusters.cluster_centers_

    return centroids, clusters.labels_

In [None]:
def GaussianMixturesClustering(descriptors, n_components=8):
    scaler = StandardScaler()
    descriptors_scaled = scaler.fit_transform(descriptors)
    
    clusterer = GaussianMixture(n_components=n_components)
    clusterer.fit(descriptors_scaled)
    
    return clusterer.means_, clusterer.predict(descriptors_scaled)

In [None]:
def BisectingKMeansClustering(descriptors, n_clusters=8):
    from sklearn.cluster import KMeans

    class BisectingKMeans:
        def __init__(self, n_clusters):
            self.n_clusters = n_clusters
            self.cluster_centers_ = []
            self.labels_ = None

        def fit(self, X):
            n_clusters = self.n_clusters
            clusters = [X]

            while len(clusters) < n_clusters:
                # Find the cluster with the highest SSE
                sse_list = [np.sum((c - np.mean(c, axis=0))**2) for c in clusters]
                index = np.argmax(sse_list)
                cluster_to_split = clusters.pop(index)
                
                # Perform k-means with k=2 on selected cluster
                kmeans = KMeans(n_clusters=2).fit(cluster_to_split)
                new_clusters = [cluster_to_split[kmeans.labels_ == i] for i in range(2)]
                
                # Add the newly formed clusters to the list
                clusters.extend(new_clusters)
            
            self.labels_ = np.zeros(X.shape[0], dtype=np.int64)
            for i, cluster in enumerate(clusters):
                for point in cluster:
                    idx = np.where(np.all(X==point, axis=1))[0][0]
                    self.labels_[idx] = i
                self.cluster_centers_.append(np.mean(cluster, axis=0))

            self.cluster_centers_ = np.array(self.cluster_centers_)
            return self

    scaler = StandardScaler()
    descriptors_scaled = scaler.fit_transform(descriptors)
    
    clusterer = BisectingKMeans(n_clusters=n_clusters).fit(descriptors_scaled)
    
    return clusterer.cluster_centers_, clusterer.labels_

In [None]:
def clustering_performance(descriptors, n_clusters=8):
    performance_metrics = {}

    # K-Means Clustering
    if n_clusters is not None:
        kmeans_centroids, kmeans_labels = KMeansClustering(descriptors, n_clusters)
        # Evaluate K-Means performance
        performance_metrics['KMeans'] = {
            'Silhouette': silhouette_score(descriptors, kmeans_labels),
            'Calinski-Harabasz': calinski_harabasz_score(descriptors, kmeans_labels),
            'Davies-Bouldin': davies_bouldin_score(descriptors, kmeans_labels)
        }

        # Evaluate Gaussian Mixtures performance
        gaussian_centroids, gaussian_labels = GaussianMixturesClustering(descriptors, n_clusters)
        performance_metrics['GaussianMixtures'] = {
            'Silhouette': silhouette_score(descriptors, gaussian_labels),
            'Calinski-Harabasz': calinski_harabasz_score(descriptors, gaussian_labels),
            'Davies-Bouldin': davies_bouldin_score(descriptors, gaussian_labels)
        }

        # Evaluate Bisecting KMeans performance
        Bisecting_centroids, Bisecting_labels = BisectingKMeansClustering(descriptors, n_clusters)
        performance_metrics['BisectingKMeans'] = {
            'Silhouette': silhouette_score(descriptors, Bisecting_labels),
            'Calinski-Harabasz': calinski_harabasz_score(descriptors, Bisecting_labels),
            'Davies-Bouldin': davies_bouldin_score(descriptors, Bisecting_labels)
        }



    return performance_metrics

In [None]:
def plot_performance(performance_metrics):

    algorithms = list(performance_metrics.keys())
    metric_names = ['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin']

    fig, axes = plt.subplots(1, len(metric_names), figsize=(15, 5))

    for ax, metric in zip(axes2, metric_names):

        values = [performance_metrics[algorithm][metric] for algorithm in algorithms]

        ax.bar(algorithms, values)
        ax.set_title(metric)
        ax.set_xlabel('Algorithms')
        ax.set_ylabel('Score')

    plt.tight_layout()
    plt.show()

In [None]:
performance_metrics_8 = clustering_performance(descriptors, n_clusters=8)
plot_performance(performance_metrics_8)

In [None]:
performance_metrics_30 = clustering_performance(descriptors, n_clusters=30)
plot_performance(performance_metrics_30)