In [None]:
!pip install scikit-learn-extra


In [2]:
!pip install hdbscan


Defaulting to user installation because normal site-packages is not writeable
Collecting hdbscan
  Downloading hdbscan-0.8.41-cp313-cp313-win_amd64.whl.metadata (15 kB)
Downloading hdbscan-0.8.41-cp313-cp313-win_amd64.whl (671 kB)
   ---------------------------------------- 0.0/671.7 kB ? eta -:--:--
   ---------------------------------------- 671.7/671.7 kB 5.5 MB/s eta 0:00:00
Installing collected packages: hdbscan
Successfully installed hdbscan-0.8.41



[notice] A new release of pip is available: 24.3.1 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
!pip install scikit-fuzzy


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-fuzzy
  Downloading scikit_fuzzy-0.5.0-py2.py3-none-any.whl.metadata (2.6 kB)
Downloading scikit_fuzzy-0.5.0-py2.py3-none-any.whl (920 kB)
   ---------------------------------------- 0.0/920.8 kB ? eta -:--:--
   ----------- ---------------------------- 262.1/920.8 kB ? eta -:--:--
   ---------------------------------------- 920.8/920.8 kB 2.6 MB/s eta 0:00:00
Installing collected packages: scikit-fuzzy
Successfully installed scikit-fuzzy-0.5.0



[notice] A new release of pip is available: 24.3.1 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.cluster import (
    KMeans, MiniBatchKMeans, AgglomerativeClustering,
    DBSCAN, OPTICS, SpectralClustering, Birch,
    AffinityPropagation, MeanShift
)

from sklearn.mixture import GaussianMixture
from sklearn_extra.cluster import KMedoids
import hdbscan
import skfuzzy as fuzz


In [2]:
from sklearn.preprocessing import LabelEncoder

original_df = pd.read_csv("CPN_working/labels.csv")

image_names = original_df["image_name"].values
original_labels = original_df["label"].values

le = LabelEncoder()
original_labels_enc = le.fit_transform(original_labels)

In [3]:
def compute_label_change(original, clustered):
    return (np.sum(original != clustered) / len(original)) * 100


def save_cluster_labels_csv(model, algo, image_names, original_labels, labels):
    os.makedirs("clustering_labels", exist_ok=True)

    df = pd.DataFrame({
        "image_name": image_names,
        "original_label": original_labels,   # ← human-readable
        "cluster_label": labels               # ← cluster assignment
    })

    filename = f"clustering_labels/{model}_{algo}.csv"
    df.to_csv(filename, index=False)



In [6]:
def get_algorithms(X):
    return {
        "KMeans": KMeans(n_clusters=NUM_CLASSES, random_state=42),
        "MiniBatchKMeans": MiniBatchKMeans(n_clusters=NUM_CLASSES),
        "KMedoids": KMedoids(n_clusters=NUM_CLASSES),
        "Agglomerative_Ward": AgglomerativeClustering(
            n_clusters=NUM_CLASSES, linkage="ward"
        ),
        "Agglomerative_Complete": AgglomerativeClustering(
            n_clusters=NUM_CLASSES, linkage="complete"
        ),
        "DBSCAN": DBSCAN(eps=0.5, min_samples=5),
        "OPTICS": OPTICS(min_samples=5),
        "HDBSCAN": hdbscan.HDBSCAN(min_cluster_size=10),
        "GMM": GaussianMixture(n_components=NUM_CLASSES),
        # "Spectral": SpectralClustering(
        #     n_clusters=NUM_CLASSES, assign_labels="kmeans"
        # ),
        "BIRCH": Birch(n_clusters=NUM_CLASSES),
        "AffinityPropagation": AffinityPropagation(),
        "MeanShift": MeanShift(),
        "Spectral": SpectralClustering(
            n_clusters=NUM_CLASSES, assign_labels="kmeans"
        )
    }


In [6]:
NUM_CLASSES = 3
MODEL_NAME = "EfficientNetB0"   # change per cell

print(f"\n===== Processing {MODEL_NAME} =====")

X = np.load(f"{MODEL_NAME}_features.npy")

algorithms = get_algorithms(X)

results = []

for algo_name, algo in algorithms.items():
    print(f"→ Implementing {algo_name} ...")

    try:
        # Fit & predict
        if algo_name == "GMM":
            labels = algo.fit(X).predict(X)
        else:
            labels = algo.fit_predict(X)

        # Save only labels CSV
        save_cluster_labels_csv(
    MODEL_NAME,
    algo_name,
    image_names,
    original_labels,   # ← ADD THIS
    labels
)


        # Compute label change %
        change = compute_label_change(original_labels_enc, labels)

        results.append([MODEL_NAME, algo_name, change])

        print(f"   ✓ Done | Label change: {change:.2f}%")

    except Exception as e:
        print(f"   ✗ Failed: {str(e)}")
        continue


# Append results safely
results_df = pd.DataFrame(
    results,
    columns=["model", "clustering", "label_percentage_change"]
)

results_df.to_csv(
    "clustering_results.csv",
    mode="a",
    header=not os.path.exists("clustering_results.csv"),
    index=False
)



===== Processing EfficientNetB0 =====
→ Implementing KMeans ...
   ✓ Done | Label change: 92.67%
→ Implementing MiniBatchKMeans ...
   ✓ Done | Label change: 43.99%
→ Implementing KMedoids ...
   ✓ Done | Label change: 86.95%
→ Implementing Agglomerative_Ward ...
   ✓ Done | Label change: 89.65%
→ Implementing Agglomerative_Complete ...
   ✓ Done | Label change: 66.83%
→ Implementing DBSCAN ...
   ✓ Done | Label change: 54.95%
→ Implementing OPTICS ...
   ✓ Done | Label change: 99.90%
→ Implementing HDBSCAN ...
   ✓ Done | Label change: 87.34%
→ Implementing GMM ...
   ✗ Failed: Fitting the mixture model failed because some components have ill-defined empirical covariance (for instance caused by singleton or collapsed samples). Try to decrease the number of components, increase reg_covar, or scale the input data. The numerical accuracy can also be improved by passing float64 data instead of float32.
→ Implementing Spectral ...
   ✓ Done | Label change: 62.76%
→ Implementing BIRCH ...




   ✓ Done | Label change: 99.98%
→ Implementing MeanShift ...
   ✓ Done | Label change: 92.67%


In [7]:
NUM_CLASSES = 3
MODEL_NAME = "ResNet152"   # change per cell

print(f"\n===== Processing {MODEL_NAME} =====")

X = np.load(f"{MODEL_NAME}_features.npy")

algorithms = get_algorithms(X)

results = []

for algo_name, algo in algorithms.items():
    print(f"→ Implementing {algo_name} ...")

    try:
        # Fit & predict
        if algo_name == "GMM":
            labels = algo.fit(X).predict(X)
        else:
            labels = algo.fit_predict(X)

        # Save only labels CSV
        save_cluster_labels_csv(
    MODEL_NAME,
    algo_name,
    image_names,
    original_labels,   # ← ADD THIS
    labels
)


        # Compute label change %
        change = compute_label_change(original_labels_enc, labels)

        results.append([MODEL_NAME, algo_name, change])

        print(f"   ✓ Done | Label change: {change:.2f}%")

    except Exception as e:
        print(f"   ✗ Failed: {str(e)}")
        continue


# Append results safely
results_df = pd.DataFrame(
    results,
    columns=["model", "clustering", "label_percentage_change"]
)

results_df.to_csv(
    "clustering_results.csv",
    mode="a",
    header=not os.path.exists("clustering_results.csv"),
    index=False
)



===== Processing ResNet152 =====
→ Implementing KMeans ...


  return fit_method(estimator, *args, **kwargs)


   ✓ Done | Label change: 68.90%
→ Implementing MiniBatchKMeans ...
   ✓ Done | Label change: 68.90%
→ Implementing KMedoids ...




   ✓ Done | Label change: 68.90%
→ Implementing Agglomerative_Ward ...
   ✓ Done | Label change: 68.88%
→ Implementing Agglomerative_Complete ...
   ✓ Done | Label change: 68.88%
→ Implementing DBSCAN ...
   ✓ Done | Label change: 68.90%
→ Implementing OPTICS ...
   ✓ Done | Label change: 68.90%
→ Implementing HDBSCAN ...
   ✓ Done | Label change: 100.00%
→ Implementing GMM ...


  return fit_method(estimator, *args, **kwargs)


   ✓ Done | Label change: 68.90%
→ Implementing Spectral ...
   ✓ Done | Label change: 68.94%
→ Implementing BIRCH ...




   ✓ Done | Label change: 68.90%
→ Implementing AffinityPropagation ...




   ✓ Done | Label change: 68.90%
→ Implementing MeanShift ...
   ✓ Done | Label change: 68.90%


In [8]:
NUM_CLASSES = 3
MODEL_NAME = "MobileNetV3"   # change per cell

print(f"\n===== Processing {MODEL_NAME} =====")

X = np.load(f"{MODEL_NAME}_features.npy")

algorithms = get_algorithms(X)

results = []

for algo_name, algo in algorithms.items():
    print(f"→ Implementing {algo_name} ...")

    try:
        # Fit & predict
        if algo_name == "GMM":
            labels = algo.fit(X).predict(X)
        else:
            labels = algo.fit_predict(X)

        # Save only labels CSV
        save_cluster_labels_csv(
    MODEL_NAME,
    algo_name,
    image_names,
    original_labels,   # ← ADD THIS
    labels
)


        # Compute label change %
        change = compute_label_change(original_labels_enc, labels)

        results.append([MODEL_NAME, algo_name, change])

        print(f"   ✓ Done | Label change: {change:.2f}%")

    except Exception as e:
        print(f"   ✗ Failed: {str(e)}")
        continue


# Append results safely
results_df = pd.DataFrame(
    results,
    columns=["model", "clustering", "label_percentage_change"]
)

results_df.to_csv(
    "clustering_results.csv",
    mode="a",
    header=not os.path.exists("clustering_results.csv"),
    index=False
)



===== Processing MobileNetV3 =====
→ Implementing KMeans ...
   ✓ Done | Label change: 57.69%
→ Implementing MiniBatchKMeans ...
   ✓ Done | Label change: 66.41%
→ Implementing KMedoids ...
   ✓ Done | Label change: 60.14%
→ Implementing Agglomerative_Ward ...
   ✓ Done | Label change: 69.99%
→ Implementing Agglomerative_Complete ...
   ✓ Done | Label change: 58.09%
→ Implementing DBSCAN ...
   ✓ Done | Label change: 68.90%
→ Implementing OPTICS ...
   ✓ Done | Label change: 99.85%
→ Implementing HDBSCAN ...
   ✓ Done | Label change: 69.38%
→ Implementing GMM ...
   ✓ Done | Label change: 69.43%
→ Implementing Spectral ...
   ✓ Done | Label change: 51.38%
→ Implementing BIRCH ...
   ✓ Done | Label change: 68.90%
→ Implementing AffinityPropagation ...




   ✓ Done | Label change: 99.98%
→ Implementing MeanShift ...
   ✓ Done | Label change: 75.69%


In [9]:
NUM_CLASSES = 3
MODEL_NAME = "VGG19"   # change per cell

print(f"\n===== Processing {MODEL_NAME} =====")

X = np.load(f"{MODEL_NAME}_features.npy")

algorithms = get_algorithms(X)

results = []

for algo_name, algo in algorithms.items():
    print(f"→ Implementing {algo_name} ...")

    try:
        # Fit & predict
        if algo_name == "GMM":
            labels = algo.fit(X).predict(X)
        else:
            labels = algo.fit_predict(X)

        # Save only labels CSV
        save_cluster_labels_csv(
    MODEL_NAME,
    algo_name,
    image_names,
    original_labels,   # ← ADD THIS
    labels
)


        # Compute label change %
        change = compute_label_change(original_labels_enc, labels)

        results.append([MODEL_NAME, algo_name, change])

        print(f"   ✓ Done | Label change: {change:.2f}%")

    except Exception as e:
        print(f"   ✗ Failed: {str(e)}")
        continue


# Append results safely
results_df = pd.DataFrame(
    results,
    columns=["model", "clustering", "label_percentage_change"]
)

results_df.to_csv(
    "clustering_results.csv",
    mode="a",
    header=not os.path.exists("clustering_results.csv"),
    index=False
)



===== Processing VGG19 =====
→ Implementing KMeans ...


  return fit_method(estimator, *args, **kwargs)


   ✓ Done | Label change: 68.90%
→ Implementing MiniBatchKMeans ...
   ✓ Done | Label change: 68.90%
→ Implementing KMedoids ...




   ✓ Done | Label change: 68.90%
→ Implementing Agglomerative_Ward ...
   ✓ Done | Label change: 68.88%
→ Implementing Agglomerative_Complete ...
   ✓ Done | Label change: 68.88%
→ Implementing DBSCAN ...
   ✓ Done | Label change: 68.90%
→ Implementing OPTICS ...
   ✓ Done | Label change: 68.90%
→ Implementing HDBSCAN ...
   ✓ Done | Label change: 100.00%
→ Implementing GMM ...


  return fit_method(estimator, *args, **kwargs)


   ✓ Done | Label change: 68.90%
→ Implementing Spectral ...
   ✓ Done | Label change: 65.42%
→ Implementing BIRCH ...




   ✓ Done | Label change: 68.90%
→ Implementing AffinityPropagation ...




   ✓ Done | Label change: 68.90%
→ Implementing MeanShift ...
   ✓ Done | Label change: 68.90%


In [None]:
NUM_CLASSES = 3
MODEL_NAME = "Xception"   # change per cell

print(f"\n===== Processing {MODEL_NAME} =====")

X = np.load(f"{MODEL_NAME}_features.npy")

algorithms = get_algorithms(X)

results = []

for algo_name, algo in algorithms.items():
    print(f"→ Implementing {algo_name} ...")

    try:
        # Fit & predict
        if algo_name == "GMM":
            labels = algo.fit(X).predict(X)
        else:
            labels = algo.fit_predict(X)

        # Save only labels CSV
        save_cluster_labels_csv(
    MODEL_NAME,
    algo_name,
    image_names,
    original_labels,   # ← ADD THIS
    labels
)


        # Compute label change %
        change = compute_label_change(original_labels_enc, labels)

        results.append([MODEL_NAME, algo_name, change])

        print(f"   ✓ Done | Label change: {change:.2f}%")

    except Exception as e:
        print(f"   ✗ Failed: {str(e)}")
        continue


# Append results safely
results_df = pd.DataFrame(
    results,
    columns=["model", "clustering", "label_percentage_change"]
)

results_df.to_csv(
    "clustering_results.csv",
    mode="a",
    header=not os.path.exists("clustering_results.csv"),
    index=False
)



===== Processing Xception =====
→ Implementing KMeans ...
   ✓ Done | Label change: 84.58%
→ Implementing MiniBatchKMeans ...
   ✓ Done | Label change: 47.40%
→ Implementing KMedoids ...
   ✓ Done | Label change: 39.69%
→ Implementing Agglomerative_Ward ...
   ✓ Done | Label change: 46.46%
→ Implementing Agglomerative_Complete ...
   ✓ Done | Label change: 65.09%
→ Implementing DBSCAN ...
   ✓ Done | Label change: 87.26%
→ Implementing OPTICS ...
   ✓ Done | Label change: 99.85%
→ Implementing HDBSCAN ...
   ✓ Done | Label change: 70.14%
→ Implementing GMM ...
   ✓ Done | Label change: 78.06%
→ Implementing BIRCH ...
   ✓ Done | Label change: 71.16%
→ Implementing AffinityPropagation ...
   ✓ Done | Label change: 99.69%
→ Implementing MeanShift ...
   ✓ Done | Label change: 70.66%
→ Implementing Spectral ...


