In [None]:
!pip install scikit-learn-extra


In [2]:
!pip install hdbscan


Defaulting to user installation because normal site-packages is not writeable
Collecting hdbscan
  Downloading hdbscan-0.8.41-cp313-cp313-win_amd64.whl.metadata (15 kB)
Downloading hdbscan-0.8.41-cp313-cp313-win_amd64.whl (671 kB)
   ---------------------------------------- 0.0/671.7 kB ? eta -:--:--
   ---------------------------------------- 671.7/671.7 kB 5.5 MB/s eta 0:00:00
Installing collected packages: hdbscan
Successfully installed hdbscan-0.8.41



[notice] A new release of pip is available: 24.3.1 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
!pip install scikit-fuzzy


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-fuzzy
  Downloading scikit_fuzzy-0.5.0-py2.py3-none-any.whl.metadata (2.6 kB)
Downloading scikit_fuzzy-0.5.0-py2.py3-none-any.whl (920 kB)
   ---------------------------------------- 0.0/920.8 kB ? eta -:--:--
   ----------- ---------------------------- 262.1/920.8 kB ? eta -:--:--
   ---------------------------------------- 920.8/920.8 kB 2.6 MB/s eta 0:00:00
Installing collected packages: scikit-fuzzy
Successfully installed scikit-fuzzy-0.5.0



[notice] A new release of pip is available: 24.3.1 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import numpy as np

from sklearn.cluster import (
    KMeans,
    MiniBatchKMeans,
    AgglomerativeClustering,
    SpectralClustering,
    MeanShift,
    AffinityPropagation,
    DBSCAN,
    OPTICS,
    Birch,
    BisectingKMeans
)

from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.preprocessing import normalize

from sklearn_extra.cluster import KMedoids
import hdbscan
import skfuzzy as fuzz


In [2]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
original_df = pd.read_csv("BreaKHis/labels.csv")

image_names = original_df["image_name"].values
original_labels = original_df["class"].values

le = LabelEncoder()
original_labels_enc = le.fit_transform(original_labels)

In [3]:
def compute_label_change(original, clustered):
    return (np.sum(original != clustered) / len(original)) * 100


def save_cluster_labels_csv(model, algo, image_names, original_labels, labels):
    os.makedirs("clustering_labels", exist_ok=True)

    df = pd.DataFrame({
        "image_name": image_names,
        "original_label": original_labels,   # ← human-readable
        "cluster_label": labels               # ← cluster assignment
    })

    filename = f"clustering_labels/{model}_{algo}.csv"
    df.to_csv(filename, index=False)



In [7]:


def get_algorithms(X):
    """
    Returns clustering algorithms.
    Handles sklearn estimators and callable clustering methods safely.
    """

    # Normalize once for cosine / fuzzy / spherical methods
    X_norm = normalize(X)

    return {

        # ================= PARTITION-BASED =================
        "KMeans": KMeans(
            n_clusters=NUM_CLASSES,
            n_init=10,
            random_state=42
        ),

        "MiniBatchKMeans": MiniBatchKMeans(
            n_clusters=NUM_CLASSES,
            n_init=10,
            random_state=42
        ),

        # NOTE: sklearn-extra is unstable with NumPy>=2
        # Use only if environment is fixed
        "KMedoids_PAM": KMedoids(
            n_clusters=NUM_CLASSES,
            method="pam",
            random_state=42
        ),

        "BisectingKMeans": BisectingKMeans(
            n_clusters=NUM_CLASSES,
            random_state=42
        ),

        # ================= SPHERICAL / FUZZY =================
        # Callable by design (no fit_predict attribute)

        "SphericalKMeans": lambda X: KMeans(
            n_clusters=NUM_CLASSES,
            n_init=10,
            random_state=42
        ).fit_predict(X_norm),

        "FuzzyCMeans": lambda X: np.argmax(
            fuzz.cluster.cmeans(
                X_norm.T,
                c=NUM_CLASSES,
                m=2.0,
                error=0.005,
                maxiter=1000,
                init=None
            )[1],
            axis=0
        ),

        # ================= HIERARCHICAL =================
        "Agglomerative_Single": AgglomerativeClustering(
            n_clusters=NUM_CLASSES,
            linkage="single"
        ),

        "Agglomerative_Complete": AgglomerativeClustering(
            n_clusters=NUM_CLASSES,
            linkage="complete"
        ),

        "Agglomerative_Average": AgglomerativeClustering(
            n_clusters=NUM_CLASSES,
            linkage="average"
        ),

        # Ward requires Euclidean distance (OK for scaled CNN features)
        "Agglomerative_Ward": AgglomerativeClustering(
            n_clusters=NUM_CLASSES,
            linkage="ward"
        ),

        # ================= DENSITY-BASED =================
        # Relaxed eps to avoid single-cluster collapse
        "DBSCAN": DBSCAN(
            eps=0.7,
            min_samples=5
        ),

        "OPTICS": OPTICS(
            min_samples=5
        ),

        "HDBSCAN": hdbscan.HDBSCAN(
            min_cluster_size=max(5, NUM_CLASSES * 2)
        ),

        # ================= MODEL-BASED =================
        # Diagonal covariance + reg for stability
        "GMM": GaussianMixture(
            n_components=NUM_CLASSES,
            covariance_type="diag",
            reg_covar=1e-3,
            max_iter=500,
            random_state=42
        ),

        "BayesianGMM": BayesianGaussianMixture(
            n_components=NUM_CLASSES,
            covariance_type="diag",
            reg_covar=1e-3,
            max_iter=500,
            random_state=42
        ),

        # ================= LARGE-SCALE =================
        "BIRCH": Birch(
            n_clusters=NUM_CLASSES,
            threshold=0.5
        ),
    }


In [8]:
import os


In [9]:
NUM_CLASSES = 2
MODEL_NAME = "EfficientNetB0"

print(f"\n===== Processing {MODEL_NAME} =====")

X = np.load(f"{MODEL_NAME}_features.npy").astype(np.float64)

algorithms = get_algorithms(X)
results = []

for algo_name, algo in algorithms.items():
    print(f"→ Implementing {algo_name} ...")

    try:
        # Case 1: callable algorithms (Fuzzy, Spherical)
        if callable(algo):
            labels = algo(X)

        # Case 2: GMM / BayesianGMM
        elif hasattr(algo, "predict") and not hasattr(algo, "fit_predict"):
            algo.fit(X)
            labels = algo.predict(X)

        # Case 3: sklearn clustering
        else:
            labels = algo.fit_predict(X)

        # Skip degenerate clustering
        if len(np.unique(labels)) < 2:
            raise ValueError("Only one cluster formed")

        save_cluster_labels_csv(
            MODEL_NAME,
            algo_name,
            image_names,
            original_labels,
            labels
        )

        change = compute_label_change(original_labels_enc, labels)
        results.append([MODEL_NAME, algo_name, change])

        print(f"   ✓ Done | Label change: {change:.2f}%")

    except Exception as e:
        print(f"   ✗ Failed: {str(e)}")
        continue


pd.DataFrame(
    results,
    columns=["model", "clustering", "label_percentage_change"]
).to_csv(
    "clustering_results.csv",
    mode="a",
    header=not os.path.exists("clustering_results.csv"),
    index=False
)



===== Processing EfficientNetB0 =====
→ Implementing KMeans ...
   ✓ Done | Label change: 38.25%
→ Implementing MiniBatchKMeans ...
   ✓ Done | Label change: 83.40%
→ Implementing KMedoids_PAM ...
   ✓ Done | Label change: 9.55%
→ Implementing BisectingKMeans ...
   ✓ Done | Label change: 53.51%
→ Implementing SphericalKMeans ...
   ✓ Done | Label change: 82.54%
→ Implementing FuzzyCMeans ...
   ✓ Done | Label change: 82.44%
→ Implementing Agglomerative_Single ...
   ✓ Done | Label change: 68.66%
→ Implementing Agglomerative_Complete ...
   ✓ Done | Label change: 68.81%
→ Implementing Agglomerative_Average ...
   ✓ Done | Label change: 69.02%
→ Implementing Agglomerative_Ward ...
   ✓ Done | Label change: 91.40%
→ Implementing DBSCAN ...
   ✓ Done | Label change: 86.71%
→ Implementing OPTICS ...
   ✓ Done | Label change: 99.80%
→ Implementing HDBSCAN ...
   ✓ Done | Label change: 68.68%
→ Implementing GMM ...
   ✓ Done | Label change: 79.26%
→ Implementing BayesianGMM ...
   ✓ Done | 

In [10]:
NUM_CLASSES = 2
MODEL_NAME = "MobileNetV3"

print(f"\n===== Processing {MODEL_NAME} =====")

X = np.load(f"{MODEL_NAME}_features.npy").astype(np.float64)

algorithms = get_algorithms(X)
results = []

for algo_name, algo in algorithms.items():
    print(f"→ Implementing {algo_name} ...")

    try:
        # Case 1: callable algorithms (Fuzzy, Spherical)
        if callable(algo):
            labels = algo(X)

        # Case 2: GMM / BayesianGMM
        elif hasattr(algo, "predict") and not hasattr(algo, "fit_predict"):
            algo.fit(X)
            labels = algo.predict(X)

        # Case 3: sklearn clustering
        else:
            labels = algo.fit_predict(X)

        # Skip degenerate clustering
        if len(np.unique(labels)) < 2:
            raise ValueError("Only one cluster formed")

        save_cluster_labels_csv(
            MODEL_NAME,
            algo_name,
            image_names,
            original_labels,
            labels
        )

        change = compute_label_change(original_labels_enc, labels)
        results.append([MODEL_NAME, algo_name, change])

        print(f"   ✓ Done | Label change: {change:.2f}%")

    except Exception as e:
        print(f"   ✗ Failed: {str(e)}")
        continue


pd.DataFrame(
    results,
    columns=["model", "clustering", "label_percentage_change"]
).to_csv(
    "clustering_results.csv",
    mode="a",
    header=not os.path.exists("clustering_results.csv"),
    index=False
)



===== Processing MobileNetV3 =====
→ Implementing KMeans ...
   ✓ Done | Label change: 70.25%
→ Implementing MiniBatchKMeans ...
   ✓ Done | Label change: 70.36%
→ Implementing KMedoids_PAM ...
   ✓ Done | Label change: 69.68%
→ Implementing BisectingKMeans ...
   ✓ Done | Label change: 73.74%
→ Implementing SphericalKMeans ...
   ✓ Done | Label change: 69.92%
→ Implementing FuzzyCMeans ...
   ✓ Done | Label change: 69.77%
→ Implementing Agglomerative_Single ...
   ✓ Done | Label change: 68.63%
→ Implementing Agglomerative_Complete ...
   ✓ Done | Label change: 35.31%
→ Implementing Agglomerative_Average ...
   ✓ Done | Label change: 69.01%
→ Implementing Agglomerative_Ward ...
   ✓ Done | Label change: 76.61%
→ Implementing DBSCAN ...
   ✗ Failed: Only one cluster formed
→ Implementing OPTICS ...
   ✓ Done | Label change: 99.90%
→ Implementing HDBSCAN ...
   ✓ Done | Label change: 69.01%
→ Implementing GMM ...
   ✗ Failed: Only one cluster formed
→ Implementing BayesianGMM ...
   ✓ D



In [11]:
NUM_CLASSES = 2
MODEL_NAME = "Xception"

print(f"\n===== Processing {MODEL_NAME} =====")

X = np.load(f"{MODEL_NAME}_features.npy").astype(np.float64)

algorithms = get_algorithms(X)
results = []

for algo_name, algo in algorithms.items():
    print(f"→ Implementing {algo_name} ...")

    try:
        # Case 1: callable algorithms (Fuzzy, Spherical)
        if callable(algo):
            labels = algo(X)

        # Case 2: GMM / BayesianGMM
        elif hasattr(algo, "predict") and not hasattr(algo, "fit_predict"):
            algo.fit(X)
            labels = algo.predict(X)

        # Case 3: sklearn clustering
        else:
            labels = algo.fit_predict(X)

        # Skip degenerate clustering
        if len(np.unique(labels)) < 2:
            raise ValueError("Only one cluster formed")

        save_cluster_labels_csv(
            MODEL_NAME,
            algo_name,
            image_names,
            original_labels,
            labels
        )

        change = compute_label_change(original_labels_enc, labels)
        results.append([MODEL_NAME, algo_name, change])

        print(f"   ✓ Done | Label change: {change:.2f}%")

    except Exception as e:
        print(f"   ✗ Failed: {str(e)}")
        continue

pd.DataFrame(
    results,
    columns=["model", "clustering", "label_percentage_change"]
).to_csv(
    "clustering_results.csv",
    mode="a",
    header=not os.path.exists("clustering_results.csv"),
    index=False
)



===== Processing Xception =====
→ Implementing KMeans ...
   ✓ Done | Label change: 94.08%
→ Implementing MiniBatchKMeans ...
   ✓ Done | Label change: 6.01%
→ Implementing KMedoids_PAM ...
   ✓ Done | Label change: 93.61%
→ Implementing BisectingKMeans ...
   ✓ Done | Label change: 5.92%
→ Implementing SphericalKMeans ...
   ✓ Done | Label change: 90.19%
→ Implementing FuzzyCMeans ...
   ✓ Done | Label change: 88.91%
→ Implementing Agglomerative_Single ...
   ✓ Done | Label change: 31.33%
→ Implementing Agglomerative_Complete ...
   ✓ Done | Label change: 68.76%
→ Implementing Agglomerative_Average ...
   ✓ Done | Label change: 68.76%
→ Implementing Agglomerative_Ward ...
   ✓ Done | Label change: 95.73%
→ Implementing DBSCAN ...
   ✓ Done | Label change: 99.79%
→ Implementing OPTICS ...
   ✓ Done | Label change: 99.91%
→ Implementing HDBSCAN ...
   ✓ Done | Label change: 99.94%
→ Implementing GMM ...
   ✓ Done | Label change: 95.12%
→ Implementing BayesianGMM ...
   ✓ Done | Label c

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import silhouette_score

# ============================
# LOAD FEATURES
# ============================

X_eff = np.load("EfficientNetB0_features.npy")
X_xcep = np.load("Xception_features.npy")

# ============================
# LOAD CLUSTER LABEL FILES
# ============================

eff_kmeans_df = pd.read_csv("clustering_labels/EfficientNetB0_KMeans.csv")
xcep_birch_df = pd.read_csv("clustering_labels/Xception_BIRCH.csv")

labels_eff = eff_kmeans_df["cluster_label"].values
labels_xcep = xcep_birch_df["cluster_label"].values

# ============================
# REMOVE NOISE IF PRESENT (-1)
# ============================

def compute_silhouette(X, labels, name):
    unique_labels = np.unique(labels)

    if len(unique_labels) < 2:
        print(f"{name}: Cannot compute silhouette (only one cluster)")
        return None

    # Remove noise label -1 if exists
    mask = labels != -1
    X_clean = X[mask]
    labels_clean = labels[mask]

    if len(np.unique(labels_clean)) < 2:
        print(f"{name}: Not enough valid clusters after removing noise")
        return None

    score = silhouette_score(X_clean, labels_clean)
    print(f"{name} Silhouette Score: {score:.4f}")
    return score


# ============================
# COMPUTE SCORES
# ============================

score_min = compute_silhouette(
    X_eff,
    labels_eff,
    "Minimum Change (EfficientNetB0 + KMeans)"
)

score_max = compute_silhouette(
    X_xcep,
    labels_xcep,
    "Maximum Change (Xception + Birch)"
)


Minimum Change (EfficientNetB0 + KMeans) Silhouette Score: 0.6273
Maximum Change (Xception + Birch) Silhouette Score: 0.3694
