In [None]:
import numpy as np
import pandas as pd
import math
import csv
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [None]:
df = pd.read_csv('/content/customer_features_scaled.csv')
df.head(10)

Unnamed: 0,CustomerID,Sum_Quantity,Mean_UnitPrice,Mean_TotalPrice,Sum_TotalPrice,Count_Invoice,Count_Stock,Mean_InvoiceCountPerStock,Mean_StockCountPerInvoice,Mean_UnitPriceMeanPerInvoice,Mean_QuantitySumPerInvoice,Mean_TotalPriceMeanPerInvoice,Mean_TotalPriceSumPerInvoice,Mean_UnitPriceMeanPerStock,Mean_QuantitySumPerStock,Mean_TotalPriceMeanPerStock,Mean_TotalPriceSumPerStock
0,000nan,4.984467,1.506343,-0.268408,4.595722,2.432481,4.773105,2.067962,2.330682,3.09408,0.761335,0.914696,2.100059,2.332104,1.979686,-0.076864,2.39238
1,012346,3.789233,-2.120154,3.800623,3.1617,-1.158633,-2.296006,-1.015864,-2.307499,-2.100457,8.84778,3.719362,7.377665,-2.141668,3.384916,3.763263,3.337067
2,012747,0.879252,0.879583,1.007144,1.397472,1.532615,0.156943,1.957285,-0.614124,0.749921,-0.306106,0.97371,0.485241,1.078802,1.041469,0.667948,1.526344
3,012748,3.038005,-0.106556,-1.068898,2.706127,2.3528,3.983682,1.982065,0.343236,1.872665,-0.238979,-0.080428,-0.71041,-0.049208,0.348309,-0.892187,-0.000881
4,012749,0.982814,1.125208,0.282588,1.379978,0.890138,1.397,0.537992,1.095989,1.056342,0.756329,0.259518,1.527988,1.056653,-0.159892,0.290885,0.329836
5,012820,0.466642,-0.817031,-0.01992,0.296923,0.65687,0.395429,-0.412569,-0.123289,-0.82486,0.189404,-0.105017,-0.181951,-0.935154,0.238579,0.0036,-0.128951
6,012821,-1.233887,-0.235351,-0.060898,-1.725684,-1.158633,-1.361354,-1.015864,-1.051,-0.261189,-0.848867,-0.151699,-1.489691,-0.233985,0.110322,-0.025617,-0.255883
7,012822,0.268811,-0.078187,0.286568,0.302366,-0.198605,0.135925,-0.081936,0.397982,-0.164225,0.676729,0.348066,0.78508,-0.141162,0.261716,0.260265,0.220589
8,012823,-0.366422,1.825216,2.528634,0.774998,0.890138,-2.296006,2.064151,-2.307499,1.669397,-1.28303,2.429015,0.374043,1.836621,2.29929,2.55467,2.796227
9,012824,-0.360106,-0.204132,-0.026684,-0.409951,-1.158633,-0.284543,-1.015864,0.500258,-0.231237,0.478007,-0.117808,0.540661,-0.202481,-0.149025,0.009031,-0.221666


In [None]:
X = df.drop(columns=['CustomerID'])

In [None]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

def run_hierarchical_with_metrics(X, n_clusters, linkage="ward"):
    """
    Chạy Agglomerative (Hierarchical) và tính metrics trên toàn bộ điểm.

    Returns:
      labels, n_clusters, silhouette, dbi, ch
    """
    model = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
    labels = model.fit_predict(X)

    # Metrics chỉ có ý nghĩa khi có >= 2 cụm
    if len(np.unique(labels)) > 1:
        sil = float(silhouette_score(X, labels))
        dbi = float(davies_bouldin_score(X, labels))
        ch  = float(calinski_harabasz_score(X, labels))
    else:
        sil, dbi, ch = None, None, None

    return labels, n_clusters, sil, dbi, ch


In [None]:
from sklearn.decomposition import PCA

def grid_pca_hierarchical(
    X_input,
    pca_dims,
    k_values=range(2, 11),
    linkage_list=("ward",),
    random_state=42,
    verbose=True
):
    """
    PCA(dim) -> Hierarchical(k, linkage).
    - In best config cuối cùng (theo Sil cao nhất, tie-break DBI thấp, CH cao)
    Returns:
      results: list[dict]
      labels_dict: dict[(pca_dim, linkage, k)] = labels
      best_row: dict
    """
    results = []
    labels_dict = {}
    best_row = None

    for dim in pca_dims:
        pca = PCA(n_components=dim, random_state=random_state)
        X_pca = pca.fit_transform(X_input)
        explained_var = float(pca.explained_variance_ratio_.sum())

        for linkage in linkage_list:
            if verbose:
                print(f"\nPCA dim={dim} | ExplainedVar={explained_var:.3f} | linkage={linkage}")
                print("-" * 70)

            for k in k_values:
                labels, n_clusters, sil, dbi, ch = run_hierarchical_with_metrics(
                    X_pca, n_clusters=k, linkage=linkage
                )

                labels_dict[(dim, linkage, k)] = labels

                row = {
                    "pca_dim": dim,
                    "explained_variance": explained_var,
                    "method": "hierarchical",
                    "linkage": linkage,
                    "n_clusters": n_clusters,
                    "silhouette": sil,
                    "dbi": dbi,
                    "ch": ch
                }
                results.append(row)

                # chỉ in dòng hợp lệ
                if verbose and (sil is not None) and (dbi is not None) and (ch is not None):
                    print(f"✓ k={k:>2} | Sil={sil:.3f} | DBI={dbi:.3f} | CH={ch:.1f}")

                # update best theo: Sil ↑, DBI ↓, CH ↑
                if (sil is not None) and (dbi is not None) and (ch is not None):
                    if best_row is None:
                        best_row = row
                    else:
                        if (
                            (row["silhouette"] > best_row["silhouette"]) or
                            (row["silhouette"] == best_row["silhouette"] and row["dbi"] < best_row["dbi"]) or
                            (row["silhouette"] == best_row["silhouette"] and row["dbi"] == best_row["dbi"] and row["ch"] > best_row["ch"])
                        ):
                            best_row = row

    if verbose:
        print("\nBEST CONFIG (Hierarchical)")
        print("=" * 70)
        if best_row is None:
            print("Không có cấu hình hợp lệ để tính metric.")
        else:
            print(
                f"PCA dim={best_row['pca_dim']} | ExplainedVar={best_row['explained_variance']:.3f} | "
                f"linkage={best_row['linkage']} | k={best_row['n_clusters']} | "
                f"Sil={best_row['silhouette']:.3f} | DBI={best_row['dbi']:.3f} | CH={best_row['ch']:.1f}"
            )

    return results, labels_dict, best_row


In [None]:
pca_dims = [2, 3, 4, 5, 6, 8, 10]
k_values = range(2, 11)

hier_results, hier_labels, best_hier = grid_pca_hierarchical(
    X_input=X,                 # hoặc X_scaled
    pca_dims=pca_dims,
    k_values=k_values,
    linkage_list=("ward", "average", "complete", "single"),
    verbose=True
)



PCA dim=2 | ExplainedVar=0.664 | linkage=ward
----------------------------------------------------------------------
✓ k= 2 | Sil=0.297 | DBI=1.287 | CH=1719.2
✓ k= 3 | Sil=0.339 | DBI=0.937 | CH=2129.4
✓ k= 4 | Sil=0.333 | DBI=0.990 | CH=2175.6
✓ k= 5 | Sil=0.286 | DBI=1.000 | CH=1990.2
✓ k= 6 | Sil=0.298 | DBI=0.943 | CH=1915.3
✓ k= 7 | Sil=0.288 | DBI=1.021 | CH=1941.8
✓ k= 8 | Sil=0.284 | DBI=1.007 | CH=1992.0
✓ k= 9 | Sil=0.287 | DBI=0.966 | CH=1948.2
✓ k=10 | Sil=0.293 | DBI=0.930 | CH=1913.6

PCA dim=2 | ExplainedVar=0.664 | linkage=average
----------------------------------------------------------------------
✓ k= 2 | Sil=0.517 | DBI=0.596 | CH=230.5
✓ k= 3 | Sil=0.391 | DBI=0.576 | CH=146.9
✓ k= 4 | Sil=0.314 | DBI=0.845 | CH=366.3
✓ k= 5 | Sil=0.267 | DBI=0.721 | CH=349.9
✓ k= 6 | Sil=0.257 | DBI=0.685 | CH=285.3
✓ k= 7 | Sil=0.228 | DBI=0.694 | CH=240.9
✓ k= 8 | Sil=0.249 | DBI=0.783 | CH=575.9
✓ k= 9 | Sil=0.239 | DBI=0.787 | CH=510.1
✓ k=10 | Sil=0.237 | DBI=0.769 | CH=45