In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage

In [None]:
df = pd.read_csv('/content/customer_features_scaled.csv')
df.head(10)

Unnamed: 0,CustomerID,Sum_Quantity,Mean_UnitPrice,Mean_TotalPrice,Sum_TotalPrice,Count_Invoice,Count_Stock,Mean_InvoiceCountPerStock,Mean_StockCountPerInvoice,Mean_UnitPriceMeanPerInvoice,Mean_QuantitySumPerInvoice,Mean_TotalPriceMeanPerInvoice,Mean_TotalPriceSumPerInvoice,Mean_UnitPriceMeanPerStock,Mean_QuantitySumPerStock,Mean_TotalPriceMeanPerStock,Mean_TotalPriceSumPerStock
0,000nan,4.984467,1.506343,-0.268408,4.595722,2.432481,4.773105,2.067962,2.330682,3.09408,0.761335,0.914696,2.100059,2.332104,1.979686,-0.076864,2.39238
1,012346,3.789233,-2.120154,3.800623,3.1617,-1.158633,-2.296006,-1.015864,-2.307499,-2.100457,8.84778,3.719362,7.377665,-2.141668,3.384916,3.763263,3.337067
2,012747,0.879252,0.879583,1.007144,1.397472,1.532615,0.156943,1.957285,-0.614124,0.749921,-0.306106,0.97371,0.485241,1.078802,1.041469,0.667948,1.526344
3,012748,3.038005,-0.106556,-1.068898,2.706127,2.3528,3.983682,1.982065,0.343236,1.872665,-0.238979,-0.080428,-0.71041,-0.049208,0.348309,-0.892187,-0.000881
4,012749,0.982814,1.125208,0.282588,1.379978,0.890138,1.397,0.537992,1.095989,1.056342,0.756329,0.259518,1.527988,1.056653,-0.159892,0.290885,0.329836
5,012820,0.466642,-0.817031,-0.01992,0.296923,0.65687,0.395429,-0.412569,-0.123289,-0.82486,0.189404,-0.105017,-0.181951,-0.935154,0.238579,0.0036,-0.128951
6,012821,-1.233887,-0.235351,-0.060898,-1.725684,-1.158633,-1.361354,-1.015864,-1.051,-0.261189,-0.848867,-0.151699,-1.489691,-0.233985,0.110322,-0.025617,-0.255883
7,012822,0.268811,-0.078187,0.286568,0.302366,-0.198605,0.135925,-0.081936,0.397982,-0.164225,0.676729,0.348066,0.78508,-0.141162,0.261716,0.260265,0.220589
8,012823,-0.366422,1.825216,2.528634,0.774998,0.890138,-2.296006,2.064151,-2.307499,1.669397,-1.28303,2.429015,0.374043,1.836621,2.29929,2.55467,2.796227
9,012824,-0.360106,-0.204132,-0.026684,-0.409951,-1.158633,-0.284543,-1.015864,0.500258,-0.231237,0.478007,-0.117808,0.540661,-0.202481,-0.149025,0.009031,-0.221666


In [None]:
X = df.drop(columns=['CustomerID'])

In [None]:
# K_Means
# Danh sách số chiều PCA muốn thử
pca_dims = [2, 3, 4, 5, 6,7, 8,9,10,12]

# Dải k cho K-Means
K_range = range(2, 11)

# Lưu toàn bộ kết quả
results = []

print("PCA + K-Means Grid Search")
print("=" * 60)

for n_comp in pca_dims:
    print(f"\nPCA with n_components = {n_comp}")
    print("-" * 40)

    # PCA
    pca = PCA(n_components=n_comp, random_state=42)
    X_pca = pca.fit_transform(X)

    explained_var = pca.explained_variance_ratio_.sum()

    inertias = []
    silhouettes = []
    dbis = []
    chs = []

    for k in K_range:
        kmeans = KMeans(
            n_clusters=k,
            random_state=42,
            n_init=10,
            max_iter=300
        )
        labels = kmeans.fit_predict(X_pca)

        inertias.append(kmeans.inertia_)
        silhouettes.append(silhouette_score(X_pca, labels))
        dbis.append(davies_bouldin_score(X_pca, labels))
        chs.append(calinski_harabasz_score(X_pca, labels))

        print(
            f"k={k:2d} | "
            f"Sil={silhouettes[-1]:.3f} | "
            f"DBI={dbis[-1]:.3f} | "
            f"CH={chs[-1]:.1f}"
        )

    # k tối ưu theo Silhouette
    optimal_k = K_range[np.argmax(silhouettes)]

    print(
        f"→ Optimal k (Silhouette) = {optimal_k} "
        f"| Explained variance = {explained_var:.3f}"
    )

    # Lưu kết quả
    results.append({
        "pca_dim": n_comp,
        "explained_variance": explained_var,
        "optimal_k": optimal_k,
        "best_silhouette": max(silhouettes),
        "best_dbi": min(dbis),
        "best_ch": max(chs)
    })


PCA + K-Means Grid Search

PCA with n_components = 2
----------------------------------------
k= 2 | Sil=0.310 | DBI=1.233 | CH=1850.8
k= 3 | Sil=0.375 | DBI=0.912 | CH=2416.1
k= 4 | Sil=0.381 | DBI=0.865 | CH=2454.3
k= 5 | Sil=0.348 | DBI=0.902 | CH=2403.9
k= 6 | Sil=0.341 | DBI=0.950 | CH=2324.9
k= 7 | Sil=0.351 | DBI=0.882 | CH=2339.5
k= 8 | Sil=0.330 | DBI=0.915 | CH=2329.5
k= 9 | Sil=0.338 | DBI=0.870 | CH=2278.8
k=10 | Sil=0.340 | DBI=0.866 | CH=2265.4
→ Optimal k (Silhouette) = 4 | Explained variance = 0.664

PCA with n_components = 3
----------------------------------------
k= 2 | Sil=0.243 | DBI=1.523 | CH=1294.1
k= 3 | Sil=0.285 | DBI=1.226 | CH=1460.5
k= 4 | Sil=0.294 | DBI=1.139 | CH=1428.7
k= 5 | Sil=0.290 | DBI=1.098 | CH=1397.8
k= 6 | Sil=0.250 | DBI=1.112 | CH=1313.5
k= 7 | Sil=0.248 | DBI=1.185 | CH=1244.9
k= 8 | Sil=0.250 | DBI=1.162 | CH=1200.6
k= 9 | Sil=0.247 | DBI=1.125 | CH=1166.4
k=10 | Sil=0.247 | DBI=1.099 | CH=1129.7
→ Optimal k (Silhouette) = 4 | Explained v