<a href="https://colab.research.google.com/github/muajnstu/Bank-Marketing-using-rough-set-approach/blob/main/Data_Analysis_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install k_means_constrained

In [None]:
import pandas as pd
import numpy as np
from k_means_constrained import KMeansConstrained
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from google.colab import files

In [None]:
universal_bank_dataset = "https://media.githubusercontent.com/media/shahriariit/opendataset/refs/heads/master/UniversalBank.csv"
df = pd.read_csv(universal_bank_dataset)

In [None]:
X = df.drop(columns=['Personal Loan'])
y = df['Personal Loan']

print(f"Dataset shape   : {df.shape}")
print(f"Class 0 samples : {(y==0).sum()}")
print(f"Class 1 samples : {(y==1).sum()}")
df.head()

In [None]:
X_0 = X[y == 0].copy()
X_1 = X[y == 1].copy()

scaler_0 = StandardScaler()
scaler_1 = StandardScaler()

X_0_scaled = scaler_0.fit_transform(X_0)
X_1_scaled = scaler_1.fit_transform(X_1)

print(f"Class 0 scaled shape : {X_0_scaled.shape}")
print(f"Class 1 scaled shape : {X_1_scaled.shape}")

In [None]:
def show_distribution(labels, title, X_data=None):
    counts = np.bincount(labels)
    total = counts.sum()
    print(f"\n{'='*50}")
    print(f"  {title}")
    print(f"{'='*50}")
    for i, c in enumerate(counts):
        print(f"  Cluster {i:>2}: {c:>5} samples  ({c/total*100:.1f}%)")
    print(f"  Total         : {total}")
    print(f"  Max-Min Diff  : {counts.max() - counts.min()}")
    print(f"  Balance Ratio : {counts.min()/counts.max():.3f}")
    if X_data is not None and len(np.unique(labels)) > 1:
        score = silhouette_score(X_data, labels)
        print(f"  Silhouette    : {score:.4f}")
    return counts


def run_constrained_kmeans(X_data, n_clusters, label, size_min=None, size_max=None):
    n = len(X_data)
    ideal = n // n_clusters
    if size_min is None:
        size_min = int(ideal * 0.75)
    if size_max is None:
        size_max = int(ideal * 1.25)

    print(f"\n[{label}] n={n} | k={n_clusters} | size_min={size_min} | size_max={size_max}")

    kmeans = KMeansConstrained(
        n_clusters=n_clusters,
        size_min=size_min,
        size_max=size_max,
        random_state=42,
        n_init=10
    )
    labels = kmeans.fit_predict(X_data)

    counts = np.bincount(labels)
    score = silhouette_score(X_data, labels)
    balance_ratio = counts.min() / counts.max()

    print(f"  Silhouette Score : {score:.4f}")
    print(f"  Balance Ratio    : {balance_ratio:.3f}")
    for i, c in enumerate(counts):
        print(f"  Cluster {i}: {c} samples ({c/n*100:.1f}%)")

    return labels, n_clusters, score

In [None]:
# Class 0
labels_c0, k_c0, score_c0 = run_constrained_kmeans(
    X_0_scaled,
    n_clusters=2,
    label="Class 0 - Initial"
)

# Class 1
labels_c1, k_c1, score_c1 = run_constrained_kmeans(
    X_1_scaled,
    n_clusters=2,
    label="Class 1 - Initial"
)

In [None]:
show_distribution(labels_c0, "Class 0 — Initial Cluster Distribution", X_0_scaled)
show_distribution(labels_c1, "Class 1 — Initial Cluster Distribution", X_1_scaled)

In [None]:
c0_cluster0_idx = np.where(labels_c0 == 0)[0]
X_c0_cluster0 = X_0_scaled[c0_cluster0_idx]

print(f"Class 0 - Cluster 0 size: {len(c0_cluster0_idx)}")

re_labels_c0_0, re_k_c0_0, re_score_c0_0 = run_constrained_kmeans(
    X_c0_cluster0,
    n_clusters=4,
    label="Class 0 - Cluster 0 Recluster"
)

show_distribution(re_labels_c0_0, "Class 0 — Cluster 0 Sub-clusters", X_c0_cluster0)

In [None]:
c0_cluster1_idx = np.where(labels_c0 == 1)[0]
X_c0_cluster1 = X_0_scaled[c0_cluster1_idx]

print(f"Class 0 - Cluster 1 size: {len(c0_cluster1_idx)}")

re_labels_c0_1, re_k_c0_1, re_score_c0_1 = run_constrained_kmeans(
    X_c0_cluster1,
    n_clusters=4,
    label="Class 0 - Cluster 1 Recluster"
)

show_distribution(re_labels_c0_1, "Class 0 — Cluster 1 Sub-clusters", X_c0_cluster1)

In [None]:
final_labels_class0 = np.full(len(X_0_scaled), -1, dtype=int)
cluster_id = 0

# Sub-clusters from Cluster 0
for local_i, global_i in enumerate(c0_cluster0_idx):
    final_labels_class0[global_i] = cluster_id + re_labels_c0_0[local_i]
cluster_id += re_k_c0_0

# Sub-clusters from Cluster 1
for local_i, global_i in enumerate(c0_cluster1_idx):
    final_labels_class0[global_i] = cluster_id + re_labels_c0_1[local_i]
cluster_id += re_k_c0_1

print(f"Total final clusters for Class 0: {cluster_id}")
show_distribution(final_labels_class0, "Class 0 — All Final Clusters Combined", X_0_scaled)

In [None]:
# Store all final micro-cluster labels for Class 0
final_labels_class0 = np.full(len(X_0_scaled), -1, dtype=int)
cluster_id = 0

# ---- From Cluster 0 re-labels (re_labels_c0_0) ----
for sub_cluster in range(re_k_c0_0):
    sub_idx_local = np.where(re_labels_c0_0 == sub_cluster)[0]
    sub_idx_global = c0_cluster0_idx[sub_idx_local]
    X_sub = X_0_scaled[sub_idx_global]

    print(f"\nReclustering Class0 → Cluster0 → Sub{sub_cluster} | size={len(sub_idx_global)}")

    micro_labels, _, _ = run_constrained_kmeans(
        X_sub,
        n_clusters=2,
        label=f"C0-Cluster0-Sub{sub_cluster}"
    )

    for local_i, global_i in enumerate(sub_idx_global):
        final_labels_class0[global_i] = cluster_id + micro_labels[local_i]

    cluster_id += 2

# ---- From Cluster 1 re-labels (re_labels_c0_1) ----
for sub_cluster in range(re_k_c0_1):
    sub_idx_local = np.where(re_labels_c0_1 == sub_cluster)[0]
    sub_idx_global = c0_cluster1_idx[sub_idx_local]
    X_sub = X_0_scaled[sub_idx_global]

    print(f"\nReclustering Class0 → Cluster1 → Sub{sub_cluster} | size={len(sub_idx_global)}")

    micro_labels, _, _ = run_constrained_kmeans(
        X_sub,
        n_clusters=2,
        label=f"C0-Cluster1-Sub{sub_cluster}"
    )

    for local_i, global_i in enumerate(sub_idx_global):
        final_labels_class0[global_i] = cluster_id + micro_labels[local_i]

    cluster_id += 2

print(f"\nTotal final micro-clusters for Class 0: {cluster_id}")

In [None]:
show_distribution(final_labels_class0, "Class 0 — Final 16 Micro-Clusters", X_0_scaled)

In [None]:
print("\n FINAL COMPARISON")
print(f"\nClass 0 — {cluster_id} clusters:")
counts_0 = np.bincount(final_labels_class0)
for i, c in enumerate(counts_0):
    print(f"  Cluster {i:>2}: {c:>4} samples ({c/len(final_labels_class0)*100:.1f}%)")

print(f"\nClass 1 — {k_c1} clusters (unchanged):")
counts_1 = np.bincount(labels_c1)
for i, c in enumerate(counts_1):
    print(f"  Cluster {i:>2}: {c:>4} samples ({c/len(labels_c1)*100:.1f}%)")

print(f"\nClass 0 avg per cluster : {counts_0.mean():.0f} samples")
print(f"Class 1 avg per cluster : {counts_1.mean():.0f} samples")
print(f"Class 0 balance ratio   : {counts_0.min()/counts_0.max():.3f}")
print(f"Class 1 balance ratio   : {counts_1.min()/counts_1.max():.3f}")

In [None]:
idx0 = y[y == 0].index
idx1 = y[y == 1].index

# Offset Class 1 labels to avoid overlap
offset = final_labels_class0.max() + 1
labels_c1_offset = labels_c1 + offset

final_df = X.copy()
cluster_series = pd.Series(index=final_df.index, dtype=int)
cluster_series.loc[idx0] = final_labels_class0
cluster_series.loc[idx1] = labels_c1_offset

final_df['Cluster_Label'] = cluster_series
final_df['Personal_Loan'] = y

output_path = 'MicroClustered_bank_data.csv'
final_df.to_csv(output_path, index=False)

print(f" Saved : {output_path}")
print(f"Shape   : {final_df.shape}")
print(f"Unique Cluster Labels : {sorted(final_df['Cluster_Label'].unique())}")
final_df.head()

In [None]:
files.download('MicroClustered_bank_data.csv')