# **DBSCAN Clustering for Banking Clients (Fixed Version)**

This notebook applies **DBSCAN clustering** on a **banking client dataset**, using **Gower distance** to handle mixed numerical & categorical data. 

### **🔹 Pipeline Overview:**
1. **Load & Preprocess Data** (Handling categorical & numerical features)
2. **Compute Gower Distance** (for mixed numerical & categorical data)
3. **Find Optimal `eps`** Using a K-Distance Graph (Elbow Method)
4. **Auto-Tune DBSCAN** to find a balanced number of clusters
5. **Evaluate Clustering Performance** (Silhouette, Davies-Bouldin, Calinski-Harabasz Scores)
6. **Visualize Results using t-SNE in 3D**
7. **Interpret the Clusters** (Restoring numerical values & categorical labels)


In [None]:
import pandas as pd
import numpy as np

# Load dataset (Update path if necessary)
path = '/Users/mouadh/Fintech_Projects/Business_Case_1/Dataset1_BankClients.xlsx'
data = pd.read_excel(path)

# Drop ID column if present
if 'ID' in data.columns:
    data = data.drop(columns=['ID'])

# Display first 5 rows
data.head()


In [None]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# Define categorical columns
categorical_columns = ['Gender', 'Job', 'Area', 'CitySize', 'Investments']

# Separate numerical & categorical features
numerical_features = data.drop(columns=categorical_columns)
categorical_features = data[categorical_columns]

# Scale numerical features
scaler = MinMaxScaler()
X_num_scaled = scaler.fit_transform(numerical_features)

# One-hot encode categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat_encoded = encoder.fit_transform(categorical_features)

# Concatenate processed features
X_preprocessed = np.concatenate((X_num_scaled, X_cat_encoded), axis=1)

# Save feature names
encoded_feature_names = encoder.get_feature_names_out(categorical_columns)
all_feature_names = numerical_features.columns.tolist() + encoded_feature_names.tolist()

# Check final shape
print(f'Processed Data Shape: {X_preprocessed.shape}')


In [None]:
import gower

# Compute Gower Distance Matrix
gower_distances = gower.gower_matrix(X_preprocessed)

# Display first 5 rows
gower_distances[:5, :5]


In [None]:
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

# Function to find optimal eps using k-distance graph
def find_optimal_eps(distance_matrix, k=8, percentile_range=(90, 98)):
    nbrs = NearestNeighbors(n_neighbors=k, metric='precomputed').fit(distance_matrix)
    distances, indices = nbrs.kneighbors(distance_matrix)

    sorted_distances = np.sort(distances[:, -1], axis=0)

    # Plot k-distance graph
    plt.figure(figsize=(10, 5))
    plt.plot(sorted_distances)
    plt.xlabel("Points sorted by distance")
    plt.ylabel(f"Distance to {k}-th nearest neighbor")
    plt.title("K-Distance Graph (Use Elbow Method for eps)")
    plt.grid(True)
    plt.show()

    for p in range(percentile_range[0], percentile_range[1] + 1, 2):
        test_eps = np.percentile(sorted_distances, p)
        print(f"🔹 Trying eps at {p}th percentile: {test_eps:.4f}")
        if 0.05 <= test_eps <= 0.2:
            return test_eps

    return np.percentile(sorted_distances, 95)

# Automatically determine initial eps
optimal_eps = find_optimal_eps(gower_distances, k=8, percentile_range=(90, 98))


In [None]:
from sklearn.cluster import DBSCAN
import numpy as np

# Automatically determine min_samples
min_samples_range = (3, 12)
optimal_min_samples = max(int(np.log(len(gower_distances))), min_samples_range[0])
optimal_min_samples = min(optimal_min_samples, min_samples_range[1])

print(f"✅ Initial min_samples: {optimal_min_samples}")

# Apply DBSCAN clustering
dbscan = DBSCAN(eps=optimal_eps, min_samples=optimal_min_samples, metric='precomputed')
labels_dbscan = dbscan.fit_predict(gower_distances)

unique_clusters = np.unique(labels_dbscan)
num_clusters = len(unique_clusters) - (1 if -1 in unique_clusters else 0)
num_noise = sum(labels_dbscan == -1)

print(f"✅ Final eps selected: {optimal_eps:.4f}")
print(f"✅ Final min_samples: {optimal_min_samples}")
print(f"✅ Final clusters: {num_clusters}")
print(f"✅ Final noise points: {num_noise}")


In [None]:
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score

if num_clusters > 1:
    ch_score = calinski_harabasz_score(X_preprocessed, labels_dbscan)
    db_score = davies_bouldin_score(X_preprocessed, labels_dbscan)
    sil_score = silhouette_score(gower_distances, labels_dbscan, metric='precomputed')

    print("🔹 **Cluster Evaluation Metrics**:")
    print(f"✅ **Calinski-Harabasz Score:** {ch_score:.4f} (Higher = Better)")
    print(f"✅ **Davies-Bouldin Score:** {db_score:.4f} (Lower = Better)")
    print(f"✅ **Silhouette Score:** {sil_score:.4f} (Closer to 1 = Better)")
else:
    print("⚠️ Not enough clusters detected for evaluation.")


In [None]:
# Convert back to DataFrame with proper labels
cluster_summary = pd.DataFrame(X_preprocessed, columns=all_feature_names)
cluster_summary['Cluster'] = labels_dbscan

# Compute mean of each feature per cluster (excluding noise points)
valid_clusters = cluster_summary[cluster_summary['Cluster'] != -1]
summary = valid_clusters.groupby('Cluster').mean()

# Reverse MinMax scaling to restore original numerical values
summary_original_scale = summary.copy()
summary_original_scale[numerical_features.columns] = scaler.inverse_transform(summary[numerical_features.columns])

# Restore categorical feature labels
for cat_col in categorical_columns:
    one_hot_columns = [col for col in summary_original_scale.columns if col.startswith(cat_col + "_")]
    summary_original_scale[cat_col] = summary_original_scale[one_hot_columns].idxmax(axis=1).str.split('_').str[1]
    summary_original_scale = summary_original_scale.drop(columns=one_hot_columns)

# Display final cluster characteristics
from IPython.display import display
print("🔹 **Cluster Characteristics (Original Scale & Categorical Labels Restored):**")
display(summary_original_scale)
