In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler

import umap.umap_ as umap
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN

In [None]:
df=pd.read_csv('/content/bank-full_clustring.csv.zip')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
data =df.copy()

In [None]:
df.info()

In [None]:
######### ' DATA CLEANING '  #########

In [None]:
df.isna().sum()   #no missing data

In [None]:
df.drop_duplicates

In [None]:
df.drop('subscribed',axis=True,inplace=True)         #Clustering

In [None]:
binary_cols = ['default', 'housing', 'loan']

le = LabelEncoder()

for col in binary_cols:                  # only two values
    df[col] = le.fit_transform(df[col])  # Yes → 1, No → 0

In [None]:
df['education'] = le.fit_transform(df['education'])   #more than 2
df['month'] = le.fit_transform(df['month'])

In [None]:
df = pd.get_dummies(df, columns=['job', 'marital', 'contact', 'day_of_week', 'poutcome'])   #there is no rating

In [None]:
binary_cols = ['contact_cellular','contact_telephone','day_of_week_fri','day_of_week_mon','day_of_week_thu','day_of_week_tue','day_of_week_wed','poutcome_failure','poutcome_nonexistent','poutcome_success']


le = LabelEncoder()

for col in binary_cols:                  # only two values
    df[col] = le.fit_transform(df[col])

In [None]:
df.head()

In [None]:
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns

df_melted = df[numeric_cols].melt(var_name='Feature', value_name='Value')

plt.figure(figsize=(11, 8))
sns.boxplot(x='Feature', y='Value', data=df_melted)
plt.xticks(rotation=45)
plt.title("Boxplot of All Numeric Features")
plt.tight_layout()
plt.show()

In [None]:
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
scaler = RobustScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])       #scalling

In [None]:
df.head()

In [None]:
########### ' EDA ' ###########

In [None]:
corr_matrix = df.corr(numeric_only=True)
plt.figure(figsize=(25, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

In [None]:
######### ' modling ' #########

In [None]:
tsne = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate=200)
tsne_result = tsne.fit_transform(df)
df_tsne = pd.DataFrame(tsne_result, columns=['TSNE-1', 'TSNE-2'])

In [None]:
plt.figure(figsize=(8, 6))
plt.title("t-SNE projection")
sns.scatterplot(x='TSNE-1', y='TSNE-2', data=df_tsne, s=40, alpha=0.7)
plt.show()

In [None]:
reducer = umap.UMAP(n_components=2, random_state=42)
umap_result = reducer.fit_transform(df)

df_umap = pd.DataFrame(umap_result, columns=['UMAP-1', 'UMAP-2'])

In [None]:
plt.figure(figsize=(8, 6))
plt.title("UMAP projection")
sns.scatterplot(x='UMAP-1', y='UMAP-2', data=df_umap, s=40, alpha=0.7)
plt.show()

In [None]:
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans_labels = kmeans.fit_predict(df)

In [None]:
dbscan = DBSCAN(eps=1.8, min_samples=10)
dbscan_labels = dbscan.fit_predict(df)

In [None]:
reducer = umap.UMAP(n_components=2, random_state=42)
umap_result = reducer.fit_transform(df)

In [None]:
df_vis = pd.DataFrame(umap_result, columns=["UMAP1", "UMAP2"])
df_vis["KMeans"] = kmeans_labels
df_vis["DBSCAN"] = dbscan_labels

In [None]:
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.scatterplot(data=df_vis, x="UMAP1", y="UMAP2", hue="KMeans", palette="Set2", s=40)
plt.title("KMeans Clustering (UMAP Projection)")
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
plt.subplot(1, 1, 1)
sns.scatterplot(data=df_vis, x="UMAP1", y="UMAP2", hue="DBSCAN", palette="tab10", s=40)
plt.title("DBSCAN Clustering (UMAP Projection)")
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

In [None]:
df_clustered = df.copy()
df_clustered['Cluster'] = kmeans_labels

cluster_summary = df_clustered.groupby('Cluster').mean(numeric_only=True)
print("📊 Cluster-wise Averages:")
print(cluster_summary)

In [None]:
from sklearn.metrics import silhouette_score
score = silhouette_score(df, kmeans_labels)
print(f"📈 Silhouette Score for KMeans: {score:.3f}")