In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load data
customers = pd.read_csv("datasets/Customers.csv")
transactions = pd.read_csv("datasets/Transactions.csv")

# Merge datasets
merged_df = pd.merge(customers, transactions, on="CustomerID")

In [3]:
# Encode categorical features
categorical_features = ['Region']
label_encoders = {col: LabelEncoder().fit(merged_df[col]) for col in categorical_features}
for col, le in label_encoders.items():
    merged_df[col] = le.transform(merged_df[col])

# Normalize numerical features
numerical_features = ['Quantity', 'TotalValue']
scaler = StandardScaler()
merged_df[numerical_features] = scaler.fit_transform(merged_df[numerical_features])

# Select features for clustering
features = ['Quantity', 'TotalValue', 'Region']
X = merged_df[features]

In [None]:
db_scores = []
silhouette_scores = []
k_values = range(2, 11)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(X)
    
    # Calculate Davies-Bouldin Index
    db_index = davies_bouldin_score(X, cluster_labels)
    db_scores.append(db_index)
    
    # Calculate Silhouette Score
    sil_score = silhouette_score(X, cluster_labels)
    silhouette_scores.append(sil_score)

# Find the optimal number of clusters (lowest DB Index)
optimal_k = k_values[np.argmin(db_scores)]
print(f"Optimal number of clusters: {optimal_k}")

# Fit K-Means with the optimal number of clusters
kmeans_optimal = KMeans(n_clusters=optimal_k, random_state=42)
merged_df['Cluster'] = kmeans_optimal.fit_predict(X)

# Calculate DB Index for optimal clustering
final_db_index = davies_bouldin_score(X, merged_df['Cluster'])
print(f"Davies-Bouldin Index for optimal clustering: {final_db_index}")


In [None]:
# Visualization of clusters without PCA
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=merged_df,
    x='Quantity', y='TotalValue',
    hue='Cluster',
    palette='tab10',
    s=100,
    alpha=0.7
)
plt.title("Customer Clustering (K-Means)")
plt.xlabel("Quantity")
plt.ylabel("TotalValue")
plt.legend(title='Cluster')
plt.show()

# Save clustering report
report = {
    "Number of Clusters": optimal_k,
    "Final DB Index": final_db_index,
    "Silhouette Score (Optimal Clusters)": silhouette_scores[np.argmin(db_scores)]
}
report_df = pd.DataFrame([report])
report_path = "KMeans_Clustering_Report_WithoutPCA.csv"
report_df.to_csv(report_path, index=False)

print(f"Clustering report saved to {report_path}")

In [None]:
# Clustering
db_scores = []
silhouette_scores = []

for k in range(2, 11):  # Clusters between 2 and 10
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(X)
    
    # Calculate Davies-Bouldin Index
    db_index = davies_bouldin_score(X, cluster_labels)
    db_scores.append(db_index)
    
    # Calculate Silhouette Score
    sil_score = silhouette_score(X, cluster_labels)
    silhouette_scores.append(sil_score)

# Determine the optimal number of clusters (based on DB Index)
optimal_clusters = np.argmin(db_scores) + 2  # Adding 2 because range starts from 2
print(f"Optimal number of clusters: {optimal_clusters}")

# Fit KMeans with optimal clusters
final_kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
merged_df['Cluster'] = final_kmeans.fit_predict(X)

# Calculate final DB Index
final_db_index = davies_bouldin_score(X, merged_df['Cluster'])
print(f"Davies-Bouldin Index for optimal clusters: {final_db_index}")

In [None]:
# Visualization using PCA
pca = PCA(n_components=2)
pca_components = pca.fit_transform(X)
merged_df['PCA1'] = pca_components[:, 0]
merged_df['PCA2'] = pca_components[:, 1]

plt.figure(figsize=(10, 6))
sns.scatterplot(data=merged_df, x='PCA1', y='PCA2', hue='Cluster', palette='tab10', s=100)
plt.title('Customer Segmentation Clusters')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster')
plt.show()

# Save clustering report
report = {
    "Number of Clusters": optimal_clusters,
    "DB Index": final_db_index,
    "Silhouette Score (Optimal Clusters)": silhouette_scores[optimal_clusters - 2]
}
report_df = pd.DataFrame([report])
report_path = "Clustering_Report.csv"
report_df.to_csv(report_path, index=False)

print(f"Clustering report saved to {report_path}")