# Customer Segmentation - Clustering Analysis

In [None]:

import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score, silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')

merged_df = pd.merge(transactions_df, customers_df, on='CustomerID')

customer_features = merged_df.groupby('CustomerID').agg(
    total_spending=('TotalValue', 'sum'),
    avg_spending=('TotalValue', 'mean'),
    purchase_count=('TransactionID', 'count'),
    product_variety=('ProductID', 'nunique')
).reset_index()

region_dummies = pd.get_dummies(customers_df.set_index('CustomerID')['Region'], prefix='region')
customer_features = customer_features.merge(region_dummies, left_on='CustomerID', right_index=True)

scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features.drop(columns='CustomerID'))

db_scores = []
silhouette_scores = []
cluster_labels_list = {}

for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(normalized_features)
    db_index = davies_bouldin_score(normalized_features, cluster_labels)
    silhouette_avg = silhouette_score(normalized_features, cluster_labels)
    
    db_scores.append(db_index)
    silhouette_scores.append(silhouette_avg)
    cluster_labels_list[k] = cluster_labels

optimal_k = 2 + db_scores.index(min(db_scores))
optimal_labels = cluster_labels_list[optimal_k]

customer_features['Cluster'] = optimal_labels

pca = PCA(n_components=2)
reduced_features = pca.fit_transform(normalized_features)
customer_features['PCA1'] = reduced_features[:, 0]
customer_features['PCA2'] = reduced_features[:, 1]


plt.figure(figsize=(10, 6))
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=customer_features, palette='Set2', s=100)
plt.title(f'Customer Segmentation (Optimal Clusters: {optimal_k})')
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.legend(title='Cluster')
plt.show()


print(f"Optimal Number of Clusters: {optimal_k}")
print(f"DB Index for Optimal Clusters: {min(db_scores):.4f}")
print(f"Silhouette Score: {silhouette_scores[db_scores.index(min(db_scores))]:.4f}")
    