In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')

# Prepare data for clustering
# Aggregate transaction data by customer
transaction_summary = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count'
}).reset_index().rename(columns={'TransactionID': 'TransactionCount'})

# Merge with customer profile data
cluster_data = customers.merge(transaction_summary, on='CustomerID')

# Select relevant features for clustering
features = cluster_data[['TotalValue', 'TransactionCount']]

# Normalize the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Determine optimal number of clusters using the Elbow method
inertia = []
silhouette_scores = []
db_indices = []
cluster_range = range(2, 11)

for n_clusters in cluster_range:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(scaled_features)

    inertia.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(scaled_features, kmeans.labels_))
    db_indices.append(davies_bouldin_score(scaled_features, kmeans.labels_))

# Plot Elbow method
plt.figure(figsize=(12, 6))
plt.subplot(1, 3, 1)
plt.plot(cluster_range, inertia, marker='o')
plt.title('Elbow Method for Optimal Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')

# Plot Silhouette Scores
plt.subplot(1, 3, 2)
plt.plot(cluster_range, silhouette_scores, marker='o')
plt.title('Silhouette Scores for Different Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')

# Plot DB Index
plt.subplot(1, 3, 3)
plt.plot(cluster_range, db_indices, marker='o')
plt.title('DB Index for Different Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('DB Index')

plt.tight_layout()
plt.show()

# Choose optimal clusters (e.g., based on the Elbow method)
optimal_clusters = 4  # Adjust based on your analysis
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
cluster_data['Cluster'] = kmeans.fit_predict(scaled_features)

# Calculate clustering metrics
db_index_value = davies_bouldin_score(scaled_features, cluster_data['Cluster'])
silhouette_avg = silhouette_score(scaled_features, cluster_data['Cluster'])

# Print clustering results
print(f'Number of Clusters Formed: {optimal_clusters}')
print(f'DB Index Value: {db_index_value:.4f}')
print(f'Average Silhouette Score: {silhouette_avg:.4f}')

# Visualize clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(data=cluster_data, x='TotalValue', y='TransactionCount', hue='Cluster', palette='viridis', s=100)
plt.title('Customer Segmentation Clusters')
plt.xlabel('Total Value of Transactions')
plt.ylabel('Number of Transactions')
plt.legend(title='Cluster')
plt.show()

# Save clustering results
cluster_data.to_csv('ClusteringResults.csv', index=False)