In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score

# Load datasets
customers = pd.read_csv('path_to_customers.csv')  # Replace with your actual path
transactions = pd.read_csv('path_to_transactions.csv')  # Replace with your actual path

# Merge datasets
merged_data = transactions.merge(customers, on='customer_id')

# Feature Engineering
customer_features = merged_data.groupby('customer_id').agg({
    'age': 'first',
    'gender': 'first',
    'transaction_amount': 'sum',
    'product_id': 'nunique',
    'transaction_date': 'count'  # Number of transactions
}).reset_index()

# Convert categorical features to numerical
customer_features = pd.get_dummies(customer_features, columns=['gender'], drop_first=True)

# Normalize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(customer_features[['age', 'transaction_amount', 'product_id', 'transaction_date']])

# Elbow Method to find optimal number of clusters
inertia = []
silhouette_scores = []
db_indices = []

for n_clusters in range(2, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(features_scaled)
    inertia.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(features_scaled, kmeans.labels_))
    db_indices.append(davies_bouldin_score(features_scaled, kmeans.labels_))

# Plot Elbow Method
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(range(2, 11), inertia, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')

plt.subplot(1, 2, 2)
plt.plot(range(2, 11), db_indices, marker='o')
plt.title('Davies-Bouldin Index')
plt.xlabel('Number of Clusters')
plt.ylabel('DB Index')

plt.tight_layout()
plt.show()

# Choose the optimal number of clusters (for example, let's say we choose 4)
optimal_clusters = 4
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
customer_features['cluster'] = kmeans.fit_predict(features_scaled)

# Calculate DB Index for the chosen number of clusters
db_index = davies_bouldin_score(features_scaled, customer_features['cluster'])

# Visualize clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=customer_features['transaction_amount'], y=customer_features['transaction_date'], hue=customer_features['cluster'], palette='viridis', s=100)
plt.title('Customer Segmentation Clusters')
plt.xlabel('Total Transaction Amount')
plt.ylabel('Number of Transactions')
plt.legend(title='Cluster')
plt.show()

# Save the results
customer_features.to_csv('Customer_Segmentation_Results.csv', index=False)

# Print results
print(f'Number of clusters formed: {optimal_clusters}')
print(f'Davies-Bouldin Index: {db_index}')