In [None]:
# Mrigank Raj Dubey - Customer Segmentation (Clustering) for Data Science Assignment

import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

# Load datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge datasets
data = transactions.merge(customers, on='CustomerID')

# Feature Engineering
customer_features = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',   # Total spending
    'TransactionID': 'count',  # Total number of transactions
    'ProductID': 'nunique'  # Product diversity
}).reset_index()

# Rename columns for clarity
customer_features.rename(columns={
    'TotalValue': 'TotalSpending',
    'TransactionID': 'TotalTransactions',
    'ProductID': 'UniqueProducts'
}, inplace=True)

# Normalize data
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features.iloc[:, 1:])

# Perform K-Means Clustering
# Experiment with different cluster sizes (2 to 10)
k_values = range(2, 11)
db_scores = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(normalized_features)
    db_index = davies_bouldin_score(normalized_features, labels)
    db_scores.append(db_index)

# Choose the optimal number of clusters (lowest DB Index)
optimal_k = k_values[db_scores.index(min(db_scores))]
final_kmeans = KMeans(n_clusters=optimal_k, random_state=42)
final_labels = final_kmeans.fit_predict(normalized_features)

# Add cluster labels to customer features
data_with_clusters = customer_features.copy()
data_with_clusters['Cluster'] = final_labels

# Visualize Clusters (Using PCA for dimensionality reduction)
pca = PCA(n_components=2)
pca_features = pca.fit_transform(normalized_features)

plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_features[:, 0], y=pca_features[:, 1], hue=final_labels, palette='Set1')
plt.title('Customer Segments (PCA Visualization)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster')
plt.show()

# Save results to CSV
data_with_clusters.to_csv('Mrigank_Raj_Dubey_Clustering.csv', index=False)

# Print clustering results
print(f"Optimal Number of Clusters: {optimal_k}")
print(f"Davies-Bouldin Index: {min(db_scores):.2f}")
print("Clustered data saved to 'Mrigank_Raj_Dubey_Clustering.csv'")
