In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load datasets
customers_data = pd.DataFrame({
    "CustomerID": ["C0001", "C0002", "C0003", "C0004"],  # Include your data here
    "CustomerName": ["Lawrence Carroll", "Elizabeth Lutz", "Michael Rivera", "Kathleen Rodriguez"],
    "Region": ["South America", "Asia", "South America", "South America"],
    "SignupDate": ["7/10/2022", "2/13/2022", "3/7/2024", "10/9/2022"]
})

transactions_file = "/mnt/data/Transactions.csv"
transactions_data = pd.read_csv(transactions_file)


In [None]:
# Merge and preprocess data for clustering
customer_transactions = pd.merge(transactions_data, customers_data, on="CustomerID")
customer_summary = customer_transactions.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    avg_quantity=("Quantity", "mean"),
    transaction_count=("TransactionID", "count")
).reset_index()

In [None]:
# Normalize the data
scaler = StandardScaler()
features = ["total_spent", "avg_quantity", "transaction_count"]
customer_summary_scaled = scaler.fit_transform(customer_summary[features])

In [None]:
# Perform clustering
optimal_clusters = 0
lowest_db_index = float("inf")
for n_clusters in range(2, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(customer_summary_scaled)
    db_index = davies_bouldin_score(customer_summary_scaled, labels)
    
    print(f"Number of clusters: {n_clusters}, Davies-Bouldin Index: {db_index}")
    
    if db_index < lowest_db_index:
        lowest_db_index = db_index
        optimal_clusters = n_clusters

In [None]:
# Final clustering with optimal clusters
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
customer_summary["Cluster"] = kmeans.fit_predict(customer_summary_scaled)

In [None]:
# Visualize the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x=customer_summary["total_spent"], 
    y=customer_summary["avg_quantity"], 
    hue=customer_summary["Cluster"], 
    palette="tab10", s=100
)
plt.title("Customer Clusters Based on Spending and Quantity")
plt.xlabel("Total Spent ($)")
plt.ylabel("Average Quantity")
plt.legend(title="Cluster")
plt.show()

In [None]:
# Save clustering results
customer_summary.to_csv("Arun_Gajraj_Clustering.ipynb", index=False)

In [None]:
# Print clustering insights
print(f"Optimal Number of Clusters: {optimal_clusters}")
print(f"Lowest Davies-Bouldin Index: {lowest_db_index}")
