In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns
def loadData():
    try:
        c = pd.read_csv('Customers.csv')
        t = pd.read_csv('Transactions.csv')
        p = pd.read_csv('Products.csv')
        return c, t, p
    except Exception as e:
        print(f"Error loading data: {e}")
        return None, None, None
def prepareFeatures(tDf):
    cFeat = tDf.groupby('CustomerID').agg({
        'TransactionID': 'count',
        'TotalValue': ['sum', 'mean'],
        'Quantity': ['sum', 'mean']
    }).reset_index()
    cFeat.columns = [
        'CustomerID',
        'transCount',
        'totalSpend',
        'avgTransVal',
        'totalQty',
        'avgQty'
    ]   
    return cFeat
def performClustering(fDf, nClust):
    fForClust = fDf.drop('CustomerID', axis=1)
    scaler = StandardScaler()
    fScaled = scaler.fit_transform(fForClust)
    kmeans = KMeans(n_clusters=nClust, random_state=42, n_init=10)
    clustLabels = kmeans.fit_predict(fScaled)
    dbIdx = davies_bouldin_score(fScaled, clustLabels)
    fDf['Cluster'] = clustLabels
    return fDf, dbIdx, kmeans.cluster_centers_
def analyzeClusters(fDf):
    cSummary = fDf.groupby('Cluster').agg({
        'CustomerID': 'count',
        'transCount': 'mean',
        'totalSpend': 'mean',
        'avgTransVal': 'mean',
        'totalQty': 'mean',
        'avgQty': 'mean'
    }).round(2) 
    cSummary.columns = [
        'CustomerCount',
        'AvgTrans',
        'AvgSpend',
        'AvgTransVal',
        'AvgQty',
        'AvgQtyPerTrans'
    ]
    return cSummary
def createVisualizations(fDf):
    plt.figure(figsize=(15, 10))
    
    plt.subplot(2, 2, 1)
    plt.scatter(fDf['totalSpend'], 
               fDf['transCount'], 
               c=fDf['Cluster'], 
               cmap='viridis')
    plt.xlabel('Total Spend')
    plt.ylabel('Transaction Count')
    plt.title('Clusters by Spend and Trans Count')  
    plt.subplot(2, 2, 2)
    plt.scatter(fDf['avgTransVal'], 
               fDf['totalQty'], 
               c=fDf['Cluster'], 
               cmap='viridis')
    plt.xlabel('Avg Trans Value')
    plt.ylabel('Total Quantity')
    plt.title('Clusters by Trans Value and Qty')
    plt.subplot(2, 2, 3)
    clustSizes = fDf['Cluster'].value_counts().sort_index()
    clustSizes.plot(kind='bar')
    plt.xlabel('Cluster')
    plt.ylabel('No. of Customers')
    plt.title('Cluster Sizes')
    plt.tight_layout()
    plt.savefig('clustAnalysis.png')
    plt.close()
def main():
    print("Loading data...")
    cDf, tDf, pDf = loadData()
    if cDf is None:
        return
    print("Preparing features...")
    fDf = prepareFeatures(tDf)
    nClustRange = range(2, 11)
    dbScores = []
    print("\nEvaluating different numbers of clusters...")
    for n in nClustRange:
        _, dbIdx, _ = performClustering(fDf.copy(), n)
        dbScores.append(dbIdx)
        print(f"Clusters: {n}, DB Index: {dbIdx:.3f}") 
    optClust = nClustRange[np.argmin(dbScores)]
    print(f"\nOptimal clusters: {optClust}")
    print("\nPerforming final clustering...")
    clustDf, finalDbIdx, clustCenters = performClustering(fDf, optClust)
    print("\nAnalyzing clusters...")
    clustSummary = analyzeClusters(clustDf)
    print("Creating visualizations...")
    createVisualizations(clustDf)
    print("\nFinal DB Index:", finalDbIdx)
    print("\nCluster Summary:")
    print(clustSummary)
    clustDf.to_csv('customerClusts.csv', index=False)
    clustSummary.to_csv('clustSummary.csv')
    print("\nResults saved to:")
    print("- customerClusts.csv (all customers with their cluster assignments)")
    print("- clustSummary.csv (summary stats for each cluster)")
    print("- clustAnalysis.png (visualization plots)")
if __name__ == "__main__":
    main()


Loading data...
Preparing features...

Evaluating different numbers of clusters...




Clusters: 2, DB Index: 1.073




Clusters: 3, DB Index: 1.227




Clusters: 4, DB Index: 1.104




Clusters: 5, DB Index: 1.045




Clusters: 6, DB Index: 1.075




Clusters: 7, DB Index: 1.110




Clusters: 8, DB Index: 1.092




Clusters: 9, DB Index: 1.023




Clusters: 10, DB Index: 1.040

Optimal clusters: 9

Performing final clustering...





Analyzing clusters...
Creating visualizations...

Final DB Index: 1.022951986409254

Cluster Summary:
         CustomerCount  AvgTrans  AvgSpend  AvgTransVal  AvgQty  \
Cluster                                                           
0                   29      6.97   4931.24       717.09   17.38   
1                   24      2.38   1657.32       715.89    5.75   
2                   19      5.74   5486.41       964.90   18.68   
3                   15      3.40   3736.68      1108.35    9.80   
4                   16      2.56    742.27       280.30    4.38   
5                   16      8.88   6875.24       782.19   24.75   
6                   15      3.40   2101.49       634.33   11.13   
7                   46      4.72   2961.49       628.34   11.41   
8                   19      6.84   3015.29       446.04   12.37   

         AvgQtyPerTrans  
Cluster                  
0                  2.51  
1                  2.41  
2                  3.26  
3                  2.89  
4  