In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt


customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')

region_mapping = {            #numerical values assigned to the four regions here
    'South America': 1,
    'Asia': 2,
    'North America': 3,
    'Europe': 4
}
customers['Regions'] = customers['Region'].map(region_mapping)
merged_df = pd.merge(transactions, customers)
transaction_count = merged_df.groupby('CustomerID')['TransactionID'].count().reset_index(name='TransactionCount')
scaler = StandardScaler()
transaction_count_normalized = scaler.fit_transform(transaction_count[['TransactionCount']])


db_scores_transaction = []
cluster_range = range(2, 11)

for n_clusters in cluster_range:
    kmeans = KMeans(n_clusters=n_clusters)
    cluster_labels = kmeans.fit_predict(transaction_count_normalized)
    db_index = davies_bouldin_score(transaction_count_normalized, cluster_labels)
    db_scores_transaction.append(db_index)

best_n_clusters_transaction = cluster_range[db_scores_transaction.index(min(db_scores_transaction))]
print(f'perfect no. of clusters for transaction count: {best_n_clusters_transaction}')
print(f'lowest DB Index score for transaction count: {min(db_scores_transaction)}')


kmeans_transaction = KMeans(n_clusters=best_n_clusters_transaction)
transaction_count['Cluster'] = kmeans_transaction.fit_predict(transaction_count_normalized)


region_data = customers[['CustomerID', 'Regions']]
scaler_region = StandardScaler()
region_normalized = scaler_region.fit_transform(region_data[['Regions']])

db_scores_region = []

for n_clusters in cluster_range:
    kmeans = KMeans(n_clusters=n_clusters)
    cluster_labels = kmeans.fit_predict(region_normalized)
    db_index = davies_bouldin_score(region_normalized, cluster_labels)
    db_scores_region.append(db_index)

best_n_clusters_region = cluster_range[db_scores_region.index(min(db_scores_region))]
print(f'perfect no. of clusters for Region: {best_n_clusters_region}')
print(f'best DB Index score for Region: {min(db_scores_region)}')

kmeans_region = KMeans(n_clusters=best_n_clusters_region)
region_data['cluster'] = kmeans_region.fit_predict(region_normalized)

customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['DaysSinceSignup'] = (pd.to_datetime('today') - customers['SignupDate']).dt.days

scaler_signup = StandardScaler()
days_normalized = scaler_signup.fit_transform(customers[['DaysSinceSignup']])

db_scores_signup = []

for n_clusters in cluster_range:
    kmeans = KMeans(n_clusters=n_clusters)
    cluster_labels = kmeans.fit_predict(days_normalized)
    db_index = davies_bouldin_score(days_normalized, cluster_labels)
    db_scores_signup.append(db_index)

best_n_clusters_signup = cluster_range[db_scores_signup.index(min(db_scores_signup))]
print(f'appropriate no. of clusters for SignupDate: {best_n_clusters_signup}')
print(f' DB Index for SignupDate metric: {min(db_scores_signup)}')

kmeans_signup = KMeans(n_clusters=best_n_clusters_signup)
customers['SignupCluster'] = kmeans_signup.fit_predict(days_normalized)

total_value = merged_df.groupby('CustomerID')['TotalValue'].sum().reset_index(name='TotalTransactionValue')

scaler_value = StandardScaler()
total_value_normalized = scaler_value.fit_transform(total_value[['TotalTransactionValue']])

db_scores_value = []

for n_clusters in cluster_range:
    kmeans = KMeans(n_clusters=n_clusters)
    cluster_labels = kmeans.fit_predict(total_value_normalized)
    db_index = davies_bouldin_score(total_value_normalized, cluster_labels)
    db_scores_value.append(db_index)

best_n_clusters_value = cluster_range[db_scores_value.index(min(db_scores_value))]
best_db_index_value = min(db_scores_value)

print(f'best no. of clusters for Total Transaction Value: {best_n_clusters_value}')
print(f'best DB Index score for Total Transaction Value: {best_db_index_value}')

kmeans_best_value = KMeans(n_clusters=best_n_clusters_value, random_state=42, n_init=10)
total_value['Cluster'] = kmeans_best_value.fit_predict(total_value_normalized)

plt.figure(figsize=(10, 8))
plt.scatter(transaction_count['TransactionCount'], [0]*len(transaction_count), c=transaction_count['Cluster'])
plt.title('Transaction count clusters')
plt.xlabel('transaction frequency')
plt.ylabel('Cluster')
plt.colorbar(label='cluster')
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 6))
plt.scatter(region_data['Regions'], [0]*len(region_data), c=region_data['cluster'])
plt.title('Region clusters')
plt.xlabel('Region')
plt.ylabel('cluster')
plt.colorbar(label='cluster')
plt.grid(True)
plt.show()

plt.figure(figsize=(6, 6))
plt.scatter(customers['DaysSinceSignup'], [0]*len(customers), c=customers['SignupCluster'])
plt.title('Customer segmentation on the basis of Signup Date')
plt.xlabel('Days since Signup')
plt.colorbar(label='cluster')
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 6))
for cluster_id in range(best_n_clusters_value):
    cluster_data = total_value[total_value['Cluster'] == cluster_id]
    plt.scatter(cluster_data['TotalTransactionValue'], [0] * len(cluster_data), label=f'Cluster {cluster_id}')

plt.title(f'Customer clusters based on total transaction value (K={best_n_clusters_value})')
plt.xlabel('Total Transaction Value')
plt.legend()
plt.grid(True)
plt.show()
