In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from tqdm import tqdm
import seaborn as sns
import os

In [3]:
os.chdir("D:\\meridianthe4\\PML\\Cases\\Recency Frequency Monetary")

In [8]:
rfm = pd.read_csv("rfm_data_customer.csv", index_col=0)

In [9]:
rfm = rfm.drop('most_recent_visit', axis=1)

In [11]:
scaler = StandardScaler().set_output(transform="pandas")
df_scaled = scaler.fit_transform(rfm)

In [12]:
km = KMeans(n_clusters=4, random_state=25)
km.fit(df_scaled)
silhouette_score(df_scaled, labels=km.labels_)

0.3232148393081134

In [15]:
clusters = [2, 3, 4, 5, 6, 7, 8]
scores = []
for k in tqdm(clusters):
    km = KMeans(n_clusters=k, random_state=25)
    km.fit(df_scaled)
    score = silhouette_score(df_scaled, labels=km.labels_)
    scores.append([k, score])
df_scores = pd.DataFrame(scores, columns=["Clusters", "Score"])
df_scores.sort_values(by="Score", ascending=False)

100%|██████████| 7/7 [01:57<00:00, 16.82s/it]


Unnamed: 0,Clusters,Score
1,3,0.370883
0,2,0.361973
3,5,0.32546
2,4,0.323215
4,6,0.288045
5,7,0.286497
6,8,0.26761


In [17]:
km = KMeans(n_clusters=3, random_state=25)
km.fit(df_scaled)
df_cluster = rfm.copy()
df_cluster['Cluster'] = km.labels_
df_cluster.groupby('Cluster').mean()

Unnamed: 0_level_0,revenue,number_of_orders,recency_days
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,749.731434,8.231623,135.921294
1,1281.638913,12.91372,133.277357
2,692.56445,7.354138,416.185286


In [22]:
eps_values = [0.1, 0.2, 0.3, 0.5, 0.7, 0.9]
min_samples_values = [2, 3, 4, 5, 7]
scores = []
# for eps in tqdm(eps_values):
for eps in eps_values:
    for min_samples in tqdm(min_samples_values):
        clust = DBSCAN(eps=eps, min_samples=min_samples)
        clust.fit(df_scaled)
        df_scaled_copy = df_scaled.copy()
        df_scaled_copy['Cluster'] = clust.labels_
        df_scaled_copy = df_scaled_copy[df_scaled_copy['Cluster'] != -1]
        if len(df_scaled_copy['Cluster'].unique()) >= 2:
            score = silhouette_score(df_scaled_copy.iloc[:,:-1], df_scaled_copy['Cluster'])
            # print(f"EPS: {eps}, Min Samples: {min_samples}, Silhouette Score: {score}")
            scores.append([eps, min_samples, score])
        else:
            # print(f"EPS: {eps}, Min Samples: {min_samples}, Not enough clusters for silhouette score")
            scores.append([eps, min_samples, None])
df_scores = pd.DataFrame(scores, columns=["EPS", "Min Samples", "Score"])
df_scores.sort_values(by="Score", ascending=False)

100%|██████████| 5/5 [01:12<00:00, 14.50s/it]
100%|██████████| 5/5 [01:21<00:00, 16.21s/it]
100%|██████████| 5/5 [01:23<00:00, 16.79s/it]
100%|██████████| 5/5 [00:13<00:00,  2.77s/it]
100%|██████████| 5/5 [00:24<00:00,  4.92s/it]
100%|██████████| 5/5 [00:35<00:00,  7.19s/it]


Unnamed: 0,EPS,Min Samples,Score
14,0.3,7,-0.014926
12,0.3,4,-0.027591
13,0.3,5,-0.027708
11,0.3,3,-0.051813
9,0.2,7,-0.072156
8,0.2,5,-0.090718
7,0.2,4,-0.09688
10,0.3,2,-0.111193
6,0.2,3,-0.129414
5,0.2,2,-0.167543


In [18]:
clust = DBSCAN(eps=0.3, min_samples=7)
clust.fit(df_scaled)
df_cluster = rfm.copy()
df_cluster['Cluster'] = clust.labels_
df_cluster.groupby('Cluster').mean()

Unnamed: 0_level_0,revenue,number_of_orders,recency_days
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,1216.65808,12.255269,382.702576
0,858.85222,9.0,182.156153
1,1527.831373,16.0,117.385621
2,479.377488,5.0,261.477612
3,1142.161753,12.0,147.624966
4,1053.807184,11.0,154.830101
5,1602.8,17.0,108.509333
6,951.390926,10.0,166.769091
7,1340.022537,14.0,131.083333
8,1232.222141,13.0,142.095725
