In [23]:
from sklearn.cluster import MiniBatchKMeans,AgglomerativeClustering,KMeans
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage,dendrogram
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score

In [11]:
rfm_df=pd.read_csv(r'..\Datasets\ecom_data_rfm.csv',index_col=0)
rfm_df.head()

Unnamed: 0,CustomerID,Frequency,Recency,Monetary,rankR,rankF,rankM,groupRFM,Country,Customer_Segment
1,12346,2,358,2.08,2,1,1,211,United Kingdom,Lost Lowest
2,12347,182,35,481.21,5,4,3,543,Iceland,Loyal Customers
3,12348,31,108,178.71,5,1,2,512,Finland,Potential Loyalist
4,12349,73,51,605.1,5,2,4,524,Italy,Recent High Spender
5,12350,17,343,65.3,2,1,1,211,Norway,Lost Lowest


In [12]:
rfm_df.shape

(4380, 10)

In [13]:
rfm_df.drop(columns=['CustomerID','rankR','rankF','rankM','groupRFM','Country','Customer_Segment'],axis=1, inplace=True)
rfm_df.head()

Unnamed: 0,Frequency,Recency,Monetary
1,2,358,2.08
2,182,35,481.21
3,31,108,178.71
4,73,51,605.1
5,17,343,65.3


In [14]:
scaler=StandardScaler().set_output(transform='pandas')
rfm_scaled=scaler.fit_transform(rfm_df)
rfm_scaled.head()

Unnamed: 0,Frequency,Recency,Monetary
1,-0.392218,2.318901,-0.249443
2,0.382791,-0.888861,0.123841
3,-0.267356,-0.163887,-0.111833
4,-0.08652,-0.729962,0.220362
5,-0.327634,2.169934,-0.20019


### Using MiniBatchKMeans

In [18]:
ks=[2,3,4,5,6,7]
scores=[]
for i in ks:
    kmean=MiniBatchKMeans(n_clusters=i)
    kmean.fit(rfm_scaled)
    scores.append([i,silhouette_score(rfm_scaled,kmean.labels_)])
score_df=pd.DataFrame(scores,columns=['clusters','score'])
score_df.sort_values(by='score',ascending=False,inplace=True)
score_df.head()

Unnamed: 0,clusters,score
0,2,0.598302
2,4,0.51238
3,5,0.466254
1,3,0.427497
4,6,0.415648


### Agglomerative clustering

In [21]:
ks=[2,3,4,5,6,7]
linkage=['ward','single','complete','average']
scores=[]
for i in ks:
    for link in linkage:
        clust=AgglomerativeClustering(n_clusters=i,linkage=link)
        clust.fit(rfm_scaled)
        scores.append([i,link,silhouette_score(rfm_scaled,clust.labels_)])
score_df=pd.DataFrame(scores,columns=['clusters','linkage','score'])
score_df.sort_values(by='score',ascending=False,inplace=True)
score_df.head()

Unnamed: 0,clusters,linkage,score
1,2,single,0.961413
3,2,average,0.961413
5,3,single,0.960839
2,2,complete,0.957642
6,3,complete,0.954639


### Using Kmeans

In [24]:
ks=[2,3,4,5,6,7]
scores=[]
for i in ks:
    kmean=KMeans(n_clusters=i)
    kmean.fit(rfm_scaled)
    scores.append([i,silhouette_score(rfm_scaled,kmean.labels_)])
score_df=pd.DataFrame(scores,columns=['clusters','score'])
score_df.sort_values(by='score',ascending=False,inplace=True)
score_df.head()

Unnamed: 0,clusters,score
0,2,0.951687
3,5,0.622155
2,4,0.61762
1,3,0.59982
5,7,0.532806


### Finding the best parameter for clustering

In [25]:
clust=AgglomerativeClustering(n_clusters=2,linkage='single')
clust.fit(rfm_scaled)

In [26]:
df1=rfm_df.copy()
df1['Cluster']=clust.labels_
df1.groupby('Cluster').mean()

Unnamed: 0_level_0,Frequency,Recency,Monetary
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,6338.0,35.0,30923.39
1,88.814485,124.563628,301.279868
