# RFM clustering using Hierarchical Clustering

## Agglomerative

The principle is simple : at start all points are clusters and then using distance we try step by step to group clusters in bigger clusters.

To do that, the algorithm consider two data:
* the intra cluster variance ;
* the inter cluster variance ;

Like in K-Means, we need to define the number of clusters `n_clusters` and the `linkage` property.

The `linkage` property determines which distance to use between observations.

In [33]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics

rfmsppl = pd.read_csv("./../../../output/rfmsppl.csv")

y = rfmsppl["review_score"]

X_train, X_test, y_train, y_test = train_test_split(
    rfmsppl, y, test_size=0.2, random_state=42, stratify=y
)
X = X_test[['recency', 'frequency', 'monetary', 'scoring_RFM', 'review_score', 'payment_installments', 'payment_sequential', 'payment_mean_boleto_value', 'payment_mean_voucher_value', 'payment_mean_debit_card_value', 'payment_mean_credit_card_value', 'nb_products']]

X.head()

Unnamed: 0,recency,frequency,monetary,scoring_RFM,review_score,payment_installments,payment_sequential,payment_mean_boleto_value,payment_mean_voucher_value,payment_mean_debit_card_value,payment_mean_credit_card_value,nb_products
43517,450.0,0.0625,29.5,3,4.0,1.0,1.0,0.0,0.0,0.0,29.5,1
52729,34.0,0.5,72.07,12,5.0,1.0,1.0,0.0,0.0,0.0,72.07,1
58266,468.0,0.058824,194.91,6,3.0,1.0,1.0,0.0,0.0,0.0,194.910004,1
7281,521.0,0.055556,47.95,3,5.0,4.0,1.0,0.0,0.0,0.0,47.950001,1
19387,225.0,0.125,39.84,7,3.0,1.5,1.5,0.0,0.0,0.0,19.92,1


We scale the data:

In [34]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scalar_vars = X[
    X.select_dtypes(exclude=["object", "category"]).columns
]

scaled_customers = scaler.fit_transform(scalar_vars)

scaled_customers

array([[ 1.41751156, -0.69331937, -0.59786197, ..., -0.05569846,
        -0.48407048, -0.20870784],
       [-1.33051672,  1.69928597, -0.38657031, ..., -0.05569846,
        -0.22119738, -0.20870784],
       [ 1.53641663, -0.7134253 ,  0.22313295, ..., -0.05569846,
         0.53734931, -0.20870784],
       ...,
       [-0.28679444, -0.25386124, -0.07749979, ..., -0.05569846,
        -0.6662353 , -0.20870784],
       [-0.17449521, -0.3515186 , -0.53040945, ..., -0.05569846,
        -0.6662353 , -0.20870784],
       [-0.84168477,  0.05864231,  0.01869077, ..., -0.05569846,
         0.2829978 , -0.20870784]])

In [35]:

#
# report = []
# linkage_methods = ["ward", "single", "average", "complete"]
# affinities = ["euclidean", "l1", "l2", "manhattan", "cosine"]
# for nb_clusters in range(2, 8):
#     for linkage_method in linkage_methods:
#         for affinity in affinities:
#             if linkage_method == "ward" and affinity != "euclidean":
#                 continue
#             agg_clustering = AgglomerativeClustering(
#                 n_clusters=nb_clusters,
#                 linkage=linkage_method,
#                 affinity=affinity,
#                 compute_full_tree=False,
#             ).fit(scaled_customers)
#
#             report.append(
#                 {
#                     "nb_clusters": nb_clusters,
#                     "silhouette_score": metrics.silhouette_score(
#                         scaled_customers, agg_clustering.labels_
#                     ),
#                     "linkage_method": linkage_method,
#                     "affinity": affinity,
#                 }
#             )
#
# report = pd.DataFrame(report)
# print(report)
# for affinity in affinities:
#     report_affinity = report[report.affinity == affinity]
#     fig = px.line(
#         report_affinity,
#         title="Silhouette score versus nb of clusters ({0})".format(affinity),
#         x="nb_clusters",
#         y="silhouette_score",
#         color="linkage_method",
#     )
#
#     fig.show()

Let's display the dendrogram to confirm that the optimal number of clusters is **4** for average method and l2 as affinity parameter.


In [36]:
# from scipy.cluster.hierarchy import linkage, dendrogram
# import matplotlib.pyplot as plt
#
# Z = linkage(scaled_customers, method="ward")
#
# fig = plt.figure(figsize=(16, 8))
# dendrogram(Z, orientation="left", truncate_mode="level", p=4)
#
# plt.show()

In [37]:
agg_clustering = AgglomerativeClustering(
    n_clusters=5, linkage="ward", affinity="euclidean", compute_full_tree=False
).fit(scaled_customers)

X["cluster"] = agg_clustering.labels_
X["cluster"] = X["cluster"].astype("category")

print(X["cluster"].value_counts())

0    9569
3    5971
1    1474
2     611
4       3
Name: cluster, dtype: int64




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



#### Clustering visualisation

In [38]:
from sklearn.manifold import TSNE
import plotly.express as px

tsne = TSNE(n_components=2, learning_rate="auto", init="random", n_jobs=-1)
projected_customers = tsne.fit_transform(scaled_customers)

fig = px.scatter(
    projected_customers,
    x=0,
    y=1,
    color=X.cluster,
    labels={"color": "cluster"},
    opacity=0.8,
)
fig.show()

In [39]:
# cluster for value equals to -1 is noise
fig2 = px.scatter_3d(X, x="recency", y="monetary", z="frequency", color="cluster")
fig2.show()

### Description of the clusters

In [43]:
report = X.groupby('cluster').agg({
    'monetary': 'mean',
    'frequency': 'mean',
    'recency': 'mean',
    'review_score': 'mean',
    'nb_products': 'mean',
    'payment_mean_credit_card_value': 'mean',
    'payment_mean_debit_card_value': 'mean',
    'payment_mean_voucher_value': 'mean',
    'payment_mean_boleto_value': 'mean',
    'payment_installments': 'mean',
    'payment_sequential': 'mean',
    'scoring_RFM': ['min', 'mean', 'max']
}).reset_index()

report['members'] = 0
report['members'] = X["cluster"].value_counts()

report.to_csv('./../../../output/hac_report.csv')

### Conclusions

The hierarchical clustering is in favor of 5 clusters to describe the dataset and have the best performance silhouette score but a really poor clustering performance.

We can't use it as the best model.