# RFM clustering using Hierarchical Clustering

## Agglomerative

The principle is simple : at start all points are clusters and then using distance we try step by step to group clusters in bigger clusters.

To do that, the algorithm consider two data:
* the intra cluster variance ;
* the inter cluster variance ;

Like in K-Means, we need to define the number of clusters `n_clusters` and the `linkage` property.

The `linkage` property determines which distance to use between observations.

In [18]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split

rfmsppl = pd.read_csv("./../../../output/rfmsppl.csv")

y = rfmsppl["review_score"]

X_train, X_test, y_train, y_test = train_test_split(
    rfmsppl, y, test_size=0.2, random_state=42, stratify=y
)
rfmsppl = X_test

rfmsppl.head()

Unnamed: 0,recency,frequency,monetary,nb_orders,first_order,last_order,days_between_orders,scoring_R,scoring_F,scoring_M,scoring_RFM,segment_RFM,review_score,review_completion_percentage,review_behavior,nb_products,customer_zip_code_prefix,customer_city,customer_state
86359,147.0,0.166667,143.56,1.0,2018-04-04 21:53:53,2018-04-04 21:53:53,0,4,4,4,12,Bons clients,5.0,33.333333,Best Contributor,3,6315,carapicuiba,SP
42247,174.0,0.142857,159.03,1.0,2018-03-08 16:13:44,2018-03-08 16:13:44,0,4,3,4,11,Bons clients,3.0,66.666667,Best Contributor,1,7841,franco da rocha,SP
70502,114.0,0.2,701.66,1.0,2018-05-07 22:52:58,2018-05-07 22:52:58,0,4,4,5,13,Très bons clients,5.0,100.0,Best Contributor,1,18045,sorocaba,SP
1832,167.0,0.142857,184.56,1.0,2018-03-16 09:17:05,2018-03-16 09:17:05,0,4,3,4,11,Bons clients,1.0,66.666667,Best Contributor,1,23071,rio de janeiro,RJ
18729,583.0,0.05,91.07,1.0,2017-01-24 09:24:11,2017-01-24 09:24:11,0,1,1,3,5,Clients tièdes,4.0,66.666667,Best Contributor,1,38735,cruzeiro da fortaleza,MG


We scale the data:

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

encoder = OneHotEncoder()
ordinal_vars = rfmsppl[rfmsppl.select_dtypes(include=["object", "category"]).columns]
encoder.fit(ordinal_vars)

encoded_customers = encoder.transform(ordinal_vars)
scalar_vars = rfmsppl[
    rfmsppl.select_dtypes(exclude=["object", "category"]).columns
].values

scaler = StandardScaler()

scalar_vars = pd.concat(
    [pd.DataFrame(scalar_vars), pd.DataFrame(encoded_customers)], axis=1
)
scaler.fit(scalar_vars)

scaled_customers = scaler.transform(scalar_vars)

scaled_customers

In [19]:
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics

report = []
linkage_methods = ["ward", "single", "average", "complete"]
affinities = ["euclidean", "l1", "l2", "manhattan", "cosine"]
for nb_clusters in range(2, 8):
    for linkage_method in linkage_methods:
        for affinity in affinities:
            if linkage_method == "ward" and affinity != "euclidean":
                continue
            agg_clustering = AgglomerativeClustering(
                n_clusters=nb_clusters,
                linkage=linkage_method,
                affinity=affinity,
                compute_full_tree=False,
            ).fit(scaled_customers)

            report.append(
                {
                    "nb_clusters": nb_clusters,
                    "silhouette_score": metrics.silhouette_score(
                        scaled_customers, agg_clustering.labels_
                    ),
                    "linkage_method": linkage_method,
                    "affinity": affinity,
                }
            )

report = pd.DataFrame(report)
print(report)
for affinity in affinities:
    report_affinity = report[report.affinity == affinity]
    fig = px.line(
        report_affinity,
        title="Silhouette score versus nb of clusters ({0})".format(affinity),
        x="nb_clusters",
        y="silhouette_score",
        color="linkage_method",
    )

    fig.show()

ERROR! Session/line number was not unique in database. History logging moved to new session 370



KeyboardInterrupt



Let's display the dendrogram to confirm that the optimal number of clusters is **4** for average method and l2 as affinity parameter.


In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

Z = linkage(scaled_customers, method="average")

fig = plt.figure(figsize=(16, 8))
dendrogram(Z, orientation="left", truncate_mode="level", p=4)

plt.show()

In [None]:
agg_clustering = AgglomerativeClustering(
    n_clusters=5, linkage="ward", affinity="euclidean", compute_full_tree=False
).fit(scaled_customers)

rfmsppl["cluster"] = agg_clustering.labels_
rfmsppl["cluster"] = rfmsppl["cluster"].astype("category")

print(rfmsppl["cluster"].value_counts())

#### Clustering visualisation

In [None]:
from sklearn.manifold import TSNE
import plotly.express as px

tsne = TSNE(n_components=2, learning_rate="auto", init="random", n_jobs=-1)
projected_customers = tsne.fit_transform(scaled_customers)

fig = px.scatter(
    projected_customers,
    x=0,
    y=1,
    color=rfmsppl.cluster,
    labels={"color": "cluster"},
    opacity=0.8,
)
fig.show()

In [None]:
# cluster for value equals to -1 is noise
fig2 = px.scatter_3d(rfmsppl, x="recency", y="monetary", z="frequency", color="cluster")
fig2.show()

### Conclusions

The hierarchical clustering is in favor of 4 clusters to describe the dataset and have the best performance silhouette score but a really poor clustering performance.

We can't use it as the best model.