# Comparison of obtained models

In [None]:
import pandas as pd

rfm = pd.read_csv('./../data/clustered/rfm.csv')
rfm['kmeans_cluster'] = rfm['kmeans_cluster'] + 1
rfm['kmeans_cluster'] = rfm['kmeans_cluster'].astype('category')
rfm

In [None]:
rfmv = pd.read_csv('./../data/clustered/rfmv.csv')
rfmv['kmeans_cluster'] = rfmv['kmeans_cluster'] + 1
rfmv['kmeans_cluster'] = rfmv['kmeans_cluster'].astype('category')

rfmv

In [None]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

experiments = []

## RFM : 100 experiments

In [None]:
y = rfm['kmeans_cluster']
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(
        rfm, y, test_size=0.1, stratify=y
    )

    scaler = StandardScaler()

    scaled_customers = scaler.fit_transform(X_test[['recency', 'frequency', 'monetary']])
    kmeans = KMeans(n_clusters=5)
    kmeans.fit(scaled_customers)
    labels = kmeans.labels_
 
    experiments.append({
        'model': 'RFM',
        'experiment_no': i,
        'silouhette_score': round(silhouette_score(scaled_customers, labels), 2),
        'calinski_harabasz_score': round(calinski_harabasz_score(scaled_customers, labels), 2),
        'davies_bouldin_score': round(davies_bouldin_score(scaled_customers, labels), 2)
    })

## RFMV : 100 experiments

In [80]:
y = rfmv['kmeans_cluster']
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(
        rfmv, y, test_size=0.1, stratify=y
    )

    scaler = StandardScaler()

    scaled_customers = scaler.fit_transform(X_test[['recency', 'frequency', 'monetary', 'variety']])
    kmeans = KMeans(n_clusters=4)
    kmeans.fit(scaled_customers)
    labels = kmeans.labels_
 
    experiments.append({
        'model': 'RFMV',
        'experiment_no': i,
        'silouhette_score': round(silhouette_score(scaled_customers, labels), 2),
        'calinski_harabasz_score': round(calinski_harabasz_score(scaled_customers, labels), 2),
        'davies_bouldin_score': round(davies_bouldin_score(scaled_customers, labels), 2)
    })

In [81]:
# Track results
report = pd.DataFrame(experiments)


## Statistics review


### Looking for a gaussian variable

* H<sub>0</sub>: our variables are gaussian
* H<sub>1</sub>: our variables are not gaussian

In [82]:
import scipy.stats as st

alpha = 0.01
k2, p = st.normaltest(report['silouhette_score'])

for var in ['silouhette_score', 'calinski_harabasz_score', 'davies_bouldin_score']:
    k2, p = st.normaltest(report[var])
    
    if (p > alpha):
        print(f"The variable '{var}' follow the normal law. (p-value = {p:.4f})")
    else:
        print(f"The variable '{var}' doesn't follow the normal law. (p-value = {p:.4f})")


The variable 'silouhette_score' doesn't follow the normal law. (p-value = 0.0000)
The variable 'calinski_harabasz_score' doesn't follow the normal law. (p-value = 0.0025)
The variable 'davies_bouldin_score' doesn't follow the normal law. (p-value = 0.0000)


The silhouette score variable is gaussian so we can use Student and Bartlett tests
to check if the the distributions of this variable is different enough regarding the modelisation method.

To assess the quality of our groups, we must prove that they are statistically different.

For this, we will use two statistical tests:

* Bartlett's test which is a variance adequacy test;
* Student's test which is a test of adequacy on the average;

The hypothesis H<sub>0</sub> will be the following, if one or the other of these statistical properties varies significantly from one group to another then the hypothesis is rejected.

We set our risk threshold α at 1%.

In [83]:
rfm = report[report.model == 'RFM']['silouhette_score']
rfmv = report[report.model == 'RFMV']['silouhette_score']

st.bartlett(rfm,rfmv)

BartlettResult(statistic=0.43876470124869965, pvalue=0.5077192848096231)

The H<sub>0</sub> hypothesis on the variance is rejected since the p-value is less than 0.01.

In [84]:
st.ttest_ind(rfm, rfmv, equal_var=True)

Ttest_indResult(statistic=53.43939773958197, pvalue=1.3757318836977976e-119)

The H<sub>0</sub> hypothesis on the mean is rejected since the p-value is less than 0.01.

### Findings

The two groups are statistically different regarding the silouhette score, we can assert that the RFMV modelisation produce less performant clustering than RFM modelisation.

In [85]:
report.to_csv('./../data/metrics/report.csv',index=False)