In [3]:
import pandas as pd

df = pd.read_csv('data.csv')
df.drop(columns=['CUST_ID', 'TENURE'], inplace=True)
df.head()

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT
0,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0
1,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222
2,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0
3,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,,0.0
4,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0


In [4]:
missing = df.isna().sum()
print(missing)

BALANCE                               0
BALANCE_FREQUENCY                     0
PURCHASES                             0
ONEOFF_PURCHASES                      0
INSTALLMENTS_PURCHASES                0
CASH_ADVANCE                          0
PURCHASES_FREQUENCY                   0
ONEOFF_PURCHASES_FREQUENCY            0
PURCHASES_INSTALLMENTS_FREQUENCY      0
CASH_ADVANCE_FREQUENCY                0
CASH_ADVANCE_TRX                      0
PURCHASES_TRX                         0
CREDIT_LIMIT                          1
PAYMENTS                              0
MINIMUM_PAYMENTS                    313
PRC_FULL_PAYMENT                      0
dtype: int64


In [7]:
df.fillna(df.median(), inplace=True)

missing = df.isna().sum()
print(missing)

BALANCE                             0
BALANCE_FREQUENCY                   0
PURCHASES                           0
ONEOFF_PURCHASES                    0
INSTALLMENTS_PURCHASES              0
CASH_ADVANCE                        0
PURCHASES_FREQUENCY                 0
ONEOFF_PURCHASES_FREQUENCY          0
PURCHASES_INSTALLMENTS_FREQUENCY    0
CASH_ADVANCE_FREQUENCY              0
CASH_ADVANCE_TRX                    0
PURCHASES_TRX                       0
CREDIT_LIMIT                        0
PAYMENTS                            0
MINIMUM_PAYMENTS                    0
PRC_FULL_PAYMENT                    0
dtype: int64


In [9]:
from sklearn.preprocessing import Normalizer

values = Normalizer().fit_transform(df.values)
print(values)

[[3.93555441e-02 7.87271593e-04 9.17958473e-02 ... 1.94178127e-01
  1.34239194e-01 0.00000000e+00]
 [2.93875903e-01 8.34231560e-05 0.00000000e+00 ... 3.76516684e-01
  9.84037959e-02 2.03923046e-05]
 [3.10798149e-01 1.24560965e-04 9.63068011e-02 ... 7.74852335e-02
  7.81351982e-02 0.00000000e+00]
 ...
 [2.27733092e-02 8.11060955e-04 1.40540698e-01 ... 7.90986945e-02
  8.02156174e-02 2.43318384e-04]
 [2.65257948e-02 1.64255731e-03 0.00000000e+00 ... 1.03579625e-01
  1.09898221e-01 4.92767391e-04]
 [1.86406219e-01 3.33426837e-04 5.46778061e-01 ... 3.15915455e-02
  4.41568390e-02 0.00000000e+00]]


In [10]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 5, n_init=10, max_iter=300)
y_pred = kmeans.fit_predict(values)

In [11]:
from sklearn import metrics

labels = kmeans.labels_
silhouette = metrics.silhouette_score(values, labels, metric='euclidean')

print(silhouette)

0.3645055324417281


## Validation metrics



### Compression Criteria

The internal metrics are based on two validation criteria, the first of which is compression, which tells us how close the points are in the same cluster. The closer the points, the more compacted it is.


### Separation Criteria

The second validation criterion is separation, which indicates how well separated the points are in different clusters. The idea is just the opposite of compaction, the more distant the clusters are from each other, the better.

## Silhouette coefficient

We have the formula for calculating the coefficient that is based on two main values: beta and alpha.


<img src="imgs/silhouette-coefficient.png" style="width: 300px"/>

We will start by calculating the alpha value, that is, the average distance between the point and all other points in the same cluster. Let's check the compression of the elements.

We will use the same example of the three clusters, and each contains three elements.

<img src="imgs/compaction.png" style="width: 300px"/>

We will be based on two dimensions (x, y), and we will use two attributes of the 16 that we have available in our dataframe.

We will select a specific point, a customer, and from there we will calculate the average for other elements of the cluster. We will call this first point "A".

For the x-axis we will use the value of 1.0 and for y 0.9. That done, we will measure the distance from point "A" to point "B" and "C"

<img src="imgs/graphic.png" style="width: 300px"/>

We are using Euclidean distance, so we will use our formula.

<img src="imgs/distance.png" style="width: 300px"/>

The result of the calculations will be 0.8. This is the Euclidean distance between A and B. We will do the same procedure for C. The distance between point A and C is 1.12.

Once this is done, we will obtain the average between the two final values, the result is 0.96. We will go back to the formula for the silhoette coefficient that we presented at the beginning of the class and insert this value in the alpha variable.

Our next step is to calculate the beta value, which is the average distance between the point and all other points in the nearest cluster.

In order to know which is the closest cluster, we must take the average between the points for the two clusters and collect the lowest value. We will do the same procedure as we did previously, but with different points.

<img src="imgs/new-points.png" style="width: 300px"/>

we will arrive at the final result of 3.1 from point A of the red cluster points. For the yellow cluster the result will be 1.31. The shortest distance is the second cluster, as we can see.

<img src="imgs/formula.png" style="width: 300px"/>

The final result will be 0.26. The result is always between 1 and -1, so in this case we have a good positive value.

In [12]:
dbs = metrics.davies_bouldin_score(values, labels)
print(dbs)

1.0756856623533984


In [13]:
calinski = metrics.calinski_harabasz_score(values, labels)
print(calinski)

3431.800717231778
