## Importing the data

In [21]:
import pandas as pd

df = pd.read_csv('data/credit-card.csv')
df.head()

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,C10001,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0,12
1,C10002,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,C10003,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0,12
3,C10004,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,,0.0,12
4,C10005,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0,12


## Preprocessing the data

Right of the bet, there are columns we don't need. Let's drop them already

In [22]:
df.drop(columns = ['CUST_ID', 'TENURE'], inplace = True)
df.head()

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT
0,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0
1,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222
2,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0
3,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,,0.0
4,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0


It's also important to look for missing data

In [23]:
missing = df.isna().sum()
missing

BALANCE                               0
BALANCE_FREQUENCY                     0
PURCHASES                             0
ONEOFF_PURCHASES                      0
INSTALLMENTS_PURCHASES                0
CASH_ADVANCE                          0
PURCHASES_FREQUENCY                   0
ONEOFF_PURCHASES_FREQUENCY            0
PURCHASES_INSTALLMENTS_FREQUENCY      0
CASH_ADVANCE_FREQUENCY                0
CASH_ADVANCE_TRX                      0
PURCHASES_TRX                         0
CREDIT_LIMIT                          1
PAYMENTS                              0
MINIMUM_PAYMENTS                    313
PRC_FULL_PAYMENT                      0
dtype: int64

For this case, instead of dropping the entire *MINIMUM_PAYMENTS* column, we're going to fill those missing values with the median value

In [24]:
df.fillna(df.median(), inplace=True)
missing = df.isna().sum()
missing

BALANCE                             0
BALANCE_FREQUENCY                   0
PURCHASES                           0
ONEOFF_PURCHASES                    0
INSTALLMENTS_PURCHASES              0
CASH_ADVANCE                        0
PURCHASES_FREQUENCY                 0
ONEOFF_PURCHASES_FREQUENCY          0
PURCHASES_INSTALLMENTS_FREQUENCY    0
CASH_ADVANCE_FREQUENCY              0
CASH_ADVANCE_TRX                    0
PURCHASES_TRX                       0
CREDIT_LIMIT                        0
PAYMENTS                            0
MINIMUM_PAYMENTS                    0
PRC_FULL_PAYMENT                    0
dtype: int64

### Normalizing the data

In [26]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
np_df = min_max_scaler.fit_transform(df) # fit_transform transforms our dataframe and returns returns a numpy array
df = pd.DataFrame(np_df, columns=df.columns) # Converting to a dataframe

df.describe()

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT
count,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0
mean,0.082154,0.877271,0.020457,0.014534,0.01827,0.020766,0.490351,0.202458,0.364437,0.090096,0.026413,0.041089,0.14839,0.03417,0.011058,0.153715
std,0.109306,0.236904,0.04357,0.040722,0.040193,0.044491,0.401371,0.298336,0.397448,0.133414,0.055485,0.069435,0.121491,0.057078,0.030531,0.292499
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.006736,0.888889,0.000808,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.002793,0.051753,0.007556,0.002236,0.0
50%,0.045864,1.0,0.007367,0.000932,0.003956,0.0,0.5,0.083333,0.166667,0.0,0.0,0.019553,0.098497,0.016894,0.004088,0.0
75%,0.107868,1.0,0.022637,0.014166,0.020828,0.023629,0.916667,0.3,0.75,0.148148,0.03252,0.047486,0.215359,0.037482,0.010322,0.142857
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Clustering

In [32]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, n_init=10, max_iter=300)
pred_labels = kmeans.fit_predict(df)
pred_labels

array([3, 3, 1, ..., 0, 3, 1])

Before we can proceed to interpret any meaning from the clusters, we should make sure that the clustering made is actually good.

## Validation

### The Silhouette Score

In [40]:
from sklearn.metrics import silhouette_score

s_score = silhouette_score(df, pred_labels)
s_score

0.35382830732160386

Our Silhouette Score was 0.3538, which is somewhat good, but not great (it can vary from -1 to 1). Let's see how changing the number of clusters will affect the silhouette score!

In [35]:
n_clusters_values = [i for i in range(2, 11)] # [2, 3, 4, 5, 6, 7, 8, 9, 10]
s_scores = []

for n_clusters in n_clusters_values:
    cluster = KMeans(n_clusters=n_clusters)
    labels = cluster.fit_predict(df)

    s_scores.append(silhouette_score(df, labels))

s_scores

[0.4134923417305637,
 0.4023268704570294,
 0.35383393504730803,
 0.3538065073627111,
 0.3591053303790778,
 0.34848188967647287,
 0.36081038969400636,
 0.3673123140692698,
 0.3153395811475611]

### The Davies-Bouldin Score

In [42]:
from sklearn.metrics import davies_bouldin_score

dbs = davies_bouldin_score(df, pred_labels)
dbs

1.1818712154511037

In [44]:
n_clusters_values = [i for i in range(2, 11)] # [2, 3, 4, 5, 6, 7, 8, 9, 10]
dbs_scores = []

for n_clusters in n_clusters_values:
    cluster = KMeans(n_clusters=n_clusters)
    labels = cluster.fit_predict(df)

    dbs_scores.append(davies_bouldin_score(df, labels))

dbs_scores

[1.0391965514303874,
 1.2545411432274742,
 1.3085912730409592,
 1.1814548335015707,
 1.184322140429467,
 1.1576124924580937,
 1.0555675370551052,
 0.996261465760402,
 1.0551524484114183]