# Clustering Approaches to Consumer Credit Card Data #

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
from numpy import unique

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
import scipy.cluster.hierarchy as sch
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import davies_bouldin_score, silhouette_score, calinski_harabasz_score

from kneed import KneeLocator

%matplotlib inline

## Notebook Objectives: ##
* Short EDA for consumer credit card data
* KMeans clustering with several metrics and graphs for interpretation
* DBSCAN clustering and graphs for interpretation
* Hierarchical (Agglomerative) clustering and graphs for interpretation
* Comparing approaches using several metrics: which approach is the best?

# EDA #

In [None]:
# need to download and save data locally from 
# https://www.kaggle.com/datasets/arjunbhasin2013/ccdata/download?datasetVersionNumber=1
# and then read it into a Pandas DataFrame 

with open(r"CC_GENERAL.csv") as csv_file:
    df_cc = pd.read_csv(csv_file)



In [None]:
# check to see what the data looks like

df_cc.head()

In [None]:
# check info on DataFrame. What data types are we dealing with? What steps might be necessary?

df_cc.info()

In [None]:
# check to see if there are missing values

df_cc.isna().sum()

we see MINIMUM_PAYMENTS and CREDIT_LIMIT have missing values. What do we do with these? Drop? Impute? 


If imputing missing values, it is important to look at things such as skewness of the data. 

In [None]:
# check for skewness by looking at the distributions

plt.figure(figsize=(20,35))
for i, column in enumerate(df_cc.columns):
    if df_cc[column].dtype != 'object':
        ax = plt.subplot(9, 2, i+1)
        sns.histplot(df_cc[column], ax=ax)
        plt.xlabel(column)
        
plt.show()

In the overall data set, MINIMUM_PAYMENTS is heavily skewed with high kurtosis. This means many outliers. In cases such as this, it is best to use median. Consider mode if results are not good.

Other key points are that the number of missings is less than 5%, so going with median should not create other problems. 

The median value should be genereated from the entire data set. 

https://medium.com/analytics-vidhya/feature-engineering-part-1-mean-median-imputation-761043b95379

In [None]:
df_cc['MINIMUM_PAYMENTS'].fillna(df_cc['MINIMUM_PAYMENTS'].median(), inplace=True)

In [None]:
# drop record with missing credit limit? Alternatively, one could impute with mean or median. Mean value is $4495, median $3000. 
# if terms of time spent, the fastest choice would be to just drop the record.  

print("Median:", df_cc['CREDIT_LIMIT'].median())
print("Mean:", df_cc['CREDIT_LIMIT'].mean())

In [None]:
# my impulse is to go with median, but you can try out mean

df_cc['CREDIT_LIMIT'].fillna(df_cc['CREDIT_LIMIT'].median(), inplace=True)

In [None]:
# check again for missings

df_cc.isna().sum()

In [None]:
# check correlations: this is potentially important when clustering

plt.figure(figsize=(9,16))
sns.heatmap(df_cc.corr(), annot=True)
plt.show()

High correlations exist between many variables

PURCHASES, INSTALLMENTS_PURCHASES, ONEOFF_PURCHASES, ONEOFF_PURCHASES_FREQUENCY, PURCHASES_TRX, CREDIT_LIMIT, MINIMUM_PAYMENTS

BALANCE, CASH_ADVANCE, CREDIT_LIMIT

 ONEOFF_PURCHASES_FREQUENCY



PURCHASES_INSTALLMENTS, PURCHASES_INSTALLMENTS_FREQUENCY,  
CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX

The discussions I have seen indicate that for clustering, PCA is the best approach in this case. Before doing this, get rid of any columns we don't need.

In [None]:
# drop cust_id due to high cardinality

df_cc_drop = df_cc.drop('CUST_ID', axis=1)

In [None]:
# save dataframe to feather in case of crashes, etc

df_cc_drop.to_feather(r"df_cc_drop.ftr")

In [None]:
# read in the dataset if needed

with open(r"df_cc_drop.ftr", "rb") as feather_file:
    df_cc_drop = pd.read_feather(feather_file)

In [None]:
# scaling the data is important for clustering

scaler = StandardScaler()

df_cc_scaled_pre_pca = scaler.fit_transform(df_cc_drop)

In [None]:
# do PCA, as this tends to help the respective clustering algorithms and is a best practice. First fit the data

pca = PCA()

pca.fit(df_cc_scaled_pre_pca)

In [None]:
# as a step in evaluating how to conduct PCA, we need to generate the explained variance ratios. 

explained_variance_pca = pca.explained_variance_ratio_
print(explained_variance_pca)

In [None]:
# to get the values into a form that is usefull for plotting, we then need to take the cummulative sum of the explained variances

cumulative_sum_eigenvalues = np.cumsum(explained_variance_pca)

In [None]:
# the cummulative sum of the explained variances can be plotted

plt.figure(figsize = (10,8))

plt.plot(range(0,len(cumulative_sum_eigenvalues)), cumulative_sum_eigenvalues, marker = 'x', linestyle = '--')
plt.title('Explained Variance by Components')
plt.axhline(y=.85, linestyle='dashdot', color='r')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.show()

For PCA, a good rule of thumb is to preserve around 85% of the variance. This would mean keeping 7 components.

In [None]:
pca = PCA(n_components=7)
pca.fit(df_cc_scaled_pre_pca)

In [None]:
df_cc_pca = pca.transform(df_cc_scaled_pre_pca)

 # K-MEANS #

In [None]:
# to do an elbow graph for k-means on the transformed data, first calculate the kmeans for a range of clusters. 
# Check other metrics at the same time for comparison

wcss_values = []
silhouette_list = []
davies_bouldin_list = []
calinski_harabasz_list = []

for number_of_clusters in range(2,9):
    kmeans_pca = KMeans(n_clusters=number_of_clusters, init='k-means++', n_init=10, random_state=42)
    kmeans_pca.fit(df_cc_pca)
    
    wcss_values.append(kmeans_pca.inertia_)
    labels = kmeans_pca.fit_predict(df_cc_pca)
    silhouette_list.append(silhouette_score(df_cc_pca, labels))
    davies_bouldin_list.append(davies_bouldin_score(df_cc_pca, labels))
    calinski_harabasz_list.append(calinski_harabasz_score(df_cc_pca, labels))



In [None]:
plt.figure(figsize=(10,8))

plt.plot(range(2,9), wcss_values, marker='x', linestyle='-')
plt.title('Elbow Curve of Scores by Cluster')
plt.xlabel('Number of Clusters')
plt.ylabel('K-means with PCA Clustering')
plt.show()

Elbow plot suggests 3, 4 or 5. But lets check other measures

In [None]:

x_values = range(2,9)

plt.plot(x_values,silhouette_list, marker='X')
plt.plot(x_values, davies_bouldin_list, marker='o')
plt.show()

Davies Bouldin indicates 7, which is not so clearly optimal given that we only have 7 principal components. Silhouette indicates 3, but the differences are rather small. 

In [None]:
x_values = range(2,9)
df_cali_har = pd.DataFrame(x_values,columns=['x_values'])
df_cali_har['cali_har'] = calinski_harabasz_list

max_x = df_cali_har.loc[df_cali_har['cali_har']==np.max(calinski_harabasz_list), 'x_values'].iloc[0]
plt.plot(x_values,calinski_harabasz_list, marker='X')

plt.show()

print("Calinski Harabasz Score indicates best number of clusters is", max_x)

In [None]:
# there is a numerical way to generate the best number of clusters using the package Kneedle

wcss=[]
for number_of_clusters in range(2,9):
    kmeans_kneedle=KMeans(n_clusters=number_of_clusters, init='k-means++', n_init=10, random_state=42)
    kmeans_kneedle.fit(df_cc_pca)
    wcss_vl=kmeans_kneedle.inertia_
    wcss.append(wcss_vl)
kneedle = KneeLocator(np.arange(2,9), wcss, S=1.0, curve='convex', direction='decreasing')
print(f'Kneedle indicates that {kneedle.knee} clusters is the optimum')

Looking at different metrics, for K-means we will go with 4 clusters.

In [None]:
# run K-means for 4 clusters

kmeans_pca = KMeans(n_clusters=4, init='k-means++', n_init=10, random_state=42)
kmeans_pca.fit(df_cc_pca)

In [None]:
# prepare original set to be joined with cluster results

df_clustered_kmeans = df_cc_drop.reset_index(drop=True)

In [None]:
# add the pertinent k-means labels

df_clustered_kmeans['Segment KMeans'] = kmeans_pca.labels_


In [None]:
# check

df_clustered_kmeans.tail()

In [None]:
# plot the columns by the various columns to see what the separation achieves and for interpretation. 

for c in df_clustered_kmeans:
    grid= sns.FacetGrid(df_clustered_kmeans, col='Segment KMeans')
    grid.map(plt.hist, c)

# DBSCAN #

Time to check out DBSCAN. This approach differs from K-means, in that the ideal number of clusters is not specified up front, but rather other parameters. These are epsilon and the minimum number of points to be sampled in each cluster. 

Before carrying out the clustering, we will walk through what is needed to find good epsilon (eps) and min_points.

We can use the PCA DataFrame for DBSCAN clustering. 

In [None]:
# parameter tuning for eps. This involves another form of elbow plot

nearest_neighbors = NearestNeighbors(n_neighbors=11)
neighbors = nearest_neighbors.fit(df_cc_pca)
distances, indices = neighbors.kneighbors(df_cc_pca)
distances = np.sort(distances[:,10], axis=0)

i = np.arange(len(distances))
knee = KneeLocator(i, distances, S=1, curve='convex', direction='increasing', interp_method='polynomial')
fig = plt.figure(figsize=(5, 5))
knee.plot_knee()
plt.axhline(distances[knee.knee], label='optimal epsilon',linestyle='dashdot', color='r')
plt.xlabel("Points")
plt.ylabel("Distance")

plt.show()

print("The optimal epsilon is the red dash dot line with a value of ", distances[knee.knee])

In [None]:
# to find the optimum number of min_points, we can search for the highest silhouette score in a range of values 

epsilon = 1.5125394356711748

min_samples= range(3,10)
for min_sample in min_samples:
    dbs=DBSCAN(eps=epsilon, min_samples=min_sample)
    dbs.fit(df_cc_pca)
    print(f"Silhouette Coefficient for epsilon {epsilon} and minimum number of points {min_sample} is",silhouette_score(df_cc_pca, dbs.labels_))

In [None]:
# take epsilon 1.5125394356711748 and number of samples 5 as optimum

dbs=DBSCAN(eps=1.5125394356711748, min_samples=5)
dbs.fit(df_cc_pca)

In [None]:
# prepare the original data set to be merged with the values of the clusters

df_clustered_dbscan = df_cc_drop.reset_index(drop=True)

In [None]:
# add the cluseter values to the various points

df_clustered_dbscan['Segment DBSCAN'] = dbs.labels_

In [None]:
# plot the columns in the respective clusters for interpretation. What do you think about the effectiveness of DBSCAN? 
# Are the clusters well-separated? What information do they yield about the credit card users?

for c in df_clustered_dbscan:
    grid= sns.FacetGrid(df_clustered_dbscan, col='Segment DBSCAN')
    grid.map(plt.hist, c)

# Hierarchical (Agglomerative) Clustering #

The third kind of clustering algorithm is a form of hierarchical clustering. There are two forms of hierarchical clustering--agglomerative and divisive--, but we will only look at agglomerative clustering. 

Herarchical clusters work by starting with each data point as a cluster, and then joining points that are nearest. This generates a dendrogram that in itself works to group points with commonalities. 

Working with such a dendrogram, it is possible to identify an ideal number of clusters by finding the highest horizontal line that cuts across the verticals at the same level. 

In [None]:
# this code generates a dendrogram for the PCA dataFrame. 
# RUN THIS CELL ONCE AS IS, IF IT WOULD BE HELPFUL TO SEE WHERE THE HORIZONTAL CUT CAN BE MADE, THEN RUN WITH THE AXHLINE CODE UNCOMMENTED

hierarchical_cluster = sch.linkage(df_cc_pca, method = 'ward')
plt.title('Dendrogram', fontsize = 20)
plt.ylabel('Euclidean  Distance')

# CAN BE RUN WITH FOLLOWING LINE UNCOMMENTED
# plt.axhline(y=122, color='r', linestyle='--')

sch.dendrogram(hierarchical_cluster, truncate_mode = "level", p = 7, show_leaf_counts = False, no_labels = True)
plt.show()

The line indicated by this technique gives us 4 verticals, or four clusters to start with. 

In [None]:
# working with 4 clusters 

hc = AgglomerativeClustering(n_clusters = 4, metric = 'euclidean', linkage = 'ward')
y_hc = hc.fit(df_cc_pca)

In [None]:
# prepare original data to receive the HC clustering results

df_clustered_hc = df_cc_drop.reset_index(drop=True)

In [None]:
# add the HC labels

df_clustered_hc['Segment HC'] = y_hc.labels_

In [None]:
# graph to check clustering and interpret results

for c in df_clustered_hc:
    grid= sns.FacetGrid(df_clustered_hc, col='Segment HC')
    grid.map(plt.hist, c)

A final section looks at the respective metrics for the various algorithms. 

In [None]:
# K-means from sklearn.cluster import KMeans

# Fit K-Means
kmeans = KMeans(n_clusters=4, init='k-means++', n_init=10, random_state=42)

# Use fit_predict to cluster the dataset
y_predict = kmeans.fit_predict(df_cc_pca)

# Calculate cluster validation metrics

silhouette_kmeans = silhouette_score(df_cc_pca, kmeans.labels_, metric='euclidean')
cali_har_kmeans = calinski_harabasz_score(df_cc_pca, kmeans.labels_)
davies_bouldin_kmeans = davies_bouldin_score(df_cc_pca, y_predict)

print('Silhouette Score: %.2f' % silhouette_kmeans)
print('Calinski Harabasz Score: %.2f' % cali_har_kmeans)
print('Davies Bouldin Score: %.2f' % davies_bouldin_kmeans)

In [None]:
# dbscan clustering by score

model = DBSCAN(eps=1.5125394356711748, min_samples= 5)

# fit model and predict clusters

y_predict = model.fit_predict(df_cc_pca)

# retrieve unique clusters
clusters = unique(y_predict)


# Calculate cluster validation metrics

silhouette_dbscan = silhouette_score(df_cc_pca, y_predict, metric='euclidean')

cali_har_dbscan = calinski_harabasz_score(df_cc_pca, y_predict)

davies_bouldin_dbscan = davies_bouldin_score(df_cc_pca, y_predict)



print('Silhouette Score DBSCAN: %.2f' % silhouette_dbscan)
print('Calinski Harabasz Score DBSCAN : %.2f' % cali_har_dbscan)
print('Davies Bouldin Score DBSCAN: %.2f' % davies_bouldin_dbscan)


In [None]:
# Agglomerative clustering evaluation by score

model = AgglomerativeClustering(n_clusters=4)

# fit model and predict clusters
y_predict = model.fit(df_cc_pca)
y_predict_2 = model.fit_predict(df_cc_pca)

# retrieve unique clusters
clusters = unique(y_predict)
 
# Calculate cluster validation metrics

silhouette_HC = silhouette_score(df_cc_pca, y_predict.labels_, metric='euclidean')
cali_har_HC = calinski_harabasz_score(df_cc_pca, y_predict.labels_)
davies_bouldin_HC = davies_bouldin_score(df_cc_pca, y_predict_2)

print('Silhoutte Score for HC: %.2f' % silhouette_HC)
print('Calinski Harabasz Score for HC: %.2f' % cali_har_HC)
print('Davies Bouldin Score for HC: %.2f' % davies_bouldin_HC)