Clustering

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
#df = pd.read_csv('./data/aggregated_reduced_data.txt')
#df.head()

In [2]:
#High-D Sample
n_samples = 1000
df = pd.read_csv('./data/aggregated_raw_data.txt').drop(['customer_id'],axis=1)
df = df.sample(n=n_samples,random_state=42)
#df.info()
#X=df.copy()[['customer_home_x_coord','customer_home_y_coord','customer_income_level','payment_mean','BV_very_early','BV_early','BV_later','BV_late']]
X=df.select_dtypes(['number'])
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 57898 to 697
Data columns (total 40 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   customer_main_branch_x_coord          1000 non-null   float64
 1   customer_main_branch_y_coord          1000 non-null   float64
 2   customer_home_x_coord                 1000 non-null   float64
 3   customer_home_y_coord                 1000 non-null   float64
 4   customer_income_level                 1000 non-null   float64
 5   customer_age                          1000 non-null   int64  
 6   akbank_banking_age                    1000 non-null   float64
 7   1)RISKSIZ                             1000 non-null   float64
 8   2)GECIKME 1-15 GUN                    1000 non-null   float64
 9   3)GECIKME 16-29 GUN                   1000 non-null   float64
 10  4)GECIKME 30-59 GUN                   1000 non-null   float64
 11  5)GECIKME 60+ 

K-Means

In [3]:
from sklearn.cluster import KMeans
k = 5
kmeans = KMeans(n_clusters=k, random_state=42)
#kmeans = KMeans(n_clusters=k, init="k-means++", random_state=42) #K++, default
#kmeans = KMeans(algorithm="elkan", random_state=42).fit(X) #Accelerated
#kmeans = MiniBatchKMeans(n_clusters=10, batch_size=10, random_state=42)
    #If data doesn't fit in memory, there is a memmap function in HW to deal with this
y_pred = kmeans.fit_predict(X)
#(y_pred and kmeans.labels_ are the same thing)
print('k-means:', silhouette_score(X, kmeans.labels_), k)

k-means: 0.6852395089384182 5


# K-Means Testing

The next block is for testing a range of K values

In [4]:
#kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(X)
#                for k in range(1, 10)]
#inertias = [model.inertia_ for model in kmeans_per_k]

In [5]:
#Make Elbow Plot
#plt.figure(figsize=(8, 3.5))
#plt.plot(range(1, 10), inertias, "bo-")
#plt.xlabel("$k$", fontsize=14)
#plt.ylabel("Inertia", fontsize=14)
##plt.axis([1, 8.5, 0, 1300])
##save_fig("inertia_vs_k_plot")
#plt.show()

In [6]:
#Silhouette score by range
#silhouette_scores = [silhouette_score(X, model.labels_)
#                     for model in kmeans_per_k[1:]]
#plt.figure(figsize=(8, 3))
#plt.plot(range(2, 10), silhouette_scores, "bo-")
#plt.xlabel("$k$", fontsize=14)
#plt.ylabel("Silhouette score", fontsize=14)
##plt.axis([1.8, 8.5, 0.55, 0.7])
##save_fig("silhouette_score_vs_k_plot")
#plt.show()

# DB Scan

In [7]:
from sklearn.cluster import DBSCAN
db_eps = 100
db_min_samp = 2
dbscan = DBSCAN(eps=db_eps, min_samples=db_min_samp)
#dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None)
    #eps - maximum distance between samples to be considered in same neighborhood
    #min_samples - minimum samples in a neighborhood for the center to be considered a core point
dbscan.fit(X)
print('DBScan:', silhouette_score(X, dbscan.labels_))

DBScan: -0.5497216455906998


# Spectral Clustering

In [8]:
spec_gamma = 100
from sklearn.cluster import SpectralClustering
spectral = SpectralClustering(n_clusters=k, gamma=spec_gamma, random_state=42)
    #n_clusters - number of clusters
    #lower gamma = less broadly defined clusters
spectral.fit(X)
print('Spectral:', silhouette_score(X, spectral.labels_), spec_gamma, )



Spectral: -0.5822866165025242 100


In [9]:
#spectral_per_k = [SpectralClustering(n_clusters=k, gamma=1000,random_state=42).fit(X)
#                for k in range(1, 10)]
#silhouette_scores = [silhouette_score(X, spectral.labels_) for model in spectral_per_k]
#plt.figure(figsize=(8, 3))
#plt.plot(range(2, 10), silhouette_scores, "bo-")
#plt.xlabel("$k$", fontsize=14)
#plt.ylabel("Silhouette score", fontsize=14)
##plt.axis([1.8, 8.5, 0.55, 0.7])
##save_fig("silhouette_score_vs_k_plot")
#plt.show()

# Agglomerative

In [10]:
from sklearn.cluster import AgglomerativeClustering
agglom = AgglomerativeClustering(n_clusters = 5,linkage="ward").fit(X)
    #n_clusters
    #linkage
print('Agglomerative:', silhouette_score(X, agglom.labels_))

Agglomerative: 0.7492758263428944


In [11]:
#def learned_parameters(estimator):
#    return [attrib for attrib in dir(estimator)
#            if attrib.endswith("_") and not attrib.startswith("_")]
#learned_parameters(agglom)
#agglom.children_

# Silhouette Score

In [12]:
from sklearn.metrics import silhouette_score
print('sample count:',n_samples)
print('k-means:', silhouette_score(X, kmeans.labels_), k)
print('DBScan:', silhouette_score(X, dbscan.labels_),db_eps,db_min_samp)
print('Spectral:', silhouette_score(X, spectral.labels_), spec_gamma, )
print('Agglomerative:', silhouette_score(X, agglom.labels_))

sample count: 1000
k-means: 0.6852395089384182 5
DBScan: -0.5497216455906998 100 2
Spectral: -0.5822866165025242 100
Agglomerative: 0.7492758263428944


Cluster Comparison

In [13]:
X_labelled = X.copy()
X_labelled['kmeans_clusters']=kmeans.labels_.tolist()
X_labelled['dbscan_clusters']=dbscan.labels_.tolist()
X_labelled['spectral_clusters']=spectral.labels_.tolist()
X_labelled['agglom_clusters']=agglom.labels_.tolist()
X_labelled

Unnamed: 0,customer_main_branch_x_coord,customer_main_branch_y_coord,customer_home_x_coord,customer_home_y_coord,customer_income_level,customer_age,akbank_banking_age,1)RISKSIZ,2)GECIKME 1-15 GUN,3)GECIKME 16-29 GUN,...,trans_average_monthly_freq_GIDA,trans_average_monthly_freq_OTHER,trans_average_monthly_freq_RESTORAN,trans_average_monthly_freq_TEKSTÝL,statement_amount_TL_mean,statement_amount_TL_std,kmeans_clusters,dbscan_clusters,spectral_clusters,agglom_clusters
57898,40.985102,29.230434,40.821400,29.317590,2000.0,47,8.0,0.0,0.0,0.0,...,1.333333,0.750000,0.250000,0.166667,599.563333,287.335824,0,-1,2,0
53619,41.103030,28.897520,41.096826,28.893722,2000.0,39,6.0,0.0,0.0,0.0,...,0.166667,0.416667,0.000000,0.000000,661.776667,355.847658,0,-1,3,0
14664,40.992145,28.846807,41.012080,28.822070,1600.0,34,2.0,0.0,0.0,0.0,...,1.000000,1.500000,0.333333,0.583333,526.022500,123.071620,0,-1,2,0
49309,41.022043,29.044177,40.961020,29.250310,1420.0,39,4.0,0.0,0.0,0.0,...,13.833333,2.500000,1.750000,0.333333,418.934000,258.720361,0,-1,2,0
14212,41.077030,28.946960,41.072810,28.927330,2000.0,43,6.0,0.0,0.0,0.0,...,0.250000,0.166667,0.000000,0.000000,163.600000,566.727024,0,-1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6501,40.353461,27.970340,41.048689,28.998694,12903.0,54,21.0,0.0,0.0,0.0,...,8.166667,14.000000,3.666667,1.833333,8727.210000,2673.859966,4,-1,2,2
34306,41.009379,29.213000,41.013580,29.215460,1000.0,33,8.0,0.0,0.0,0.0,...,3.416667,3.666667,1.000000,0.166667,476.622500,180.167120,0,-1,2,0
1736,40.991200,29.026540,40.973200,29.106930,1000.0,38,3.0,0.0,0.0,0.0,...,0.000000,0.250000,0.000000,0.000000,354.533333,595.898871,0,-1,2,0
47806,41.063666,28.687843,41.058900,28.729490,1249.0,29,2.0,0.0,0.0,0.0,...,1.416667,3.000000,0.250000,1.000000,223.799167,177.508945,0,-1,2,0


In [18]:
method = 'agglom_clusters'
X_labelled[method].value_counts()

0    937
2     32
4     16
1     14
3      1
Name: agglom_clusters, dtype: int64

In [19]:
cluster_number_1 = 0
cluster_number_2 = 2

In [22]:
X_labelled[X_labelled[method]==cluster_number_1].head()

Unnamed: 0,customer_main_branch_x_coord,customer_main_branch_y_coord,customer_home_x_coord,customer_home_y_coord,customer_income_level,customer_age,akbank_banking_age,1)RISKSIZ,2)GECIKME 1-15 GUN,3)GECIKME 16-29 GUN,...,trans_average_monthly_freq_GIDA,trans_average_monthly_freq_OTHER,trans_average_monthly_freq_RESTORAN,trans_average_monthly_freq_TEKSTÝL,statement_amount_TL_mean,statement_amount_TL_std,kmeans_clusters,dbscan_clusters,spectral_clusters,agglom_clusters
57898,40.985102,29.230434,40.8214,29.31759,2000.0,47,8.0,0.0,0.0,0.0,...,1.333333,0.75,0.25,0.166667,599.563333,287.335824,0,-1,2,0
53619,41.10303,28.89752,41.096826,28.893722,2000.0,39,6.0,0.0,0.0,0.0,...,0.166667,0.416667,0.0,0.0,661.776667,355.847658,0,-1,3,0
14664,40.992145,28.846807,41.01208,28.82207,1600.0,34,2.0,0.0,0.0,0.0,...,1.0,1.5,0.333333,0.583333,526.0225,123.07162,0,-1,2,0
49309,41.022043,29.044177,40.96102,29.25031,1420.0,39,4.0,0.0,0.0,0.0,...,13.833333,2.5,1.75,0.333333,418.934,258.720361,0,-1,2,0
14212,41.07703,28.94696,41.07281,28.92733,2000.0,43,6.0,0.0,0.0,0.0,...,0.25,0.166667,0.0,0.0,163.6,566.727024,0,-1,2,0


In [23]:
X_labelled[X_labelled[method]==cluster_number_2].head()

Unnamed: 0,customer_main_branch_x_coord,customer_main_branch_y_coord,customer_home_x_coord,customer_home_y_coord,customer_income_level,customer_age,akbank_banking_age,1)RISKSIZ,2)GECIKME 1-15 GUN,3)GECIKME 16-29 GUN,...,trans_average_monthly_freq_GIDA,trans_average_monthly_freq_OTHER,trans_average_monthly_freq_RESTORAN,trans_average_monthly_freq_TEKSTÝL,statement_amount_TL_mean,statement_amount_TL_std,kmeans_clusters,dbscan_clusters,spectral_clusters,agglom_clusters
11708,40.876417,29.233722,40.87517,29.25199,13000.0,45,11.0,0.0,0.0,0.0,...,0.083333,1.0,0.083333,0.916667,676.805,546.298648,4,-1,2,2
64377,41.111072,29.130485,41.05106,29.059994,15000.0,40,11.0,0.0,0.0,0.0,...,3.333333,7.5,5.166667,0.75,3820.325333,2075.557048,4,-1,2,2
45739,40.958791,29.094943,40.86199,29.28775,10000.0,44,2.0,0.0,0.0,0.0,...,2.0,5.75,0.25,0.25,8917.135833,2026.196482,3,-1,2,2
57264,41.032217,28.900861,41.05475,28.71692,22500.0,42,11.0,0.0,0.0,0.0,...,20.166667,25.916667,10.0,4.0,10963.366667,7042.194703,3,-1,2,2
57080,41.032217,28.900861,41.05164,28.86387,20000.0,28,5.0,0.0,0.0,0.0,...,1.416667,5.583333,0.166667,0.583333,1590.948333,554.586776,4,-1,2,2


In [25]:
#Get mean/std for each column of the two clusters to compare.
cluster_summary_1 = pd.merge(X_labelled[X_labelled[method]==cluster_number_1].mean().to_frame(name=str(cluster_number_1)+'_mean'),X_labelled[X_labelled[method]==cluster_number_1].std().to_frame(name=str(cluster_number_1)+'_std'),left_index=True, right_index=True)
cluster_summary_2 = pd.merge(X_labelled[X_labelled[method]==cluster_number_2].mean().to_frame(name=str(cluster_number_2)+'_mean'),X_labelled[X_labelled[method]==cluster_number_2].std().to_frame(name=str(cluster_number_2)+'_std'),left_index=True, right_index=True)
cluster_comparison = pd.merge(cluster_summary_1,cluster_summary_2,left_index=True,right_index=True).round(3)
#Calculate the difference between the mean/std of each cluster.
cluster_comparison['mean_dif']=cluster_comparison.iloc[:, 0] - cluster_comparison.iloc[:, 2]
cluster_comparison['std_dif']=cluster_comparison.iloc[:, 1] - cluster_comparison.iloc[:, 3]
#Display
cluster_comparison

Unnamed: 0,0_mean,0_std,2_mean,2_std,mean_dif,std_dif
customer_main_branch_x_coord,40.931,0.535,40.898,0.567,0.033,-0.032
customer_main_branch_y_coord,29.154,1.377,29.337,2.171,-0.183,-0.794
customer_home_x_coord,40.981,0.358,41.037,0.073,-0.056,0.285
customer_home_y_coord,29.115,1.216,28.956,0.178,0.159,1.038
customer_income_level,2519.044,1990.865,16201.844,4222.144,-13682.8,-2231.279
customer_age,39.395,8.857,40.125,9.414,-0.73,-0.557
akbank_banking_age,7.619,4.919,10.031,6.606,-2.412,-1.687
1)RISKSIZ,0.0,0.0,0.0,0.0,0.0,0.0
2)GECIKME 1-15 GUN,0.0,0.0,0.0,0.0,0.0,0.0
3)GECIKME 16-29 GUN,0.0,0.0,0.0,0.0,0.0,0.0
