# Clustering

First, we'll read in our two encoded datasets.

In [2]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

X = pd.read_csv('data/encoded_females.csv')
Y = pd.read_csv('data/encoded_males.csv')

In this section, we have four couples that we're assuming are married.  Each pair lives as the same X/Y coordinate, has been a bank customer for the same amount of time, and are in a similar age range.  None of this is in any way definitive, but it's at least something to test our model against.  To fill out the data set, we drew 6 male and 6 female customer_ids at random from the encoded data - these are hard-encoded below.

In [3]:
#Get 6 random males/females.
test_data_random_1 = Y['customer_id'].sample(n=6, random_state=42).to_frame()
test_data_random_2 = X['customer_id'].sample(n=6, random_state=42).to_frame()

#df_random = pd.DataFrame(columns = ['male_id', 'female_id', 'assumed_status'])
#df_random['male_id']=test_data_random_1
#df_random['female_id']=test_data_random_2
#df_random['assumed_status']='nm'

#Assumed Married Couples
test_data = [[19054837,18713562,'m'],[20216301,20478819,'m'],[10380055,10194865,'m'],[14101839,14784272,'m'],
[15888653,5012852,'nm'],[15783617,11331741,'nm'],[8767345,16304349,'nm'],[20683498,26593265,'nm'],
[20001707,23280462,'nm'],[7648453,26306037,'nm']]
df_test_couples = pd.DataFrame(test_data, columns = ['male_id', 'female_id','assumed_status'])

df_test_couples

Unnamed: 0,male_id,female_id,assumed_status
0,19054837,18713562,m
1,20216301,20478819,m
2,10380055,10194865,m
3,14101839,14784272,m
4,15888653,5012852,nm
5,15783617,11331741,nm
6,8767345,16304349,nm
7,20683498,26593265,nm
8,20001707,23280462,nm
9,7648453,26306037,nm


Here, we check to make sure the format of our two encoded datasets match.

In [4]:
X.head()

Unnamed: 0,customer_id,0,1,2,3,4,5,6,7,8,9
0,9875403,2.983229,27.176733,16.333344,18.159441,22.485016,10.867768,15.43889,17.659956,16.022356,11.103373
1,20022095,4.848162,5.223834,7.458213,3.717671,3.993921,7.437561,8.309194,5.767206,10.405681,8.021971
2,18751734,8.573576,7.342382,6.673953,1.585556,4.933287,9.890267,6.875749,6.549332,8.229789,4.978735
3,14288103,7.144671,5.393974,7.112313,1.338546,4.279624,10.714273,8.189869,7.604131,10.508555,6.972306
4,20018440,5.003764,14.045266,7.3834,5.952122,2.138889,9.260857,11.649364,6.544845,10.652727,10.809715


In [5]:
Y.head()

Unnamed: 0,customer_id,0,1,2,3,4,5,6,7,8,9
0,25139695,27.851864,17.014044,0.0,50.125134,0.0,78.02402,18.281296,29.809046,57.001293,0.0
1,23187800,11.500882,11.49221,0.0,18.809597,0.0,41.201447,12.54253,12.504235,16.854506,0.0
2,16400475,19.934437,23.69683,0.0,47.04694,0.0,45.805378,54.14596,22.674173,31.922752,0.0
3,17303354,18.51626,18.397793,0.0,52.92986,0.0,72.32252,18.89935,29.891552,52.21884,0.0
4,15522570,13.837995,9.308175,0.0,19.172619,0.0,41.492718,12.527002,12.661451,17.845081,0.0


In [5]:
#for eqch row
#customerx data - customer y data

#cluster on above

K-Means

In [6]:
from sklearn.cluster import KMeans
k = 6
kmeans = KMeans(n_clusters=k, random_state=42)
#kmeans = KMeans(n_clusters=k, init="k-means++", random_state=42) #K++, default
#kmeans = KMeans(algorithm="elkan", random_state=42).fit(X) #Accelerated
#kmeans = MiniBatchKMeans(n_clusters=10, batch_size=10, random_state=42)
    #If data doesn't fit in memory, there is a memmap function in HW to deal with this
y_pred = kmeans.fit_predict(X)
#(y_pred and kmeans.labels_ are the same thing)
print('k-means:', silhouette_score(X, kmeans.labels_), k)

k-means: 0.5908353762506297 6


# DB Scan

DBScan typically returned 1-2 highly-imbalanced groups, regardless of hyperparameter settings.  After many attempts, we decided to abandon it for this project.

In [8]:
from sklearn.cluster import DBSCAN
#Best so far: eps 15, min_samp 10
db_eps = 2
db_min_samp = 2
dbscan = DBSCAN(eps=db_eps, min_samples=db_min_samp)
#dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None)
    #eps - maximum distance between samples to be considered in same neighborhood
    #min_samples - minimum samples in a neighborhood for the center to be considered a core point
dbscan.fit(X)
print('DBScan:', silhouette_score(X, dbscan.labels_))
print(pd.DataFrame(dbscan.labels_.tolist()).value_counts())

DBScan: -0.023007285526956536
-1    13370
 0        2
dtype: int64


In [13]:
from sklearn.cluster import DBSCAN
#Best so far: eps 15, min_samp 10
db_eps = 100
db_min_samp = 5
dbscan = DBSCAN(eps=db_eps, min_samples=db_min_samp)
#dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None)
    #eps - maximum distance between samples to be considered in same neighborhood
    #min_samples - minimum samples in a neighborhood for the center to be considered a core point
dbscan.fit(X)
print('DBScan:', silhouette_score(X, dbscan.labels_))
print(pd.DataFrame(dbscan.labels_.tolist()).value_counts())

DBScan: -0.21457023728269894
-1    13328
 0       11
 2        8
 1        5
 3        5
 4        5
 5        5
 6        5
dtype: int64


In [34]:
from sklearn.cluster import DBSCAN
#Best so far: eps 15, min_samp 10
db_eps = 10000
db_min_samp = 10
dbscan = DBSCAN(eps=db_eps, min_samples=db_min_samp)
#dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None)
    #eps - maximum distance between samples to be considered in same neighborhood
    #min_samples - minimum samples in a neighborhood for the center to be considered a core point
dbscan.fit(X)
print('DBScan:', silhouette_score(X, dbscan.labels_))
print(pd.DataFrame(dbscan.labels_.tolist()).value_counts())

DBScan: 0.27131862741118795
-1      1841
 19     1514
 16     1379
 13      314
 38      300
        ... 
 249       6
 251       6
 253       4
 235       3
 252       1
Length: 257, dtype: int64


In [35]:
len(pd.DataFrame(dbscan.labels_.tolist()).value_counts())

257

# Spectral Clustering

We tried tweaking gamme & n_clusters repeatedly, but this algorithm, for our data, either returned extremely poor results (heavily imbalanced groups at low silhouette), or failed to run.  Research indicated this should be a good algorithm for high-dimensional data, so further investigation should be conducted.

In [17]:
k=5
spec_gamma = 10
from sklearn.cluster import SpectralClustering
spectral = SpectralClustering(n_clusters=k, gamma=spec_gamma, random_state=42)
    #n_clusters - number of clusters
    #lower gamma = less broadly defined clusters
spectral.fit(X)
print('Spectral:', silhouette_score(X, spectral.labels_), spec_gamma, )

  x = um.multiply(x, x, out=x)
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  elkan_iter(
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  elkan_iter(
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  elkan_iter(
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  elkan_iter(
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  elkan_iter(
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  elkan_iter(
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  elkan_iter(
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  elkan_iter(
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  elkan_iter(
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  elkan_iter(
  ret = a @ b
  distances += XX
  est = KMeans(


ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

# Agglomerative

This algorithm returned similar (but consistently lower) silhouette scores than kmeans, but with more consistent group sizes.  

In [39]:
from sklearn.cluster import AgglomerativeClustering
agglom = AgglomerativeClustering(n_clusters = 10,linkage="ward").fit(X)
    #n_clusters
    #linkage
print('Agglomerative:', silhouette_score(X, agglom.labels_))
print(agglom.labels_.value_counts())

Agglomerative: 0.50659191552705


AttributeError: 'numpy.ndarray' object has no attribute 'value_counts'

In [37]:
def learned_parameters(estimator):
    return [attrib for attrib in dir(estimator)
            if attrib.endswith("_") and not attrib.startswith("_")]
learned_parameters(agglom)

['children_',
 'feature_names_in_',
 'labels_',
 'n_clusters_',
 'n_connected_components_',
 'n_features_in_',
 'n_leaves_']

# Silhouette Score

In [11]:
from sklearn.metrics import silhouette_score
#print('sample count:',n_samples)
print('k-means:', silhouette_score(X, kmeans.labels_), k)
#print('DBScan:', silhouette_score(X, dbscan.labels_),db_eps,db_min_samp)
#print('Spectral:', silhouette_score(X, spectral.labels_), spec_gamma, )
print('Agglomerative:', silhouette_score(X, agglom.labels_))

k-means: 0.5908353762506297 6
Agglomerative: 0.50659191552705


This section is intended to review the count & distribution of clusters for kmeans/agglom.

In [12]:
pd.DataFrame(agglom.labels_.tolist()).value_counts()

4    2378
3    1795
2    1718
0    1563
5    1355
8    1126
7    1120
6     838
1     765
9     714
dtype: int64

# Cluster Comparison

This section makes a labelled copy of the X dataframe, and labels each customer_id with its closest kmeans and agglomerative clusters. 

In [13]:
X_labelled = pd.DataFrame(X).copy()
X_labelled['kmeans_clusters']=kmeans.labels_.tolist()
#X_labelled['dbscan_clusters']=dbscan.labels_.tolist()
#X_labelled['spectral_clusters']=spectral.labels_.tolist()
X_labelled['agglom_clusters']=agglom.labels_.tolist()
X_labelled

Unnamed: 0,customer_id,0,1,2,3,4,5,6,7,8,9,kmeans_clusters,agglom_clusters
0,9875403,2.983229,27.176733,16.333344,18.159441,22.485016,10.867768,15.438890,17.659956,16.022356,11.103373,5,0
1,20022095,4.848162,5.223834,7.458213,3.717671,3.993921,7.437561,8.309194,5.767206,10.405681,8.021971,3,8
2,18751734,8.573576,7.342382,6.673953,1.585556,4.933287,9.890267,6.875749,6.549332,8.229789,4.978735,4,2
3,14288103,7.144671,5.393974,7.112313,1.338546,4.279624,10.714273,8.189869,7.604131,10.508555,6.972306,1,9
4,20018440,5.003764,14.045266,7.383400,5.952122,2.138889,9.260857,11.649364,6.544845,10.652727,10.809715,3,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13367,12300473,8.281458,11.826859,13.410042,3.344865,13.114079,16.232502,13.898013,14.057336,13.430591,2.763047,1,0
13368,4321625,13.159775,18.255754,12.769427,5.808396,13.522941,14.917950,10.934726,14.259798,11.882689,0.000000,2,1
13369,25864288,10.005547,9.823379,7.123112,3.168521,4.650004,10.174408,6.625361,4.974505,9.212149,4.874309,0,4
13370,27493768,11.931265,15.226109,9.878992,7.368333,5.427123,7.282018,4.881672,7.036744,9.671045,7.383472,0,3


In [14]:
method = 'agglom_clusters'
X_labelled[method].value_counts()

4    2378
3    1795
2    1718
0    1563
5    1355
8    1126
7    1120
6     838
1     765
9     714
Name: agglom_clusters, dtype: int64

In [15]:
cluster_number_1 = 6
cluster_number_2 = 3

In [16]:
X_labelled[X_labelled[method]==cluster_number_1].head()

Unnamed: 0,customer_id,0,1,2,3,4,5,6,7,8,9,kmeans_clusters,agglom_clusters
17,9233948,13.287117,15.756793,7.073366,4.706351,5.206967,9.174794,7.483405,8.892846,7.702393,5.759586,5,6
22,8232208,7.527152,11.466265,7.039898,3.543985,3.448131,9.333062,8.639911,6.481568,8.451571,6.835232,5,6
23,8491075,8.490011,10.432808,6.986469,3.056189,4.187491,9.727957,9.709045,5.843378,9.003953,8.182604,5,6
24,6861292,9.129262,8.493911,7.740866,2.562065,4.898296,10.908092,9.097775,9.083282,10.706307,8.165275,2,6
25,7028373,9.103627,8.896014,7.77382,2.529035,5.214057,10.099566,10.771051,9.567203,9.93206,9.768019,2,6


In [17]:
X_labelled[X_labelled[method]==cluster_number_2].head()

Unnamed: 0,customer_id,0,1,2,3,4,5,6,7,8,9,kmeans_clusters,agglom_clusters
38,26659024,6.158948,7.503264,9.110242,3.615126,5.101214,7.648323,10.951666,5.385929,9.481445,11.862264,0,3
40,27675689,11.375908,11.608958,4.996815,4.049199,5.002155,8.435238,6.146498,5.99554,7.03269,3.324679,0,3
41,26950245,5.118316,5.085758,7.384338,0.823586,4.016681,10.778385,9.570455,2.096991,9.597076,7.24195,0,3
42,27400215,9.771663,7.798669,8.274205,2.324288,4.827536,12.599514,7.386721,5.248628,12.277944,6.875361,0,3
45,27731345,1.116381,11.758346,7.499646,8.373816,16.388126,9.735472,5.898198,7.865475,11.389377,0.0,0,3


In [18]:
#Get mean/std for each column of the two clusters to compare.
cluster_summary_1 = pd.merge(X_labelled[X_labelled[method]==cluster_number_1].mean().to_frame(name=str(cluster_number_1)+'_mean'),X_labelled[X_labelled[method]==cluster_number_1].std().to_frame(name=str(cluster_number_1)+'_std'),left_index=True, right_index=True)
cluster_summary_2 = pd.merge(X_labelled[X_labelled[method]==cluster_number_2].mean().to_frame(name=str(cluster_number_2)+'_mean'),X_labelled[X_labelled[method]==cluster_number_2].std().to_frame(name=str(cluster_number_2)+'_std'),left_index=True, right_index=True)
cluster_comparison = pd.merge(cluster_summary_1,cluster_summary_2,left_index=True,right_index=True).round(3)
#Calculate the difference between the mean/std of each cluster.
cluster_comparison['mean_dif']=cluster_comparison.iloc[:, 0] - cluster_comparison.iloc[:, 2]
cluster_comparison['std_dif']=cluster_comparison.iloc[:, 1] - cluster_comparison.iloc[:, 3]
#Display
cluster_comparison

Unnamed: 0,6_mean,6_std,3_mean,3_std,mean_dif,std_dif
customer_id,8182306.204,801866.897,27007050.0,485857.217,-18824750.0,316009.68
0,8.617,2.586,8.106,2.589,0.511,-0.003
1,11.948,6.338,6.324,3.579,5.624,2.759
2,7.761,3.69,6.975,1.891,0.786,1.799
3,4.068,3.285,2.265,2.002,1.803,1.283
4,5.096,3.167,4.891,1.859,0.205,1.308
5,8.189,2.744,10.19,2.047,-2.001,0.697
6,9.158,4.006,7.071,2.197,2.087,1.809
7,7.589,3.589,6.0,2.611,1.589,0.978
8,8.237,3.226,9.88,2.078,-1.643,1.148


# Clustering Results for test customers

### Y Training

In [19]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(Y)
print('k-means:', silhouette_score(Y, kmeans.labels_), k)

k-means: 0.5592727973169372 6


In [20]:
from sklearn.cluster import AgglomerativeClustering
agglom = AgglomerativeClustering(n_clusters = 10,linkage="ward").fit(Y)
print('Agglomerative:', silhouette_score(Y, agglom.labels_))

KeyboardInterrupt: 

In [None]:
Y_labelled = pd.DataFrame(Y)
Y_labelled['kmeans_clusters']=kmeans.labels_.tolist()
Y_labelled['agglom_clusters']=agglom.labels_.tolist()
Y_labelled

In [None]:
df_test_couples_copy = df_test_couples.copy()
#Get female cluster labels
df_test_couples_copy = df_test_couples_copy.merge(X_labelled,left_on='female_id', right_on='customer_id', how='left')
df_test_couples_copy = df_test_couples_copy.drop(['customer_id','0','1','2','3','4','5','6','7','8','9'],axis=1)
df_test_couples_copy.columns=['male_id','female_id','assumed_status','f_kmeans_clusters','f_agglom_clusters']
df_test_couples_copy
#Get male cluster labels
df_test_couples_copy = df_test_couples_copy.merge(Y_labelled,left_on='male_id', right_on='customer_id', how='left')
df_test_couples_copy = df_test_couples_copy.drop(['customer_id','0','1','2','3','4','5','6','7','8','9'],axis=1)
df_test_couples_copy.columns=['male_id','female_id','assumed_status','f_kmeans_clusters','f_agglom_clusters','m_kmeans_clusters','m_agglom_clusters']
df_test_couples_copy