In [3]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from global_variables import *
from clustering import *
import os

# Clustering

Clustering based off the encoded data and projecting the respective labels onto the original data allows us to guage what similarity measurement was discovered and used for comparison

In [30]:
# Read in encoded datasets
encoded_females_sep = pd.read_csv(os.path.join(data_path,'encoded_females_sep.csv'),index_col='customer_id')
encoded_males_sep = pd.read_csv(os.path.join(data_path,'encoded_males_sep.csv'),index_col='customer_id')
encoded_females_joint = pd.read_csv(os.path.join(data_path,'encoded_females_joint.csv'),index_col='customer_id')
encoded_males_joint = pd.read_csv(os.path.join(data_path,'encoded_males_joint.csv'),index_col='customer_id')

# Read in full-columned datasets
females = pd.read_csv(os.path.join(data_path,'females.csv'),index_col='customer_id')
males = pd.read_csv(os.path.join(data_path,'males.csv'),index_col='customer_id')

# Read in matches
matches_joint = pd.read_csv(os.path.join(data_path,'matches_sample_joint.csv'))
matches_sep = pd.read_csv(os.path.join(data_path,'matches_sample_sep.csv'))

# Establish datasets to capture differences within pairs; for use in determining types of couples
encoded_diff_joint = match_difference( encoded_males_joint, encoded_females_joint, matches_joint )
encoded_diff_sep = match_difference( encoded_males_sep, encoded_females_sep, matches_sep )
diff_joint = match_difference( males, females, matches_joint )
diff_sep = match_difference( males, females, matches_sep )

In [5]:
from sklearn.cluster import KMeans
k = 6
random_state=42
kmeans = KMeans(n_clusters=k, random_state=random_state)

In [6]:
from sklearn.cluster import AgglomerativeClustering
n_clusters = 10
linkage = 'ward'
agglom = AgglomerativeClustering(n_clusters = n_clusters,linkage=linkage)

# Training

In [61]:
# 
kmeans_female_sep_centroids = cluster(kmeans, encoded_females_sep, females)
kmeans_male_sep_centroids = cluster(kmeans, encoded_males_sep, males)
kmeans_female_joint_centroids = cluster(kmeans, encoded_females_joint, females)
kmeans_male_joint_centroids = cluster(kmeans, encoded_males_joint, males)

# 
kmeans_diff_joint_centroids = cluster(kmeans, encoded_diff_joint, diff_joint)
kmeans_diff_sep_centroids = cluster(kmeans, encoded_diff_sep, diff_sep)

KMeans(n_clusters=6, random_state=42) :
	 0.1685218761957051
KMeans(n_clusters=6, random_state=42) :
	 0.2343569652565167
KMeans(n_clusters=6, random_state=42) :
	 0.2496968779053669
KMeans(n_clusters=6, random_state=42) :
	 0.2572236367155692
KMeans(n_clusters=6, random_state=42) :
	 0.21057332679094903
KMeans(n_clusters=6, random_state=42) :
	 0.14408024789279164


In [None]:
agglom_female_sep_centroids = cluster(agglom, encoded_females_sep, females)
agglom_male_sep_centroids = cluster(agglom, encoded_males_sep, males)
agglom_female_joint_centroids = cluster(agglom, encoded_females_joint, females)
agglom_male_joint_centroids = cluster(agglom, encoded_males_joint, males)

agglom_diff_joint_centroids = cluster(agglom, encoded_diff_joint, diff_joint)
agglom_diff_sep_centroids = cluster(agglom, encoded_diff_sep, diff_sep)

# Persona

## Isolated Perspective

In [65]:
cluster_extrema( kmeans_female_sep_centroids )

Unnamed: 0,customer_main_branch_x_coord,customer_main_branch_y_coord,customer_home_x_coord,customer_home_y_coord,customer_income_level,customer_age,akbank_banking_age,1)RISKSIZ,2)GECIKME 1-15 GUN,3)GECIKME 16-29 GUN,...,trans_average_amount_TEKSTÝL,trans_average_monthly_freq_AKARYAKIT,trans_average_monthly_freq_GIDA,trans_average_monthly_freq_OTHER,trans_average_monthly_freq_RESTORAN,trans_average_monthly_freq_TEKSTÝL,statement_amount_TL_mean,statement_amount_TL_std,customer_education_level,customer_job_status
0,U,L,0.086689,L,-0.017099,U,0.204142,0,0,0,...,-0.026016,-0.02955,0.093939,-0.04078,-0.082134,-0.100909,-0.230272,-0.245095,L,8.44008
1,U,L,U,L,L,L,L,0,0,0,...,L,L,L,L,L,L,L,L,U,U
2,L,0.051372,U,0.043374,U,U,U,0,0,0,...,U,U,U,U,U,U,U,U,U,L
3,L,U,L,U,U,-0.069166,0.326506,0,0,0,...,-0.081696,-0.063678,0.05596,-0.029222,-0.014051,0.099114,-0.048077,-0.032508,5.641026,8.556777
4,0.089468,U,L,U,L,L,L,0,0,0,...,L,L,L,L,L,L,L,L,L,L
5,0.060867,0.026465,0.057289,0.012456,0.01319,0.156475,U,0,0,0,...,U,U,U,U,U,U,U,U,5.895062,U


In [66]:
cluster_extrema( kmeans_male_sep_centroids )

Unnamed: 0,customer_main_branch_x_coord,customer_main_branch_y_coord,customer_home_x_coord,customer_home_y_coord,customer_income_level,customer_age,akbank_banking_age,1)RISKSIZ,2)GECIKME 1-15 GUN,3)GECIKME 16-29 GUN,...,trans_average_amount_TEKSTÝL,trans_average_monthly_freq_AKARYAKIT,trans_average_monthly_freq_GIDA,trans_average_monthly_freq_OTHER,trans_average_monthly_freq_RESTORAN,trans_average_monthly_freq_TEKSTÝL,statement_amount_TL_mean,statement_amount_TL_std,customer_education_level,customer_job_status
0,U,-0.129487,0.101023,-0.087659,0.039428,0.088891,0.342827,0,0,0,...,0.164536,0.346466,0.338355,0.478858,0.406463,0.395012,0.383733,0.434119,U,7.83522
1,L,U,L,U,U,U,U,0,0,0,...,U,U,U,U,U,U,U,U,5.407975,L
2,U,L,0.097299,L,L,U,0.043305,0,0,0,...,L,L,L,L,L,L,L,L,L,7.627874
3,L,U,L,U,-0.006938,L,L,0,0,0,...,-0.052,0.005831,0.058153,0.028133,0.042442,0.081453,-0.120192,-0.112871,5.4375,U
4,0.100687,L,U,L,L,L,L,0,0,0,...,L,L,L,L,L,L,L,L,L,U
5,0.056263,-0.106881,U,-0.089068,U,0.384718,U,0,0,0,...,U,U,U,U,U,U,U,U,U,L


In [67]:
cluster_extrema( kmeans_female_joint_centroids )

Unnamed: 0,customer_main_branch_x_coord,customer_main_branch_y_coord,customer_home_x_coord,customer_home_y_coord,customer_income_level,customer_age,akbank_banking_age,1)RISKSIZ,2)GECIKME 1-15 GUN,3)GECIKME 16-29 GUN,...,trans_average_amount_TEKSTÝL,trans_average_monthly_freq_AKARYAKIT,trans_average_monthly_freq_GIDA,trans_average_monthly_freq_OTHER,trans_average_monthly_freq_RESTORAN,trans_average_monthly_freq_TEKSTÝL,statement_amount_TL_mean,statement_amount_TL_std,customer_education_level,customer_job_status
0,0.036823,-0.098827,0.04307,L,L,0.00534,L,0,0,0,...,L,L,L,L,L,L,L,L,L,U
1,U,L,U,-0.036704,-0.005907,0.087582,0.353897,0,0,0,...,0.113654,U,U,U,U,U,0.437919,0.363416,5.529818,8.609419
2,L,U,L,U,-0.004797,L,0.355048,0,0,0,...,0.012325,-0.025587,0.065157,-0.026732,-0.034154,0.035881,0.040285,0.163856,5.576923,8.60989
3,-0.326792,-0.063265,-0.262467,0.019352,U,U,U,0,0,0,...,U,U,U,U,U,U,U,U,U,L
4,U,L,U,L,L,L,L,0,0,0,...,L,L,L,L,L,L,L,L,L,U
5,L,U,L,U,U,U,U,0,0,0,...,U,0.032412,0.050374,0.269813,0.067497,0.190432,U,U,U,L


In [68]:
cluster_extrema( kmeans_male_joint_centroids )

Unnamed: 0,customer_main_branch_x_coord,customer_main_branch_y_coord,customer_home_x_coord,customer_home_y_coord,customer_income_level,customer_age,akbank_banking_age,1)RISKSIZ,2)GECIKME 1-15 GUN,3)GECIKME 16-29 GUN,...,trans_average_amount_TEKSTÝL,trans_average_monthly_freq_AKARYAKIT,trans_average_monthly_freq_GIDA,trans_average_monthly_freq_OTHER,trans_average_monthly_freq_RESTORAN,trans_average_monthly_freq_TEKSTÝL,statement_amount_TL_mean,statement_amount_TL_std,customer_education_level,customer_job_status
0,0.009071,-0.138201,0.071129,-0.082182,0.11445,U,U,0,0,0,...,U,U,U,U,U,U,U,U,U,L
1,U,L,U,L,L,0.09749,L,0,0,0,...,L,L,L,L,L,L,L,L,L,U
2,L,U,L,U,U,L,L,0,0,0,...,L,L,L,L,L,L,L,L,U,U
3,U,L,U,L,L,L,0.039407,0,0,0,...,-0.009947,0.114157,0.178221,0.094628,0.03394,0.082554,-0.089439,-0.13882,L,8.315735
4,L,U,L,U,0.016824,0.00655,0.205023,0,0,0,...,-0.008215,0.079535,0.153432,0.067022,0.116836,0.152791,-0.061836,-0.052332,5.364758,8.154185
5,-0.042804,-0.043229,0.020909,-0.099971,U,U,U,0,0,0,...,U,U,U,U,U,U,U,U,5.396197,L


## Relative to Partner Perspective

In [63]:
cluster_extrema( kmeans_diff_joint_centroids )

Unnamed: 0,customer_main_branch_x_coord,customer_main_branch_y_coord,customer_home_x_coord,customer_home_y_coord,customer_income_level,customer_age,akbank_banking_age,1)RISKSIZ,2)GECIKME 1-15 GUN,3)GECIKME 16-29 GUN,...,trans_average_amount_TEKSTÝL,trans_average_monthly_freq_AKARYAKIT,trans_average_monthly_freq_GIDA,trans_average_monthly_freq_OTHER,trans_average_monthly_freq_RESTORAN,trans_average_monthly_freq_TEKSTÝL,statement_amount_TL_mean,statement_amount_TL_std,customer_education_level,customer_job_status
0,0.075878,-0.025374,L,U,U,U,U,0,0,0,...,U,U,U,U,U,U,U,U,U,L
1,U,-0.106364,0.006305,L,L,L,L,0,0,0,...,L,L,L,L,L,L,L,L,L,U
2,L,U,0.032846,-0.051502,-0.019231,U,-0.133309,0,0,0,...,-0.140048,L,L,-0.364846,-0.301985,L,-0.294086,-0.185675,-0.150997,L
3,U,L,U,L,L,-0.254017,L,0,0,0,...,L,-0.149704,-0.328737,L,L,-0.043361,L,L,0.153846,U
4,0.052692,L,U,-0.072926,0.001078,L,0.115286,0,0,0,...,0.079818,U,U,U,U,U,U,0.251366,L,-0.385057
5,L,U,L,U,U,-0.230181,U,0,0,0,...,U,-0.16919,0.020892,-0.031702,0.014083,-0.120474,0.221644,U,U,-0.4375


In [64]:
cluster_extrema( kmeans_diff_sep_centroids )

Unnamed: 0,customer_main_branch_x_coord,customer_main_branch_y_coord,customer_home_x_coord,customer_home_y_coord,customer_income_level,customer_age,akbank_banking_age,1)RISKSIZ,2)GECIKME 1-15 GUN,3)GECIKME 16-29 GUN,...,trans_average_amount_TEKSTÝL,trans_average_monthly_freq_AKARYAKIT,trans_average_monthly_freq_GIDA,trans_average_monthly_freq_OTHER,trans_average_monthly_freq_RESTORAN,trans_average_monthly_freq_TEKSTÝL,statement_amount_TL_mean,statement_amount_TL_std,customer_education_level,customer_job_status
0,U,-0.126871,L,U,-0.006343,L,L,0,0,0,...,L,L,L,L,L,L,L,L,L,U
1,L,-0.113213,0.010554,-0.02452,L,L,-0.069946,0,0,0,...,0.048273,U,0.110317,U,0.197875,0.209717,U,0.158456,U,U
2,L,U,L,U,U,-0.199639,U,0,0,0,...,U,L,U,-0.080178,U,U,-0.005472,U,-0.272727,L
3,0.024476,L,U,L,U,U,U,0,0,0,...,U,U,U,U,U,U,U,U,U,L
4,0.01857,U,0.04786,-0.052748,-0.005702,U,0.168177,0,0,0,...,-0.055066,-0.006147,-0.076235,-0.120067,-0.096854,-0.143963,-0.032918,-0.036364,L,-1.169192
5,U,L,U,L,L,-0.250493,L,0,0,0,...,L,-0.1248,L,L,L,L,L,L,-0.315789,-1.052632
