# K-Means Clustering 



# Setup 


In [0]:
%%capture dependency_log
!pip install jellyfish tqdm matplotlib

In [0]:
import numpy as np
import pandas as pd
import tqdm

import uuid

import jellyfish as jf
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import HTML, display, clear_output

In [0]:
try: 
    from google.colab import auth
    auth.authenticate_user()
except:     
    try:
        import google.auth
        credentials, project = google.auth.default()
    except:
        raise Exception('Google cloud authentication required!')

In [0]:
%env GOOGLE_CLOUD_PROJECT=anz-pso-nfaggian

project_id = 'anz-pso-nfaggian'

env: GOOGLE_CLOUD_PROJECT=anz-pso-nfaggian


# Feature Generation 

We can use simple letter counting as a way to represent names and address.The query to create such a dataset would be like this:


**length(REGEXP_REPLACE(concat(IFNULL(first_name,""), IFNULL(last_name,"")), '[^a]', '')) a_num,**

The SQL snippet concatenates first_name with last_name and then replaces every other character except 'a'. We would add 26 such conditions - one for each character. 

Similar for address field. 

**length(REGEXP_REPLACE(concat(IFNULL(address_1," "), IFNULL(address_2, " ")), '[^a]', '')) x_num_addr, **


We have already created a processed table for this task. The table is named donors_features and can be created in less then 5 minutes of query execution time. 

In [33]:
query = f"""
SELECT
  * 
FROM
  record_link.donors_features 
"""

donors = pd.io.gbq.read_gbq(query, project_id=project_id, dialect='standard')
donors = donors.set_index("donor_id")
donors.head()

Unnamed: 0_level_0,a_num,b_num,c_num,d_num,e_num,f_num,g_num,h_num,i_num,j_num,...,q_num_addr,r_num_addr,s_num_addr,t_num_addr,u_num_addr,v_num_addr,w_num_addr,x_num_addr,y_num_addr,z_num_addr
donor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
560876,2,0,0,0,0,0,0,1,1,0,...,0,0,1,0,0,0,1,0,0,0
539573,2,0,2,0,1,0,0,0,1,0,...,0,2,0,0,0,1,0,1,0,0
697327,2,0,0,1,1,0,0,0,2,0,...,0,2,0,0,0,0,1,0,2,0
592777,1,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
593596,0,1,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:

from sklearn.cluster import KMeans  

kmeans = KMeans(n_clusters=20)  
labels = kmeans.fit_predict(donors)  
donors['labels'] = labels

donors.head()

Unnamed: 0_level_0,a_num,b_num,c_num,d_num,e_num,f_num,g_num,h_num,i_num,j_num,...,r_num_addr,s_num_addr,t_num_addr,u_num_addr,v_num_addr,w_num_addr,x_num_addr,y_num_addr,z_num_addr,labels
donor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
560876,2,0,0,0,0,0,0,1,1,0,...,0,1,0,0,0,1,0,0,0,10
539573,2,0,2,0,1,0,0,0,1,0,...,2,0,0,0,1,0,1,0,0,18
697327,2,0,0,1,1,0,0,0,2,0,...,2,0,0,0,0,1,0,2,0,18
592777,1,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,5
593596,0,1,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,18
