## K-Means Machine Learning Algorithm
This notebook uses the K Means algorithm to create personas based on person and household attributes from PUMs data. This is an unsupervised learning technique because there are no pre-labeling of the personas. The attributes that are choosen will work with clustering and are easy to communicate. Then we will be able to use the data to describe meaningful attributes of a real type of transportation users. 

In [92]:
import math
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

In [82]:
# Loading the preprocessed data 
df_data = pd.read_csv('pums_processed.csv', index_col = 0)

In [83]:
# Droping some characteristics for a better clustering 
df_data = df_data.drop(['AGEP', 'WHITE_ALONE', 'BLACK_ALONE', 'NATIVE_INDIAN', 'ASIAN_ALONE', 'PACIFIC_ISLANDER',
             'OTHER_RACE', 'HISP', 'DOWNTOWN', 'NORTHEAST', 'NORTHWEST', 'SOUTHEAST', 'WEST'], axis=1)

In [84]:
# Preparing for K-means clustering --> dropping all NaNs and normalizing the data 
df_nonan = df_data.dropna()
index_nomissing = df_nonan['SERIALNO']
df_nonan = df_nonan.set_index('SERIALNO')
normalized_cluster = (df_nonan-df_nonan.mean())/df_nonan.std()

In [85]:
normalized_cluster.head(5)

Unnamed: 0_level_0,HINCP,NP,WGTP,DDRS,ENG,DEAR,DEYE,DOUT,MIG,JWMNP,BIKING,DRIVING,TRANSIT,WALKING,MALE,FEMALE,SCHL
SERIALNO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
127,-0.860447,-0.11914,0.004158,0.156246,2.87048,0.216603,0.14193,0.240739,-0.502748,-1.031821,-1.53168e-15,-2.085899e-14,4.998803e-16,0.374925,-1.410049,1.410049,-0.857605
747,0.921545,1.507844,-0.654642,0.156246,-1.240797e-14,0.216603,0.14193,0.240739,-0.502748,-0.085906,-0.2731775,0.9734808,-0.692119,-0.523713,0.739851,-0.739851,-0.330261
1984,-0.564809,0.694352,-0.723267,0.156246,-1.240797e-14,0.216603,0.14193,0.240739,2.22114,-0.506312,-0.2731775,0.9734808,-0.692119,-0.523713,0.500973,-0.500973,-0.593933
2319,-0.687311,0.694352,-0.448767,0.156246,-0.5607805,0.216603,-2.91813,0.240739,2.22114,-0.506312,-0.2731775,0.9734808,-0.692119,-0.523713,0.500973,-0.500973,-1.297059
2975,-0.385957,0.694352,-0.613467,0.156246,-1.240797e-14,0.216603,0.14193,0.240739,2.22114,0.334501,-0.2731775,0.9734808,-0.692119,-0.523713,-1.410049,1.410049,-0.418152


In [86]:
# K-Mean clustering where n_cluster=5 
CLUSTER = 'cluster'
kmeans = KMeans(n_clusters=5, random_state=0).fit(normalized_cluster)
clustered = pd.DataFrame({CLUSTER: kmeans.labels_}, index= index_nomissing)

df_final = df_nonan.join(clustered)
#df_final.head(-5)

In [87]:
# Checking how many data points are in each cluster 
for i in range(len(df_final['cluster'].unique())):
    print(len(df_final[df_final['cluster'] == i]))

110
462
509
1038
462


In [88]:
# Aggregating to the cluster level 
cluster_summary = df_final.groupby(['cluster'], as_index = True).agg({'DDRS':'mean', 'ENG':'mean', 'DEAR':'mean', 'DEYE':'mean', 'DOUT':'mean',
                                                            'MIG':'mean', 'JWMNP':'mean', 'BIKING':'mean', 'DRIVING':'mean', 
                                                            'TRANSIT':'mean', 'WALKING':'mean', 'MALE':'mean', 'FEMALE':'mean', 'SCHL':'mean'})

#cluster_summary.head(5)
d = {'cluster': [1, 2, 3, 4, 5]}
df = pd.DataFrame(data=d)

In [89]:
#cluster_summary = pd.concat([cluster_summary, df], axis=1)
cluster_summary.head(5)

Unnamed: 0_level_0,DDRS,ENG,DEAR,DEYE,DOUT,MIG,JWMNP,BIKING,DRIVING,TRANSIT,WALKING,MALE,FEMALE,SCHL
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0.588788,0.662804,0.811818,0.88,0.153788,0.357576,2.708333,0.044924,0.637093,0.208932,0.213477,0.315303,0.684697,1.74197
1,1.0,0.524235,0.931818,0.98088,0.99531,0.30772,14.785354,0.033413,0.693655,0.102413,0.215328,0.006494,0.993506,3.431457
2,0.999018,0.51536,0.959725,0.982318,0.994761,0.473772,13.916143,0.045769,0.584971,0.098731,0.315168,0.93019,0.06981,3.062901
3,0.996112,0.523349,0.982473,0.995182,0.986293,0.256707,13.677198,0.057392,0.835082,0.076182,0.052746,0.511814,0.488186,3.015745
4,0.997707,0.501725,0.99446,0.991625,0.986111,0.570648,29.917754,0.018499,0.107465,0.856043,0.018813,0.491736,0.508264,3.468831


### Calculating Weights into each Summary Cluster

In [90]:
def multiply_weights(column, weights):
    return column * weights

def divide_weights(column, weights):
    return column / weights

def cluster_summary_with_weights(clustered_dat):
    
    input_dat = clustered_dat[['ENG', 'DEYE', 'DOUT', 'MIG', 'JWMNP', 
                        'BIKING', 'DRIVING', 'TRANSIT',
                        'WALKING', 'MALE', 'FEMALE', 'SCHL', 'WGTP']].copy()
    
    # multiply weights to each column of dat
    weighted_dat = input_dat.apply(lambda x: multiply_weights(x, normalized_cluster['WGTP']))
    weighted_dat = pd.concat([weighted_dat, normalized_cluster['WGTP'], cluster_summary['cluster']], axis = 1)
    
    cluster_sums = weighted_dat.groupby(['cluster'], as_index = True).agg('sum')
    
    weighted_cluster_means = cluster_sums.apply(lambda x: divide_weights(x, cluster_sums['WGTP']))
    # drop WEIGHT as all value will be 1.0
    weighted_cluster_means = weighted_cluster_means.drop(columns = 'WGTP')
    
    
    # add a neutral element (average/baseline for the personas)
    weighted_neutral = pd.Series(weighted_dat.sum())
    weighted_neutral = weighted_neutral / weighted_neutral['WGTP']
    # drop cluster/weight (used in calculation but unnecessary for further analysis) 
    weighted_neutral = weighted_neutral.drop(['cluster', 'WGTP'])

    weighted_cluster_means = weighted_cluster_means.append(weighted_neutral, ignore_index=True)
        
    weighted_cluster_means = weighted_cluster_means.set_index(weighted_cluster_mean)

    return(weighted_cluster_means)

In [93]:
#weighted_cluster_summary = cluster_summary_with_weights(df_final)
#weighted_cluster_summary