# Clustering Validation

### IMPORT LIBRARIES

In [39]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
import pickle
from sklearn.metrics import normalized_mutual_info_score as norm_mutual
from sklearn.metrics import adjusted_mutual_info_score as adj_mutual

#### Define the parameters to select the correct area and time period

In [40]:
stop = '5'
id_area = '2'
month =  '9'
n_months = '2'
week = '0'

month_code = month
if n_months != "1":
    for m in range(1, int(n_months)):
        month_code += "_" + str(int(month)+m)

#### Open the dataframe of the location features

In [41]:
path = '../../../datasets/out/Traj' + stop + 'min/'
file_name_in = 'loc_feat_area'+id_area+'_month'+month_code+'_week'+ week + '_compl_log_norm.csv'
file_name_out = '_area'+id_area+'_month'+month_code+'_week'+ week + '_log'

df = pd.read_csv(path+file_name_in)

#### We can remove the attributes that have a high correlation with another

In some cases we perform a mean of the correlated columns, in other cases, if the information is just redundant, we just remove the attribute

In [42]:
df_corr = df.copy()
df_corr.drop(['vehicle', 'loc_id'], axis=1, inplace=True)

df_corr.drop(["support"], axis=1, inplace=True)

avg_stay_weekday = (df_corr["avg_stay_weekday_day"] + df_corr["avg_stay_weekday_night"])/2
avg_stay_weekend = (df_corr["avg_stay_weekend_day"] + df_corr["avg_stay_weekend_night"])/2
std_stay_weekday = (df_corr["std_stay_weekday_day"] + df_corr["std_stay_weekday_night"])/2
std_stay_weekend = (df_corr["std_stay_weekend_day"] + df_corr["std_stay_weekend_night"])/2

df_corr = df_corr.assign(avg_stay_weekday=avg_stay_weekday, avg_stay_weekend=avg_stay_weekend,
                         std_stay_weekday=std_stay_weekday, std_stay_weekend=std_stay_weekend)

df_corr.drop(["avg_stay_weekday_day", "avg_stay_weekday_night", "avg_stay_weekend_day", "avg_stay_weekend_night",
              "std_stay_weekday_day", "std_stay_weekday_night", "std_stay_weekend_day", "std_stay_weekend_night"], axis=1, inplace=True)


avg_time_weekday_day = (df_corr["avg_leave_weekday_day"] + df_corr["avg_arrive_weekday_day"])/2
avg_time_weekend_day = (df_corr["avg_leave_weekend_day"] + df_corr["avg_arrive_weekend_day"])/2
avg_time_weekday_night = (df_corr["avg_leave_weekday_night"] + df_corr["avg_arrive_weekday_night"])/2
avg_time_weekend_night = (df_corr["avg_leave_weekend_night"] + df_corr["avg_arrive_weekend_night"])/2

df_corr = df_corr.assign(avg_time_weekday_day=avg_time_weekday_day, avg_time_weekend_day=avg_time_weekend_day,
                         avg_time_weekday_night=avg_time_weekday_night, avg_time_weekend_night=avg_time_weekend_night)

df_corr.drop(["avg_leave_weekday_day", "avg_arrive_weekday_day", "avg_leave_weekend_day", "avg_arrive_weekend_day",
              "avg_leave_weekday_night", "avg_arrive_weekday_night", "avg_leave_weekend_night", "avg_arrive_weekend_night"], axis=1, inplace=True)

df_corr.drop(["avg_leave_mov_duration", "avg_arrive_mov_duration", "std_leave_mov_duration", "std_arrive_mov_duration"], axis=1, inplace=True)

df_corr.drop(["centrality5K", "rev_centrality3", "rev_centrality8", "rev_centrality10"], axis=1, inplace=True)

# move the collective features as the last columns of the dataframe
columns_df_c = ["exclusivity", "centrality1K", "centrality15K", "rev_centrality1", "rev_centrality5", "rev_centrality20"]
df_corr = df_corr[[c for c in df_corr if c not in columns_df_c] + [c for c in columns_df_c if c in df_corr]]

# move the geographical features as the last columns of the dataframe
categories = ["gas", "parking", "pier", "hotel", "food", "leisure", "shop", "service", "supermarket"]
columns_df_g = ["n_"+c for c in categories]+["k_"+c for c in categories]+["d_"+c for c in categories]
df_corr = df_corr[[c for c in df_corr if c not in columns_df_g] + [c for c in columns_df_g if c in df_corr]]

# Temporal Clustering

#### Select only the stay time and the arrival time

In [43]:
df_temp = df_corr[['avg_stay_weekday', 'avg_stay_weekend', 'std_stay_weekday', 'std_stay_weekend', 
                  'avg_time_weekday_day', 'avg_time_weekend_day', 'avg_time_weekday_night', 'avg_time_weekend_night']]

In [44]:
df_temp.describe()

Unnamed: 0,avg_stay_weekday,avg_stay_weekend,std_stay_weekday,std_stay_weekend,avg_time_weekday_day,avg_time_weekend_day,avg_time_weekday_night,avg_time_weekend_night
count,109335.0,109335.0,109335.0,109335.0,109335.0,109335.0,109335.0,109335.0
mean,0.194373,0.033277,0.157336,0.041639,0.515622,0.087821,0.115397,0.038945
std,0.225787,0.122519,0.215122,0.09981,0.271083,0.266313,0.288635,0.175829
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.050459,0.0,0.351111,0.0,0.0,0.0
50%,0.197314,0.0,0.050459,0.0,0.584108,0.0,0.0,0.0
75%,0.271173,0.0,0.214116,0.053577,0.715958,0.0,0.0,0.0
max,0.999863,1.0,1.0,1.0,0.999693,0.999776,0.999881,0.999815


#### Run k-means with k=6

In [45]:
kmeans_t = KMeans(init='k-means++', n_clusters=6, n_init=10, max_iter=300, random_state = 123)
kmeans_t.fit(df_temp)

# get the centroids
centroids_kmeans_t = kmeans_t.cluster_centers_
labels_kmeans_t = kmeans_t.labels_

In [46]:
unique_elements, counts_elements = np.unique(labels_kmeans_t, return_counts=True)
dict(zip(unique_elements, counts_elements))

{0: 25812, 1: 11106, 2: 8087, 3: 10095, 4: 3300, 5: 50935}

# Spatial Clustering

#### Select only the location prototype coordinates

In [47]:
df_spatial = df_corr[['loc_proto_lat', 'loc_proto_lon']]

In [48]:
df_spatial.describe()

Unnamed: 0,loc_proto_lat,loc_proto_lon
count,109335.0,109335.0
mean,0.479416,0.749837
std,0.061089,0.055686
min,0.0,0.0
25%,0.449828,0.736013
50%,0.472772,0.753163
75%,0.505175,0.772169
max,1.0,1.0


#### Run k-means with k=6

In [49]:
kmeans_s = KMeans(init='k-means++', n_clusters=6, n_init=10, max_iter=300, random_state = 123)
kmeans_s.fit(df_spatial)

# get the centroids
centroids_kmeans_s = kmeans_s.cluster_centers_
labels_kmeans_s = kmeans_s.labels_

In [50]:
unique_elements, counts_elements = np.unique(labels_kmeans_s, return_counts=True)
dict(zip(unique_elements, counts_elements))

{0: 53260, 1: 3106, 2: 33146, 3: 1742, 4: 12151, 5: 5930}

# Semantic Clustering

#### Retrieve the result using our clustering technique

In [51]:
with open(path+"link_cluster"+file_name_out+'.pickle', 'rb') as fp:
    df_link = pickle.load(fp)
    link_cluster = df_link["link_cluster"]

In [52]:
unique_elements, counts_elements = np.unique(link_cluster, return_counts=True)
dict(zip(unique_elements, counts_elements))

{1: 6470, 2: 9122, 3: 6563, 4: 14035, 5: 28748, 6: 44397}

# Compare Methods

#### Compute matrix n_locs*n_locs, where c_ij = 1 if loc_i and loc_j are in the same cluster, 0 ow

In [None]:
# # temporal clustering
# M_t = []
# for i in labels_kmeans_t:
#     row = []
#     for j in labels_kmeans_t:
#         if i == j:
#             row.append(1)
#         else:
#             row.append(0)
#     M_t.append(row)

In [45]:
# with open(path+"matrix_t"+file_name_out+'.pickle', 'wb') as fp:
#     pickle.dump(M_t, fp)

In [None]:
# # spatial clustering
# M_s = []
# for i in labels_kmeans_s:
#     row = []
#     for j in labels_kmeans_s:
#         if i == j:
#             row.append(1)
#         else:
#             row.append(0)
#     M_s.append(row)

In [None]:
# with open(path+"matrix_s"+file_name_out+'.pickle', 'wb') as fp:
#     pickle.dump(M_s, fp)

In [None]:
# # semantic clustering
# M = []
# for i in link_cluster:
#     row = []
#     for j in link_cluster:
#         if i == j:
#             row.append(1)
#         else:
#             row.append(0)
#     M.append(row)

In [None]:
# M.size()

#### (METHOD 1) Count how many couples od locations are clustered together in different method

#### (METHOD 2) Using sklearn normalized_mutual_info_score

In [53]:
nmi_M_Mt = norm_mutual(link_cluster, labels_kmeans_t, average_method="arithmetic")
nmi_M_Ms = norm_mutual(link_cluster, labels_kmeans_s, average_method="arithmetic")
nmi_Ms_Mt = norm_mutual(labels_kmeans_s, labels_kmeans_t, average_method="arithmetic")
print(nmi_M_Mt, nmi_M_Ms, nmi_Ms_Mt)

0.15799538753521084 0.11864595623337867 0.0035988934447113226


#### (METHOD 3) Using sklearn adjusted_mutual_info_score

In [54]:
ami_M_Mt = adj_mutual(link_cluster, labels_kmeans_t, average_method="arithmetic")
ami_M_Ms = adj_mutual(link_cluster, labels_kmeans_s, average_method="arithmetic")
ami_Ms_Mt = adj_mutual(labels_kmeans_s, labels_kmeans_t, average_method="arithmetic")
print(ami_M_Mt, ami_M_Ms, ami_Ms_Mt)

0.15793056959534332 0.11857408767818622 0.0035153439511819754
