# SEMANTIC CLUSTERING OF THE LOCATIONS
# (USING THE ZSCORE DATAFRAME)

### IMPORT LIBRARIES

In [None]:
from collections import defaultdict
from scipy.spatial.distance import pdist
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm as cm
import matplotlib.patches as mpatches
import numpy as np
import seaborn as sns
from sklearn.metrics import *
from sklearn.cluster import KMeans
from pandas.plotting import parallel_coordinates
import pickle
import folium
from folium import plugins
from folium.plugins import HeatMap
import selenium.webdriver

plt.rcParams["font.family"] = 'serif'

#### Define the parameters to select the correct area and time period

In [None]:
stop = '5'
id_area = '11'
month =  '9'
n_months = '2'
week = '0'

month_code = month
if n_months != "1":
    for m in range(1, int(n_months)):
        month_code += "_" + str(int(month)+m)

#### Open the dataframe of the location features

In [None]:
path = '../../../datasets/out/Traj' + stop + 'min/'
file_name_in = 'loc_feat_area'+id_area+'_month'+month_code+'_week'+ week + '_compl_zscore.csv'
file_name_out = '_area'+id_area+'_month'+month_code+'_week'+ week + '_zscore'

df = pd.read_csv(path+file_name_in)

print("the number of different vehicles is", len(df["vehicle"].unique()))
print("the total number of locations is", len(df["vehicle"]))

In [None]:
df.describe()

#### Remove the columns for the vehicle and the location id that are not relevant right now

In [None]:
df_corr = df.copy()
df_corr.drop(['vehicle', 'loc_id'], axis=1, inplace=True)

#### We plot the distribution of the variables

In [None]:
# draw the distribution of the attributes 
fig = plt.figure(figsize=(100, 100)) 
fig_dims = (8, 9)

plot_type = ["line" for i in df_corr.keys()]
plot_type[0:2] = ["pie", "ignore", "ignore"]

#plot_type = ["pie", "ignore", "ignore", "bar", "line", "line", "line", "line", "line", "line", "line", "line", "line", 
          #  "line", "line", "line", "line", "line", "line", "line", "bar", "bar", "bar", "bar", "bar", "bar", "bar", "bar", 
           # "bar", "bar", "bar", "bar", "bar", "bar", "bar", "bar", "bar", "bar", "line", "line", "line", 
          #   "line", "line", "line", "line", "line", "line", "line", "line", "line", "line", "line", "line", "line", "line"]
plt.rcParams["font.size"] = 13

skip = 0

for i in range(len(df_corr.keys())-1):
    k = df_corr.keys()[i]
    t = plot_type[i]
    ax = plt.subplot2grid(fig_dims, (int((i-skip)/9), (i-skip)%9))
    
    if t == "pie":
        labels = 'Not Regular', 'Regular'
        sizes = df_corr[k].value_counts()
        explode = (0, 0.05)
        c = ["#97c170", "#dde37a", "#e1bd66", "#EAC435"]
        inside, texts, ltexts = ax.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=True, startangle=50, colors=c)
        for i in range(len(texts)):
            texts[i].set_fontsize(12)
            ltexts[i].set_fontsize(12)
        ax.axis('equal') 
        plt.title(k)
        
    if t == "line":
        x = range(0, len(df_corr))
        y = sorted(df_corr[k])
        plt.plot(x, y, color = '#EAC435', linewidth=2.5)
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)
        plt.xlabel("locations", fontsize=13)
        plt.ylabel(k, fontsize=13)
        plt.grid(True)
        plt.title(k)
    
    if t == "bar":
        x = range(0, len(df_corr))
        y = sorted(df_corr[k])
        _, bins, _ = plt.hist(df_corr[k], 20, color = '#97c170', ec='#FFFFFF')
        ax.set_xlabel(k, fontsize=13)
        ax.set_ylabel("number of locations", fontsize=13)
        plt.title(k)

    if t == "ignore":
        skip += 1
        
plt.savefig('../../../thesis/images/distribution'+file_name_out+'_minmax.png', format='png', bbox_inches='tight')
plt.close(fig)

#### Draw the correlation plot of the individual, collective and geographical features to understand if some attributes are redundant

In [None]:
# color map from purple to orange
cmap = cm.get_cmap('PuOr')

plt.rcParams["font.size"] = '16'

# draw the heatmap first for the individual features, and then for the collective and the geographical
# the correlation between the individual and the others are almost none
# and this way we work with 2 smaller matrices

# individual heatmap
fig = plt.figure()
fig.set_size_inches(40,30)

# take only the first part of the dataset
correlati = df_corr.iloc[ : , :35].corr()
correlati = correlati.round(2)
ax = sns.heatmap(correlati, cmap=cmap, vmin = -1, vmax = 1, annot = True,linewidths=.4)

# little trick to solve the bug that the heatmap is cut 
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values

plt.xticks(rotation=90)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.savefig('../../../thesis/images/corr'+file_name_out+'_indiv.png', format='png', bbox_inches='tight')
plt.close(fig)

# collective and geographical heatmap
fig = plt.figure()
fig.set_size_inches(40,30)

# take only the second and third part of the dataset
correlati = df_corr.iloc[ : , 35:].corr()
correlati = correlati.round(2)
ax = sns.heatmap(correlati, cmap=cmap, vmin = -1, vmax = 1, annot = True,linewidths=.4)

# little trick to solve the bug that the heatmap is cut 
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values

plt.xticks(rotation=90)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.savefig('../../../thesis/images/corr'+file_name_out+'_coll_geo.png', format='png', bbox_inches='tight')
plt.close(fig)

#### We can remove the attributes that have a high correlation with another

In some cases we perform a mean of the correlated columns, in other cases, if the information is just redundant, we just remove the attribute

In [None]:
df_corr.drop(["support"], axis=1, inplace=True)

avg_stay_weekday = (df_corr["avg_stay_weekday_day"] + df_corr["avg_stay_weekday_night"])/2
avg_stay_weekend = (df_corr["avg_stay_weekend_day"] + df_corr["avg_stay_weekend_night"])/2
std_stay_weekday = (df_corr["std_stay_weekday_day"] + df_corr["std_stay_weekday_night"])/2
std_stay_weekend = (df_corr["std_stay_weekend_day"] + df_corr["std_stay_weekend_night"])/2

df_corr = df_corr.assign(avg_stay_weekday=avg_stay_weekday, avg_stay_weekend=avg_stay_weekend,
                         std_stay_weekday=std_stay_weekday, std_stay_weekend=std_stay_weekend)

df_corr.drop(["avg_stay_weekday_day", "avg_stay_weekday_night", "avg_stay_weekend_day", "avg_stay_weekend_night",
              "std_stay_weekday_day", "std_stay_weekday_night", "std_stay_weekend_day", "std_stay_weekend_night"], axis=1, inplace=True)


avg_time_weekday_day = (df_corr["avg_leave_weekday_day"] + df_corr["avg_arrive_weekday_day"])/2
avg_time_weekend_day = (df_corr["avg_leave_weekend_day"] + df_corr["avg_arrive_weekend_day"])/2
avg_time_weekday_night = (df_corr["avg_leave_weekday_night"] + df_corr["avg_arrive_weekday_night"])/2
avg_time_weekend_night = (df_corr["avg_leave_weekend_night"] + df_corr["avg_arrive_weekend_night"])/2

df_corr = df_corr.assign(avg_time_weekday_day=avg_time_weekday_day, avg_time_weekend_day=avg_time_weekend_day,
                         avg_time_weekday_night=avg_time_weekday_night, avg_time_weekend_night=avg_time_weekend_night)

df_corr.drop(["avg_leave_weekday_day", "avg_arrive_weekday_day", "avg_leave_weekend_day", "avg_arrive_weekend_day",
              "avg_leave_weekday_night", "avg_arrive_weekday_night", "avg_leave_weekend_night", "avg_arrive_weekend_night"], axis=1, inplace=True)

df_corr.drop(["avg_leave_mov_duration", "avg_arrive_mov_duration", "std_leave_mov_duration", "std_arrive_mov_duration"], axis=1, inplace=True)

df_corr.drop(["centrality5K", "rev_centrality3", "rev_centrality8", "rev_centrality10"], axis=1, inplace=True)

# move the geographical features as the last columns of the dataframe
categories = ["gas", "parking", "pier", "hotel", "food", "leisure", "shop", "service", "supermarket"]
columns_df_g = ["n_"+c for c in categories]+["k_"+c for c in categories]+["d_"+c for c in categories]
df_corr = df_corr[[c for c in df_corr if c not in columns_df_g] + [c for c in columns_df_g if c in df_corr]]

In [None]:
df_corr.describe()

#### We can compute the correlation matrix again after the varible transformation (in this case we draw only one heatmap)

In [None]:
# correlation matrix only with the collective and geographic features

cmap = cm.get_cmap('PuOr')

fig = plt.figure()
fig.set_size_inches(60, 50)

correlati = df_corr.corr()
correlati = correlati.round(2)
ax = sns.heatmap(correlati, cmap=cmap, vmin = -1, vmax = 1, annot = True,linewidths=.4)

b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values

plt.xticks(rotation=90)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

plt.savefig('../../../thesis/images/corr'+file_name_out+'_after.png', format='png', bbox_inches='tight')
plt.close(fig)

## K-MEANS CLUSTERING OF THE FEATURES

#### Compute the sse and the silhouette for k in the range from 2 to 1000

In [None]:
############################ DO NOT RUN AGAIN, TOO LONG ############################
sse_list = list()
sil_list = list()
print("range 2-10")
for k in range(2,10):
    kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10, max_iter=100)
    kmeans.fit(df_corr)
    sse = kmeans.inertia_
    sse_list.append(sse)
    sil = silhouette_score(df_corr, kmeans.labels_)
    sil_list.append(sil)

with open(path+"sse_silouette"+file_name_out+'.pickle', 'wb') as fp:
    pickle.dump(sse_list, fp)
    pickle.dump(sil_list, fp)
    
print("range 10-200, step 5")
for k in range(10, 200, 5):
    kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10, max_iter=100)
    kmeans.fit(df_corr)
    sse = kmeans.inertia_
    sse_list.append(sse)
    sil = silhouette_score(df_corr, kmeans.labels_)
    sil_list.append(sil)

with open(path+"sse_silouette"+file_name_out+'.pickle', 'ab') as fp:
    pickle.dump(sse_list, fp)
    pickle.dump(sil_list, fp)
    
print("range 200-1000, step 100")
for k in range(200, 1000, 100):
    kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10, max_iter=100)
    kmeans.fit(df_corr)
    sse = kmeans.inertia_
    sse_list.append(sse)
    sil = silhouette_score(df_corr, kmeans.labels_)
    sil_list.append(sil)
    
with open(path+"sse_silouette"+file_name_out+'.pickle', 'ab') as fp:
    pickle.dump(sse_list, fp)
    pickle.dump(sil_list, fp)

#### Read the pickle files containing the sse and silhouette values

In [None]:
with open(path+"sse_silouette"+file_name_out+'.pickle', 'rb') as fp:
    sse_list1 = pickle.load(fp)
    sil_list1 = pickle.load(fp)
    
    sse_list2 = pickle.load(fp)
    sil_list2 = pickle.load(fp)
    
    sse_list = pickle.load(fp)
    sil_list = pickle.load(fp)    

#### Draw the sse and the silhouette values obtained

In [None]:
# draw sse
fig = plt.figure()
fig.set_size_inches(20,12)
plt.rcParams["font.size"] = '16'
x = list(range(2,10)) + list(range(10, 200, 5)) + list(range(200, 1000, 100))

plt.plot(x, sse_list, color = '#A8201A', linewidth=2.5)
plt.plot(105, sse_list[27], "o", color = '#A8201A', markersize = 10) ## area 11
#plt.plot(160, sse_list[38], "o", color = '#A8201A', markersize = 10) ## area 2

plt.xticks(np.arange(0, 1000, 50))
plt.yticks(np.arange(300000, 2000000, 150000)) ## area 11
#plt.yticks(np.arange(1000000, 5700000, 250000)) ## area 2
plt.xlabel("k", fontsize=19)
plt.ylabel("sse", fontsize=19)
plt.grid(True)

plt.savefig('../../../thesis/images/sse'+file_name_out+'.png', format='png', bbox_inches='tight')
plt.close(fig)

In [None]:
# draw silhouette
fig = plt.figure()
fig.set_size_inches(20,12)
plt.rcParams["font.size"] = '16'
x = list(range(2,10)) + list(range(10, 200, 5)) + list(range(200, 1000, 100))

plt.plot(x, sil_list, color = '#143642', linewidth=2.5)
plt.plot(105, sil_list[27], "o", color = '#143642', markersize = 10) ## area 11
#plt.plot(160, sil_list[38], "o", color = '#143642', markersize = 10) ## area 2

plt.xticks(np.arange(0, 1000, 50))
plt.yticks(np.arange(0.04, 0.3, 0.02)) ## area 11
#plt.yticks(np.arange(0.03, 0.31, 0.02)) ## area 2

plt.xlabel("k", fontsize=19)
plt.ylabel("silhouette", fontsize=19)
plt.grid(True)

plt.savefig('../../../thesis/images/sil'+file_name_out+'.png', format='png', bbox_inches='tight')
plt.close(fig)

#### Choose the best k for the kmeans clustering

In [None]:
k_best = 105 ## area 11
#k_best = 160 ## area 2

#### run again the kmeans with the k chosen to compute the centroids and the dict from cluster to number of locations

In [None]:
############################ DO NOT RUN AGAIN, TOO LONG ############################
kmeans = KMeans(init='k-means++', n_clusters=k_best, n_init=10, max_iter=300, random_state = 123)
kmeans.fit(df_corr)

# get the centroids
centroids_kmeans = kmeans.cluster_centers_
labels_kmeans = kmeans.labels_

with open(path+"centroids_kmeans"+file_name_out+'.pickle', 'wb') as fp:
    pickle.dump(centroids_kmeans, fp)
    pickle.dump(labels_kmeans, fp)

In [None]:
with open(path+"centroids_kmeans"+file_name_out+'.pickle', 'rb') as fp:
    centroids_kmeans = pickle.load(fp)
    labels_kmeans = pickle.load(fp)
    
hist, bins = np.histogram(labels_kmeans, bins=range(0, len(set(labels_kmeans)) + 1))
# dict from cluster id to number of locs in cluster
kmeans_cluster_size = dict(zip(bins, hist)) 

## COMPUTE THE HIERARCHICAL CLUSTERING ON THE CENTROIDS

In [None]:
def compute_linkage_matrix(centroids_kmeans):
    # compute the distance and the linkage matrix
    cmap = cm.gist_rainbow(np.linspace(0.1, 1, 7))
    hierarchy.set_link_color_palette([mpl.colors.rgb2hex(rgb[:3]) for rgb in cmap])

    # distance matrix
    dist_matrix = pdist(centroids_kmeans, metric='euclidean')
    # linkage matrix
    link_matrix = linkage(dist_matrix, method='ward', metric='euclidean')
    return link_matrix

#### Create a dict from cluster labels to the points in that cluster

In [None]:
def points_labels_to_clusters(points, labels):
    clusters = defaultdict(list)
    for i in range(0, len(points)):
        clusters[labels[i]].append(points[i])
    return clusters

#### Compute a set of dictionaries useful for computing measures

In [None]:
def clusters_dict(link_matrix, centroids_kmeans, labels_kmeans, kmeans_cluster_size):
    # list of linkage cluster id 
    linkage_labels = fcluster(link_matrix, cut_dist, 'distance') 
    # dict from cluster label to the points in it
    linkage_clusters = points_labels_to_clusters(np.array(centroids_kmeans), linkage_labels)

    # dict from linkage cluster id to number of locations
    link_cluster_to_n_location = dict.fromkeys(np.unique(linkage_labels), 0)
    # dict from linkage cluster id to number of kmeans clusters
    link_cluster_to_n_kcluster = dict.fromkeys(np.unique(linkage_labels), 0)
    # dict from kmeans cluster id to linkage cluster id
    kcluster_to_link_cluster = dict.fromkeys(np.unique(labels_kmeans), 0)
    for i, c in enumerate(linkage_labels):
        link_cluster_to_n_location[c] += kmeans_cluster_size[i]
        link_cluster_to_n_kcluster[c] += 1
        kcluster_to_link_cluster[i] = c
        
    return linkage_labels, link_cluster_to_n_location, link_cluster_to_n_kcluster, kcluster_to_link_cluster

#### Define a function to draw the dendrogram of the hierarchical clustering

In [None]:
def draw_dendro(k, link_matrix, cut_dist, link_cluster_to_n_location):
    # draw the dendrogram of the linkage clustering
    fig = plt.figure(figsize=(20, 10)) 

    res = dendrogram(link_matrix, color_threshold = cut_dist, above_threshold_color = 'grey', no_labels= True)
    plt.axhline(y=cut_dist, c='r')
    y_ticks_max = round(link_matrix[-1][2])
    plt.yticks(np.arange(0, y_ticks_max, y_ticks_max/10), fontsize=16)

    cmap = cm.gist_rainbow(np.linspace(0.1, 1, 7))

    legend_handles = []
    for i in range(1, len(link_cluster_to_n_location)+1):
        legend_handles.append(mpatches.Patch(color=cmap[(i-1)%7], label='C'+str(i)+', n_locs ='+str(link_cluster_to_n_location[i])))

    plt.legend(handles=legend_handles, loc=1)

    plt.savefig('../../../thesis/images/dentro_'+id_area+'_cluster_'+str(k+2)+'.png', format='png', bbox_inches='tight')
    plt.close(fig)

#### Create a dataframe containing the kmeans centroids and linkage clustering

In [None]:
def create_df_centroids(centroids_kmeans, kcluster_to_link_cluster):
    # create a dataframe containing the kmeans centroids
    df_centroids = pd.DataFrame(centroids_kmeans, columns=df_corr.columns)
    # add a column containing for each centroids the linkage cluster id
    df_centroids["link_cluster"] = kcluster_to_link_cluster.values()

    # for each linkage cluster extract all the centroids and compute a mean
    link_centroids = []
    link_centroids_std = []
    for i in range(1, len(link_cluster_to_n_kcluster)+1):
        df_i = df_centroids[df_centroids["link_cluster"] == i]
        link_centroids.append(list(df_i.mean(axis = 0)))
        link_centroids_std.append(list(df_i.std(axis = 0)))
        
    # create a dataframe containing of each linkage cluster the mean of the centroids in it
    df_par = pd.DataFrame(link_centroids, columns=df_centroids.columns)
    
    return df_centroids, df_par, link_centroids_std

#### Draw the parallel coordinates of the cluster obtained

In [None]:
################# USING PANDAS LIBRARY
def draw_par_coords(df_par, link_cluster_to_n_location, cluster_id):
    # draw the parallel coordinates of the linkage clusters
    fig = plt.figure(figsize=(35, 12)) 

    cmap = cm.gist_rainbow(np.linspace(0.1, 1, 7))

    parallel_coordinates(df_par, 'link_cluster', color = cmap, linewidth=3, axvlines=True, \
                         axvlines_kwds={"linewidth":0.5, "color":"k"} )
    plt.xticks(rotation=90, fontsize=16)
    plt.yticks(np.arange(0, 1.01, 0.1), fontsize=16)

    legend_handles = []
    for i in range(1, len(link_cluster_to_n_location)+1):
        legend_handles.append(mpatches.Patch(color=cmap[(i-1)%7], label='C'+str(i)+', n_locs ='+str(link_cluster_to_n_location[i])))

    plt.legend(handles=legend_handles, loc=1)
    
    plt.grid(False)

    plt.savefig('../../../thesis/images/parallel_coord_'+id_area+'_cluster_'+str(cluster_id+2)+'.png', format='png', bbox_inches='tight')
    plt.close(fig)

In [None]:
############### USING ERRORBAR
def draw_par_coords_error(df_par, link_centroids_std, link_cluster_to_n_location, cluster_id):
    
    # draw the parallel coordinates of the linkage clusters
    fig, ax = plt.subplots(1,1, figsize=(35, 12)) 
    #fig = plt.figure() 
    cmap = cm.gist_rainbow(np.linspace(0.1, 1, 7))
    l = len(df_par.keys()[:-1])
        
    for i, row in df_par.iterrows():
        
        x = [x + y for x, y in zip(range(l), np.ones(l)*0.05*i)] #df_par.keys()[:-1]
        y = row[:-1]
        yerr = link_centroids_std[i][:-1]
        
        (_, caps, _) = plt.errorbar(x, y, yerr=yerr, color=cmap[i], linewidth=5, barsabove=True, \
                            elinewidth=2, uplims=True, lolims=True, label='uplims=True, lolims=True')

        for cap in caps:
            cap.set_marker("_")
            cap.set_markersize(10)
            cap.set_markeredgewidth(3)
        
    x_ticks_labels = df_par.keys()[:-1]
    ax.set_xticks(x) # Set number of ticks for x-axis
    ax.set_xticklabels(x_ticks_labels, rotation='vertical', fontsize=16) # Set ticks labels for x-axis
    plt.yticks(np.arange(-0.2, 1.3, 0.1), fontsize=16)

    legend_handles = []
    for i in range(1, len(link_cluster_to_n_location)+1):
        legend_handles.append('C'+str(i)+', n_locs ='+str(link_cluster_to_n_location[i]))

    plt.legend(legend_handles, loc=1)

    plt.savefig('../../../thesis/images/parallel_coord_'+id_area+'_cluster_'+str(cluster_id+2)+'_error.png', format='png', bbox_inches='tight')
    plt.close(fig)

#### Compute the clusters splitting recursively according to the dendrogram

In [None]:
link_matrix = compute_linkage_matrix(centroids_kmeans)

for i in range(5):
    cut_dist = link_matrix[-i-1][2] - 0.1
    
    linkage_labels, link_cluster_to_n_location, link_cluster_to_n_kcluster, kcluster_to_link_cluster = clusters_dict(
                        link_matrix, centroids_kmeans, labels_kmeans, kmeans_cluster_size)
    
    draw_dendro(i, link_matrix, cut_dist, link_cluster_to_n_location)    
    
    df_centroids, df_par, link_centroids_std = create_df_centroids(centroids_kmeans, kcluster_to_link_cluster)
    
    draw_par_coords_error(df_par, link_centroids_std, link_cluster_to_n_location, i)
    draw_par_coords(df_par, link_cluster_to_n_location, i)

## DRAW THE HEATMAP OF THE LOCATIONS COMPOSING THE CLUSTERS

#### Use selenium to transform a hmtl map into a png image

In [None]:
driver = selenium.webdriver.PhantomJS()

#### Get the dataframe not normalized

In [None]:
# get dataframe not normalized
path = '../../../datasets/out/Traj' + stop + 'min/'
file_name_in = 'loc_feat_area'+id_area+'_month'+month_code+'_week'+ week + '_complete.csv'

df_denorm = pd.read_csv(path+file_name_in)

In [None]:
df_denorm = df_denorm[["loc_proto_lat", "loc_proto_lon"]]

#### Compute the array of linkage cluster label for each location and assign the linkage cluster to the dataset of the locations

In [None]:
link_cluster = []
for kmeans_label in labels_kmeans:
    link_cluster.append(kcluster_to_link_cluster[kmeans_label])
    
df_locs = df_denorm.copy()
df_locs["link_cluster"] = link_cluster

#### Extract all the points of the locations of a linkage cluster

In [None]:
# array with k (linkage) elements, each contains a list of points in that cluster
link_points = []
for i in range(1, len(np.unique(link_cluster))+1):
    df_i = df_locs[df_locs["link_cluster"] == i]
    link_points.append([list(a) for a in zip(df_i["loc_proto_lat"], df_i["loc_proto_lon"])])

#### Draw the heatmap for each cluster

In [None]:
for i in range(len(link_points)):
    cluster_id = i
    #m = folium.Map(location=[38, 23], zoom_start=10) ## area 2
    m = folium.Map(location=[38, 23.68], zoom_start=12) ## area 11

    # Plot it on the map
    HeatMap(link_points[cluster_id]).add_to(m)

    folium.map.Marker([38.2, 23.68], ## area 11 [38.2, 23.68] ## area 2 [38.8, 22.9]
        icon=folium.features.DivIcon(icon_size=(500,40), icon_anchor=(0,0),
                                     html='<div style="font-size: 56pt">CLUSTER '+str(cluster_id+1)+'</div>')).add_to(m)
    # Display the map
    m.save('../../../thesis/images/heatmap_area_'+id_area+'_cluster_'+str(cluster_id+1)+'.html')
        
    driver.set_window_size(2500, 1800)
    driver.get('../../../thesis/images/heatmap_area_'+id_area+'_cluster_'+str(cluster_id+1)+'.html')
    driver.save_screenshot('../../../thesis/images/heatmap_area_'+id_area+'_cluster_'+str(cluster_id+1)+'.png')