# generals

In [None]:
# general libraries
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import time
import requests
import json
import io

# analysis libraries
from sklearn.cluster import AgglomerativeClustering
from validclust import dunn
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, cophenet
from sklearn.metrics import silhouette_score, davies_bouldin_score

# visualization libraries
import matplotlib.pyplot as plt
import contextily as ctx
from shapely.geometry import Point

In [None]:
import plotly.io as pio
pio.renderers.default = 'notebook_connected'

In [None]:
# directories
datasets_dir = "datasets/"

# datasets

In [None]:
# OSM data: we need to fix the coordinate reference system
data = gpd.read_file(datasets_dir + "geopoints_data.geojson")
data['lat_m'] = data.geometry.x
data['lon_m'] = data.geometry.y
data['node_id'] = data['node_id'].astype('string')

# RASTER DATA:
polygons = gpd.read_file(datasets_dir + "bologna_polygons.geojson")
#polygons = polygons.to_crs('EPSG:32632')
polygons['id'] = polygons['id'].astype(str)

# CLUSTERING RESULTS DATA
results = pd.read_csv(datasets_dir + "pareto_hac.csv")
results.polygon_id = results.polygon_id.astype(str)

print("all datasets loaded")

In [None]:
results

# functions setting

In [None]:
def dist_matrix_extraction(setting, polygon):
    
    current_matrix = pd.read_csv("datasets/matrix_computation/" + setting + "/polygon" + str(polygon) + ".csv", index_col=0)
    current_matrix.columns = current_matrix.columns.astype(int)
    
    return current_matrix

In [None]:
def hac_clustering(dist, matrix, dataset, grid, setting):
    
    # initialize clustering
    hac = AgglomerativeClustering(
        n_clusters=None,               
        metric='precomputed',        
        linkage='average',             
        distance_threshold=dist)
    
    res = hac.fit(matrix) # fit to the distance matrix
    ids = [int(el) for el in matrix.index]
    
    clusters = res.labels_ # get the cluster labels
    num_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)  # to exclude the noise cluster from the counting
    
    cluster_df = pd.DataFrame({'id': ids, "clusters": clusters})
    cluster_df['id'] = cluster_df['id'].astype("string") 
    complete_subset = dataset.merge(cluster_df, left_on='node_id',right_on="id", how='left')

    # re-classification of outliers 
    cluster_sizes = complete_subset['clusters'].value_counts()
    valid_clusters = cluster_sizes[cluster_sizes == 1].index
    complete_subset.loc[complete_subset['clusters'].isin(valid_clusters), 'clusters'] = -1 # reassign those values to -1 for comparison with noises   
        
    return complete_subset

    

In [None]:
def plot_clusters(plot_data, polygon_gdf, dist, polygon_type):
    
    # reprojection of the coordinates
    plot_data = plot_data.to_crs(epsg=3857)
    plot_data.clusters = plot_data.clusters.astype(str)
    
    polygon_gdf = polygon_gdf.set_crs(epsg=4326, allow_override=True)
    polygon_gdf = polygon_gdf.to_crs(epsg=3857)
    
    # figure
    fig, ax = plt.subplots(figsize=(16, 8))
    
    polygon_gdf.boundary.plot(ax=ax, color='black', linewidth=1)
   
    plot_data.plot(
        ax=ax,
        column='clusters',
        categorical=True,
        cmap='tab20',
        legend=True,
        markersize=40,
        alpha=0.8
    )
    
    ax.set_aspect('equal')
    
    ctx.add_basemap(ax, source=ctx.providers.CartoDB.Positron, crs=plot_data.crs)
    
    ax.text(0.01, 0.02, f"distance threshold={dist}", 
        transform=ax.transAxes, fontsize=9, color='gray')
    ax.set_title(f"Clustering results for polygon with {polygon_type} boundary", fontsize=14)
    
    ax.axis('off')
    plt.tight_layout()
    plt.show()


In [None]:
def clustering_pipeline(dataset, polygons_gdf, clustering_results, polygon_id, setting):
   
    matrix = dist_matrix_extraction(setting, polygon_id)
    current_nodes = [str(el) for el in matrix.index]
    subset = dataset[dataset.node_id.isin(current_nodes)]
    
    polygon = polygons_gdf[polygons_gdf.id==polygon_id]
    
    subset_results = clustering_results[(clustering_results.polygon_id==polygon_id)&(clustering_results.buffer==setting)]
    
    for index, row in subset_results.iterrows():

        dist = row.dist
        
        clustered = hac_clustering(dist, matrix, subset, polygon_id, setting)
                
        #print(clustered.head())
        plot_clusters(clustered, polygon, dist, setting)



## polygon 0

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '0',
                    'original')

# DISCUSSION: 
# two clusters are well defined with similar sizes -> dist =2.3 
# only the first clustering appear to capture significant results

### 100 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '0',
                    'buffer_100')

# DISCUSSION: similar results of before 
# two clusters are well defined with similar sizes -> dist =2.3 

### 200 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '0',
                    'buffer_200')

# DISCUSSION: 
# similar results to before -> clusters in 2.3 aggregate the outliers
# interesting also dist=1.5 -> identifies a decent number of clusters with different sizes


## polygon 1

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '1',
                    'original')

# DISCUSSION: many clustering identified
# 0.6 and 0.7 identify many and small clusters
# 0.8 and 0.9 identify fewer clusters, a little bit bigger
# 1.8 and 2.0 identify very big clusters


### 100 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '1',
                    'buffer_100')

# DISCUSSION: similarities with previous implementations
# 0.6 and 0.7 identify many and small clusters
# 1.1 identify fewer clusters, a little bit bigger
# 1.4 and 1.7 identify fewer clusters with general similar sizes
# 1.9 and 2.5 identify very big clusters

### 200 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '1',
                    'buffer_200')

# DISCUSSION: similarities with previous implementations
# 0.6 and 0.7 identify many and small clusters
# 0.9 and 1.1 identify fewer clusters, a little bit bigger
# 1.4, 1.5 and 1.7 identify fewer clusters with general similar sizes
# 2.0 and 2.5 identify very big clusters

## polygon 2

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '2',
                    'original')

# DISCUSSION: results too differenciated
# dist=1.6 identifies the most significant partition

### 100 BUFFER DATA

In [None]:
clustering_pipeline(data, 
                    polygons,
                    results,
                    '2',
                    'buffer_100')
# DISCUSSION: results are consistent with the previous polygon
# dist=1.6/1.9

### 200 BUFFER DATA

In [None]:
clustering_pipeline(data, 
                    polygons,
                    results,
                    '2',
                    'buffer_200')

# DISCUSSION: results are consistent with the previous polygon

## polygon 3

### ORIGINAL DATA

In [None]:
clustering_pipeline(data, 
                    polygons,
                    results,
                    '3',
                    'original')

# DISCUSSION: 
# only clusters on the left side appear to be well separated and have shapes consistent with an ideal distribution


### 100 BUFFER DATA

In [None]:
clustering_pipeline(data, 
                    polygons,
                    results,
                    '3',
                    'buffer_100')

# DISCUSSION: rapid change in the clusterization process by enlargin the area
# dist=1.6

### 200 BUFFER DATA

In [None]:
clustering_pipeline(data, 
                    polygons,
                    results,
                    '3',
                    'buffer_200')

# DISCUSSION: similar results to previous polygon
# dist=1.7

## polygon 4

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '4',
                    'original')

# DISCUSSION: different types of clusterings are identified
# 0.8 identify many small clusters with similar sizes
# 0.5 and 0.6 identify too many small clusters


### 100 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '4',
                    'buffer_100')


# DISCUSSION similar results to previous polygon
# 0.9 identify many small clusters with similar sizes
# 0.5 identify too many small clusters
# 1.7 and 2.1 identify a moderate number of clusters with similar sizes

### 200 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '4',
                    'buffer_200')

# DISCUSSION similar results to previous polygon
# 0.8 identify many small clusters with similar sizes
# 0.5 identify too many small clusters
# 1.8 and 2.2 identify a moderate number of clusters with similar sizes

## polygon 5

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '5',
                    'original')

# DISCUSSION: very different results
# dist=1.6 is not significant
# dist= 0.8 identifies many small clusters with different sizes
# dist=0.5 identifies many singleton

### 100 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '5',
                    'buffer_100')

# DISCUSSION: similar results to previous polygons
# dist=0.5/0.6/0.8 identify many clusters but distributed with different shapes and sizes (visually preferred 0.8)

### 200 BUFFER DATA

In [None]:
# 200 buffer POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '5',
                    'buffer_200')

# DISCUSSION: similar results to previous polygons
# dist=0.5/0.6 identify many clusters but distributed with different shapes and sizes 
# visually preferred dist=0.8 since identifies fewer clusters and has fewer singletons

## polygon 6

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '6',
                    'original')

# DISCUSSION very different results
# dist=1.4 -> only two not proportional clusters identified

### 100 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '6',
                    'buffer_100')


# DISCUSSION: new combination available
# preferred dist=0.7  ->clusters are more defined and visually separated

### 200 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '6',
                    'buffer_200')

# DISCUSSION: more results available
# dist=0.5 -> identifies many small clusters with different sizes and shapes
# dis=.5 has similar results to previous implementation
# dis=1. presents more defined clusters

## polygon 7

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '7',
                    'original')

# DISCUSSION only one significant result

### 100 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '7',
                    'buffer_100')


# DISCUSSION results consistent with previous implementation

### 200 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '7',
                    'buffer_200')

# DISCUSSION new results wrt previous implementations
# all combinations available present intersting configurations even if the clusters are too small