# generals

In [None]:
# general libraries
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import time
import requests
import json
import io

# analysis libraries
import networkx as nx
from networkx.algorithms.community import louvain_communities

# visualization libraries
import matplotlib.pyplot as plt
import contextily as ctx
from shapely.geometry import Point

In [None]:
import plotly.io as pio
pio.renderers.default = 'notebook_connected'

In [None]:
# directories
datasets_dir = "../../datasets/"

# datasets

In [None]:
# OSM data: we need to fix the coordinate reference system
data = gpd.read_file(datasets_dir + "geopoints_data.geojson")
data['lat_m'] = data.geometry.x
data['lon_m'] = data.geometry.y
data['node_id'] = data['node_id'].astype('string')

# RASTER DATA:
polygons = gpd.read_file(datasets_dir + "bologna_polygons.geojson")
#polygons = polygons.to_crs('EPSG:32632')
polygons['id'] = polygons['id'].astype(str)

# CLUSTERING RESULTS DATA
results = pd.read_csv(datasets_dir + "pareto_louvain.csv")
results.polygon_id = results.polygon_id.astype(str)

print("all datasets loaded")

In [None]:
results

# functions setting

In [None]:
def dist_matrix_extraction(setting, polygon):
    
    current_matrix = pd.read_csv("../../datasets/matrix_computation/" + setting + "/polygon" + str(polygon) + ".csv", index_col=0)
    current_matrix.columns = current_matrix.columns.astype(int)
    
    return current_matrix

In [None]:
def graph_construction(matrix, max_dist):
    
    similarity = 1 / (1 + matrix)
    ids = [int(el) for el in matrix.index]
    G = nx.Graph()
    G.add_nodes_from(ids)
    
    n = len(ids)
    
    for i in range(n):
        for j in range(i + 1, n):
            distance = matrix.iloc[i, j]
            if distance <= max_dist:  
                weight = similarity.iloc[i, j] 
                G.add_edge(ids[i], ids[j], weight=weight)
    return G

In [None]:
def louvain_clustering(G, matrix, dataset, grid, setting, max_dist, resolution):
    
    # initialize algorithm
    communities = louvain_communities(G, 
                                      weight='weight', 
                                      resolution=resolution,
                                      seed=42)

    # create correspondance between the data
    node_to_community = {}
    
    for cid, community_nodes in enumerate(communities):
        for node in community_nodes:
            node_to_community[node] = cid
    
    #print(node_to_community)
    ids = [int(el) for el in matrix.index]
    communities_df = pd.DataFrame({
        'id': ids,
        'clusters': [node_to_community[node] for node in ids]})
    
    communities_df['id'] = communities_df['id'].astype("string") 
    complete_subset = dataset.merge(communities_df, left_on='node_id',right_on="id", how='left') 

    # re-classification of outliers 
    cluster_sizes = complete_subset['clusters'].value_counts()
    valid_clusters = cluster_sizes[cluster_sizes == 1].index
    complete_subset.loc[complete_subset['clusters'].isin(valid_clusters), 'clusters'] = -1 # reassign those values to -1 for comparison with noises
     
    return complete_subset

In [None]:
def plot_clusters(plot_data, polygon_gdf, dist, resolution, polygon_type):
    
    # reprojection of the coordinates
    plot_data = plot_data.to_crs(epsg=3857)
    plot_data.clusters = plot_data.clusters.astype(str)
    
    polygon_gdf = polygon_gdf.set_crs(epsg=4326, allow_override=True)
    polygon_gdf = polygon_gdf.to_crs(epsg=3857)
    
    # figure
    fig, ax = plt.subplots(figsize=(16, 8))
    
    polygon_gdf.boundary.plot(ax=ax, color='black', linewidth=1)
   
    plot_data.plot(
        ax=ax,
        column='clusters',
        categorical=True,
        cmap='tab20',
        legend=True,
        markersize=40,
        alpha=0.8
    )
    
    ax.set_aspect('equal')
    
    ctx.add_basemap(ax, source=ctx.providers.CartoDB.Positron, crs=plot_data.crs)
    
    ax.text(0.01, 0.02, f"max_distance={dist} ; resolution={resolution}", 
        transform=ax.transAxes, fontsize=9, color='gray')
    ax.set_title(f"Clustering results for polygon with {polygon_type} boundary", fontsize=14)
    
    ax.axis('off')
    plt.tight_layout()
    plt.show()


In [None]:
def clustering_pipeline(dataset, polygons_gdf, clustering_results, polygon_id, setting):
   
    matrix = dist_matrix_extraction(setting, polygon_id)
    current_nodes = [str(el) for el in matrix.index]
    subset = dataset[dataset.node_id.isin(current_nodes)]
    
    polygon = polygons_gdf[polygons_gdf.id==polygon_id]
    subset_results = clustering_results[(clustering_results.polygon_id==polygon_id)&(clustering_results.buffer==setting)]
    
    for index, row in subset_results.iterrows():
        #print(params)
        dist = row.dist
        res = row.res
        
        graph = graph_construction(matrix, dist)
        clustered = louvain_clustering(graph, matrix, subset, polygon_id, setting, dist, res)
        plot_clusters(clustered, polygon, dist, res, setting)


## polygon 0

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '0',
                    'original')

# DISCUSSION: 
# PREFERRED dist=1.2 + res=0.4

### 100 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '0',
                    'buffer_100')

# PREFERRED dist=1.0 + res=0.4 -> clusters are well dense

### 200 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '0',
                    'buffer_200')

# PREFERRED dist=1.0 + res=0.6/0.8 -> clusters are equally distributed

## polygon 1

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '1',
                    'original')

# DISCUSSION
# initial clustering are probably too detailes -> each data point is considered as a community
# PREFERRED dist=0.6/0.7 + res=0.8 clusters are well distributed

### 100 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '1',
                    'buffer_100')

# DISCUSSION: results similar to the previous polygon
# PREFERRED dist=0.7 + res=0.8/0.4 clusters are well distributed
# interesting also dist=0.6 + res=1.0

### 200 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '1',
                    'buffer_200')

# DISCUSSION: results similar to the previous polygon
# PREFERRED dist=0.7 + res=1.0 clusters are well distributed
# interesting also dist=0.6 + res=1.6

## polygon 2

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '2',
                    'original')

# DISCUSSION: results similar to the previous polygon
# PREFERRED dist=1.2 + res=0.4/0.8 clusters are well separated

### 100 BUFFER DATA

In [None]:
clustering_pipeline(data, 
                    polygons,
                    results,
                    '2',
                    'buffer_100')

# DISCUSSION: results similar to the previous polygon
# PREFERRED dist=1.2 + res=0.4 -> clusters are well separated
# interesting dist=1.1 + res=0.6

### 200 BUFFER DATA

In [None]:
clustering_pipeline(data, 
                    polygons,
                    results,
                    '2',
                    'buffer_200')

# DISCUSSION: results similar to the previous polygon
# PREFERRED dist=1.2/1.0 + res=0.4 -> clusters are well separated

## polygon 3

### ORIGINAL DATA

In [None]:
clustering_pipeline(data, 
                    polygons,
                    results,
                    '3',
                    'original')

# DISCUSSION
# PREFERRED dist=0.7 + res=0.4 -> clusters are well separated

### 100 BUFFER DATA

In [None]:
clustering_pipeline(data, 
                    polygons,
                    results,
                    '3',
                    'buffer_100')

# DISCUSSION similar results from before
# PREFERRED dist=0.9 + res=0.6 -> clusters are well separated
# interesting because of comparison dist=1.0 + res=0.4

### 200 BUFFER DATA

In [None]:
clustering_pipeline(data, 
                    polygons,
                    results,
                    '3',
                    'buffer_200')

# DISCUSSION similar results from before
# PREFERRED dist=0.8 + res=0.6 -> clusters are well separated
# interesting because of comparison dist=0.8/1.0 + res=0.4

## polygon 4

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '4',
                    'original')

# DISCUSSION interesting results
# dist=0.6 + res=1.2/1.6 -> many clusters but they are well separated
# dist=0.7 + res=1.6/0.6 shows few clusters with different shapes but spatial interestingly distributed 
# dist=0.8 + res=1.0/2.0 similar to previous one

### 100 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '4',
                    'buffer_100')

# DISCUSSION similar results to before
# dist=0.6 + res=1.2 -> many clusters but they are well separated
# dist=0.7 + res=0.4/0.8 shows few clusters with different shapes but spatial interestingly distributed 
# dist=0.8 + res=1.2 similar to previous one
# dist=1.2 + res=0.8 -> bigger clusters

### 200 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '4',
                    'buffer_200')

# DISCUSSION similar results to before
# dist=0.7 + res=0.4/0.8 shows few clusters with different shapes but spatial interestingly distributed 
# dist=0.8 + res=1.8 similar to previous one
# dist=1.1/1.2 + res=0.8 -> bigger clusters

## polygon 5

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '5',
                    'original')

# DISCUSSION: 
# dist=0.4 + res=0.8/1.6 interesting clusters -> similar in sizes
# dist=0.5 + res=0.8/0.4 -> fewer clusters but bigger
# dist=0.6 + res=1.0/0.4 similar to previous one
# dist=1.1/1.2 + res=0.8 -> bigger clusters

### 100 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '5',
                    'buffer_100')

# DISCUSSION: 
# dist=0.4 + res=0.8/1.6 interesting clusters -> similar in sizes
# dist=0.5 + res=1.6/1.8 -> fewer clusters but bigger
# dist=0.6 + res=0.4/0.8 similar to previous one

### 200 BUFFER DATA

In [None]:
# 200 buffer POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '5',
                    'buffer_200')

# DISCUSSION: 
# dist=0.4 + res=0.8 interesting clusters -> beyond and across the boundaries
# dist=0.5 + res=0.8/0.4 -> fewer clusters but bigger
# dist=0.6 + res=0.4/1.0 similar to previous one

## polygon 6

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '6',
                    'original')

# DISCUSSION very different results across parameters
# dist=0.4 + res=0.8/0.4 -> interesting clusters -> medium-to-large sizes
# dist=0.5/0.9 + res=0.4 -> fewer clusters but bigger

### 100 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '6',
                    'buffer_100')

# DISCUSSION similar results to previous implementation
# dist=0.4 + res=0.4 -> interesting clusters -> medium-to-large sizes clusters
# dist=0.5 + res=0.6 -> similar results to previous one
# dist=0.5/0.7 + res=0.4 -> fewer clusters but bigger

### 200 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '6',
                    'buffer_200')

# DISCUSSION similar results to previous implementation
# dist=0.5 + res=0.4 -> few clusters but bigger and distributed across boundaries
# dist=0.5 + res=0.8 -> more clusters slightly smaller
# dist=0.7 + res=0.6/0.8 -> larger clusters but only few

## polygon 7

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '7',
                    'original')

# DISCUSSION 
# dist=0.5 + res=0.8/1.2 -> many clusters with different shapes and sizes
# dist=0.7 + res=0.6/0.8/1.4 -> larger clusters though sizes differences are still present

### 100 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '7',
                    'buffer_100')

# DISCUSSION similar results to before
# dist=0.5 + res=1.8/1.0 -> many clusters with different shapes and sizes
# dist=0.6 + res=0.4/0.8 -> larger clusters though sizes differences are still present

### 200 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '7',
                    'buffer_200')

# DISCUSSION similar results to before
# dist=0.5 + res=1.6/1.0/2.0 -> many clusters with different shapes and sizes
# dist=0.6 + res=0.8 -> larger clusters though sizes differences are still present