# generals

In [None]:
# general libraries
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import time
import requests
import json
import io

# analysis libraries
from sklearn.cluster import DBSCAN

# visualization libraries
import matplotlib.pyplot as plt
import contextily as ctx
from shapely.geometry import Point

In [None]:
import plotly.io as pio
pio.renderers.default = 'notebook_connected'

In [None]:
# directories
datasets_dir = "../../datasets/"

# datasets

In [None]:
# OSM data: we need to fix the coordinate reference system
data = gpd.read_file(datasets_dir + "geopoints_data.geojson")
data['lat_m'] = data.geometry.x
data['lon_m'] = data.geometry.y
data['node_id'] = data['node_id'].astype(str)
data['polygon_id'] = data['polygon_id'].astype(str)

# RASTER DATA:
polygons = gpd.read_file(datasets_dir + "bologna_polygons.geojson")
#polygons = polygons.to_crs('EPSG:32632')
polygons['id'] = polygons['id'].astype(str)

# CLUSTERING RESULTS DATA
results = pd.read_csv(datasets_dir + "pareto_dbscaneu.csv")
results.polygon_id = results.polygon_id.astype(str)

print("all datasets loaded")

In [None]:
data

# functions setting

In [None]:
def get_ids(setting, polygon):
    
    current_matrix = pd.read_csv("../../datasets/matrix_computation/" + setting + "/polygon" + str(polygon) + ".csv", index_col=0)
    current_matrix.columns = current_matrix.columns.astype(int)

    current_nodes = [str(el) for el in current_matrix.index]
    return current_nodes

In [None]:
def dbscan_clustering(eps_value, el_value, dataset, grid, setting):
    
    # initialize clustering
    dbscan = DBSCAN(eps=eps_value*1000, # meters values
                        min_samples=el_value,
                        algorithm='auto',
                        metric='euclidean')

    
    res = dbscan.fit(dataset[['lat_m', 'lon_m']]) # fit to the distance matrix
    clusters = res.labels_ # get the cluster labels
    num_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)  # to exclude the noise cluster from the counting
    
    noise_ratio = (np.sum(clusters == -1) / len(clusters)).round(2)
        
    cluster_df = pd.DataFrame({'id': dataset.node_id.values, "clusters": clusters})
    cluster_df['id'] = cluster_df['id'].astype("string") 
    complete_subset = dataset.merge(cluster_df, left_on='node_id',right_on="id", how='left')
        
    return complete_subset
    

In [None]:
def plot_clusters(plot_data, polygon_gdf, eps, el, polygon_type):
    
    # reprojection of the coordinates
    plot_data = plot_data.to_crs(epsg=3857)
    plot_data.clusters = plot_data.clusters.astype(str)
    
    polygon_gdf = polygon_gdf.set_crs(epsg=4326, allow_override=True)
    polygon_gdf = polygon_gdf.to_crs(epsg=3857)
    
    # figure
    fig, ax = plt.subplots(figsize=(16, 8))
    
    polygon_gdf.boundary.plot(ax=ax, color='black', linewidth=1)
   
    plot_data.plot(
        ax=ax,
        column='clusters',
        categorical=True,
        cmap='tab20',
        legend=True,
        markersize=40,
        alpha=0.8
    )
    
    ax.set_aspect('equal')
    
    ctx.add_basemap(ax, source=ctx.providers.CartoDB.Positron, crs=plot_data.crs)
    
    ax.text(0.01, 0.02, f"epsilon={eps} ; min elements={el}", 
        transform=ax.transAxes, fontsize=9, color='gray')
    ax.set_title(f"Clustering results for polygon with {polygon_type} boundary", fontsize=14)
    
    ax.axis('off')
    plt.tight_layout()
    plt.show()


In [None]:
def clustering_pipeline(dataset, polygons_gdf, clustering_results, polygon_id, setting):

    ids = get_ids(setting, polygon_id)
    
    subset = dataset[dataset.node_id.isin(ids)]
    polygon = polygons_gdf[polygons_gdf.id==polygon_id]
    
    subset_results = clustering_results[(clustering_results.polygon_id==polygon_id)&(clustering_results.buffer==setting)]
    
    for index, row in subset_results.iterrows():

        min_element = row.el
        eps = row.eps
        
        clustered = dbscan_clustering(eps, min_element, subset, polygon_id, setting)
                
        #print(clustered.head())
        plot_clusters(clustered, polygon, eps, min_element, setting)



## polygon 0

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '0',
                    'original')

# DISCUSSION: 
# two clusters are well defined, whle one cluster is smaller and oftern identified as noise
# preferred: eps=0.7 and min_el=2 or eps=0.8 and min_el=2

### 100 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '0',
                    'buffer_100')

# DISCUSSION: similar results of before but no small central cluster identified
# preferred: eps=0.9/1.0 + el=2 ; eps=1.1 + el=5 -> differences in cluster distribution

### 200 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '0',
                    'buffer_200')

# DISCUSSION: difference more accentuated between differen combinations but overall noise data are the same
# preferred: eps=0.8 + el=2  identifies two big and two small clusters -> eps=0.8 + el=3 doesn't capture the small ones
# eps=0.8 + el=5/6 identifies three medium/big clusters


## polygon 1

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '1',
                    'original')

# DISCUSSION

### 100 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '1',
                    'buffer_100')

# DISCUSSION:
# similarities with previous implementations

### 200 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '1',
                    'buffer_200')

# DISCUSSION: almost identical results to previous implementation

## polygon 2

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '2',
                    'original')

# DISCUSSION:

### 100 BUFFER DATA

In [None]:
clustering_pipeline(data, 
                    polygons,
                    results,
                    '2',
                    'buffer_100')
# DISCUSSION: results are consistent with the previous polygon


### 200 BUFFER DATA

In [None]:
clustering_pipeline(data, 
                    polygons,
                    results,
                    '2',
                    'buffer_200')

# DISCUSSION: results are consistent with the previous polygon

## polygon 3

### ORIGINAL DATA

In [None]:
clustering_pipeline(data, 
                    polygons,
                    results,
                    '3',
                    'original')

# DISCUSSION: 

### 100 BUFFER DATA

In [None]:
clustering_pipeline(data, 
                    polygons,
                    results,
                    '3',
                    'buffer_100')

# DISCUSSION: rapid change of eps bring allow to identify new clusters
# PREFERRED: eps=1.0 + el=2 identifies more clsuters than any else and has lower noise ratio

### 200 BUFFER DATA

In [None]:
clustering_pipeline(data, 
                    polygons,
                    results,
                    '3',
                    'buffer_200')

# DISCUSSION: similar results to previous polygon
# PREFERRED: eps=1.0 + el=2/4

## polygon 4

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '4',
                    'original')

# DISCUSSION

### 100 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '4',
                    'buffer_100')


# DISCUSSION similar results to previous polygon

### 200 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '4',
                    'buffer_200')

# DISCUSSION: similar results to previous polygon
# eps=0.7 + el=7 identifies few clusters, some small and some big -> more  noise data 
# eps=0.6 + el=3/4 identifies many clusters, some small and some big -> less noise data
# eps=0.8 + el=8/9 similar result to 0.7

## polygon 5

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '5',
                    'original')

# DISCUSSION

### 100 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '5',
                    'buffer_100')

# DISCUSSION: similar results to previous polygons
# PREFERRED: eps=0.3 + el=3/2 -> clusters appear more defined than in other clusterings

### 200 BUFFER DATA

In [None]:
# 200 buffer POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '5',
                    'buffer_200')

# DISCUSSION results consistent with previous results
# PREFERRED: eps=0.3 + el=3/2 -> clusters appear more defined than in other clusterings

## polygon 6

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '6',
                    'original')

# DISCUSSION very different results
# eps=0.4 + el=7 -> few cluster not proportional
# eps=0.3 + el=4 -> similar to previous combination
# eps=0.2 + el=2/3 -> identifies a lot of clusters but very small

### 100 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '6',
                    'buffer_100')


# DISCUSSION similar implementation to before

### 200 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '6',
                    'buffer_200')

# DISCUSSION similar implementation to before
# eps=0.4 + el=7/10 -> few cluster not proportional
# eps=0.2 + el=2/3 -> identifies a lot of clusters but very small

## polygon 7

### ORIGINAL DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '7',
                    'original')

# DISCUSSION 

### 100 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '7',
                    'buffer_100')


# DISCUSSION similar results to previous implementation

### 200 BUFFER DATA

In [None]:
# ORIGINAL POLYGON
clustering_pipeline(data, 
                    polygons,
                    results,
                    '7',
                    'buffer_200')

# DISCUSSION similar results to previous implementation