# import the libraries

In [1]:
#standard libraries
import pandas as pd
import numpy as np

# statistic libraries
from sklearn import metrics
from sklearn.cluster import DBSCAN
from sklearn.metrics import davies_bouldin_score
#from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances

# load the dataset

In [2]:
df = pd.read_json('/Users/majadallacqua/Desktop/università/II_sem/CSS/urban analysis/data/trento_poi_dataset.json')
df.head()

Unnamed: 0,id,lat,lon,name,class
0,265590160,46.047611,11.12602,Volksbank,economic
1,269193707,46.074491,11.124553,Cassa di Trento,economic
2,269193731,46.064718,11.123213,Cassa di Trento,economic
3,288910331,46.065631,11.154994,Cassa di Trento,economic
4,292015813,46.076473,11.141931,Cassa di Trento,economic


# set the environment

In [3]:
kms_per_radian = 6371.0088

# 12 is the number of districts in the city of Trento
min_neigh = 12 - 5 # value for the city of Trento
max_neigh = 12 + 5

# subset used for each analysis
df_eco = df[df['class']=='economic']
df_edu = df[df['class']=='education']
df_hea = df[df['class']=='health']
df_cat = df[df['class']=='catering']
df_shop = df[df['class']=='shopping']
df_tou = df[df['class']=='tourism']

# DBSCAN algorithm
we don't have many data points in this dataset.
We don't expect many banks being very close, this may happen in the city's center but not in all the municipality.

In [4]:
# let's the value we want to try
eps_params = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7] # metric is in km
min_params = [2, 3, 4, 5, 6, 7, 8, 9] # min number of elements we want in each cluster

## economic dataset

In [17]:
df_eco_rad = np.radians(df_eco[['lat','lon']])
scores_eco = []

for i in eps_params:
    
    epsilon = i / kms_per_radian
    
    for j in min_params:
        
        # general parameters for the DBSCAN algorithm
        dbscan = DBSCAN(eps=epsilon, 
                min_samples=j,
                algorithm = 'ball_tree',
                metric='haversine')
        
        # fit to economic dataset
        db_eco = dbscan.fit(df_eco_rad)
        clusters_eco = db_eco.labels_
        num_clusters_eco = len(set(clusters_eco)) - 1 # to esclude the noise cluster
        
        if num_clusters_eco in range(min_neigh, max_neigh + 1):
            sil_score = metrics.silhouette_score(df_eco_rad, clusters_eco)
            davies_score = davies_bouldin_score(df_eco_rad, clusters_eco)

            score_valid = (i, j, num_clusters_eco, sil_score, davies_score)
            scores_eco.append(score_valid)
        
scores_eco.sort(key=lambda tuple: tuple[2], reverse=True)     

# let's print the results to compare them:
print(f"the dbscan found {len(scores_eco)} valid results") # use this to check how many items have been saved    
for s in scores_eco:
    print(f"the result is:{s}")

the dbscan found 4 valid results
the result is:(0.2, 2, 12, 0.24726358874225482, 3.621104992985061)
the result is:(0.1, 2, 9, -0.14614024445658866, 3.6028257040145943)
the result is:(0.3, 2, 9, 0.32082015139421943, 2.998499146462305)
the result is:(0.3, 3, 7, 0.2204072771380088, 6.049514547338348)


## education dataset

In [18]:
df_edu_rad = np.radians(df_edu[['lat','lon']])
scores_edu = []

for i in eps_params:
    
    epsilon = i / kms_per_radian
    
    for j in min_params:
        
        # general parameters for the DBSCAN algorithm
        dbscan = DBSCAN(eps=epsilon, 
                min_samples=j,
                algorithm = 'ball_tree',
                metric='haversine')
        
        # fit to education dataset
        db_edu = dbscan.fit(df_edu_rad)
        clusters_edu = db_edu.labels_
        num_clusters_edu = len(set(clusters_edu)) - 1
        
        if num_clusters_edu in range(min_neigh, max_neigh + 1):
            sil_score = metrics.silhouette_score(df_edu_rad, clusters_edu)
            davies_score = davies_bouldin_score(df_edu_rad, clusters_edu)

            score_valid = (i, j, num_clusters_edu, sil_score, davies_score)
            scores_edu.append(score_valid)
        
scores_edu.sort(key=lambda tuple: tuple[2], reverse=True)     

# let's print the results to compare them:
print(f"the dbscan found {len(scores_edu)} valid results") # use this to check how many items have been saved    
for s in scores_edu:
    print(f"the result is:{s}")

the dbscan found 5 valid results
the result is:(0.2, 2, 9, -0.052260926345964416, 7.037638332931813)
the result is:(0.3, 2, 8, 0.07659342420441541, 4.76102488311645)
the result is:(0.5, 2, 8, 0.2791108468328062, 2.471357161117928)
the result is:(0.3, 3, 7, 0.0441339731907586, 7.046563074574154)
the result is:(0.6, 2, 7, 0.2703153552098291, 2.8328851628044274)


## health dataset

In [14]:
df_hea_rad = np.radians(df_hea[['lat','lon']])
scores_hea = []

for i in eps_params:
    
    epsilon = i / kms_per_radian
    
    for j in min_params:
        
        # general parameters for the DBSCAN algorithm
        dbscan = DBSCAN(eps=epsilon, 
                min_samples=j,
                algorithm = 'ball_tree',
                metric='haversine')
        
        # fit to health dataset
        db_hea = dbscan.fit(df_hea_rad)
        clusters_hea = db_hea.labels_
        num_clusters_hea = len(set(clusters_hea)) - 1
        
        if num_clusters_hea in range(min_neigh - 4, max_neigh + 4):
            sil_score = metrics.silhouette_score(df_hea_rad, clusters_hea)
            davies_score = davies_bouldin_score(df_hea_rad, clusters_hea)
                                     
            score_valid = (i, j, num_clusters_hea, sil_score, davies_score)
            scores_hea.append(score_valid)
        
scores_hea.sort(key=lambda tuple: tuple[2], reverse=True)     

# let's print the results to compare them:
print(f"the dbscan found {len(scores_hea)} valid results") # use this to check how many items have been saved 
for s in scores_hea:
    print(f"the result is:{s}")

the dbscan found 3 valid results
the result is:(0.4, 2, 3, -0.05649395027515204, 5.599380761918972)
the result is:(0.6, 2, 3, 0.22780805424024034, 4.747780158289724)
the result is:(0.7, 2, 3, 0.2403984775267705, 5.292712890204045)


## catering dataset

In [19]:
df_cat_rad = np.radians(df_cat[['lat','lon']])
scores_cat = []

for i in eps_params:
    
    epsilon = i / kms_per_radian
    
    for j in min_params:
        
        # general parameters for the DBSCAN algorithm
        dbscan = DBSCAN(eps=epsilon, 
                min_samples=j,
                algorithm = 'ball_tree',
                metric='haversine')
        
        # fit to catering dataset
        db_cat = dbscan.fit(df_cat_rad)
        clusters_cat = db_cat.labels_
        num_clusters_cat = len(set(clusters_cat)) - 1
        
        if num_clusters_cat in range(min_neigh, max_neigh + 1):
            sil_score = metrics.silhouette_score(df_cat_rad, clusters_cat)
            davies_score = davies_bouldin_score(df_cat_rad, clusters_cat)

            score_valid = (i, j, num_clusters_cat, sil_score, davies_score)
            scores_cat.append(score_valid)
        
scores_cat.sort(key=lambda tuple: tuple[2], reverse=True)     

# let's print the results to compare them:
print(f"the dbscan found {len(scores_cat)} valid results") # use this to check how many items have been saved    
for s in scores_cat:
    print(f"the result is:{s}")

the dbscan found 18 valid results
the result is:(0.2, 3, 17, 0.11005478139924929, 2.854424399773766)
the result is:(0.3, 3, 17, 0.24788966815184907, 2.402072306180986)
the result is:(0.4, 3, 16, 0.3240079446943718, 2.0540389448973624)
the result is:(0.1, 3, 14, -0.09674402633281363, 5.336195091189466)
the result is:(0.7, 2, 14, 0.23760706795339004, 1.7039293686965915)
the result is:(0.5, 3, 13, 0.379215225382933, 1.9324076556519234)
the result is:(0.6, 3, 12, 0.39180705994585097, 2.6867603348261224)
the result is:(0.4, 4, 10, 0.2773263779642477, 3.9727119705462184)
the result is:(0.7, 3, 10, 0.37181588287959577, 2.2583013472566744)
the result is:(0.1, 4, 9, -0.1487552407689662, 16.689508895997825)
the result is:(0.2, 4, 9, 0.08738413435015968, 6.005604105829745)
the result is:(0.5, 4, 8, 0.35566582914876355, 3.9993265546474976)
the result is:(0.2, 5, 7, 0.022409370187647708, 13.825475632429491)
the result is:(0.2, 6, 7, 0.0088780890362644, 11.154171198529616)
the result is:(0.3, 4, 7, 

## shopping dataset

In [20]:
df_shop_rad = np.radians(df_shop[['lat','lon']])
scores_shop = []

for i in eps_params:
    
    epsilon = i / kms_per_radian
    
    for j in min_params:
        
        # general parameters for the DBSCAN algorithm
        dbscan = DBSCAN(eps=epsilon, 
                min_samples=j,
                algorithm = 'ball_tree',
                metric='haversine')
        
        # fit to shopnomic dataset
        db_shop = dbscan.fit(df_shop_rad)
        clusters_shop = db_shop.labels_
        num_clusters_shop = len(set(clusters_shop)) - 1
        
        if num_clusters_shop in range(min_neigh, max_neigh +1):
            sil_score = metrics.silhouette_score(df_shop_rad, clusters_shop)
            davies_score = davies_bouldin_score(df_shop_rad, clusters_shop)

            score_valid = (i, j, num_clusters_shop, sil_score, davies_score)
            scores_shop.append(score_valid)
        
scores_shop.sort(key=lambda tuple: tuple[2], reverse=True)     

# let's print the results to compare them:
print(f"the dbscan found {len(scores_shop)} valid results") # use this to check how many items have been saved    
for s in scores_shop:
    print(f"the result is:{s}")

the dbscan found 2 valid results
the result is:(0.4, 2, 8, -0.017548109117090132, 3.7247715031057576)
the result is:(0.5, 2, 8, 0.04463992489948266, 3.1825381201422194)


## tourism dataset

In [22]:
df_tou_rad = np.radians(df_tou[['lat','lon']])
scores_tou = []

for i in eps_params:
    
    epsilon = i / kms_per_radian
    
    for j in min_params:
        
        # general parameters for the DBSCAN algorithm
        dbscan = DBSCAN(eps=epsilon, 
                min_samples=j,
                algorithm = 'ball_tree',
                metric='haversine')
        
        # fit to tounomic dataset
        db_tou = dbscan.fit(df_tou_rad)
        clusters_tou = db_tou.labels_
        num_clusters_tou = len(set(clusters_tou)) - 1
        
        if num_clusters_tou in range(min_neigh, max_neigh + 1):
            sil_score = metrics.silhouette_score(df_tou_rad, clusters_tou)
            davies_score = davies_bouldin_score(df_tou_rad, clusters_tou)


            score_valid = (i, j, num_clusters_tou, sil_score, davies_score)
            scores_tou.append(score_valid)
        
scores_tou.sort(key=lambda tuple: tuple[2], reverse=True)     

# let's print the results to compare them:
print(f"the dbscan found {len(scores_tou)} valid results") # use this to check how many items have been saved    
for s in scores_tou:
    print(f"the result is:{s}")

the dbscan found 6 valid results
the result is:(0.2, 2, 12, 0.1074743667167245, 1.6418871658521113)
the result is:(0.3, 2, 8, 0.17172268032513446, 1.5047089773721372)
the result is:(0.6, 2, 8, 0.4629136719343527, 1.2447038136381723)
the result is:(0.1, 2, 7, -0.17275846970609915, 2.819913143845101)
the result is:(0.5, 2, 7, 0.4599598040559525, 1.0966628542874715)
the result is:(0.7, 2, 7, 0.47275006069814923, 1.2034767990991755)
