In [1]:
# https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude
# https://www.kdnuggets.com/2020/04/dbscan-clustering-algorithm-machine-learning.html
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
# https://docs.kepler.gl/docs/keplergl-jupyter

In [2]:

import numpy as np

import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

from sklearn.cluster import DBSCAN
from geopy import distance
import csv

In [3]:
def load_carvansaras(path):
    carvansaras = list()
    with open(path) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=",")
        FIRST_LINE = True
        for row in csv_reader:
            if FIRST_LINE:
                FIRST_LINE = False
                continue
            carvansaras.append([row[0], row[1], row[2]])

    return carvansaras

In [4]:
carvansaras = load_carvansaras("carvansara.csv")

In [5]:
names = []
latitudes = []
longitudes = []
for carvansara in carvansaras:
#     try:
    names.append(carvansara[1])
    latitude, longitude = carvansara[2].replace("-", ",").split(",")
    latitudes.append(float(latitude))
    longitudes.append(float(longitude))
#     except:
#         pass

In [6]:
df = pd.DataFrame(list(zip(names, latitudes, longitudes)),
               columns =['Name', 'Latitude', 'Longitude'])

In [7]:
print(len(df))

203


In [8]:
df.head()

Unnamed: 0,Name,Latitude,Longitude
0,کاروانسرای,38.344328,45.834966
1,کاروانسرای خواجه نظر,38.977452,45.577038
2,کاروانسرای جمال آباد,37.271923,47.843075
3,کاروانسرای رباط شرق,36.26649,60.655253
4,کاروانسرای منظریه,34.891223,50.819861


In [9]:
df.tail()

Unnamed: 0,Name,Latitude,Longitude
198,کاروانسرا_غیب_الله,27.302168,54.472015
199,BaqerAbad_Caravansary,34.930295,50.823524
200,هتل_کاروانسرای_شمسی,32.105542,54.118904
201,کاروانسرای_تاریخی_بلاد_شاپور,30.788056,50.561667
202,کاروانسرا_برکه_سلطان,27.242924,55.510821


In [10]:
def geo_distance(coordinates_from, coordinates_to):
    return distance.distance(coordinates_from, coordinates_to).km

In [11]:
locations = df[["Latitude", "Longitude"]].to_numpy()

In [34]:
# Compute DBSCAN
db = DBSCAN(eps=.5, min_samples=2, metric=geo_distance).fit(locations)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

In [40]:
print('Estimated number of clusters: %d' % n_clusters)
print('Estimated number of noise points: %d' % n_noise_)

Estimated number of clusters: 2
Estimated number of noise points: 199


In [41]:
labels
# Noisy samples are given the label -1

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1,  0, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1])

In [42]:
df["label"] = labels

In [43]:
n_clusters

2

In [44]:
cluster_num = -1

In [47]:

cluster_num+=1
print("@", cluster_num)
check_loc = df[df["label"] == cluster_num]
check_loc

@ 2


Unnamed: 0,Name,Latitude,Longitude,label


In [51]:
df[["Name","Latitude","Longitude"]].to_csv("carvansaras_checked.csv", index=False)