In [121]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn import metrics
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
import seaborn as sns
import time
import matplotlib as mpl


In [122]:
# set matplotlib defaults
%matplotlib inline
sns.set()
plt.rcParams["figure.figsize"] = (15,6)
mpl.rc('axes', labelsize=18)
mpl.rc('xtick', labelsize=16)
mpl.rc('ytick', labelsize=16)
plt.rc('figure', titlesize=18)
plt.style.use('seaborn-darkgrid')

In [123]:
# define the number of kilometers in one radian
kms_per_radian = 6371.0088

In [124]:
# load the data set
df = pd.read_csv('../data/accident_clean.csv')
df.head(2) 

Unnamed: 0,x,y,accident_id,year,address,severity,accident_type,severity_numeric,borough_geo,timestamp,date,hour,month_name,month,day
0,-74.105296,4.509792,4437952,2016,CL 80A-KR 1 SE 02,Injury,Crash,8,USME,2016-02-27 16:20:00+00:00,2016-02-27,16,Feb,2,Sat
1,-74.167225,4.631051,4472304,2017,AV AVENIDA CIUDAD DE CALI-CL 42 S 02,Injury,Run over,9,KENNEDY,2017-02-09 16:45:00+00:00,2017-02-09,16,Feb,2,Thu


In [125]:
YEAR = 2015

In [126]:
# df_sample = df[(df.year == 2019) | (df.year == 2018) | (df.year == 2017) | (df.year == 2016) | (df.year == 2015)]
df_sample = df[(df.year == YEAR)]

In [127]:
coords = df_sample[['y', 'x']].values
coords = np.radians(coords)
# define epsilon as 100 meters, converted to radians for use by haversine
epsilon = 0.075 / kms_per_radian

In [128]:
coords.shape

(30429, 2)

In [129]:
start_time = time.time()
db = DBSCAN(eps=epsilon, min_samples=25, algorithm='ball_tree', metric='haversine').fit(coords)
cluster_labels = db.labels_
elapsed = time.time()-start_time

In [130]:
# get the number of clusters
num_clusters = len(set(cluster_labels))

# all done, print the outcome
print(f'Clustered {len(df_sample)} points down to {num_clusters} clusters, for {round(1 - float(num_clusters) / len(df_sample),2)*100}% compression in {round(elapsed,2)} seconds')
# print(f'Silhouette coefficient: {metrics.silhouette_score(coords,cluster_labels)}')

Clustered 30429 points down to 105 clusters, for 100.0% compression in 5.37 seconds


In [131]:
df_sample['cluster_id'] = cluster_labels
df_sample.head()

Unnamed: 0,x,y,accident_id,year,address,severity,accident_type,severity_numeric,borough_geo,timestamp,date,hour,month_name,month,day,cluster_id
5,-74.023482,4.766878,4426757,2015,KR 3-CL 189C 02,Injury,Run over,9,USAQUEN,2015-10-17 21:00:00+00:00,2015-10-17,21,Oct,10,Sat,-1
6,-74.072893,4.598232,4404972,2015,KR 5-CL 12 28,Injury,Crash,8,CANDELARIA,2015-02-17 08:10:00+00:00,2015-02-17,8,Feb,2,Tue,-1
7,-74.15818,4.639044,4412227,2015,AV AVENIDA CIUDAD DE CALI-CL 2 S 02,Material damage,Crash,1,KENNEDY,2015-05-07 05:50:00+00:00,2015-05-07,5,May,5,Thu,-1
20,-74.088759,4.591854,4407122,2015,AV AVENIDA CARACAS-CL 1 02,Material damage,Crash,1,SANTA FE,2015-03-08 01:50:00+00:00,2015-03-08,1,Mar,3,Sun,0
21,-74.058111,4.650699,4427675,2015,KR 7-CL 67 02,Injury,Crash,8,CHAPINERO,2015-10-26 15:10:00+00:00,2015-10-26,15,Oct,10,Mon,-1


In [132]:
df_sample[df_sample.cluster_id == -1].shape

(26727, 16)

In [133]:
def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    # centermost_point = np.degrees(np.array(centroid))
    centermost_point = np.degrees(centermost_point)
    return centermost_point

In [134]:
clusters = pd.DataFrame([coords[cluster_labels==n] for n in range(num_clusters)])
clusters.drop(clusters.tail(1).index, inplace=True)

In [135]:
clusters.applymap(lambda x: len(x)).sort_values(by=0)

Unnamed: 0,0
51,25
63,25
39,25
78,25
85,25
...,...
7,68
23,68
10,69
15,84


In [136]:
clustered_points = clusters.explode(0)
clustered_points = clustered_points.applymap(np.degrees)
clustered_points['y'] = clustered_points[0].apply(lambda x: x[0])
clustered_points['x'] = clustered_points[0].apply(lambda x: x[1])
clustered_points.shape

(3702, 3)

In [137]:
centroids = clusters.applymap(get_centermost_point)

In [138]:
centroids['y'] = centroids[0].apply(lambda x: x[0])
centroids['x'] = centroids[0].apply(lambda x: x[1])
centroids.shape

(104, 3)

In [139]:
centroids

Unnamed: 0,0,y,x
0,"[4.59184233000002, -74.08878166499993]",4.591842,-74.088782
1,"[4.760559846000035, -74.06575966999998]",4.760560,-74.065760
2,"[4.648264478000044, -74.10723968499997]",4.648264,-74.107240
3,"[4.59516491100004, -74.14530483699998]",4.595165,-74.145305
4,"[4.679129501000034, -74.119425515]",4.679130,-74.119426
...,...,...,...
99,"[4.628263501000049, -74.17076367399994]",4.628264,-74.170764
100,"[4.66420211600007, -74.04761288599997]",4.664202,-74.047613
101,"[4.6804475150000275, -74.09967961099994]",4.680448,-74.099680
102,"[4.726337779000062, -74.12478046799998]",4.726338,-74.124780


In [140]:
#clustered_points = clustered_points.sample(frac=0.5)

In [141]:
# import folium  
# from folium.plugins import HeatMap
# folium_map = folium.Map(location=[4.654335, -74.083644],
#                         zoom_start=14,
#                         tiles="openstreetmap")

# for row in clustered_points.iterrows():
#     marker = folium.CircleMarker(location=[row[1]['y'],row[1]['x']], radius=2, color="black", fill=True)
#     marker.add_to(folium_map)


# for row in centroids.iterrows():
#     marker = folium.CircleMarker(location=[row[1]['y'],row[1]['x']], radius=7, color='#3186cc',fill=True, fill_color='#3186cc')
#     marker.add_to(folium_map)

# id = 296
# marker = folium.CircleMarker(location=[centroids.iloc[id]['y'],centroids.iloc[id]['x']], radius=25, color='#3186cc',fill=True, fill_color='#3186cc')
# marker.add_to(folium_map)
# for point in clusters.iloc[id].values[0]:
#     marker = folium.CircleMarker(location=[np.degrees(point)[0],np.degrees(point)[1]], radius=2, color="black", fill=True)
#     marker.add_to(folium_map)

In [142]:
# folium_map

In [143]:
df_sample.head(2)

Unnamed: 0,x,y,accident_id,year,address,severity,accident_type,severity_numeric,borough_geo,timestamp,date,hour,month_name,month,day,cluster_id
5,-74.023482,4.766878,4426757,2015,KR 3-CL 189C 02,Injury,Run over,9,USAQUEN,2015-10-17 21:00:00+00:00,2015-10-17,21,Oct,10,Sat,-1
6,-74.072893,4.598232,4404972,2015,KR 5-CL 12 28,Injury,Crash,8,CANDELARIA,2015-02-17 08:10:00+00:00,2015-02-17,8,Feb,2,Tue,-1


In [144]:
centroids.reset_index(inplace=True)
centroids.drop(columns=[0], inplace=True)
centroids.rename(columns={'index': 'cluster_id'}, inplace=True)

In [145]:
centroids['year'] = YEAR

In [146]:
df_sample.to_csv(f'../data/{YEAR}_dataset_clusters.csv', index=None)

In [147]:
centroids.to_csv(f'../data/{YEAR}_centroids.csv',index=None)

In [148]:
clustered_points.reset_index(inplace=True)
clustered_points.drop(columns=[0], inplace=True)
clustered_points.rename(columns={'index': 'cluster_id'}, inplace=True)

In [149]:
clustered_points['year'] = YEAR

In [150]:
clustered_points.to_csv(f'../data/{YEAR}_clustered_points.csv', index=None)