In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import collections
import matplotlib.pyplot as plt
import seaborn as sns

from pyproj import Proj, transform

from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import DBSCAN

import bokeh
import bokeh.plotting as plotting
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.tile_providers import CARTODBPOSITRON
plotting.output_notebook()

sns.set_style('whitegrid')

# Problem definition

Cluster regions based on crime data

http://donnees.ville.montreal.qc.ca/dataset/actes-criminels

# Load the data

In [8]:
#input
df = pd.read_csv('data/interventionscitoyendo.csv', encoding='latin_1')
df['DATE'] = pd.to_datetime(df['DATE'])
print(df.columns)
print(df['CATEGORIE'].value_counts())
df.head()

Index(['CATEGORIE', 'DATE', 'QUART', 'PDQ', 'X', 'Y', 'LAT', 'LONG'], dtype='object')
Introduction                        30472
Vol dans / sur véhicule à moteur    24465
Méfait                              20702
Vol de véhicule à moteur            12453
Vols qualifiés                       4913
Infractions entrainant la mort         73
Name: CATEGORIE, dtype: int64


Unnamed: 0,CATEGORIE,DATE,QUART,PDQ,X,Y,LAT,LONG
0,Introduction,2015-01-01,jour,23,302375.197993,5.046522e+06,45.558606,-73.531060
1,Introduction,2015-01-01,jour,23,302375.197993,5.046522e+06,45.558606,-73.531060
2,Introduction,2015-01-01,soir,13,295850.656000,5.031730e+06,45.425443,-73.614364
3,Introduction,2015-01-01,nuit,8,289215.072000,5.036423e+06,45.467564,-73.699308
4,Introduction,2015-01-01,soir,44,298915.433995,5.046912e+06,45.562090,-73.575381
5,Introduction,2015-01-01,soir,15,297494.204005,5.034926e+06,45.454221,-73.593408
6,Introduction,2015-01-01,jour,3,276368.349000,5.041011e+06,45.508444,-73.863862
7,Introduction,2015-01-01,jour,15,299192.515999,5.038122e+06,45.482999,-73.571731
8,Introduction,2015-01-01,jour,42,297210.375006,5.050180e+06,45.591480,-73.597273
9,Introduction,2015-01-01,jour,49,305327.121993,5.059317e+06,45.673742,-73.493234


# Feature Engineering 

In [3]:
# feature engineering

# select a period
df = df[df['DATE']>='2018-01-01']

# select the categories
df = df[df['CATEGORIE']==u'Vols qualifiés']

# remove lines with no location
df = df[(df['X']>0)&(df['Y']>0)]

# adapt X and Y to the visualization
df['X'] = df.apply(lambda x: transform(Proj(init='epsg:4326'), Proj(init='epsg:3857'), x['LONG'], x['LAT'])[1], axis=1)
df['Y'] = df.apply(lambda x: transform(Proj(init='epsg:4326'), Proj(init='epsg:3857'), x['LONG'], x['LAT'])[0], axis=1)

X_columns = ['X', 'Y']
df = df[X_columns]

# Model Training

In [6]:
model = DBSCAN(eps=1000.0, min_samples=3)
model.fit(df[['X', 'Y']])

cluster_labels = model.labels_
n_clusters = len(set(cluster_labels))
print(collections.Counter(cluster_labels))

df['cluster'] = cluster_labels

Counter({1: 84, -1: 81, 4: 11, 0: 7, 2: 7, 3: 7, 13: 7, 17: 7, 25: 7, 14: 6, 6: 6, 15: 6, 7: 5, 10: 5, 18: 5, 19: 5, 29: 4, 5: 4, 8: 4, 9: 4, 23: 4, 24: 4, 20: 4, 26: 4, 30: 3, 11: 3, 12: 3, 27: 3, 28: 3, 16: 3, 21: 3, 22: 3})


In [7]:
p = figure(y_range=(5641788.0, 5751788.0), x_range=(-8152883, -8252883))
p.add_tile(CARTODBPOSITRON)

latitude  = list(df[df['cluster']>-1]['X'].values)
longitude = list(df[df['cluster']>-1]['Y'].values)

colormap = list(bokeh.palettes.viridis(n_clusters))
colors = [colormap[x] for x in df[df['cluster']>-1]['cluster']]
source = ColumnDataSource(data=dict(longitude=longitude, latitude=latitude))
p.circle(x=longitude, y=latitude, color=colors, fill_alpha=0.2, size=5)
show(p)

# Model Evaluation

In [None]:
# Inter-Cluster
centroids = []
for cluster in sorted(set(model.labels_)):
    centroids.append(df[df['cluster']==cluster][X_columns].mean().values)
distances = []
for c1 in centroids:
    for c2 in centroids:
        distances.append(euclidean_distances(c1.reshape(-1, 1), c2.reshape(-1, 1))[0][0])
print('Inter Cluster distance', np.mean(distances))

# Intra-Cluster
distances = []
for cluster in sorted(set(model.labels_)):
    df_filter = df[df['cluster']==cluster]
    centroid = df_filter[X_columns].mean().values
    for k, v in df_filter[X_columns].iterrows():
        distances.append(euclidean_distances(centroid.reshape(-1, 1), v.values.reshape(-1, 1))[0][0])
print('Intra Cluster distance', np.mean(distances))

# Inertia
distances = []
for cluster in sorted(set(model.labels_)):
    df_filter = df[df['cluster']==cluster]
    centroid = df_filter[X_columns].mean().values
    for k, v in df_filter[X_columns].iterrows():
        distances.append(euclidean_distances(centroid.reshape(1, -1), v.values.reshape(1, -1), squared=True)[0][0])
print('Inertia', np.sum(distances))