In [1]:
# algotritm implemented according to: http://geoffboeing.com/2014/08/clustering-to-reduce-spatial-data-set-size/

# k-means is not an ideal algorithm for latitude-longitude spatial data because it minimizes variance, 
# not geodetic distance

import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint

In [38]:
# coordinates for the training dataset
df = pd.read_csv("location_csv/train.csv")
coords1 = df.as_matrix(columns=['latitude', 'longitude'])

In [27]:
# coordinates for the test dataset
df2 = pd.read_csv("location_csv/test.csv")
coords2 = df2.as_matrix(columns = ['latitude','longitude'])

In [40]:
# all coordinates from both training and testing
coords = np.concatenate((coords1,coords2))

In [114]:
# DBSCAN algorithm
# two variables that can be varied:  epsilon and min_samples
kms_per_radian = 6371.0088
epsilon = 0.05/ kms_per_radian
db = DBSCAN(eps=epsilon, min_samples=10, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

Number of clusters: 1095


In [139]:
# clusters # call clusters if you want to check out coordinates for each cluster

0       [[40.7513, -73.9722], [40.7513, -73.9721], [40...
1       [[40.7575, -73.9625], [40.7575, -73.9625], [40...
2       [[40.7439, -73.9743], [40.7434, -73.9746], [40...
3       [[40.7348, -73.9865], [40.7348, -73.9865], [40...
4       [[40.7302, -73.9826], [40.7319, -73.9817], [40...
5       [[40.7769, -73.9467], [40.7723, -73.951], [40....
6       [[40.7346, -73.9811], [40.7352, -73.9832], [40...
7       [[40.699, -73.9943], [40.6985, -73.9937], [40....
8       [[40.753, -73.9959], [40.753, -73.9958], [40.7...
9       [[40.761, -73.999], [40.7585, -73.9913], [40.7...
10      [[40.7277, -74.0], [40.7278, -73.9999], [40.72...
11      [[40.7633, -73.9596], [40.7641, -73.9592], [40...
12      [[40.7073, -73.9665], [40.7073, -73.9665], [40...
13      [[40.7528, -73.9709], [40.7528, -73.9709], [40...
14      [[40.736, -73.986], [40.7358, -73.9859], [40.7...
15      [[40.783, -73.9828], [40.7824, -73.9841], [40....
16      [[40.746, -73.9754], [40.7398, -73.9811], [40....
17      [[40.7

In [115]:
# compute legth of each clusters
map(lambda(x): len(x), clusters)

[195,
 180,
 1041,
 344,
 5039,
 8692,
 450,
 61,
 373,
 4754,
 714,
 545,
 14,
 195,
 152,
 201,
 5983,
 74,
 26,
 4219,
 5176,
 54,
 960,
 134,
 800,
 239,
 20,
 133,
 25,
 34,
 14,
 87,
 304,
 199,
 276,
 15,
 326,
 317,
 308,
 19,
 151,
 204,
 199,
 73,
 669,
 151,
 413,
 277,
 332,
 220,
 271,
 977,
 88,
 668,
 108,
 127,
 301,
 149,
 558,
 205,
 119,
 258,
 90,
 16,
 66,
 149,
 278,
 16,
 335,
 271,
 383,
 68,
 38,
 46,
 313,
 25,
 877,
 476,
 11,
 33,
 103,
 349,
 17,
 45,
 354,
 67,
 436,
 21,
 99,
 33,
 274,
 282,
 923,
 122,
 47,
 189,
 1318,
 40,
 141,
 416,
 14,
 104,
 55,
 44,
 94,
 139,
 101,
 286,
 19,
 160,
 74,
 306,
 141,
 37,
 278,
 71,
 485,
 73,
 330,
 81,
 327,
 37,
 481,
 18,
 35,
 90,
 34,
 37,
 531,
 51,
 116,
 23,
 12,
 66,
 142,
 35,
 21,
 41,
 62,
 115,
 29,
 37,
 30,
 41,
 164,
 17,
 14,
 43,
 32,
 734,
 10,
 15,
 74,
 226,
 175,
 129,
 244,
 277,
 77,
 800,
 24,
 104,
 205,
 33,
 55,
 86,
 59,
 162,
 62,
 86,
 328,
 12,
 226,
 56,
 423,
 212,
 42,
 69,
 35

In [135]:
# Save clusters for training
train_cluster = cluster_labels[0:49352]
df['cluster'] = train_cluster # add to original trianing dataset
train = df[['aptID','cluster']]
# write csv
train.to_csv('location_csv/train_cluster.csv',sep=',',index_label=False)

array([ -1,   0,   1, ..., 264, 608, 418])

In [171]:
# Save clusters for test dataset 
test_cluster = cluster_labels[49352:124012]
df2['cluster']=test_cluster # add to original test dataset
test = df2[['aptID','cluster']]
test.to_csv('location_csv/test_cluster.csv',sep=',',index_label=False)

array([25, 10, 74, ..., 16, 16,  2])

In [None]:
# calculate centermost points for each cluster
def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)
centermost_points = clusters.map(get_centermost_point)
