In [21]:
import numpy as np
import pandas as pd
import itertools
from __future__ import division
import geoplotlib as glp
from geoplotlib.utils import BoundingBox, DataAccessObject
from sklearn import cluster
import geoplotlib as gpl

%matplotlib inline
pd.set_option('display.max_columns', None)

In [28]:
filePath = 'datasets/NYPD_Motor_Vehicle_Collisions_weather4.csv'
collisions = pd.read_csv(filePath)
collisions['YEAR'] = collisions.DATE.str.split('/').str.get(2)

In [76]:
def kmean(k, dataset, colums):
    md = cluster.KMeans(n_clusters=k).fit(dataset[colums])
    return md.predict(dataset[colums]),md.cluster_centers_


def encode_column(df, target_column):
    df_mod = df.copy()
    targets = pd.Series(df_mod[target_column].unique())
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod[target_column+"_encoded"] = df_mod[target_column].replace(map_to_int)
    return (df_mod, targets)

mask = ((pd.notnull(collisions.LOCATION)) & (collisions.YEAR == str(2015)) & (pd.notnull(collisions.TemperatureC)))
data = collisions.loc[mask]
print "Data size: %s" % len(data.index)

data, _ = encode_column(data, 'CONTRIBUTING FACTOR VEHICLE 1')
data, _ = encode_column(data, 'Conditions')

data.TemperatureC = data.TemperatureC.astype('float64')

k = 30

kmeans, centoid = kmean(k, data.loc[mask], ['Conditions_encoded', 'CONTRIBUTING FACTOR VEHICLE 1_encoded', 'TemperatureC'])

result = pd.DataFrame(data.loc[mask])
result['K-mean'] = kmeans

Data size: 182788


In [77]:
def get_spaced_colors(n):
    max_value = 16581375 #255**3
    interval = int(max_value / n)
    colors = [hex(I)[2:].zfill(6) for I in range(0, max_value, interval)]
    
    return [[int(i[:2], 16), int(i[2:4], 16), int(i[4:], 16), 255] for i in colors]

def coords(k):
    lat = result[result['K-mean'] == k].LATITUDE.values
    lon = result[result['K-mean'] == k].LONGITUDE.values
    
    return lat,lon


colormap = get_spaced_colors(k)
for i in range(0, k):
    lat, lon = coords(i)

    data = {'lon': lon, 'lat': lat}
    gpl.dot(data, color=colormap[i])

gpl.inline()

In [78]:
result

Unnamed: 0.1,Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5,Conditions,Precipitationmm,TemperatureC,VisibilityKm,YEAR,CONTRIBUTING FACTOR VEHICLE 1_encoded,Conditions_encoded,K-mean
41375,41375,12/31/2015,5:30,QUEENS,11420,40.681006,-73.812561,"(40.6810063, -73.812561)",LINDEN BOULEVARD,128 STREET,,0,0,0,0,0,0,0,0,Unspecified,Unspecified,Unspecified,Unspecified,,3363204,SPORT UTILITY / STATION WAGON,PASSENGER VEHICLE,PASSENGER VEHICLE,PASSENGER VEHICLE,,Mostly Cloudy,-,8.0,16.1,2015,0,0,3
41376,41376,12/31/2015,5:40,,,40.822145,-73.887813,"(40.8221447, -73.8878132)",,,,0,0,0,0,0,0,0,0,Outside Car Distraction,Pavement Slippery,,,,3363688,PASSENGER VEHICLE,UNKNOWN,,,,Mostly Cloudy,-,8.0,16.1,2015,1,0,3
41377,41377,12/31/2015,5:50,,,40.754901,-73.745477,"(40.7549013, -73.7454772)",,,,0,0,0,0,0,0,0,0,Outside Car Distraction,,,,,3363266,PASSENGER VEHICLE,,,,,Mostly Cloudy,-,8.0,16.1,2015,1,0,3
41378,41378,12/31/2015,5:55,BRONX,10463,40.875683,-73.908378,"(40.8756831, -73.9083783)",BROADWAY,WEST 228 STREET,,0,0,0,0,0,0,0,0,Other Vehicular,Other Vehicular,Traffic Control Disregarded,,,3363003,PASSENGER VEHICLE,SPORT UTILITY / STATION WAGON,UNKNOWN,,,Mostly Cloudy,-,8.0,16.1,2015,2,0,3
41379,41379,12/31/2015,6:45,,,40.754151,-73.722875,"(40.7541513, -73.7228746)",,,,1,0,0,0,0,0,1,0,Traffic Control Disregarded,Unspecified,Unspecified,Unspecified,,3363196,PASSENGER VEHICLE,PASSENGER VEHICLE,SPORT UTILITY / STATION WAGON,SPORT UTILITY / STATION WAGON,,Mostly Cloudy,-,7.8,16.1,2015,3,0,27
41380,41380,12/31/2015,7:00,MANHATTAN,10037,40.808905,-73.938337,"(40.8089051, -73.9383371)",EAST 130 STREET,MADISON AVENUE,,0,0,0,0,0,0,0,0,Unspecified,Unspecified,,,,3362880,PASSENGER VEHICLE,UNKNOWN,,,,Mostly Cloudy,-,8.9,16.1,2015,0,0,3
41383,41383,12/31/2015,8:00,BRONX,10470,40.907219,-73.850401,"(40.9072194, -73.8504009)",EAST 242 STREET,ROBERTSON STREET,,0,0,0,0,0,0,0,0,Unspecified,Unspecified,,,,3362979,OTHER,PASSENGER VEHICLE,,,,Mostly Cloudy,-,10.6,16.1,2015,0,0,3
41384,41384,12/31/2015,8:00,QUEENS,11106,40.755821,-73.932579,"(40.7558214, -73.9325786)",29 STREET,37 AVENUE,,0,0,0,0,0,0,0,0,Unspecified,Unspecified,,,,3364431,PASSENGER VEHICLE,UNKNOWN,,,,Mostly Cloudy,-,10.6,16.1,2015,0,0,3
41385,41385,12/31/2015,8:00,QUEENS,11356,40.784918,-73.842128,"(40.7849178, -73.8421283)",15 AVENUE,126 STREET,,0,0,0,0,0,0,0,0,Unspecified,Unspecified,,,,3363257,PASSENGER VEHICLE,UNKNOWN,,,,Mostly Cloudy,-,10.6,16.1,2015,0,0,3
41387,41387,12/31/2015,8:15,BROOKLYN,11214,40.614167,-74.000396,"(40.614167, -74.0003962)",NEW UTRECHT AVENUE,77 STREET,,0,0,0,0,0,0,0,0,Unspecified,Unspecified,,,,3363760,PASSENGER VEHICLE,PASSENGER VEHICLE,,,,Mostly Cloudy,-,10.6,16.1,2015,0,0,3
