## Visualizing geodata

* *Using Python, create a JSON file containing the data you wish to visualize.*
>The datapoints you should use, is the same as for week 5, so it is all crimes with type = PROSTITUTION
Note : Think carefully about how you can minimize the size of the file containing the data. My file is around 700KB. Why is the size of the file important?

In [23]:
import pandas as pd
import numpy as np
import json
from sklearn.cluster import KMeans

In [2]:
# Load it into a Dataframe using pandas
path = '../../data/sfpd_incidents.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location,PdId
0,150060275,NON-CRIMINAL,LOST PROPERTY,Monday,01/19/2015,14:00,MISSION,NONE,18TH ST / VALENCIA ST,-122.421582,37.761701,"(37.7617007179518, -122.42158168137)",15006027571000
1,150098210,ROBBERY,"ROBBERY, BODILY FORCE",Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821003074
2,150098210,ASSAULT,AGGRAVATED ASSAULT WITH BODILY FORCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821004014
3,150098210,SECONDARY CODES,DOMESTIC VIOLENCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821015200
4,150098226,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Tuesday,01/27/2015,19:00,NORTHERN,NONE,LOMBARD ST / LAGUNA ST,-122.431119,37.800469,"(37.8004687042875, -122.431118543788)",15009822628160


In [20]:
# Filter out Prostitution only with coordinates
df_kmeans = df[df['Category']=='PROSTITUTION']
df_kmeans = df_kmeans.filter(items=['X','Y'])

# Remove the outliers
df_kmeans = df_kmeans[df_kmeans['Y']!=90]
df_kmeans = df_kmeans.reset_index().drop(['index'],axis=1)
df_kmeans.head()

Unnamed: 0,X,Y
0,-122.485039,37.761291
1,-122.466205,37.772541
2,-122.403405,37.775421
3,-122.409661,37.786439
4,-122.417956,37.75788


In [21]:
# Assign X and Y coordinates to K-means classifier
X = np.array(df_kmeans[['X','Y']])

# Initialize the KMeans algorithm and fit
def k_means_fit(n_clusters,features):
    clf = KMeans(n_clusters=n_clusters, random_state=0).fit(features)
    return clf

In [22]:
# Create a cluster reference for each K
for i in range(2,7):
    df_kmeans[str(i)+'_clusters'] = pd.Series(k_means_fit(i,X).labels_, index=df_kmeans.index)

df_kmeans.head()

Unnamed: 0,X,Y,2_clusters,3_clusters,4_clusters,5_clusters,6_clusters
0,-122.485039,37.761291,1,2,3,3,4
1,-122.466205,37.772541,1,2,3,3,4
2,-122.403405,37.775421,0,1,0,4,0
3,-122.409661,37.786439,0,1,0,4,0
4,-122.417956,37.75788,1,0,1,0,1


In [32]:
# Create a centroid reference for each K
centroids = {}
for i in range(2,7):
    centroids[str(i)+'_centroids'] = [list(el) for el in list(list(k_means_fit(i,X).cluster_centers_))]
centroids

{'2_centroids': [[-122.41721258127922, 37.787394262218022],
  [-122.41924311718914, 37.760004216652128]],
 '3_centroids': [[-122.41582476469686, 37.761346056903406],
  [-122.41709742374232, 37.787424549878409],
  [-122.47811474903897, 37.738906485698408]],
 '4_centroids': [[-122.41708247002195, 37.787427118841762],
  [-122.41579332831969, 37.761446811162173],
  [-122.46632498052548, 37.718814247089576],
  [-122.48639782848089, 37.758572304670537]],
 '5_centroids': [[-122.41584224261476, 37.761425698684391],
  [-122.41876997704011, 37.787654471039687],
  [-122.46632498052548, 37.718814247089576],
  [-122.48639782848089, 37.758572304670537],
  [-122.4045346858759, 37.785530686729118]],
 '6_centroids': [[-122.4045346858759, 37.785530686729118],
  [-122.41599755987995, 37.761710403228989],
  [-122.46952143243395, 37.719173519869869],
  [-122.41876997704011, 37.787654471039687],
  [-122.48636572534035, 37.758689247617198],
  [-122.40540320357452, 37.727577617551638]]}

## Save files to work with D3

In [34]:
# Save the centroids into a json
with open('../../docs/angelos/blocks_map/centroids.txt', 'w') as outfile:  
    json.dump(centroids, outfile)
    
# Save the clusters and coordinates into a csv
df_kmeans.to_csv('../../docs/angelos/blocks_map/clusters.csv',index=False)