# Comparison of San Francisco, New York and Toronto

In this notebook we compare similiarity of San Francisco, New York and Toronto. For each city we create neighbourhood data and then search for venues in the neighbourhoods. For each city neighbourhoods are culstered using k-means algorithm. We then compare centroids between each city, based on that we calculate the similarity beteen cities.

In [61]:
import pandas as pd
import folium
import requests
import numpy as np

import matplotlib.cm as cm
import matplotlib.colors as colors

import json

# import k-means from clustering stage
from sklearn.cluster import KMeans

### 1. Get the data for San Francisco

In [9]:
data = pd.read_html("http://www.healthysf.org/bdi/outcomes/zipmap.htm")

In [10]:
sf_neighbourhoods = data[3]

In [11]:
sf_neighbourhoods.columns = ['ZipCode','Neighborhood','Population']
sf_neighbourhoods.drop([0,22], axis = 0, inplace = True)

In [12]:
sf_neighbourhoods

Unnamed: 0,ZipCode,Neighborhood,Population
1,94102,Hayes Valley/Tenderloin/North of Market,28991
2,94103,South of Market,23016
3,94107,Potrero Hill,17368
4,94108,Chinatown,13716
5,94109,Polk/Russian Hill (Nob Hill),56322
6,94110,Inner Mission/Bernal Heights,74633
7,94112,Ingelside-Excelsior/Crocker-Amazon,73104
8,94114,Castro/Noe Valley,30574
9,94115,Western Addition/Japantown,33115
10,94116,Parkside/Forest Hill,42958


In [13]:
sf_ll = pd.read_html('https://www.geonames.org/postal-codes/US/CA/075/san-francisco.html')

In [17]:
data = sf_ll[2]

In [19]:
data.drop(100, axis = 0, inplace = True)

In [20]:
data

Unnamed: 0.1,Unnamed: 0,Place,Code,Country,Admin1,Admin2,Admin3
0,1.0,San Francisco,94102,United States,California,City and County of San Francisco,
1,,37.781/-122.417,37.781/-122.417,37.781/-122.417,37.781/-122.417,37.781/-122.417,37.781/-122.417
2,2.0,San Francisco,94103,United States,California,City and County of San Francisco,
3,,37.773/-122.415,37.773/-122.415,37.773/-122.415,37.773/-122.415,37.773/-122.415,37.773/-122.415
4,3.0,San Francisco,94105,United States,California,City and County of San Francisco,
...,...,...,...,...,...,...,...
95,,37.775/-122.419,37.775/-122.419,37.775/-122.419,37.775/-122.419,37.775/-122.419,37.775/-122.419
96,49.0,San Francisco,94188,United States,California,City and County of San Francisco,
97,,37.775/-122.419,37.775/-122.419,37.775/-122.419,37.775/-122.419,37.775/-122.419,37.775/-122.419
98,50.0,San Francisco,94158,United States,California,City and County of San Francisco,


In [22]:
ll_data = data.loc[1:100:2,'Place']

In [23]:
data.drop(range(1,100,2), inplace = True)

In [40]:
ll_df_data = ll_data.str.split("/").apply(lambda x: pd.Series({'latitude':float(x[0]),'longitude':float(x[1])}))

In [50]:
data.reset_index(inplace = True, drop = True)

In [51]:
ll_df_data.reset_index(inplace = True, drop = True)

In [52]:
sf_data_final = pd.concat([data,ll_df_data], axis = 1)

In [54]:
sf_data_final = sf_data_final[['Code','latitude','longitude']]

In [56]:
sf_data = sf_neighbourhoods.merge(sf_data_final, left_on = 'ZipCode', right_on = 'Code')

In [59]:
sf_data.drop(['Code'], axis = 1, inplace = True)

In [60]:
sf_data

Unnamed: 0,ZipCode,Neighborhood,Population,latitude,longitude
0,94102,Hayes Valley/Tenderloin/North of Market,28991,37.781,-122.417
1,94103,South of Market,23016,37.773,-122.415
2,94107,Potrero Hill,17368,37.762,-122.397
3,94108,Chinatown,13716,37.793,-122.408
4,94109,Polk/Russian Hill (Nob Hill),56322,37.792,-122.419
5,94110,Inner Mission/Bernal Heights,74633,37.751,-122.415
6,94112,Ingelside-Excelsior/Crocker-Amazon,73104,37.72,-122.441
7,94114,Castro/Noe Valley,30574,37.759,-122.433
8,94115,Western Addition/Japantown,33115,37.786,-122.436
9,94116,Parkside/Forest Hill,42958,37.744,-122.486


### 2. Get New York data

In [63]:
with open('../newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [64]:
neighborhoods_data = newyork_data['features']

In [65]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [66]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [68]:
ny_data = neighborhoods

In [69]:
ny_data

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585
...,...,...,...,...
301,Manhattan,Hudson Yards,40.756658,-74.000111
302,Queens,Hammels,40.587338,-73.805530
303,Queens,Bayswater,40.611322,-73.765968
304,Queens,Queensbridge,40.756091,-73.945631


### 3. Get Toronto Data

In [70]:
data = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
df = data[0]

In [71]:
df.columns = ["PostalCode", "Borough", "Neighborhood"]

In [72]:
df = df.loc[df["Borough"] != "Not assigned", :]

In [73]:
df.loc[df['Neighborhood'] == "Not assigned", 'Neighborhood'] = df.loc[df['Neighborhood'] == "Not assigned", 'Borough']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [74]:
geo_data = pd.read_csv('Geospatial_Coordinates.csv')

In [75]:
df = df.merge(geo_data, left_on = 'PostalCode', right_on = 'Postal Code')

In [76]:
df.drop('Postal Code', axis = 1, inplace = True)

In [77]:
toronto_data = df 

In [78]:
toronto_data

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


### 4. Standardize data

In [79]:
sf_data_standard = sf_data[['Neighborhood','latitude','longitude']]
sf_data_standard.columns = ['Neighborhood','Latitude','Longitude']

In [80]:
ny_data_standard = ny_data[['Neighborhood','Latitude','Longitude']]

In [81]:
toronto_data_standard = toronto_data[['Neighborhood','Latitude','Longitude']]

In [82]:
ny_latitude, ny_longitude = 40.7127281, -74.0060152
sf_latitude, sf_longitude = 37.773972, -122.431297
to_latitude, to_longitude = 43.651070, -79.347015

### 5. Visualize Data

#### Toronto

In [83]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[to_latitude, to_longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(toronto_data_standard['Latitude'], toronto_data_standard['Longitude'], toronto_data_standard['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### New York

In [84]:
# create map of New York using latitude and longitude values
map_ny = folium.Map(location=[ny_latitude, ny_longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(ny_data_standard['Latitude'], ny_data_standard['Longitude'], ny_data_standard['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_ny)  
    
map_ny

#### San Francisco

In [85]:
# create map of New York using latitude and longitude values
map_sf = folium.Map(location=[sf_latitude, sf_longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(sf_data_standard['Latitude'], sf_data_standard['Longitude'], sf_data_standard['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_sf)  
    
map_sf