The first step will be to set up two dataframes, one for Toronto and one for New York, containing the different neighborhoods and their correspinding latitudes and longitudes. This is done in the cells below:

In [2]:
import pandas as pd
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs = pd.read_html(url)
df = dfs[0]
#Drop all rows where the borough is "Not assigned"
newDf1 = df[df.Borough != 'Not assigned']
#Use the given csv file to find lat/long values for each neighborhood
zipData = pd.read_csv('Geospatial_Coordinates.csv')
torontoDf = newDf1.join(zipData.set_index('Postal Code'), on='Postal Code')
torontoDf.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [22]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

import wget
!wget -q -O 'newyork_data.json' https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs/newyork_data.json
print('Data downloaded!')

url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs/newyork_data.json'
newyork_data_load = wget.download(url)

#print(newyork_data)

with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

neighborhoods_data = newyork_data['features']
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

    
print('The New York dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

Libraries imported.
zsh:1: command not found: wget
Data downloaded!
The New York dataframe has 5 boroughs and 306 neighborhoods.


Next I will get the latitude and longitude values of New York and Toronto:

In [23]:
address_ny = 'New York City, NY'

geolocator_ny = Nominatim(user_agent="ny_explorer")
location_ny = geolocator_ny.geocode(address_ny)
latitude_ny = location_ny.latitude
longitude_ny = location_ny.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude_ny, longitude_ny))

address_tor = 'Toronto, ON'

geolocator_tor = Nominatim(user_agent="toronto_explorer")
location_tor = geolocator_tor.geocode(address_tor)
latitude_tor = location_tor.latitude
longitude_tor = location_tor.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude_tor, longitude_tor))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.
The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


Next I will define a method to return nearby venues in a specified location using the foursquare API. I will then use this function to get find all the venues associated with each neighborhood in each city:

In [26]:
CLIENT_ID = 'U0SZXBBND2X35RL0IBT40C2FYAF3PLQ2Z5O3VXMM5CUKQGPB' # your Foursquare ID
CLIENT_SECRET = 'IDECMZSDK10RPXCZFWLDDXXLKLWIXBUTVYYJJFQ4KGYBX4OX' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

newYork_venues = getNearbyVenues(names=neighborhoods['Neighborhood'],
                                  latitudes=neighborhoods['Latitude'],
                                  longitudes=neighborhoods['Longitude'])

toronto_venues = getNearbyVenues(names=torontoDf['Neighbourhood'],
                                  latitudes=torontoDf['Latitude'],
                                  longitudes=torontoDf['Longitude'])

print('Venues Found')
print(newYork_venues.shape)
print(toronto_venues.shape)

Wakefield
Co-op City
Eastchester
Fieldston
Riverdale
Kingsbridge
Marble Hill
Woodlawn
Norwood
Williamsbridge
Baychester
Pelham Parkway
City Island
Bedford Park
University Heights
Morris Heights
Fordham
East Tremont
West Farms
High  Bridge
Melrose
Mott Haven
Port Morris
Longwood
Hunts Point
Morrisania
Soundview
Clason Point
Throgs Neck
Country Club
Parkchester
Westchester Square
Van Nest
Morris Park
Belmont
Spuyten Duyvil
North Riverdale
Pelham Bay
Schuylerville
Edgewater Park
Castle Hill
Olinville
Pelham Gardens
Concourse
Unionport
Edenwald
Bay Ridge
Bensonhurst
Sunset Park
Greenpoint
Gravesend
Brighton Beach
Sheepshead Bay
Manhattan Terrace
Flatbush
Crown Heights
East Flatbush
Kensington
Windsor Terrace
Prospect Heights
Brownsville
Williamsburg
Bushwick
Bedford Stuyvesant
Brooklyn Heights
Cobble Hill
Carroll Gardens
Red Hook
Gowanus
Fort Greene
Park Slope
Cypress Hills
East New York
Starrett City
Canarsie
Flatlands
Mill Island
Manhattan Beach
Coney Island
Bath Beach
Borough Park
Dyker

The next step is to combine the two dataframes, perform one hot encoding on the combined data and sort the resulting venue data: 

In [37]:
dataFrames = [newYork_venues,toronto_venues]
combinedData = pd.concat(dataFrames)
combinedData.groupby('Neighborhood').count()

# one hot encoding
combined_onehot = pd.get_dummies(combinedData[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
combined_onehot['Neighborhood'] = combinedData['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [combined_onehot.columns[-1]] + list(combined_onehot.columns[:-1])
combined_onehot = combined_onehot[fixed_columns]

print(combined_onehot.shape)
combined_onehot.head()
combined_grouped = combined_onehot.groupby('Neighborhood').mean().reset_index()
print(combined_grouped.shape)
combined_grouped.head()

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = combined_grouped['Neighborhood']

for ind in np.arange(combined_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(combined_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

(12257, 461)
(396, 461)


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Breakfast Spot,Lounge,Latin American Restaurant,Clothing Store,Skating Rink,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Empanada Restaurant
1,"Alderwood, Long Branch",Pizza Place,Pub,Coffee Shop,Gym,Sandwich Place,Skating Rink,Pharmacy,Dry Cleaner,Dumpling Restaurant,Duty-free Shop
2,Allerton,Pizza Place,Deli / Bodega,Chinese Restaurant,Supermarket,Cosmetics Shop,Discount Store,Donut Shop,Martial Arts School,Gas Station,Bus Station
3,Annadale,Pizza Place,Restaurant,Park,Diner,Train Station,Pharmacy,Bakery,Sushi Restaurant,American Restaurant,Liquor Store
4,Arden Heights,Pharmacy,Deli / Bodega,Pizza Place,Business Service,Coffee Shop,Women's Store,Event Space,Eastern European Restaurant,Egyptian Restaurant,Electronics Store


The final step is to perform k-means clustering with k=5 on the combined dataset. This will give 5 different groups containing neighborhoods from both cities. This is potenitally usedful because similar neighborhoods will be grouped together, so a user will be able to find neighborhoods similar to their own in their own city or a different city. The resulting dataframe contains the neighborhoods with a "Cluster label" corresponding to the cluster that neighborhood belongs to. 

In [43]:
# set number of clusters
kclusters = 5

combined_grouped_clustering = combined_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(combined_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

combined_merged = combinedData

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
combined_merged = combined_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

neighborhoods_venues_sorted.head() # check the last columns!

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,Agincourt,Breakfast Spot,Lounge,Latin American Restaurant,Clothing Store,Skating Rink,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Empanada Restaurant
1,4,"Alderwood, Long Branch",Pizza Place,Pub,Coffee Shop,Gym,Sandwich Place,Skating Rink,Pharmacy,Dry Cleaner,Dumpling Restaurant,Duty-free Shop
2,4,Allerton,Pizza Place,Deli / Bodega,Chinese Restaurant,Supermarket,Cosmetics Shop,Discount Store,Donut Shop,Martial Arts School,Gas Station,Bus Station
3,4,Annadale,Pizza Place,Restaurant,Park,Diner,Train Station,Pharmacy,Bakery,Sushi Restaurant,American Restaurant,Liquor Store
4,4,Arden Heights,Pharmacy,Deli / Bodega,Pizza Place,Business Service,Coffee Shop,Women's Store,Event Space,Eastern European Restaurant,Egyptian Restaurant,Electronics Store
