# Segmenting and Clustering Neighborhoods in Toronto 

## PART 1. Toronto Neighborhood with Postal Codes

In [1]:
#Load libraries
import requests
import pandas as pd
import numpy as np
import lxml

from bs4 import BeautifulSoup

### Scrape data into dataframe

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

table = soup.find('table', class_='wikitable sortable')
table_rows = table.find_all('tr')

data = []

for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    data.append(row)

df = pd.DataFrame(data, columns=["PostalCode", "Borough", "Neighborhood"])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n


### Cleanup data

In [3]:
df=df.replace('\n','', regex=True)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Remove 'Not assigned'

In [4]:
df = df[df.Borough != 'Not assigned']
df.head(8)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue


### Replace if 'Neighborhood' is 'Not assigned' with 'Bourough'

In [5]:
df[df['Neighborhood']=='Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood
8,M7A,Queen's Park,Not assigned


In [6]:
df['Neighborhood'] = df.apply(lambda row: row['Borough'] if row['Neighborhood'] == 'Not assigned' else row['Neighborhood'], axis=1)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


### Groupby Postalcode

In [7]:
df = df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda x: ", ".join(x.astype(str))).reset_index()
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5W,Downtown Toronto,Stn A PO Boxes 25 The Esplanade
1,M7A,Queen's Park,Queen's Park
2,M4X,Downtown Toronto,"Cabbagetown, St. James Town"
3,M2L,North York,"Silver Hills, York Mills"
4,M4R,Central Toronto,North Toronto West


### Number of Rows & Columns

In [8]:
df.shape

(103, 3)

### (End of Part 1)

## PART 2.  Join Geospacial Coordinates

In [9]:
# Read csv into dataframe
url="http://cocl.us/Geospatial_data"
geo_coords = pd.read_csv(url) #, sep = ',' 
geo_coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
print(list(df))
print(list(geo_coords))

['PostalCode', 'Borough', 'Neighborhood']
['Postal Code', 'Latitude', 'Longitude']


In [11]:
df2 = df.set_index('PostalCode').join(geo_coords.set_index('Postal Code'))
df2 = df2.sample(frac=1).reset_index(drop=True) #False
df2.head(10)

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,"Harbord, University of Toronto",43.662696,-79.400049
1,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
2,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
4,North York,"Maple Leaf Park, North Park, Upwood Park",43.713756,-79.490074
5,Downtown Toronto,St. James Town,43.651494,-79.375418
6,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817
7,Mississauga,Canada Post Gateway Processing Centre,43.636966,-79.615819
8,York,Caledonia-Fairbanks,43.689026,-79.453512
9,North York,Hillcrest Village,43.803762,-79.363452


### (End of Part 2)

## Part 3. Explore and cluster the neighborhoods in Toronto

In [12]:
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if folium is not installed yet

import folium

### Toronto's Coordinates

In [13]:
address = 'Toronto'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 43.653963, -79.387207.


#### Create a map of Toronto with neighborhoods superimposed on top.

In [14]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df2['Latitude'], df2['Longitude'], df2['Borough'], df2['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Select Borough with "Toronto" only

In [17]:
df2_to = df2[df2['Borough'].str.contains('Toronto')]
df2_to = df2_to.reset_index(drop=True)
df2_to.head()

#manhattan_data = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
#manhattan_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,"Harbord, University of Toronto",43.662696,-79.400049
1,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
2,Downtown Toronto,St. James Town,43.651494,-79.375418
3,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817
4,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188


In [18]:
df2_to.shape

(38, 4)

In [19]:
# create map of Manhattan using latitude and longitude values
map_to = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df2_to['Latitude'], df2_to['Longitude'], df2_to['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_to)  
    
map_to

## Explore Toronto Neighborhood

#### Define Foursquare Credentials and Version

In [20]:
# @hidden cell
CLIENT_ID = '44DQ1SLY3HIZY3ONMVLRTQ0FHYXO4XOT34OZQRHSYKQHASA5' # your Foursquare ID
CLIENT_SECRET = 'Z0VA21AJMN4HF4PZDOLOADWPDQOKHC4C54OUJ4QQ5YCH3FYD' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 44DQ1SLY3HIZY3ONMVLRTQ0FHYXO4XOT34OZQRHSYKQHASA5
CLIENT_SECRET:Z0VA21AJMN4HF4PZDOLOADWPDQOKHC4C54OUJ4QQ5YCH3FYD


#### Let's explore the first neighborhood in our dataframe.

In [26]:
neighborhood_latitude = df2_to.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df2_to.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df2_to.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Harbord, University of Toronto are 43.6626956, -79.4000493.


In [27]:
df2_to.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,"Harbord, University of Toronto",43.662696,-79.400049
1,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
2,Downtown Toronto,St. James Town,43.651494,-79.375418
3,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817
4,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188


#### Now, let's get the top 100 venues that are in Harbord, University of Toronto within a radius of 500 meters.

First, let's create the GET request URL. Name your URL **url**.

In [29]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=44DQ1SLY3HIZY3ONMVLRTQ0FHYXO4XOT34OZQRHSYKQHASA5&client_secret=Z0VA21AJMN4HF4PZDOLOADWPDQOKHC4C54OUJ4QQ5YCH3FYD&v=20180605&ll=43.6626956,-79.4000493&radius=500&limit=100'

Send the GET request and examine the resutls

In [30]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5c447d7cdd5797602b88f82e'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-5362c366498e602fbe1db395-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/japanese_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d111941735',
         'name': 'Japanese Restaurant',
         'pluralName': 'Japanese Restaurants',
         'primary': True,
         'shortName': 'Japanese'}],
       'id': '5362c366498e602fbe1db395',
       'location': {'address': '81 Harbord St.',
        'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'distance': 255,
        'formattedAddress': ['81 Harbord St.', 'Toronto ON M5S 1G4', 'Canada'],
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.66283719650635,
  

Let's borrow the **get_category_type** function from the Foursquare lab.

In [31]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Clean the json and structure it into a *pandas* dataframe.

In [33]:
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Yasu,Japanese Restaurant,43.662837,-79.403217
1,Rasa,Restaurant,43.662757,-79.403988
2,Piano Piano,Italian Restaurant,43.662949,-79.402898
3,Cafe Cancan,French Restaurant,43.662735,-79.403447
4,Almond Butterfly,Bakery,43.662836,-79.403365


Venues returned by Foursquare

In [35]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

34 venues were returned by Foursquare.


The end. Thank you.