# Segmenting and Clustering Neighborhoods in Toronto 

## PART 1. Toronto Neighborhood with Postal Codes

In [3]:
#Load libraries
import requests
import pandas as pd
import numpy as np
import lxml

from bs4 import BeautifulSoup

### Scrape data into dataframe

In [4]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

table = soup.find('table', class_='wikitable sortable')
table_rows = table.find_all('tr')

data = []

for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    data.append(row)

df = pd.DataFrame(data, columns=["PostalCode", "Borough", "Neighborhood"])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n


### Cleanup data

In [5]:
df=df.replace('\n','', regex=True)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Remove 'Not assigned'

In [6]:
df = df[df.Borough != 'Not assigned']
df.head(8)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue


### Replace if 'Neighborhood' is 'Not assigned' with 'Bourough'

In [7]:
df[df['Neighborhood']=='Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood
8,M7A,Queen's Park,Not assigned


In [8]:
df['Neighborhood'] = df.apply(lambda row: row['Borough'] if row['Neighborhood'] == 'Not assigned' else row['Neighborhood'], axis=1)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


### Groupby Postalcode

In [9]:
df = df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda x: ", ".join(x.astype(str))).reset_index()
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M9W,Etobicoke,Northwest
1,M5A,Downtown Toronto,"Harbourfront, Regent Park"
2,M1S,Scarborough,Agincourt
3,M4P,Central Toronto,Davisville North
4,M3M,North York,Downsview Central


### Number of Rows & Columns

In [10]:
df.shape

(103, 3)

### (End of Part 1)

## PART 2.  Join Geospacial Coordinates

In [11]:
# Read csv into dataframe
url="http://cocl.us/Geospatial_data"
geo_coords = pd.read_csv(url) #, sep = ',' 
geo_coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
print(list(df))
print(list(geo_coords))

['PostalCode', 'Borough', 'Neighborhood']
['Postal Code', 'Latitude', 'Longitude']


In [13]:
df2 = df.set_index('PostalCode').join(geo_coords.set_index('Postal Code'))
df2 = df2.sample(frac=1).reset_index(drop=True) #False
df2.head(10)

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Queen's Park,Queen's Park,43.662301,-79.389494
1,North York,Willowdale West,43.782736,-79.442259
2,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442
3,Scarborough,Upper Rouge,43.836125,-79.205636
4,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
5,Central Toronto,North Toronto West,43.715383,-79.405678
6,Scarborough,"Dorset Park, Scarborough Town Centre, Wexford ...",43.75741,-79.273304
7,Scarborough,"Clarks Corners, Sullivan, Tam O'Shanter",43.781638,-79.304302
8,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St...",43.815252,-79.284577
9,West Toronto,"Runnymede, Swansea",43.651571,-79.48445


### (End of Part 2)

## Part 3. Explore and cluster the neighborhoods in Toronto

In [15]:
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if folium is not installed yet

import folium

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  54.31 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  33.86 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  39.85 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  48.80 MB/s


### Toronto's Coordinates

In [16]:
address = 'Toronto'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 43.653963, -79.387207.


#### Create a map of Toronto with neighborhoods superimposed on top.

In [17]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df2['Latitude'], df2['Longitude'], df2['Borough'], df2['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Select Borough with "Toronto" only

In [18]:
df2_to = df2[df2['Borough'].str.contains('Toronto')]
df2_to = df2_to.reset_index(drop=True)
df2_to.head()


#df2_to = df2[df2['Borough'] == 'Toronto'].reset_index(drop=True)
#df2_to.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442
1,Central Toronto,North Toronto West,43.715383,-79.405678
2,West Toronto,"Runnymede, Swansea",43.651571,-79.48445
3,West Toronto,"High Park, The Junction South",43.661608,-79.464763
4,Central Toronto,Roselawn,43.711695,-79.416936


In [19]:
df2_to.shape

(38, 4)

In [20]:
# create map of Manhattan using latitude and longitude values
map_to = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df2_to['Latitude'], df2_to['Longitude'], df2_to['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_to)  
    
map_to

## Explore Toronto Neighborhood

#### Define Foursquare Credentials and Version

In [28]:
# The code was removed by Watson Studio for sharing.

#### Let's explore the first neighborhood in our dataframe.

In [22]:
neighborhood_latitude = df2_to.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df2_to.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df2_to.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara are 43.6289467, -79.3944199.


In [27]:
df2_to.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,"Harbord, University of Toronto",43.662696,-79.400049
1,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
2,Downtown Toronto,St. James Town,43.651494,-79.375418
3,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817
4,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188


#### Now, let's get the top 100 venues that are in Harbord, University of Toronto within a radius of 500 meters.

First, let's create the GET request URL. Name your URL **url**.

In [23]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=44DQ1SLY3HIZY3ONMVLRTQ0FHYXO4XOT34OZQRHSYKQHASA5&client_secret=Z0VA21AJMN4HF4PZDOLOADWPDQOKHC4C54OUJ4QQ5YCH3FYD&v=20180605&ll=43.6289467,-79.3944199&radius=500&limit=100'

Send the GET request and examine the resutls

In [24]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5c45694bdb04f57d55311719'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-4ad8df54f964a520881521e3-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/travel/airport_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d1ed931735',
         'name': 'Airport',
         'pluralName': 'Airports',
         'primary': True,
         'shortName': 'Airport'}],
       'id': '4ad8df54f964a520881521e3',
       'location': {'address': 'Toronto Island',
        'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'distance': 309,
        'formattedAddress': ['Toronto Island', 'Toronto ON M5J 1B7', 'Canada'],
        'lat': 43.631584545678436,
        'lng': -79.39564308312414,
        'postalCode': 'M5J 1B7',
       

Let's borrow the **get_category_type** function from the Foursquare lab.

In [25]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Clean the json and structure it into a *pandas* dataframe.

In [26]:
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Billy Bishop Toronto City Airport (YTZ) (Billy...,Airport,43.631585,-79.395643
1,Porter Lounge,Airport Lounge,43.63068,-79.395756
2,Toronto Harbour,Harbor / Marina,43.633045,-79.396484
3,Billy Bishop Café,Airport Food Court,43.631132,-79.396139
4,Air Canada Check-In Counter,Airport Terminal,43.631226,-79.395987


Venues returned by Foursquare

In [27]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

14 venues were returned by Foursquare.


The end. Thank you.