# Segmenting and Clustering Neighborhoods in Toronto 
Applied Data Science Capstone week 3 Peer graded assignment

Neville Yoon

In [1]:
import pandas as pd
import numpy as np
import requests
#!conda install -c conda-forge beautifulsoup4 --yes 
from bs4 import BeautifulSoup
#!conda install -c anaconda lxml --yes

# Section 1: Scrape the table from the Wikipedia page into a DataFrame

Use BeautifulSoup to scrape the web page and convert the table into a Pandas DataFrame

In [2]:
# Read the html page into a BeautifulSoup object
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url).text
doc = BeautifulSoup(page,'lxml')

#Extract the table and convert to a DataFrame
tables = doc.find('table')
df = pd.read_html(str(tables))[0]
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### Inspect and clean the data

In [3]:
# look for rows where Borough is 'Not assigned'
sum(df['Borough'].str.upper() == 'NOT ASSIGNED')

77

In [4]:
# remove rows where Borough is 'Not assigned'
df_filtered = df[df['Borough']!= 'Not assigned'].reset_index(drop=True)
df_filtered

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [5]:
# look for rows where Neighborhood is 'Not assigned'
sum(df_filtered['Neighborhood'].str.upper() == 'NOT ASSIGNED')

0

In [6]:
# look for rows where Neighborhod is empty
sum(df_filtered['Neighborhood'].isnull())

0

## Answer #1:

In [7]:
df_filtered.shape

(103, 3)

# Section 2: Geocode Postal Codes

First let's attempt to get latitude and longitude coordinates for the postal codes using geocoder.

In [8]:
import io
import geocoder # import geocoder

def get_coords(postal_code):
    '''Get latitude and longitude coordinates for a Toronto Postal Code'''
    print("attempting to get coordinates for {}".format(postal_code),end="")
    max_tries = 10
    lat_lng_coords = None
    i = 1
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        if i > max_tries:
            print("Failed to get coordinates",end="")
            break
        print(".",end="")
        g = geocoder.osm('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
        i += 1
    print("")
    return lat_lng_coords

In [9]:
postal_codes = df_filtered['Postal Code'].tolist()

latitudes = []
longitudes = []
for pc in postal_codes:
    coords = get_coords(pc)
    if coords:
        latitudes.append(coords[0])
        longitudes.append(coords[1])
    else:
        latitudes.append(None)
        longitudes.append(None)

attempting to get coordinates for M3A.
attempting to get coordinates for M4A..........Failed to get coordinates
attempting to get coordinates for M5A..........Failed to get coordinates
attempting to get coordinates for M6A..........Failed to get coordinates
attempting to get coordinates for M7A.
attempting to get coordinates for M9A..........Failed to get coordinates
attempting to get coordinates for M1B.
attempting to get coordinates for M3B..........Failed to get coordinates
attempting to get coordinates for M4B..........Failed to get coordinates
attempting to get coordinates for M5B..........Failed to get coordinates
attempting to get coordinates for M6B..........Failed to get coordinates
attempting to get coordinates for M9B.
attempting to get coordinates for M1C.
attempting to get coordinates for M3C.
attempting to get coordinates for M4C..........Failed to get coordinates
attempting to get coordinates for M5C..........Failed to get coordinates
attempting to get coordinates for M6

In [10]:
df_lat_lon = pd.DataFrame({'Postal Code':postal_codes, 'Latitude':latitudes,'Longitude':longitudes})
df_combined = pd.merge(df_filtered,df_lat_lon, on='Postal Code')
df_combined

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.653482,-79.383935
1,M4A,North York,Victoria Village,,
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
3,M6A,North York,"Lawrence Manor, Lawrence Heights",,
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.653482,-79.383935
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",,
99,M4Y,Downtown Toronto,Church and Wellesley,,
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",,
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",,


That didn't work so well. So let's just get the geo coordinates from the web.

In [11]:
url = 'https://cocl.us/Geospatial_data'
csv = requests.get(url).content
df_lat_lon = pd.read_csv(io.StringIO(csv.decode('utf-8')))
df_combined = pd.merge(df_filtered,df_lat_lon, on='Postal Code')
df_combined = df_combined.rename(columns={'Postal Code':'PostalCode'})

## Answer #2

In [12]:
df_combined

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
