# Segmenting and Clustering Neighborhoods in Toronto

1. Use notebook to build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe like the one shown below:

In [1]:

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

# URL for the wikipedia page
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_page = requests.get(url) # pull the data
data = BeautifulSoup(wiki_page.text,'lxml') 
table = data.find('table') # extract the table
print(table) # visualize the contents of the table

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

In [3]:
PostCode = [] # post codes in Canada
Borough = [] # boroughs in Canada
Neighborhood = [] # neighborhoods in Canada
# from the table above, the data needs to be extracted using <th> and <td>
for tr in table.find_all('tr'):
    i = 3 # start a counter since there are three "td" fields in "tr"
    for td in tr.find_all('td'):
        if i == 3: # post code
            post = td.text
            i-= 1
        elif i==2: # borough
            borough = td.text
            i-= 1
        elif i==1: # neighborhood
            neighborhood = str(td.text).strip() # strip extra space
            i-= 1
        if i==0:
            if borough!='Not assigned': # if the borough is not assigned, add it to lists
                if neighborhood=='Not assigned': # if neighborhood is not assigned
                    neighborhood=borough # reassign it with borough name
                if post in PostCode: # if postcode exists in list already
                    Neighborhood[-1]+= ', '
                    Neighborhood[-1]+= neighborhood # add neighborhood name to last element in list
                else:
                    PostCode.append(post) # add post code to list
                    Borough.append(borough) # add borough to list
                    Neighborhood.append(neighborhood) # add neighborhood to list

In [4]:
# create data frame lists and display it
df = pd.DataFrame(data={'PostalCode':PostCode,'Borough':Borough,'Neighborhood':Neighborhood})
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [5]:
print("There are {} unique postal code areas".format(df.shape[0]))

There are 103 unique postal code areas


In [6]:
! pip install geocoder


[33mYou are using pip version 18.1, however version 19.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [7]:
import geocoder

In [8]:
def get_coordinates(postcode):
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postcode))
        lat_lng_coords = g.latlng
    return lat_lng_coords[0], lat_lng_coords[1]

In [41]:
df_location = pd.read_csv('Geospatial_Coordinates.csv')
df_location.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [42]:
print(df_location.index[df_location['Postal Code']=='M3A'])

Int64Index([25], dtype='int64')


In [43]:
latitude = []
longitude = []
for code in PostCode:
    loc = df_location.index[df_location['Postal Code']==code]
    latitude.append(df_location['Latitude'][loc[0]])
    longitude.append(df_location['Longitude'][loc[0]])

M3A
25
M4A
34
M5A
53
M6A
71
M7A
85
M9A
93
M1B
0
M3B
26
M4B
35
M5B
54
M6B
72
M9B
94
M1C
1
M3C
27
M4C
36
M5C
55
M6C
73
M9C
95
M1E
2
M4E
37
M5E
56
M6E
74
M1G
3
M4G
38
M5G
57
M6G
75
M1H
4
M2H
17
M3H
28
M4H
39
M5H
58
M6H
76
M1J
5
M2J
18
M3J
29
M4J
40
M5J
59
M6J
77
M1K
6
M2K
19
M3K
30
M4K
41
M5K
60
M6K
78
M1L
7
M2L
20
M3L
31
M4L
42
M5L
61
M6L
79
M9L
96
M1M
8
M2M
21
M3M
32
M4M
43
M5M
62
M6M
80
M9M
97
M1N
9
M2N
22
M3N
33
M4N
44
M5N
63
M6N
81
M9N
98
M1P
10
M2P
23
M4P
45
M5P
64
M6P
82
M9P
99
M1R
11
M2R
24
M4R
46
M5R
65
M6R
83
M7R
86
M9R
100
M1S
12
M4S
47
M5S
66
M6S
84
M1T
13
M4T
48
M5T
67
M1V
14
M4V
49
M5V
68
M8V
88
M9V
101
M1W
15
M4W
50
M5W
69
M8W
89
M9W
102
M1X
16
M4X
51
M5X
70
M8X
90
M4Y
52
M7Y
87
M8Y
91
M8Z
92


In [44]:
df_2 = pd.DataFrame(data={'PostalCode':PostCode,'Borough':Borough,'Neighborhood':Neighborhood,'Latitude':latitude,'Longitude':longitude})
df_2.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


In [47]:
df_2.to_csv('toronto.csv',index=False)