# Segmenting and Clustering Neighborhoods in Toronto
## Part 2

#### Scrape the Wikipedia page.

In [1]:
# Import libraries.

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = bs(source, 'lxml')

# Use tag 'table' and class 'wikitable sortable'.
table = soup.find('table', class_='wikitable sortable')

# Group by tag <tr>, within this tag group by <th> for the table headers and by <td> for the table entries.
postcodes = table.find_all('tr')

# Create and populate a pandas dataframe.
df = pd.DataFrame()

columns = [column_name.text.strip('\n') for column_name in postcodes[0].find_all('th')]
columns[0] = 'PostalCode'
df = pd.DataFrame(columns=columns)
for postcode in postcodes[1:]:
    df.loc[len(df)] = [value.text.strip('\n') for value in postcode.find_all('td')]
    
# Ignore all cells with the borough that is 'Not assigned'.
df = df[df['Borough'] != 'Not assigned']

# Rename 'Not assigned' neighborhoods with the corresponding borough names.
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = np.nan
df['Neighbourhood'].fillna(value=df['Borough'], inplace=True)

# Combine neighborhoods with the same postal codes.
df = df.groupby(['PostalCode', 'Borough'])['Neighbourhood'].apply(sorted).apply(', '.join).reset_index()

# Print the number of rows of the dataframe.
print('The shape of the dataframe: %s.' % str(df.shape))
df.head(10)

The shape of the dataframe: (103, 3).


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


#### Get the latitude and the longitude coordinates of each neighborhood.

In [3]:
# Import libraries and create a geopy agent.

import time
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="toronto_agent")

Let's try to retrive coordinates of a postal code in Toronto.

In [4]:
address = 'M1B, Malvern, Rouge, Toronto, Canada'

location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

43.8091955 -79.2217008


Get the coordinates for all postal codes.

In [5]:
df['Latitude'] = np.nan
df['Longitude'] = np.nan

In [6]:
for k, postal_code in enumerate(df['PostalCode']):
    location = None
    tries = 2
    while location is None and tries:
        try:
            location = geolocator.geocode('%s, Toronto, Canada' % postal_code)
            time.sleep(1)
            df.iloc[k, df.columns.get_loc('Latitude')] = location.latitude
            df.iloc[k, df.columns.get_loc('Longitude')] = location.longitude
        except:
            tries -= 1

In [7]:
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.653963,-79.387207
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.653963,-79.387207
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",,
3,M1G,Scarborough,Woburn,43.760778,-79.223732
4,M1H,Scarborough,Cedarbrae,,
5,M1J,Scarborough,Scarborough Village,,
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",,
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",,
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",,
9,M1N,Scarborough,"Birch Cliff, Cliffside West",,


In [8]:
df['Latitude'].count()

22

This did not work well...

Let's try to scrape Google.

In [9]:
source = requests.get('http://www.google.com/search?q=M1E+Toronto+coordinates').text

In [10]:
pos = -1
while pos == -1:
    pos = source.find(r'href="http://maps.google.com/maps?')
    source = source[pos+1:]
    temp = source[:200]
    pos = temp.find(r'll=')

print(temp[pos+3:].split('&amp;')[0].split(','))

['43.7635726', '-79.1887115']


Let's write a helpher function.

In [11]:
def coordinates(request):
    source = requests.get('http://www.google.com/search?q=%s' % request).text
    pos = -1
    while pos == -1:
        pos = source.find(r'href="http://maps.google.com/maps?')
        if pos == -1:
            return [np.nan, np.nan]
        source = source[pos+1:]
        temp = source[:200]
        pos = temp.find(r'll=')

    return [float(value) for value in temp[pos+3:].split('&amp;')[0].split(',')]

In [12]:
print(coordinates('M1H+Canada+coordinates'))

[43.773136, -79.2394761]


Get the coordinates for all postal codes.

In [15]:
for k, postal_code in enumerate(df['PostalCode']):
    if not np.isfinite(df.iloc[k, df.columns.get_loc('Latitude')]):
        location = coordinates('%s+Canada+postal+code+coordinates' % postal_code)
        print('%s: %s' %(postal_code, location))
        df.iloc[k, df.columns.get_loc('Latitude')] = location[0]
        df.iloc[k, df.columns.get_loc('Longitude')] = location[1]
        time.sleep(1)

M1N: [43.692657, -79.2648481]
M2P: [43.7527583, -79.4000493]
M4A: [43.7258823, -79.3155716]
M4B: [43.7063972, -79.309937]
M4C: [43.6953439, -79.3183887]
M4E: [43.6763574, -79.2930312]
M4G: [43.7090604, -79.3634517]
M4H: [43.7053689, -79.3493719]
M4J: [43.685347, -79.3381065]
M4K: [43.6795571, -79.352188]
M4L: [43.6689985, -79.3155716]
M4M: [43.6595255, -79.340923]
M4N: [43.7280205, -79.3887901]
M4P: [43.7127511, -79.3901975]
M4R: [43.7153834, -79.4056784]
M4S: [43.7043244, -79.3887901]
M4T: [43.6895743, -79.3831599]
M4V: [43.6864123, -79.4000493]
M4W: [43.6795626, -79.3775294]
M4Y: [43.6658599, -79.3831599]
M5A: [43.6542599, -79.3606359]
M5B: [43.6571618, -79.3789371]
M5C: [43.6514939, -79.3754179]
M5G: [43.6579524, -79.3873826]
M5H: [43.6505712, -79.3845675]
M5K: [43.6471768, -79.3815764]
M5L: [43.6481985, -79.3798169]
M5M: [43.7332825, -79.4197497]
M5N: [43.7116948, -79.4169356]
M5P: [43.6969476, -79.4113072]
M5R: [43.6727097, -79.4056784]
M5S: [43.6626956, -79.4000493]
M5T: [43.6532

In [19]:
df.head(15)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.653963,-79.387207
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.653963,-79.387207
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.760778,-79.223732
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [20]:
df['Latitude'].count()

103

Looks good!