# Scrape Toronto Data

In [1]:
# imports
import pandas as pd
import numpy as np

## Webscrape with BeautifulSoup

In [2]:
from bs4 import BeautifulSoup
import requests

# set url to scrape
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Getting the webpage, creating a Response object.
response = requests.get(url)

if response.status_code == 200:
    print('Success!')
elif response.status_code == 404:
    print('Not Found.')

data = response.content

Success!


In [3]:
# create soup object
soup = BeautifulSoup(data, 'lxml')

In [22]:
# # examine object
# print(soup.prettify())

In [23]:
# # locate table and examine
# table = soup.find('table')
# print(table.prettify())

In [24]:
# # cell to isolate information
# for row in table.find_all('tr'):
#     try:
#         print(row.find_all('td')[2].text)
#     except:
#         row.find('td') == None

## Create DataFrame from soup object

In [7]:
# loop through table and parse information into a dataframe
zipcodes = []

for row in table.find_all('tr'):
    zps = {}
    try:
        zps['postalcode'] = row.find_all('td')[0].text
        zps['borough'] = row.find_all('td')[1].text
        zps['neighborhood'] = row.find_all('td')[2].text.strip()
    except:
        row.find('td') == None
        
    zipcodes.append(zps)

toronto = pd.DataFrame(zipcodes)

In [8]:
# drop na's
toronto.dropna(inplace=True)

In [9]:
# remove rows with borough 'Not assigned'
toronto = toronto[toronto['borough'] != 'Not assigned']
toronto.head()

Unnamed: 0,borough,neighborhood,postalcode
3,North York,Parkwoods,M3A
4,North York,Victoria Village,M4A
5,Downtown Toronto,Harbourfront,M5A
6,Downtown Toronto,Regent Park,M5A
7,North York,Lawrence Heights,M6A


In [10]:
# find neighborhood 'Not assigned'
toronto[toronto['neighborhood'] == 'Not assigned']

Unnamed: 0,borough,neighborhood,postalcode
9,Queen's Park,Not assigned,M7A


In [11]:
# reassign neighborhood name to borough name
toronto['neighborhood'].loc[9] = toronto['borough'].loc[9]

In [12]:
# get neighborhoods into columns as lists
new = toronto.groupby(['postalcode', 'borough'])['neighborhood'].apply(list)

In [13]:
new = new.to_frame().reset_index()
new.head()

Unnamed: 0,postalcode,borough,neighborhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]


In [14]:
# remove brackets
new['neighborhood'] = new['neighborhood'].astype(str).str.strip('[]')
new.head()

Unnamed: 0,postalcode,borough,neighborhood
0,M1B,Scarborough,"'Rouge', 'Malvern'"
1,M1C,Scarborough,"'Highland Creek', 'Rouge Hill', 'Port Union'"
2,M1E,Scarborough,"'Guildwood', 'Morningside', 'West Hill'"
3,M1G,Scarborough,'Woburn'
4,M1H,Scarborough,'Cedarbrae'


In [15]:
# remove quotes
new['neighborhood'] = new['neighborhood'].str.replace("'", '')
new

Unnamed: 0,postalcode,borough,neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## Final formatted DataFrame

In [16]:
# remove double quotes
new['neighborhood'] = new['neighborhood'].str.replace('"', '')
new

Unnamed: 0,postalcode,borough,neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [17]:
new.shape

(103, 3)

In [20]:
# save dataframe to csv
# new.to_csv('toronto_neighborhoods.csv', index=False)

In [21]:
# check saved file
pd.read_csv('toronto_neighborhoods.csv')

Unnamed: 0,postalcode,borough,neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"
