In [1]:
import pandas as pd
from pandas.io.json import json_normalize
pd.set_option('display.max_columns', None) # display all columns of a df inline
pd.set_option('display.max_rows', None) # display all rows of a df inline

import json
import requests

from sklearn.cluster import KMeans

# Folium and geopy handle and display geospatial data
import folium # display data on interactive map
from geopy.geocoders import Nominatim # get langitude and longitude for an address

# Import BeautifulSoup for scraping table from wiki-page
from bs4 import BeautifulSoup

In [None]:
website_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' # define wikipedia url
html_content = requests.get(website_url).text # download html flatfile from wikipedia
soup = BeautifulSoup(website_url, 'lxml') # create parse tree object
print(soup.prettify()) # display html data with indents

In [41]:
# extract table and store in variable 'table'
table = soup.find('table',{'class':'wikitable sortable'})

In [None]:
# Get headers of table and store in t_headers
t_headers = ['PostalCode', 'Borough', 'Neighborhood']

# Following code to be uncommented in case table headers from website should be used
# for th in table.find_all("th"):
#     # remove any newlines and extra spaces from left and right
#     t_headers.append(th.text.replace('\n', ' ').strip())

# Get all the rows of table and store in t_data
t_data = []
for tr in table.tbody.find_all("tr"): # find all tr's from table's tbody
    t_row = {}
    # Each table row is stored in the form of
    # t_row = {'Postcode': '', 'Borough': '', 'Neighbourhood': ''}

    # find all td's(3) in tr and zip it with t_header
    for td, th in zip(tr.find_all("td"), t_headers): 
        t_row[th] = td.text.replace('\n', '').strip()
    t_data.append(t_row)

# remove first row, as it included only th (table headers) and thus is empty dictionary
t_data = t_data[1:]
print(t_data)

In [None]:
# Convert t_data into pandas DataFrame
toronto_data_full = pd.DataFrame(t_data)

In [177]:
# Fill Borough, if Neighborhood is not assigned
def fill_neighborhood(df):
    return df.apply(lambda x: x['Borough'] if x['Neighborhood'] == 'Not assigned' else x['Neighborhood'], axis=1)

# Clean data step-by-step with method chaining
toronto_data = (toronto_data_full.query("Borough != 'Not assigned'") # Remove entries without an assigned Borough
                                 # Fill Neighborhood with Borough, if Neighborhood is not filled
                                 .assign(Neighborhood=fill_neighborhood) 
                                 # Merge multiple Neighborhoods on shared PostalCode 
                                 .groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(lambda x: ', '.join(x))
                                 # Convert MultiIndex to columns
                                 .reset_index(drop=False)
               )

toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [178]:
print(f'Shape: {toronto_data.shape}')

Shape: (103, 3)
