In [1]:
!pip install BeautifulSoup4



In [2]:
# import libraries
import urllib.request
from bs4 import BeautifulSoup

In [3]:
import pandas as pd

In [4]:
# specify the url
quote_page = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(quote_page)

In [5]:
soup = BeautifulSoup(page, 'html.parser')

In [6]:
# table class="wikitable sortable"
Toronto = soup.find('table', attrs={'class': 'wikitable sortable'})

In [7]:
table_rows = Toronto.find_all('tr')
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    if len(row)>0:
        row[2] = row[2][0:-1]
        l.append(row)
Toronto_list = pd.DataFrame(l, columns=["Postcode", "Borough", "Neighbourhood"])

In [8]:
Toronto_list.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [9]:
Toronto_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 289 entries, 0 to 288
Data columns (total 3 columns):
Postcode         289 non-null object
Borough          289 non-null object
Neighbourhood    289 non-null object
dtypes: object(3)
memory usage: 6.9+ KB


In [10]:
Toronto_list = Toronto_list[Toronto_list.Borough != 'Not assigned']

In [11]:
Toronto_list.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [12]:
Toronto_list.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 212 entries, 2 to 287
Data columns (total 3 columns):
Postcode         212 non-null object
Borough          212 non-null object
Neighbourhood    212 non-null object
dtypes: object(3)
memory usage: 6.6+ KB


In [13]:
example = Toronto_list[Toronto_list.Neighbourhood == 'Not assigned']
example

Unnamed: 0,Postcode,Borough,Neighbourhood
8,M7A,Queen's Park,Not assigned


In [14]:
Toronto_list['Neighbourhood'][Toronto_list['Neighbourhood'] == 'Not assigned'] = Toronto_list['Borough']
mask = Toronto_list.Neighbourhood == 'Not assigned'
column_name = 'Neighbourhood'
Toronto_list.loc[mask, column_name] = Toronto_list['Borough']

In [15]:
example = Toronto_list[Toronto_list.Neighbourhood == "Queen's Park"]
example

Unnamed: 0,Postcode,Borough,Neighbourhood
8,M7A,Queen's Park,Queen's Park


In [16]:
Toronto_list.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 212 entries, 2 to 287
Data columns (total 3 columns):
Postcode         212 non-null object
Borough          212 non-null object
Neighbourhood    212 non-null object
dtypes: object(3)
memory usage: 6.6+ KB


In [17]:
Toronto_merge = Toronto_list.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join)

In [18]:
Toronto_merge

Postcode  Borough         
M1B       Scarborough                                            Rouge, Malvern
M1C       Scarborough                    Highland Creek, Rouge Hill, Port Union
M1E       Scarborough                         Guildwood, Morningside, West Hill
M1G       Scarborough                                                    Woburn
M1H       Scarborough                                                 Cedarbrae
M1J       Scarborough                                       Scarborough Village
M1K       Scarborough               East Birchmount Park, Ionview, Kennedy Park
M1L       Scarborough                           Clairlea, Golden Mile, Oakridge
M1M       Scarborough           Cliffcrest, Cliffside, Scarborough Village West
M1N       Scarborough                               Birch Cliff, Cliffside West
M1P       Scarborough         Dorset Park, Scarborough Town Centre, Wexford ...
M1R       Scarborough                                         Maryvale, Wexford
M1S       Sca

In [19]:
type(Toronto_merge)

pandas.core.series.Series

In [26]:
new_df = Toronto_merge.to_frame()
new_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M1B,Scarborough,"Rouge, Malvern"
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [27]:
new_df = new_df.reset_index()
print(new_df.info())
new_df.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
Postcode         103 non-null object
Borough          103 non-null object
Neighbourhood    103 non-null object
dtypes: object(3)
memory usage: 2.5+ KB
None


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [28]:
new_df.tail(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
83,M6R,West Toronto,"Parkdale, Roncesvalles"
84,M6S,West Toronto,"Runnymede, Swansea"
85,M7A,Queen's Park,Queen's Park
86,M7R,Mississauga,Canada Post Gateway Processing Centre
87,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
88,M8V,Etobicoke,"Humber Bay Shores, Mimico South, New Toronto"
89,M8W,Etobicoke,"Alderwood, Long Branch"
90,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
91,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So..."
92,M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen..."


##  Explain my work and any assumptions I make:
1. First, I use BeautifulSoup4 to read contents from wiki, and get table class = 'wikitable sortable' => convert it into dataframe
2. Second, I make dataframe without rows have Toronto_list.Borough == 'Not assigned'
3. Third, I find the rows which Toronto_list['Neighbourhood'] == 'Not assigned' => change them to Toronto_list['Neighbourhood'] == Toronto_list['Borough']
4. Forth, I make a Series by using groupby(['Postcode', 'Borough'], and apply ['Neighbourhood'] with (', '.join) to have the concatination string in ['Neighbourhood']
5. Fifth, I convert Series to DataFrame and reset index.

In [29]:
new_df.shape

(103, 3)