In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

### 1. Prepare Data
Issuing a GET request and location the first table in the page content

In [2]:
link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

req = requests.get(link)
page = BeautifulSoup(req.content).find('table')

### 2. Filtering Data
Extracting just the content from the HTML \< table \> tag. More over, removing borough lines with the value __Not assigned__, besides.
All valid lines has spaces and new line caracters remove from begining and ending string.

In [10]:
data = list()

for trs in page.find_all('tr'):
    line = [td.get_text().strip() for td in trs.find_all('td')]
    if line and line[1] != 'Not assigned':
        data.append(line)

### 3. Create DataFrame
Using data list as a source to create a DataFrame along with all columns name requested. Most importantly, the groupby() function is grouping all registers which have the same Borough and PostalCode values, besides, join all Neighborhood values with a comma.

As the groupby() function generate a Series object, I utilized to_frame() function to convert the result back to DataFrame with a new index created through reset_index() function.

In [None]:
df = pd.DataFrame.from_records(data, columns=['PostalCode', 'Borough', 'Neighborhood'])
df = df.groupby(['PostalCode', 'Borough'], sort=False)['Neighborhood'].apply(', '.join).to_frame().reset_index()

### 4. Removing invalid data
Replacing all __Not assigned__ value from Neighborhood column with the value from Borough column. Finally, display the first 10 rows.

In [57]:
df['Neighborhood'] = df['Borough'].where(df['Neighborhood'] == 'Not assigned', df['Neighborhood'])
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


### 5. Size
Dispay the number rows and columns

In [56]:
df.shape

(103, 3)