In [0]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request

In [0]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [0]:
page = urllib.request.urlopen(url)

In [0]:
soup = BeautifulSoup(page, "lxml")

In [0]:
PostalCode = []
Borough = []
Neighborhood = []

for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        PostalCode.append(cells[0].text.rstrip('\n'))
        Borough.append(cells[1].text.rstrip('\n'))
        Neighborhood.append(cells[2].text.rstrip('\n'))

***The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood***

In [0]:
df = pd.DataFrame({"Postal Code":PostalCode,
                   "Borough":Borough,
                   "Neighborhood":Neighborhood})

In [14]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


***Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.***

In [0]:
df_dropna = df[df.Borough != "Not assigned"].reset_index(drop=True)

In [18]:
df_dropna.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


***More than one neighborhood can exist in one postal code area***

In [0]:
df_grp = df_dropna.groupby(["Postal Code", "Borough"], as_index=False).agg(lambda x: ",".join(x))

In [21]:
df_grp.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


***If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.***

In [0]:
for index, row in df_grp.iterrows():
  if row["Neighborhood"] == "Not Assigned":
    row["Neighborhood"] = row["Borough"]

In [32]:
df_grp.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


***In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.***

In [33]:
df_grp.shape

(103, 3)