In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from IPython.display import display, HTML

## Parsing the table in the given URL into a dataframe and reading first row as column headers for the table

In [2]:
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page=requests.get(url)
#soup = BeautifulSoup(page.content, 'html.parser')
soup = BeautifulSoup(page.content, 'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))[0]        # reading html table into a dataframe
df.columns=df.iloc[0]                   # reading first row as column headers
df.drop(df.index[0],inplace=True)       # dropping first row as they are table column headers and not part of real data
print(df.shape)
display(HTML(df.head().to_html()))

(289, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


## Ignoring rows with borough containing value 'Not assigned'

In [3]:
df=df[~df['Borough'].str.contains('Not assigned')]
print(df.shape)
display(HTML(df.head().to_html()))
print(df[df['Borough'].str.contains('Not assigned')].count())   # checking if there are rows with borough values 'Not Assigned'

(212, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


0
Postcode         0
Borough          0
Neighbourhood    0
dtype: int64


## Assigning neighborhood containing value 'Not assigned' with borough name

In [4]:
df.Neighbourhood[df['Neighbourhood'].str.contains('Not assigned')]=df.Borough[df['Neighbourhood'].str.contains('Not assigned')]
print(df[df['Postcode'].str.contains('M7A')])     # checking neighborhood for postcode = 'M7A'

0 Postcode       Borough Neighbourhood
9      M7A  Queen's Park  Queen's Park


## Applying groupby function and merging neighborhoods seperated by comma

In [5]:
df_grouped=pd.DataFrame(df.groupby(['Postcode','Borough'],sort=False)['Neighbourhood'].apply(lambda x: "%s" % ', '.join(x))).reset_index()    # Applying groupby and merging neighborhoods seperated by comma

In [6]:
df_grouped.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [7]:
df_grouped.shape

(103, 3)

## Saving the new dataframe df_grouped to a csv file which will be read in the 2nd part of the assignment

In [8]:
df_grouped.to_csv('df_grouped.csv',index=False)