### Import required libraries

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Download the URL for scraping

In [4]:
page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(page.content, 'html.parser')

#find the table containing the data we need
table = soup.find(class_='wikitable sortable')
table_row = table.find_all('tr') 

### Data Scraping

#### Initialize DataFrame

In [5]:
column_names = ['PostCode', 'Borough', 'Neighborhood']
postal = pd.DataFrame(columns=column_names)
postal

#### Fill DataFrame

In [6]:
for tr in table_row[1:]:
    tr = tr.find_all('td')
    table_row_data = [data.get_text() for data in tr] 
    postcode = table_row_data[0]
    borough = table_row_data[1]
    neighborhood = table_row_data[2]
    postal = postal.append({'PostCode' : postcode,
                           'Borough' : borough,
                           'Neighborhood': neighborhood}, ignore_index=True)
    
postal.head(10)

Unnamed: 0,PostCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M5A,Downtown Toronto,Regent Park\n
6,M6A,North York,Lawrence Heights\n
7,M6A,North York,Lawrence Manor\n
8,M7A,Queen's Park,Not assigned\n
9,M8A,Not assigned,Not assigned\n


#### Remove the newline at the end from neighborhood 

In [7]:
postal['Neighborhood'] = [col.split('\n')[0] for col in postal['Neighborhood']]
postal.head()                                   

Unnamed: 0,PostCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### 1. Ignore cells with a borough that is Not assigned

#### 2. Merge Neighborhoods with same postal code

In [8]:
#Ignore cells with a borough that is Not assigned
postal = postal.loc[postal['Borough'] != 'Not assigned']
dfc = postal.copy()

#Merge neighborhoods with same postal code
for group, member in dfc.groupby('PostCode'):
    index = []
    for i in range(0, len(member)):
        index.append(member.iloc[i].name)
    
    target = index[0]
        
    for i in range (1, len(member)):
        temp = member.loc[target, ['Neighborhood']].values[0]  + ',' + member.iloc[i]['Neighborhood']
        dfc.loc[target, 'Neighborhood'] = temp

    for i in range(1, len(member)):
        dfc.drop(index[i], inplace=True)
    

#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough


In [9]:
for index, row in dfc.iterrows():
    if ((row['Neighborhood'] == 'Not assigned') & (row['Borough'] != 'Not assigned')):
        row['Neighborhood'] = row['Borough']
    

#### Print shape of final DataFrame

In [11]:
dfc.shape

(103, 3)