### Import library

In [1]:
import urllib.request
import re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

### Load the data from webside

In [2]:
html = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()
soup = BeautifulSoup(html,"html.parser")

In [3]:
text = soup.find_all('td')

In [4]:
Toronto_Neigh = []
for i in range(int(len(text)/3)):
    if(text[3*i].text.startswith('M')):
        Toronto_Neigh.append([text[3*i].text,text[3*i+1].text,text[3*i+2].text.split('\n')[0]]) 

### Load data to the dataframe

In [5]:
df_Toronto_Neigh = pd.DataFrame(Toronto_Neigh)

In [6]:
df_Toronto_Neigh.columns = ["Postalcode","Borough","Neighborhood"]

In [7]:
df_Toronto_Neigh.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Data Preprocessing

#### Drop unassigned Borough

In [8]:
df_Toronto_Neigh = df_Toronto_Neigh.drop(df_Toronto_Neigh[df_Toronto_Neigh.Borough == "Not assigned"].index).reset_index(drop=True)

#### Update unassigned Neighborhood

In [9]:
df_Toronto_Neigh['Neighborhood'] = np.where(df_Toronto_Neigh['Neighborhood'] == 'Not assigned',df_Toronto_Neigh['Borough'],df_Toronto_Neigh['Neighborhood'])

In [10]:
df_Toronto_Neigh.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


#### Compile the duplicated postalcode

In [11]:
df_Toronto_Neigh["is_duplicate"]= df_Toronto_Neigh.duplicated('Postalcode')

In [12]:
df_Toronto_Neigh.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,is_duplicate
0,M3A,North York,Parkwoods,False
1,M4A,North York,Victoria Village,False
2,M5A,Downtown Toronto,Harbourfront,False
3,M5A,Downtown Toronto,Regent Park,True
4,M6A,North York,Lawrence Heights,False


In [13]:
df_duplicate = df_Toronto_Neigh[df_Toronto_Neigh['is_duplicate']==True].reset_index(drop=True)

In [14]:
df_duplicate.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,is_duplicate
0,M5A,Downtown Toronto,Regent Park,True
1,M6A,North York,Lawrence Manor,True
2,M1B,Scarborough,Malvern,True
3,M4B,East York,Parkview Hill,True
4,M5B,Downtown Toronto,Garden District,True


In [15]:
duplicate_pos = list(set(df_duplicate['Postalcode'].tolist()))

In [16]:
df = df_Toronto_Neigh

In [17]:
for pos in duplicate_pos:
    neigh_combin = df[df['Postalcode']==pos]['Neighborhood'].tolist()
    neigh_combin_str = ",".join(str(x) for x in neigh_combin)    
    df.loc[(df['Postalcode'] == pos) & (df['is_duplicate']==False), ['Neighborhood']] = neigh_combin_str

In [18]:
df = df.drop(df[df.is_duplicate == True].index).reset_index(drop=True)

In [19]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,is_duplicate
0,M3A,North York,Parkwoods,False
1,M4A,North York,Victoria Village,False
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",False
3,M6A,North York,"Lawrence Heights,Lawrence Manor",False
4,M7A,Queen's Park,Queen's Park,False


### Final result

In [25]:
df.drop(['is_duplicate'],axis = 1)
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,is_duplicate
0,M3A,North York,Parkwoods,False
1,M4A,North York,Victoria Village,False
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",False
3,M6A,North York,"Lawrence Heights,Lawrence Manor",False
4,M7A,Queen's Park,Queen's Park,False


In [26]:
df.shape

(103, 4)