In [180]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [181]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = requests.get(url)
html_data = req.text

In [182]:
soup = BeautifulSoup(html_data,'html.parser')

In [183]:
data = []
for tr in soup.table.find_all('tr'): 
    data = np.append(data,tr.text.split('\n'))
data = list(filter(None,data))

In [184]:
data = np.reshape(np.array(data),(-1,3))
data = pd.DataFrame(data[1:,:],columns=data[0,:])
data.rename(columns={'Postcode':'PostalCode'}, inplace=True)
data.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


In [185]:
data.shape

(287, 3)

In [186]:
np.sum(data[data['Neighbourhood'].isna()])

PostalCode       0.0
Borough          0.0
Neighbourhood    0.0
dtype: float64

In [187]:
print(np.sum(data['PostalCode']=='Not assigned'))
print(np.sum(data['Borough']=='Not assigned'))
print(np.sum(data['Neighbourhood']=='Not assigned'))

0
77
78


### Remove rows where Borough is 'Not assigned'

In [188]:
data_clean = data[data['Borough']!='Not assigned']
data_clean.reset_index(inplace=True)
data_clean.drop('index',axis=1,inplace=True)
data_clean.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Not assigned
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [189]:
data_clean.shape

(210, 3)

### Group Neighbourhood with identical PostalCode

In [190]:
#Check if 1 PostalCode can have >1 Boroughs
pc_list = []
for pc in set(data_clean['PostalCode']): 
    if len(set(data_clean[data_clean['PostalCode']==pc]['Borough']))>1: 
        pc_list = np.append(pc_list,set(data_clean[data_clean['PostalCode']==pc]['Borough']))

if len(pc_list)==0: 
    print('No PostalCode can have >1 Boroughs')

No PostalCode can have >1 Boroughs


In [191]:
data_clean = data_clean.groupby('PostalCode',as_index=False).agg({'Borough':'first','Neighbourhood':', '.join})
data_clean.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### Assign Borough name to unnamed Neighbourhood

In [192]:
#Check if any unassigned Neighbourhood is merged with other Neighbourhood in previous step 
'Not assigned' in set(data_clean[data_clean['Neighbourhood']!='Not assigned']['Neighbourhood'])

False

In [193]:
#No unassigned Neighbourhood is merged so we only need to consider if a Neighbourhood == 'Not assigned'
data_clean[data_clean['Neighbourhood']=='Not assigned']

Unnamed: 0,PostalCode,Borough,Neighbourhood
93,M9A,Queen's Park,Not assigned


In [194]:
for index,row in data_clean.iterrows(): 
    if row['Neighbourhood'] == 'Not assigned': 
        row['Neighbourhood'] = row['Borough']

In [195]:
data_clean[data_clean['Neighbourhood']=='Not assigned']

Unnamed: 0,PostalCode,Borough,Neighbourhood


### Shape after cleaning

In [196]:
data_clean.shape

(103, 3)

In [199]:
data_clean.to_csv('week3_postalcode.csv')