# Code for Scrapping Data from Wikipedia and Creating Table

Installing necessary libraries and importing table from HTML

In [157]:
import bs4 as bs
import urllib.request
import pandas as pd

source = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()
soup = bs.BeautifulSoup(source,'html.parser')

table = soup.find('table', class_='wikitable sortable')
table_rows = table.find_all('tr')

In [158]:
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
df = pd.DataFrame(l, columns=["Postal_Code", "Borough","Neighborhood"])

## DataFrame created

In [159]:
df

Unnamed: 0,Postal_Code,Borough,Neighborhood
0,,,
1,M1A\n,Not assigned\n,\n
2,M2A\n,Not assigned\n,\n
3,M3A\n,North York\n,Parkwoods\n
4,M4A\n,North York\n,Victoria Village\n
...,...,...,...
176,M5Z\n,Not assigned\n,\n
177,M6Z\n,Not assigned\n,\n
178,M7Z\n,Not assigned\n,\n
179,M8Z\n,Etobicoke\n,"Mimico NW, The Queensway West, South of Bloor,..."


### Removing Not Assigned from Borough

In [160]:
df_f = df[df['Borough']!='Not assigned\n']
a = df_f.Neighborhood=="Not assigned\n"
a.any() #checking for any not assigned value in Neighborhood; found None
df_f

Unnamed: 0,Postal_Code,Borough,Neighborhood
0,,,
3,M3A\n,North York\n,Parkwoods\n
4,M4A\n,North York\n,Victoria Village\n
5,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
6,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights\n"
...,...,...,...
161,M8X\n,Etobicoke\n,"The Kingsway, Montgomery Road, Old Mill North\n"
166,M4Y\n,Downtown Toronto\n,Church and Wellesley\n
169,M7Y\n,East Toronto\n,Business reply mail Processing Centre\n
170,M8Y\n,Etobicoke\n,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [161]:
df_f.drop([0], inplace = True) #dropping first None values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [162]:
df_f #Checking DataFrame

Unnamed: 0,Postal_Code,Borough,Neighborhood
3,M3A\n,North York\n,Parkwoods\n
4,M4A\n,North York\n,Victoria Village\n
5,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
6,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights\n"
7,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government\n"
...,...,...,...
161,M8X\n,Etobicoke\n,"The Kingsway, Montgomery Road, Old Mill North\n"
166,M4Y\n,Downtown Toronto\n,Church and Wellesley\n
169,M7Y\n,East Toronto\n,Business reply mail Processing Centre\n
170,M8Y\n,Etobicoke\n,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


### Final DataFrame

In [163]:
d = df_f.groupby("Postal_Code")["Neighborhood"].apply(",".join).reset_index() #Joining all same neighbours

In [164]:
d #Final DataFrame

Unnamed: 0,Postal_Code,Neighborhood
0,M1B\n,"Malvern, Rouge\n"
1,M1C\n,"Rouge Hill, Port Union, Highland Creek\n"
2,M1E\n,"Guildwood, Morningside, West Hill\n"
3,M1G\n,Woburn\n
4,M1H\n,Cedarbrae\n
...,...,...
98,M9N\n,Weston\n
99,M9P\n,Westmount\n
100,M9R\n,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V\n,"South Steeles, Silverstone, Humbergate, Jamest..."


In [165]:
d["Borough"] = df_f["Borough"] #Adding Borough

In [166]:
d #Checking again

Unnamed: 0,Postal_Code,Neighborhood,Borough
0,M1B\n,"Malvern, Rouge\n",
1,M1C\n,"Rouge Hill, Port Union, Highland Creek\n",
2,M1E\n,"Guildwood, Morningside, West Hill\n",
3,M1G\n,Woburn\n,North York\n
4,M1H\n,Cedarbrae\n,North York\n
...,...,...,...
98,M9N\n,Weston\n,
99,M9P\n,Westmount\n,York\n
100,M9R\n,"Kingsview Village, St. Phillips, Martin Grove ...",Scarborough\n
101,M9V\n,"South Steeles, Silverstone, Humbergate, Jamest...",North York\n


In [167]:
d.dropna(subset=["Borough"], inplace = True)

In [168]:
d = d[["Postal_Code","Neighborhood","Borough"]] #Rearranging DataFrame

In [169]:
d

Unnamed: 0,Postal_Code,Neighborhood,Borough
3,M1G\n,Woburn\n,North York\n
4,M1H\n,Cedarbrae\n,North York\n
5,M1J\n,Scarborough Village\n,Downtown Toronto\n
6,M1K\n,"Kennedy Park, Ionview, East Birchmount Park\n",North York\n
7,M1L\n,"Golden Mile, Clairlea, Oakridge\n",Downtown Toronto\n
...,...,...,...
95,M9C\n,"Eringate, Bloordale Gardens, Old Burnhamthorpe...",Central Toronto\n
96,M9L\n,Humber Summit\n,York\n
99,M9P\n,Westmount\n,York\n
100,M9R\n,"Kingsview Village, St. Phillips, Martin Grove ...",Scarborough\n


In [170]:
cols_to_check = ["Postal_Code", "Borough","Neighborhood"] #Remoning \n Caharcter
d[cols_to_check] = d[cols_to_check].replace({'\n':''}, regex=True)

## Final Result

In [183]:
e = d.reset_index() #Final Result
e = e.drop("index",axis=1)
e

Unnamed: 0,Postal_Code,Neighborhood,Borough
0,M1G,Woburn,North York
1,M1H,Cedarbrae,North York
2,M1J,Scarborough Village,Downtown Toronto
3,M1K,"Kennedy Park, Ionview, East Birchmount Park",North York
4,M1L,"Golden Mile, Clairlea, Oakridge",Downtown Toronto
...,...,...,...
62,M9C,"Eringate, Bloordale Gardens, Old Burnhamthorpe...",Central Toronto
63,M9L,Humber Summit,York
64,M9P,Westmount,York
65,M9R,"Kingsview Village, St. Phillips, Martin Grove ...",Scarborough


In [184]:
print("Final shape of DataFrame is {}".format(e.shape))

Final shape of DataFrame is (67, 3)
