# Hello this script will scrape wikipedia and create a pandas data frame of Postcodes, Borooughs, and Neigborhoods. 



In [204]:
from bs4 import BeautifulSoup
import requests

site = r"https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

r  = requests.get(site)
data = r.text

soup = BeautifulSoup(data)

# print(soup)

The variable "soup" now contains the html contents of the webpage. The relevant table has the following structure: 

In [205]:
"""
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
"""
print("")




Within each 'tr' and '/tr' tags, there are three 'td' items. Let's extract each 'tr' item into a list.

In [206]:
ugly_contents = []
table = soup.find("tbody")
for row in table.find_all("tr"):
    cells = row.find_all("td")
    for e in cells:
        ugly_contents.append(str(e)[4:-5])


print(ugly_contents[0:20])

['M1A', 'Not assigned', 'Not assigned\n', 'M2A', 'Not assigned', 'Not assigned\n', 'M3A', '<a href="/wiki/North_York" title="North York">North York</a>', '<a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>\n', 'M4A', '<a href="/wiki/North_York" title="North York">North York</a>', '<a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>\n', 'M5A', '<a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a>', '<a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>\n', 'M5A', '<a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a>', '<a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>\n', 'M6A', '<a href="/wiki/North_York" title="North York">North York</a>']


Every third element has a "/n" character, so let's clean it up and clean up the links too

In [207]:
import re

clean_contents = []
for u in ugly_contents:
    c = u.strip()
    if c[0] == "<":
        c = re.search('title="(.*)">', c).group(1)
    
    clean_contents.append(c)
    
print(clean_contents[0:10])

['M1A', 'Not assigned', 'Not assigned', 'M2A', 'Not assigned', 'Not assigned', 'M3A', 'North York', 'Parkwoods', 'M4A']


Let's now make this list into a dataframe

In [208]:
import pandas as pd 

data_dict = {
    "Postcode":clean_contents[0::3],
    "Borough":clean_contents[1::3],
    "Neighbourhood":clean_contents[2::3]
}

df = pd.DataFrame(data_dict)
print(df.head().to_string())
print(df.info())

  Postcode           Borough           Neighbourhood
0      M1A      Not assigned            Not assigned
1      M2A      Not assigned            Not assigned
2      M3A        North York               Parkwoods
3      M4A        North York        Victoria Village
4      M5A  Downtown Toronto  Harbourfront (Toronto)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 3 columns):
Postcode         288 non-null object
Borough          288 non-null object
Neighbourhood    288 non-null object
dtypes: object(3)
memory usage: 6.8+ KB
None


As per instruction, we will drop any Borough of "Not Assigned"

In [209]:
df = df[df["Borough"] != "Not assigned"].reset_index(drop=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 3 columns):
Postcode         211 non-null object
Borough          211 non-null object
Neighbourhood    211 non-null object
dtypes: object(3)
memory usage: 5.0+ KB
None


Now replace the Neighbourhoods which are 'Not assigned' to the borough name 

In [210]:
print("Before")
not_as_fil = df["Neighbourhood"] == "Not assigned"
print(df[not_as_fil])

print("After")
df.loc[not_as_fil,["Neighbourhood"]] = df.loc[not_as_fil]["Borough"]
print(df[not_as_fil])

Before
  Postcode                 Borough Neighbourhood
6      M7A  Queen's Park (Toronto)  Not assigned
After
  Postcode                 Borough           Neighbourhood
6      M7A  Queen's Park (Toronto)  Queen's Park (Toronto)


Let's now group by postal code

In [211]:
pst = df.groupby("Postcode")
df_grped = pst.aggregate(lambda x: list(set(x)))
print(df_grped[0:20].to_string())

                         Borough                                      Neighbourhood
Postcode                                                                           
M1B       [Scarborough, Toronto]                 [Malvern, Toronto, Rouge, Toronto]
M1C       [Scarborough, Toronto]  [Highland Creek (Toronto), Port Union, Toronto...
M1E       [Scarborough, Toronto]  [West Hill, Toronto, Morningside, Toronto, Gui...
M1G       [Scarborough, Toronto]                                  [Woburn, Toronto]
M1H       [Scarborough, Toronto]                                        [Cedarbrae]
M1J       [Scarborough, Toronto]                              [Scarborough Village]
M1K       [Scarborough, Toronto]  [Ionview, Kennedy Park, Toronto, East Birchmou...
M1L       [Scarborough, Toronto]  [Oakridge, Toronto, Clairlea, Golden Mile, Tor...
M1M       [Scarborough, Toronto]  [Scarborough Village West, Cliffside, Toronto,...
M1N       [Scarborough, Toronto]                      [Cliffside West, Birch

Now we join with the lat/long data

In [212]:
lat_long = pd.read_csv("Geospatial_Coordinates.csv", sep=",", header=0)
print(lat_long.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
Postal Code    103 non-null object
Latitude       103 non-null float64
Longitude      103 non-null float64
dtypes: float64(2), object(1)
memory usage: 2.5+ KB
None


In [213]:
final = df_grped.merge(lat_long, how="inner", left_on="Postcode", right_on="Postal Code")

print(final[0:20].to_string())

                   Borough                                      Neighbourhood Postal Code   Latitude  Longitude
0   [Scarborough, Toronto]                 [Malvern, Toronto, Rouge, Toronto]         M1B  43.806686 -79.194353
1   [Scarborough, Toronto]  [Highland Creek (Toronto), Port Union, Toronto...         M1C  43.784535 -79.160497
2   [Scarborough, Toronto]  [West Hill, Toronto, Morningside, Toronto, Gui...         M1E  43.763573 -79.188711
3   [Scarborough, Toronto]                                  [Woburn, Toronto]         M1G  43.770992 -79.216917
4   [Scarborough, Toronto]                                        [Cedarbrae]         M1H  43.773136 -79.239476
5   [Scarborough, Toronto]                              [Scarborough Village]         M1J  43.744734 -79.239476
6   [Scarborough, Toronto]  [Ionview, Kennedy Park, Toronto, East Birchmou...         M1K  43.727929 -79.262029
7   [Scarborough, Toronto]  [Oakridge, Toronto, Clairlea, Golden Mile, Tor...         M1L  43.711112 -79

# Done!

Thank you,

Michael A Greene, PhD. 