In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import folium # map rendering library
from geopy.geocoders import Nominatim
# import k-means from clustering stage
from sklearn.cluster import KMeans

# Part 1

Scrape the Wikipedia page and store into a pandas dataframe

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))
df = df[0]
new_header = df.iloc[0]
df = df[1:]
df.columns = new_header

Remove Not assigned Borough Rows

In [3]:
indexNames = df[ df['Borough'] == "Not assigned" ].index
df.drop(indexNames , inplace=True)

More than one neighborhood can exist in one postal code area. These two rows will be combined into one row with the neighborhoods separated with a comma.

In [4]:
df = df.groupby(["Postcode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))

If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 

In [5]:
for index, row in df.iterrows():
    if row["Neighbourhood"] == "Not assigned":
        row["Neighbourhood"] = row["Borough"]

In [6]:
df.shape

(103, 3)

# Part 2

Read the geocoder url into a pandas dataframe

In [7]:
coord = pd.read_csv("http://cocl.us/Geospatial_data")

Combine with the first dataframe

In [8]:
coord.rename(columns={"Postal Code": "Postcode"}, inplace=True)
df = df.merge(coord, on="Postcode", how="left")
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Part 3

In [9]:
#### Use geopy library to get the latitude and longitude values of Toronto

#In order to define an instance of the geocoder, we need to define a user_agent. 
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [10]:

# Create a map of with neighborhoods superimposed on top.
map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map

for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map)  
map