# Explore and Cluster the Neighbourhoods in Toronto

This notebook will be collecting all the neighbourhood location information in Toronto from scraping Canada postal codes wikipedia page, then using FourSquare API to retrieve the specific information information about each neighbourhood. 

The List of postal codes of Canada Wikipedia link:
https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M 


In [1]:
import pandas as pd
import numpy as np
import requests
import html5lib
from bs4 import BeautifulSoup# beautifulsoup4
#import geocoder

## parsing wikipedia page and find the table

In [2]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(URL)
#print(r.content) # returns raw HTML content of the web of string type

# BeautifulSoup library is build on the top of the HTML parsing libraries like html5lib
soup = BeautifulSoup(r.content, 'html5lib') # specifiy the HTML parser

#print(soup.prettify()) # gives the visual representation of the parse tree

table = soup.find('table', {'class':'wikitable sortable'})

df = pd.DataFrame(columns=["PostalCode","Borough","Neighborhood"])
for tr in table.find_all('tr'):
    tds=tr.find_all('td')
    info = {}
    if tds and len(tds)==3:
        if tds[1].text != "Not assigned": # filter out borough not assigned
            PostalCode= tds[0].text
            if not tds[1].find_all('a'):
                Borough=((tds[1]).text.strip()) # add .strip() to filter out '/n'
            else:
                Borough=(tds[1].find_all('a')[0].text.strip())
                
            if not tds[2].find_all('a'):
                neighborhood=((tds[2]).text.strip())
            else:
                neighborhood=(tds[2].find_all('a')[0].text.strip())
                
            info = {"PostalCode": PostalCode, "Borough": Borough, "Neighborhood": neighborhood}
        
    if info:
        #print(info)
        df=df.append(info,ignore_index=True)

df.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


## combine by postal code and borough

In [3]:
combined_df = pd.DataFrame(df.groupby(["PostalCode","Borough"])["Neighborhood"].apply(lambda x: [i for i in x if i!="Not assigned"])).reset_index()
combined_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]


In [4]:
# join all list to string seperated by ","
# replace empty cell with None
# then use fillna to replace not assigned neighborhood with its borough
combined_df["Neighborhood"] = combined_df["Neighborhood"].apply(lambda x: ", ".join(x)).apply(lambda a : None if len(a)==0 else a).fillna(combined_df["Borough"])
combined_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


The final shape of combined dataframe is (103, 3)

In [5]:
print(combined_df.shape)

(103, 3)
