# Segmenting and Clustering Neighborhoods in Toronto

## Part 1: Scraping the Wikipedia page

We start by retrieving the necessary data from Wikipedia. Setup:

In [148]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

webpage = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

Now we can do the scraping:

In [149]:
response = requests.get(url = webpage)

soup = BeautifulSoup(response.content, 'html.parser')

postal_codes = pd.DataFrame([], columns = ["Postal Code", "Borough", "Neighbourhood"])
i = 0

for tr in soup.find("table").find_all("tr"):
    if tr.find_all('td') != []:
        postal_codes.loc[i] = [tr.find_all('td')[0].string[:-1],
                               tr.find_all('td')[1].string[:-1],
                               tr.find_all('td')[2].string[:-1]]
        i += 1

            

In [154]:
postal_codes.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [151]:
postal_codes.shape

(180, 3)

We need to clean the data a bit:

In [152]:
postal_codes = postal_codes.query("Borough != 'Not assigned'")

def conc_nbhd(nbhds):
    if(len(nbhds) == 1):
        return nbhds
    else:
        output = ""
        for nbhd in nbhds:
            output += nbhd
            output += ", "
        return(output[:-2])
    
postal_codes = postal_codes.groupby(["Postal Code", "Borough"]).agg({"Neighbourhood" : (lambda x : conc_nbhd(x))}).reset_index()

postal_codes['Neighbourhood'] = np.where(postal_codes["Neighbourhood"] == "Not assigned",
                                         postal_codes["Borough"],
                                         postal_codes["Neighbourhood"])

Data looks like this now:

In [155]:
postal_codes.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [145]:
postal_codes.shape

(103, 3)