# Toronto neighborhoods clustering

In [1]:
! pip install lxml



In [2]:
import json
import requests
from lxml import html
import pandas as pd

In [3]:
pd.set_option('display.max_rows', 200)

In order to reconstruct the table, it will be necessary to perform a parsing of the html source code of the Wikipedia page.  
That can be achieved by means of the html module in the lxml package as follows:

In [4]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(url)
tree = html.fromstring(page.content)

Let's take a look at the source code: the table is indicated with the class _wikitable sortable_ .

In [5]:
pos = tree.find_class("wikitable sortable")[0]
headers = [col.text.strip('\n') for col in pos[0][0]]
data = pos.xpath('//tbody//tr//td/text()')
l = len(headers)

pos_list = []

for n in range(0,len(data)-1, l):
    try:
        pos_list.append([it.strip('\n').replace(' /',',') for it in data[n:n+l]])
    except:
        print('error in line: {}\n could not add to list'.format((n-n%l)/l))


In [6]:
df_poscode = pd.DataFrame(pos_list, columns=headers)
df_poscode.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Data cleaning

In [7]:
df_poscode = df_poscode[(df_poscode['Postal code'] != '') & (df_poscode.Borough != 'Not assigned')]
df_poscode.reset_index(drop=True, inplace=True)

### View of the scraped dataframe

In [8]:
df_poscode.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [9]:
print('N rows in the dataframe: {}'.format(df_poscode.shape[0]))

N rows in the dataframe: 103
