# Segmenting and Clustering Neighborhoods in Toronto  <br>Part I

## Importing libraries need it

In [222]:
import pandas as pd
import requests
import lxml.html as lh

## Convert html table in dataframe using pandas

In [8]:
dfs = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0)

In [223]:
# retrive the first table found in the web page
table=dfs[0]
table

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


## Convert html table into dataframe custom made

In [224]:
#Create a variable page, to handle the contents of the website
page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML which are referred to a table
tr_elements = doc.xpath('//tr')

In [225]:
#Create empty column list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    if (i==3):
        name=name[:-1]
    print ('%d:"%s"'%(i,name))
    col.append((name,[]))

1:"Postcode"
2:"Borough"
3:"Neighbourhood"


In [226]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 3, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    #S is the values of the row
    S=[]
    #Iterate through each element of the row
    for t in T.iterchildren():
        # get the text data from the html table
        data=t.text_content() 
        # cut the \n of the final item of each row
        if (i==2):
            data=data[:-1]
        # save all the items of the row in a series
        S.append(data)
        #Increment i for the next column
        i+=1
    #check if there is any not assigned values in the Borough
    if (S[1]!='Not assigned'):
        l=0
        for item in S:
            # check if there is any neighbourhood with not assigned values 
            if (item=='Not assigned'):
                item=S[1]
            #Append the data to the empty list of the i'th column
            col[l][1].append(item)
            l+=1

In [227]:
# create a directory with the data collected from the html table
Dict={title:column for (title,column) in col}
# convert the directory of the data into a dataframe
df=pd.DataFrame(Dict)

In [228]:
df.head(14)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [229]:
# group the table by the postcode and concatenate the neighbourhood values 
df2 = df.groupby(['Postcode'],as_index=False).agg({'Borough':'first','Neighbourhood': lambda col: ', '.join(col)})

In [230]:
df2.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [234]:
#saved variable 
%store df2

Stored 'df2' (DataFrame)


In [233]:
# show the final shape of the data
df2.shape

(103, 3)