# Clustering of Toronto neighborhoods

In [1]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup

#### Load wikipedia page with postal codes of Toronto

In [2]:
wikipedia_link="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

r = requests.get(wikipedia_link)
page = r.text

#### Locate embedded table with postal codes, boroughs and and extract it

In [3]:
pos1=page.find('<table class="wikitable sortable">')
pos2=page.find("</table>", pos1)
table_text = page[pos1:pos2+8]

In [4]:
table_text=table_text.replace("\n","")
soup = BeautifulSoup(table_text,"lxml")

##### Extract column names from table

In [5]:
rowl = []
tablerows = soup.find_all("tr")  

rowh = soup.find_all("th") 
colnames=[]
for rh in rowh:
    colnames.append(rh.string)
colnames  

['Postcode', 'Borough', 'Neighbourhood']

##### Extract all rows and put in a python dataframe 

In [6]:
rowr=soup.find_all("tr")
allrows=[]
for rr in rowr:
    rd = rr.find_all("td")
    currow=[]
    for rrd in rd:
        currow.append(rrd.string) 
    allrows.append(currow)
    
alldf=pd.DataFrame(allrows, columns=colnames).iloc[1:,]
alldf.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [7]:
print("Total number of scraped rows except header is: ", alldf.shape[0])

Total number of scraped rows except header is:  289


#####  Skip rows  with both "Not Assigned" Boroughs and Neighborhods and assign Neighborhood = Borough if Borough is known

In [8]:
alldf = alldf[alldf["Borough"] != "Not assigned"]
df1=alldf.loc[alldf.Neighbourhood == 'Not assigned']
for index, row in df1.iterrows():
    alldf.Neighbourhood[index] = row.Borough

##### Combine rows with the same Borough into the same  row with Neighborhoods separated by comma

In [9]:
##create Pivit table with columns equal to All found Neighbourhoods
alldf["Val"] = 1
tab=pd.pivot_table(alldf, values="Val", columns="Neighbourhood", aggfunc=np.sum, index=["Postcode","Borough"])
tab=tab.fillna("")
tab.columns    

Index([Adelaide, Agincourt, Agincourt North, Albion Gardens, Alderwood,
       Bathurst Manor, Bathurst Quay, Bayview Village, Beaumond Heights,
       Bedford Park,
       ...
       Willowdale South, Willowdale West, Wilson Heights, Woburn,
       Woodbine Gardens, Woodbine Heights, York Mills, York Mills West,
       York University, Yorkville],
      dtype='object', name='Neighbourhood', length=210)

In [10]:
### function to combine neigbourhoods into one line, separated by comma
def compress_neighbourhoods(itm):
    str = ""
    for key, value in itm:
        if value == 1.0: 
            if str == "": str = (str + key)
            else: str = str + "," + key
    return str

In [11]:
### Produce target table with one row per Postcode and comma separated Neighbourhoods
nr=tab.shape[0]
dict = {}
for cur in range(nr):
    dict[cur] = {"Postcode": tab.index[cur][0], "Borough": tab.index[cur][1], "Neighbourhoods": compress_neighbourhoods(tab.iloc[cur,].items())}

toronto_neighbourhoods=pd.DataFrame(dict).transpose().loc[:,["Postcode","Borough","Neighbourhoods"]]
toronto_neighbourhoods.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhoods
0,M1B,Scarborough,"Malvern,Rouge"
1,M1C,Scarborough,"Highland Creek,Port Union,Rouge Hill"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [12]:
toronto_neighbourhoods.tail(5)

Unnamed: 0,Postcode,Borough,Neighbourhoods
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."
102,M9W,Etobicoke,Northwest


In [13]:
print("Number of rows in the dataframe:", toronto_neighbourhoods.shape[0])

Number of rows in the dataframe: 103


### Add Geolocation coordinates to Postal Codes

In [14]:
!conda install -c conda-forge geocoder --yes
import geocoder

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
geocoder                  1.38.1                     py_0    conda-forge


In [15]:
#Load provided file with geo coordinates, as the provided method of finding coordinates through geocoder seems to be problematic
!wget -q -O "ontario_postalcode_coord.csv" http://cocl.us/Geospatial_data
geodata = pd.read_csv("ontario_postalcode_coord.xls")
geodata.columns=["Postcode", "Latitude", "Longtitude"]
geodata.head()    

Unnamed: 0,Postcode,Latitude,Longtitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [16]:
print("Number of rows in geodata dataframe:", geodata.shape[0])

Number of rows in geodata dataframe: 103


#### Resulting table of neighbourhoods with geolocations

In [17]:
toronto_neighbourhoods=pd.merge(toronto_neighbourhoods, geodata, how='left', on="Postcode", left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False,
         validate=None)

toronto_neighbourhoods.head()

Unnamed: 0,Postcode,Borough,Neighbourhoods,Latitude,Longtitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Port Union,Rouge Hill",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [18]:
pd.set_option('display.max_colwidth', -1)
toronto_neighbourhoods.loc[toronto_neighbourhoods['Postcode'].isin(['M5G','M2H','M4B','M1J','M4G','M4M','M1R','M9V','M9L','M5V','M1B','M5A'])]

Unnamed: 0,Postcode,Borough,Neighbourhoods,Latitude,Longtitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
11,M1R,Scarborough,"Maryvale,Wexford",43.750072,-79.295849
17,M2H,North York,Hillcrest Village,43.803762,-79.363452
35,M4B,East York,"Parkview Hill,Woodbine Gardens",43.706397,-79.309937
38,M4G,East York,Leaside,43.70906,-79.363452
43,M4M,East Toronto,Studio District,43.659526,-79.340923
53,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
57,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
68,M5V,Downtown Toronto,"Bathurst Quay,CN Tower,Harbourfront West,Island airport,King and Spadina,Railway Lands,South Niagara",43.628947,-79.39442
