In this note book, we will practise how to segment and cluster neighborhoods in Toronto using Foursquare API

In [1]:
# use url to store the target website
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
! pip install beautifulsoup4
! pip install lxml



In [4]:
#create a object to handle the contents of the website
source = requests.get(url).text
soup = BeautifulSoup(source,'lxml')

In [5]:
#find the table in the website and convert it to panda dataframe
table = soup.find('table')
#print(table.prettify())
table_rows = table.find_all('tr')

canadadata = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [i.text.rstrip() for i in td]
    canadadata.append(row)

canadadata = canadadata[1:]
#print(canadadata)

df_can = pd.DataFrame(canadadata, columns = ['Postal code', 'Borough', 'Neighborhood'])
#df_can = df_can.drop([0],axis = 0)
df_can.head(5)

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [6]:
#modify the dataframe to satisfy the request
#1. delete the rows with not assigned Borough
df_can = df_can[df_can.Borough != 'Not assigned']

#2. replace / with ,
df_can['Neighborhood']=df_can['Neighborhood'].str.replace('/', ',')

#3. assign neighborhood as the same as the borough
df_can.loc[df_can['Neighborhood']=='', 'Neighborhood'] = df_can.loc[df_can['Neighborhood']=='','Borough']

#4. reset the index
df_can = df_can.reset_index(drop = True)
df_can.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [8]:
# read the coordinate information from http://cocl.us/Geospatial_data
df_laglng = pd.read_csv('http://cocl.us/Geospatial_data')

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [20]:
df_can_joint = df_can.join(df_laglng.set_index('Postal Code'), on = 'Postal code')
df_can_joint.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494


Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 15.0MB/s ta 0:00:01
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


None
