# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib.request

#### Get Page

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page_file = urllib.request.Request(url)
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
page = response.read().decode('utf-8')

soup = BeautifulSoup(page, 'lxml')

#### find the table and all rows

In [3]:
table = soup.find_all('table')[0] # Grab the first table
table_rows = table.find_all('tr')

#### prepare columns and list -> save all table cells into the list

In [4]:
cols = ['Postcode','Borough','Neighbourhood']
lst = []

for tr in table_rows:
    td = tr.find_all('td')
    row = [i.text for i in td]
    lst.append(row)

#### create the dataframe and prepare it

In [5]:
df = pd.DataFrame(lst, columns=cols)

df = df.replace('Not assigned', np.nan)
df = df.dropna()
df[(df.Neighbourhood == 'Not assigned\n')]

df.iloc[6]['Neighbourhood'] = "Queen's Park"
df = df.reset_index(drop=True)
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods\n
1,M4A,North York,Victoria Village\n
2,M5A,Downtown Toronto,Harbourfront\n
3,M5A,Downtown Toronto,Regent Park\n
4,M6A,North York,Lawrence Heights\n
5,M6A,North York,Lawrence Manor\n
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue\n
8,M1B,Scarborough,Rouge\n
9,M1B,Scarborough,Malvern\n


In [6]:
df.shape

(212, 3)

### Move on to Geocoder 

In [7]:
!pip install geocoder
import geocoder

Collecting geocoder
  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K    100% |████████████████████████████████| 102kB 7.2MB/s ta 0:00:01
[?25hRequirement not upgraded as not directly required: future in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: click in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: six in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Requirement not upgraded as not directly required: requests in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement 

#### with geocoder get the lat_longs from Toronto -> save them to the dataframe

In [120]:
lat = []
long = []

for row in df['Postcode']:
    lat_lng_coords = None
    
    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(row))
      lat_lng_coords = g.latlng
    
    lat.append(lat_lng_coords[0])
    long.append(lat_lng_coords[1])

df['Latitude'] = lat
df['Longitude'] = long
df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods\n,43.753259,-79.329656
1,M4A,North York,Victoria Village\n,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront\n,43.654260,-79.360636
3,M5A,Downtown Toronto,Regent Park\n,43.654260,-79.360636
4,M6A,North York,Lawrence Heights\n,43.718518,-79.464763
5,M6A,North York,Lawrence Manor\n,43.718518,-79.464763
6,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
7,M9A,Etobicoke,Islington Avenue\n,43.667856,-79.532242
8,M1B,Scarborough,Rouge\n,43.806686,-79.194353
9,M1B,Scarborough,Malvern\n,43.806686,-79.194353
