# Segmenting and Clustering Neighborhoods in Toronto
---------
Applied Data Science Capstone > 
Week 3

In [6]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests


### Download wikipedia page and parse columns

In [7]:
page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(page, 'html.parser')
post_table = soup.find('table')
#post_table.tr.find_all('th')
column_titles = [i.text.replace('\n','') for i in post_table.tr.find_all('th')]
column_titles

['Postcode', 'Borough', 'Neighbourhood']

### Parse out the rows 

In [8]:
data = []
rows = post_table.find_all('tr') 
for num in range(1, len(rows)):
    temp = rows[num].text.split('\n')
    temp = list(filter(None, temp))  # Filter out the empty elements
    data.append(temp)
    
#rows = soup.find('table').find_all('tr')[1].text.split('\n')
#rows = list(filter(None, rows))
data[:5]

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront']]

### Create a dataframe with the data

In [9]:
df_all = pd.DataFrame(data=data, columns=column_titles)
df_all.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Remove rows with borough that is "Not assigned"

In [10]:
df_all = df_all.loc[df_all['Borough'] != 'Not assigned'] 
df_all.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Combine neighborhoods in the same zipcode

In [11]:
unique_post = df_all['Postcode'].unique() # Get all the unique postcodes
df = pd.DataFrame(columns=column_titles) # create the new dataframe

# Loop through the unique postcodes and insert data
for i, p in enumerate(unique_post):
    hood = df_all['Neighbourhood'].loc[df_all['Postcode'] == p].to_string(index=False).strip().replace('\n',",")
    borough = df_all['Borough'].loc[df_all['Postcode'] == p].head(1).to_string(index=False).strip()
    df.loc[i] = [p, borough, hood]

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned


### Copy the Borough's name to the "Not assigned" Neighborhood's name

In [12]:
for i, row in df.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] =  row['Borough']    

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


### The # of rows in the dataframe

In [13]:
df.shape[0]

103

## Geocoder 

In [15]:
!pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 13.4MB/s ta 0:00:01
[?25hCollecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


I can't get the geocoder to work. Using the CSV

In [29]:
import geocoder
ll_coords = None
pcode = ""
borough = 'Mountain View, CA'

#while(ll_coords is None):
#    g = geocoder.google('{}, {}'.format(pcode, borough))
#    ll_coords = g.latlng

g = geocoder.google('{} {}'.format(pcode, borough))
ll_coords = g.latlng

ll_coords
g.latlng
#print(ll_coords[0],",",ll_coords[1])



In [31]:
df_coords = pd.read_csv("https://cocl.us/Geospatial_data")
df_coords.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [32]:
df_coords = df.merge(df_coords, on=('Postal Code'))
df_coords

KeyError: 'Postal Code'