# Segmenting and Clustering Neighborhoods in Toronto

In [5]:
import numpy as np, pandas as pd
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

## Part 1: Parsing html data and generating dataframe

In [6]:
d1 = pd.read_html(url)

In [7]:
html_table =  d1[0]
html_table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Removing areas which do not have Borough assigned to it

In [8]:
html_table = html_table[html_table['Borough']!='Not assigned']
html_table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Replacing 'Not assigned' Neighborhoods with Borough names

In [9]:
for ind in np.where(html_table['Neighbourhood']=='Not assigned'):
    html_table.iloc[ind,2] = html_table.iloc[ind,1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Combining Neighborhoods by Postcode

In [10]:
res = []
for postcode in list(set(html_table['Postcode'])):
    postcode_ind = np.where(html_table['Postcode']==postcode)[0]
    if(len(postcode_ind)==1):
        res.append(list(html_table.iloc[postcode_ind,].values[0]))
    else:
        res.append(list(html_table.iloc[postcode_ind[0],:2])+[', '.join(html_table.iloc[postcode_ind,2])])

In [11]:
df = pd.DataFrame(res,columns=html_table.columns,dtype='str')
print('Shape of Resulting Postal Code dataframe:',df.shape)
df.head()

Shape of Resulting Postal Code dataframe: (103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M9W,Etobicoke,Northwest
1,M5S,Downtown Toronto,"Harbord, University of Toronto"
2,M3J,North York,"Northwood Park, York University"
3,M2H,North York,Hillcrest Village
4,M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol..."


## Part 2: Fetching lat,long values for each Postcode


In [12]:
#!conda install -c conda-forge geopy
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="canada_explorer")

In [25]:
df['Latitude'] = 0
df['Longitude'] = 0
# loop until you get the coordinates
for i,postcode in enumerate(df['Postcode']):
    #print(i,postcode)
    lat_lng_coords = None
    repeater = 0
    while(lat_lng_coords is None and repeater <10):
        lat_lng_coords = geolocator.geocode('{}, Toronto, Ontario'.format(postcode))
        repeater+=1
    if(repeater!=10 and lat_lng_coords is None):
        df.iloc[i,3] = lat_lng_coords[1][0]
        df.iloc[i,4] = lat_lng_coords[1][1]

GeocoderServiceError: [Errno 99] Cannot assign requested address

In [107]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M9W,Etobicoke,Northwest,43.706748,-79.594054
1,M5S,Downtown Toronto,"Harbord, University of Toronto",43.662696,-79.400049
2,M3J,North York,"Northwood Park, York University",43.76798,-79.487262
3,M2H,North York,Hillcrest Village,43.803762,-79.363452
4,M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol...",43.643515,-79.577201
