# Toronto neighborhoods clustering

## Creating the dataframe

In [1]:
! pip install lxml



In [1]:
import json
import requests
from lxml import html
import pandas as pd

In [2]:
pd.set_option('display.max_rows', 200)

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(url)
tree = html.fromstring(page.content)

Let's take a look at the source code: the table is indicated with the class _wikitable sortable_ .

In [4]:
pos = tree.find_class("wikitable sortable")[0]
headers = [col.text.strip('\n') for col in pos[0][0]]
data = pos.xpath('//tbody//tr//td/text()')
l = len(headers)

pos_list = []

for n in range(0,len(data)-1, l):
    try:
        pos_list.append([it.strip('\n').replace(' /',',') for it in data[n:n+l]])
    except:
        print('error in line: {}\n could not add to list'.format((n-n%l)/l))


In [5]:
df_poscode = pd.DataFrame(pos_list, columns=headers)
df_poscode.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Data cleaning

In [6]:
df_poscode = df_poscode[(df_poscode['Postal code'] != '') & (df_poscode.Borough != 'Not assigned')]
df_poscode.reset_index(drop=True, inplace=True)

In [7]:
df_poscode.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
print('N rows in the dataframe: {}'.format(df_poscode.shape[0]))

N rows in the dataframe: 103


## Getting geographical position of neighborhoods

In [9]:
!pip install pgeocode



You should consider upgrading via the 'c:\users\lucagranalli\appdata\local\programs\python\python36\python.exe -m pip install --upgrade pip' command.


In [10]:
import pgeocode
from geopy.geocoders import Nominatim
import folium

In [11]:
import pgeocode
nomi = pgeocode.Nominatim('ca')

def get_geocoding(zip_code, retry=3):
    
    for i in range(retry):
        geo_series = nomi.query_postal_code(zip_code)
        if not geo_series.empty:
            break
            
    geo_series = nomi.query_postal_code(zip_code)
    lat, long = geo_series[['latitude', 'longitude']].values
    
    return lat, long

In [12]:
df_poscode['Latitude'], df_poscode['Longitude'] = zip(*df_poscode['Postal code'].apply(get_geocoding))
df_poscode.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889


Check for postal codes with no latitude, longitude

In [13]:
cond = (df_poscode.Latitude.isna()) | (df_poscode.Longitude.isna())
df_poscode.loc[cond, ['Latitude', 'Longitude']]

Unnamed: 0,Latitude,Longitude
76,,


Fillig information with data from http://cocl.us/Geospatial_data

In [14]:
!curl -L http://cocl.us/Geospatial_data > toronto_geo.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0
100   524    0   524    0     0    174      0 --:--:--  0:00:03 --:--:--   185

  0     0    0     0    0     0      0      0 --:--:--  0:00:03 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:03 --:--:--     0

  0     0    0     0    0     0      0      0 --:--:--  0:00:04 --:--:--     0

  0     0    0     0    0     0      0      0 --:--:--  0:00:04 --:--:--     0
100     4    0     4    0     0      0      0 -

In [15]:
df_togeo = pd.read_csv('toronto_geo.csv')
df_togeo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [16]:
codes = df_poscode.loc[cond, 'Postal code'].values
ll = df_togeo.loc[df_togeo['Postal Code'].isin(codes), ['Latitude', 'Longitude']].values

df_poscode.loc[cond, ['Latitude', 'Longitude']] = ll

Final check to see if the previously missing data have been filled in:

In [17]:
df_poscode[df_poscode['Postal code'].isin(codes)]

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
76,M7R,Mississauga,Canada Post Gateway Processing Centre,43.636966,-79.615819


### View of the geo-referenced dataframe

The final version of the dataframe looks as follows:

In [64]:
df_poscode.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889


Now that the dataset is ready, it is possible to create a first map of neighborhoods within Toronto that is centered on the city center itself

In [18]:
address='Toronto, Ontario'
geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
to_lat, to_long = location.latitude, location.longitude

map_toronto = folium.Map(location=[to_lat, to_long], zoom_start=11)

In [19]:
# add markers to map
for lat, lng, label in zip(df_poscode['Latitude'], df_poscode['Longitude'], df_poscode['Neighborhood']):
    try:
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    except:
        pass

In [20]:
map_toronto