<h1 align=center><font size = 5>Segmenting and Clustering Neighborhoods in Toronto</font></h1>

## Introduction to this part of the assignment - Geospatial data

In this work, after building a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, it will be converted addresses into their equivalent latitude and longitude values. 

## Libraries

In [1]:
import pandas as pd #library for data analsysis
import numpy as np #library to handle data in a vectorized manner

#!pip install geopy #instal geopy
from geopy.geocoders import Nominatim #library to convert an address into latitude and longitude values

#!pip install geocoder #instal geocoder
import geocoder #Library to handle gospatial data

#!pip install pgeocode
import pgeocode

import requests #library to handle requests
import json #library to handle JSON files
from pandas.io.json import json_normalize #library to tranform JSON file into a pandas dataframe

#Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

#!pip install folium #instal folium
import folium #map rendering library

import urllib.request #library to use to open URLs

from bs4 import BeautifulSoup #library to import the BeautifulSoup library so we can parse HTML and XML documents

print('Libraries imported.')


Libraries imported.


In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data_url = urllib.request.urlopen(url)
data_toronto = pd.read_html(data_url)
neighborhood = data_toronto[0]

In [3]:
#Droping "Not assigned" rows, they're not needed for analysis
neighborhood = neighborhood[neighborhood.Borough !='Not assigned']

#Grouping neighborhoods with the same postal code
neighborhood.set_index(['Postal Code','Borough'],inplace=True)
merge_neighborhood = neighborhood.groupby(level=['Postal Code','Borough'], sort=False).agg( ','.join)

In [4]:
# Resetting the index
new_neighborhood = merge_neighborhood.reset_index()

In [5]:
#Giving the borough's value to the neighborhood if is not assigned
new_name = new_neighborhood['Neighbourhood']
new_neighborhood[['Borough', 'Neighbourhood']]
borough_name = new_neighborhood['Borough']

for x in new_name:
    if x == 'Not assigned':
        y = borough_name
        z = new_neighborhood['Neighbourhood'].replace([x], [y])
        new_neighborhood['Neighbourhood'] = z
    else:
        None

In [6]:
neighborhood_all = pd.DataFrame(new_neighborhood)
neighborhood_all

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [7]:
neighborhood_all.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
print('Shape is:', neighborhood_all.shape)

Shape is: (103, 3)


## Latitude and Longitude

In [9]:
pgeocode.Nominatim('ca')
geolocator = pgeocode.Nominatim('ca')
postal_codes = neighborhood_all['Postal Code'].tolist()
latitudes = []
longitudes = []
for i, postcode in enumerate(postal_codes):
    g = geolocator.query_postal_code(postcode)
    
    if not g.empty:
        latitudes.append(g.latitude)
        longitudes.append(g.longitude)

In [10]:
print(latitudes)

[43.7545, 43.7276, 43.6555, 43.7223, 43.6641, 43.6662, 43.8113, 43.745, 43.7063, 43.6572, 43.7081, 43.6505, 43.7878, 43.7334, 43.6913, 43.6513, 43.6915, 43.6437, 43.7678, 43.6784, 43.6456, 43.6889, 43.7712, 43.7124, 43.6564, 43.6683, 43.7686, 43.8015, 43.7535, 43.7059, 43.6496, 43.6655, 43.7464, 43.7801, 43.7694, 43.6872, 43.62300000000001, 43.648, 43.7298, 43.7797, 43.739, 43.6803, 43.6469, 43.6383, 43.7122, 43.7547, 43.7334, 43.6693, 43.6492, 43.7137, 43.7598, 43.7247, 43.7915, 43.7319, 43.6561, 43.7335, 43.6934, 43.7366, 43.6952, 43.7673, 43.7568, 43.7301, 43.7113, 43.6748, 43.7068, 43.7612, 43.75, 43.7135, 43.6966, 43.6605, 43.6949, 43.7507, 43.7786, 43.7143, 43.6736, 43.6469, nan, 43.6898, 43.7946, 43.702, 43.6629, 43.6512, 43.7812, 43.6899, 43.6541, 43.8177, 43.6861, 43.6404, 43.6075, 43.7432, 43.8016, 43.6827, 43.6437, 43.6021, 43.7144, 43.834, 43.6684, 43.6492, 43.6518, 43.6656, 43.7804, 43.6325, 43.6256]


In [11]:
print(longitudes)

[-79.33, -79.3148, -79.3626, -79.4504, -79.3889, -79.5282, -79.193, -79.359, -79.3094, -79.3783, -79.4479, -79.5517, -79.1564, -79.3329, -79.3116, -79.3756, -79.4307, -79.5767, -79.1866, -79.2941, -79.3754, -79.4507, -79.2144, -79.3644, -79.38600000000002, -79.4205, -79.2389, -79.3577, -79.4472, -79.3464, -79.3833, -79.4378, -79.2323, -79.3479, -79.4921, -79.3368, -79.3936, -79.4177, -79.2639, -79.3813, -79.4692, -79.3538, -79.3823, -79.4301, -79.2843, -79.3764, -79.5116, -79.3155, -79.3823, -79.4869, -79.5565, -79.2312, -79.4103, -79.4928, -79.3406, -79.4177, -79.4857, -79.5401, -79.2646, -79.4111, -79.521, -79.3935, -79.4195, -79.4839, -79.517, -79.2707, -79.3978, -79.3887, -79.41199999999998, -79.4633, -79.5323, -79.3003, -79.445, -79.4065, -79.4035, -79.4521, nan, -79.5582, -79.2644, -79.3853, -79.3987, -79.4828, -79.3036, -79.3853, -79.3978, -79.2819, -79.4025, -79.3995, -79.5013, -79.5876, -79.3216, -79.373, -79.3787, -79.5402, -79.5909, -79.2069, -79.3689, -79.3823, -79.5076, -7

In [12]:
print(postal_codes)

['M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M9A', 'M1B', 'M3B', 'M4B', 'M5B', 'M6B', 'M9B', 'M1C', 'M3C', 'M4C', 'M5C', 'M6C', 'M9C', 'M1E', 'M4E', 'M5E', 'M6E', 'M1G', 'M4G', 'M5G', 'M6G', 'M1H', 'M2H', 'M3H', 'M4H', 'M5H', 'M6H', 'M1J', 'M2J', 'M3J', 'M4J', 'M5J', 'M6J', 'M1K', 'M2K', 'M3K', 'M4K', 'M5K', 'M6K', 'M1L', 'M2L', 'M3L', 'M4L', 'M5L', 'M6L', 'M9L', 'M1M', 'M2M', 'M3M', 'M4M', 'M5M', 'M6M', 'M9M', 'M1N', 'M2N', 'M3N', 'M4N', 'M5N', 'M6N', 'M9N', 'M1P', 'M2P', 'M4P', 'M5P', 'M6P', 'M9P', 'M1R', 'M2R', 'M4R', 'M5R', 'M6R', 'M7R', 'M9R', 'M1S', 'M4S', 'M5S', 'M6S', 'M1T', 'M4T', 'M5T', 'M1V', 'M4V', 'M5V', 'M8V', 'M9V', 'M1W', 'M4W', 'M5W', 'M8W', 'M9W', 'M1X', 'M4X', 'M5X', 'M8X', 'M4Y', 'M7Y', 'M8Y', 'M8Z']


In [13]:
data_geo = pd.read_csv("https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv")
data_geo

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [14]:
#Cancatenation of neighborhood_all and data_geo - Merging the two data sets over two columns on 'Postal Code'
merge_columns=pd.merge(neighborhood_all, data_geo, left_on='Postal Code', right_on='Postal Code')
toronto_lat_long = pd.DataFrame(merge_columns)
toronto_lat_long

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [15]:
toronto_lat_long.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [16]:
print('Shape is:', toronto_lat_long.shape)

Shape is: (103, 5)
