# Segmenting and Clustering Neighborhoods in Toronto
## Peer-graded Assignment:  to explore and cluster the neighborhoods in Toronto
### *by Leopoldo Sprandel*

### Importing data from wikipedia

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.18.1-py_0 conda-forge

geographiclib- 100% |################################| Time: 0:00:00  23.86 MB/s
geopy-1.18.1-p 100% |################################| Time: 0:00:00  38.58 MB/s
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  54.63 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  34.04 MB/s
vincent-0.4.4- 100% |###################

In [2]:
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page= requests.get(wikipedia_link).text

In [3]:
raw_list = page[page.find('wikitable sortable')-15:page.find('/table')]
raw_list = pd.read_html(raw_list, header=0, index_col=None, attrs={"class":"wikitable sortable"})[0]
raw_list.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Part 1
## Wrangling data
(done!) Ignore cells with a borough that is Not assigned  
(done!) Two rows with the same PostalCode will be combined into one row with the neighborhoods separated with a comma  
(done!) If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough  
(done!) In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe

In [4]:
postals=raw_list
# Rename the column Postcode by PostalCode
postals.rename(columns={'Postcode':'PostalCode'}, inplace=True)
# Check the columns
postals.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [5]:
# Ignore cells with a borough that is Not assigned
postals.replace("Not assigned", np.nan, inplace = True)
postals.dropna(subset=["Borough"], axis=0, inplace = True)
postals.reset_index(drop = True, inplace = True)
postals.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [6]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
postals['Neighbourhood'].replace(np.nan, postals['Borough'], inplace= True)
postals.head(10)   

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [7]:
# Two rows with the same PostalCode will be combined into one row with the neighborhoods separated with a comma
postals=postals.groupby(['PostalCode','Borough'], as_index=False, sort=False).aggregate(lambda x: ', '.join(x))
postals.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [8]:
# In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe
postals.shape

(103, 3)

### Part 2
## Get the latitude and the longitude coordinates of each neighborhood

In [9]:
geocode=pd.read_csv(('http://cocl.us/Geospatial_data'))
geocode.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
postals=postals.merge(geocode, left_on='PostalCode', right_on='Postal Code', how='inner').drop(columns=["Postal Code"])

In [11]:
postals.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [12]:
postals.shape

(103, 5)

### Part 3
## Explore and cluster the neighborhoods in Toronto

In [171]:
dftoronto=postals[postals['Borough'].str.contains('Toronto')]

In [172]:
dftoronto.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [173]:
dftoronto.shape

(38, 5)

In [176]:
address = 'Downtown Toronto, CA'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinate of Toronto are 43.655115, -79.380219.


In [183]:
# create map of Toronto using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for post, lat, lng, borough, neighborhood in zip(dftoronto['PostalCode'],dftoronto['Latitude'], dftoronto['Longitude'], dftoronto['Borough'], dftoronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    

map_newyork