# Clustering Neighbourhoods in Toronto

In [66]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [67]:
import requests
import urllib
from bs4 import BeautifulSoup

### Q2.Use the Notebook to build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M.

In [68]:
# Link to the Postal codes of Cananda on wikipedia
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [69]:
r= requests.get(url)

In [70]:
#Beautifulsoup will be used to scrape wikipedia page
html_content=r.text
soup=BeautifulSoup(html_content,"html.parser")

In [71]:
#print(soup.prettify())

In [72]:
# fetching the table with postcodes and saving it as a list
data=[]
table = soup.find("table", { "class" : "wikitable sortable" })
for row in table.findAll("tr"):
    cols = row.findAll("td")
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols if ele]) # Get rid of empty values

In [73]:
#checking the first 5 values
data[:5]

[[],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village']]

##### The list 'data' is converted to a dataframe for manipulation

In [74]:
# define the dataframe columns
column_names = ['Postal Code','Borough', 'Neighborhood'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(data,columns=column_names)

In [75]:
neighborhoods.head(5)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [76]:
#first row has no values.hence dropping the row
neighborhoods.drop(0,inplace = True)
neighborhoods = neighborhoods.reset_index(drop=True)

In [77]:
neighborhoods.shape


(288, 3)

In [78]:
#number of unique postcodes
neighborhoods['Postal Code'].unique()

array(['M1A', 'M2A', 'M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M8A', 'M9A',
       'M1B', 'M2B', 'M3B', 'M4B', 'M5B', 'M6B', 'M7B', 'M8B', 'M9B',
       'M1C', 'M2C', 'M3C', 'M4C', 'M5C', 'M6C', 'M7C', 'M8C', 'M9C',
       'M1E', 'M2E', 'M3E', 'M4E', 'M5E', 'M6E', 'M7E', 'M8E', 'M9E',
       'M1G', 'M2G', 'M3G', 'M4G', 'M5G', 'M6G', 'M7G', 'M8G', 'M9G',
       'M1H', 'M2H', 'M3H', 'M4H', 'M5H', 'M6H', 'M7H', 'M8H', 'M9H',
       'M1J', 'M2J', 'M3J', 'M4J', 'M5J', 'M6J', 'M7J', 'M8J', 'M9J',
       'M1K', 'M2K', 'M3K', 'M4K', 'M5K', 'M6K', 'M7K', 'M8K', 'M9K',
       'M1L', 'M2L', 'M3L', 'M4L', 'M5L', 'M6L', 'M7L', 'M8L', 'M9L',
       'M1M', 'M2M', 'M3M', 'M4M', 'M5M', 'M6M', 'M7M', 'M8M', 'M9M',
       'M1N', 'M2N', 'M3N', 'M4N', 'M5N', 'M6N', 'M7N', 'M8N', 'M9N',
       'M1P', 'M2P', 'M3P', 'M4P', 'M5P', 'M6P', 'M7P', 'M8P', 'M9P',
       'M1R', 'M2R', 'M3R', 'M4R', 'M5R', 'M6R', 'M7R', 'M8R', 'M9R',
       'M1S', 'M2S', 'M3S', 'M4S', 'M5S', 'M6S', 'M7S', 'M8S', 'M9S',
       'M1T', 'M2T',

### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.



In [79]:

#neighborhoods.drop(neighborhoods.loc[neighborhoods['Borough']== 'Not assigned'].index, inplace=True)

In [80]:
#rows that have Neighbourhood value  'Not assigned'
neighborhoods.loc[(neighborhoods['Neighborhood'] == 'Not assigned')].head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned
13,M2B,Not assigned,Not assigned


In [81]:

#Dropping the rows that has Borough  'Not Assigned'
neighborhoods=neighborhoods.loc[~(neighborhoods['Borough'] == 'Not assigned')]

In [82]:
#neighborhood=neighborhoods.loc[(neighborhoods['Neighborhood'] == 'Not assigned')]

In [83]:
#Assigning value of Neighbourhood as Borough where neighborhood has 'not assigned ' values
neighborhoods['Neighborhood'] = np.where(neighborhoods['Neighborhood']== 'Not assigned', neighborhoods['Borough'], neighborhoods['Neighborhood'])

### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the 
### Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.

In [84]:
neighborhoods.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


### Combining neighbourhoods with same post code into one row.The different neighbourhoods are separated by comas and included in the neighbourhood column.

In [85]:
#combining neighbourhoods with same post code into one row.The different neighbourhoods are separated by comas and included in the neighbourhood column.
neighborhoods['Neighborhood']=neighborhoods[['Postal Code','Borough','Neighborhood']].groupby(['Postal Code','Borough'])['Neighborhood'].transform(lambda x: ','.join(x))

In [86]:
neighborhoods.drop_duplicates(inplace=True)

In [87]:
neighborhoods.reset_index(drop=True,inplace=True)

In [89]:
neighborhoods.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


#### In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [90]:
neighborhoods.shape

(103, 3)

### Fetching latitudes and longitudes  for each postal code from the CSV file provided.

In [92]:
#CSV file saved as dataframe

codes=pd.read_csv('https://cocl.us/Geospatial_data')

In [93]:
codes.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merging the neighbourhood  with latitude and longitude .

In [95]:
n_merge_latlong = pd.merge(neighborhoods, codes, on='Postal Code')

In [96]:
n_merge_latlong.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
