Let's first obtain the data from Wikipedia page that is in the table of postal codes and transform it into a pandas dataframe. We will be using Pandas and BeautifulSoup libraries for webscraping.

In [142]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#Data wrangling and cleaning up the above dataframe. Removing the Not assigned values

In [143]:
# Dropping the rows where 'Borough'=='Not assigned'
df.drop(df[df['Borough']=='Not assigned'].index, inplace = True)  
df.reset_index(drop=True, inplace=True)
# Replacing the NOt assigned values of Neighbourhood with that of corresponding Borough.
for index in df[df['Neighbourhood']=='Not assigned'].index:
    df['Neighbourhood'][index]=df['Borough'][index]
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [144]:
# Combining the Neighbourhoods with same Postcode(separated by comma).
df_toronto=pd.DataFrame(columns=['Postcode','Borough', 'Neighbourhood'])
df_toronto['Postcode'] = df.Postcode.unique()
Neighbourhood_list = []
for row in range(df_toronto.shape[0]):
    for i in range(df.shape[0]):
        if df['Postcode'][i]==df_toronto['Postcode'][row]:
            df_toronto['Borough'][row] = df['Borough'][i]
            Neighbourhood_list.append(df['Neighbourhood'][i])
    df_toronto['Neighbourhood'][row] = ', '.join(Neighbourhood_list)
    Neighbourhood_list = []
df_toronto

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


#No. of rows in dataframe

In [145]:
df_toronto.shape

(103, 3)

# 2nd part of assignment starts from here.

In [146]:
# Loading the location coordinates in a dataframe.
df_coordinates = pd.read_csv('http://cocl.us/Geospatial_data')

In [147]:
df_coordinates.rename(columns={'Postal Code': 'Postcode'}, inplace=True)
df_coordinates.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [148]:
df_toronto = pd.merge(df_toronto,df_coordinates, on=['Postcode'])
df_toronto

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.654260,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


## 3rd part of assignment starts from here.
### Segmenting and clustering neighbourhoods in Toronto City

Before we get the data and start exploring it, let's download all the dependencies that we will need.

In [149]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library


Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

In [150]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_toronto['Borough'].unique()),
        len(df['Neighbourhood'].unique())
    )
)

The dataframe has 11 boroughs and 209 neighborhoods.


We will do our analysis with only boroughs that contain the word Toronto.

In [162]:
# neighbourhoods that contain the word Toronto.
print([item for item in df_toronto['Borough'].unique() if item.find('Toronto')>0])

['Downtown Toronto', 'East Toronto', 'West Toronto', 'Central Toronto']


Let's visualize the above neighbourhoods on map with labels

In [171]:
latitude = 43.654260
longitude = -79.360636
# create map of above neighbourhoods using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.

In [172]:
# Foursquare Credentials and Version
CLIENT_ID = 'E5JRO30UW3P0F44ANBCSPJ3OUHYMXG43PCQSZGVX2A2VGHTF' # your Foursquare ID
CLIENT_SECRET = 'JUFGRKYSQIUI2D12A44RBTDMLVOZCGDFWULAFMYO0JMAMPF1' # your Foursquare Secret
VERSION = '20191002' # Foursquare API version

# K-mean clustering of various neighbours

In [179]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = df_toronto[['Latitude','Longitude']]

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
df_toronto['label']=kmeans.labels_
df_toronto.sort_values('label')

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,label
51,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476,0
32,M1J,Scarborough,Scarborough Village,43.744734,-79.239476,0
26,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0
22,M1G,Scarborough,Woburn,43.770992,-79.216917,0
65,M1P,Scarborough,"Dorset Park, Scarborough Town Centre, Wexford ...",43.75741,-79.273304,0
18,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0
71,M1R,Scarborough,"Maryvale, Wexford",43.750072,-79.295849,0
78,M1S,Scarborough,Agincourt,43.7942,-79.262029,0
12,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,0
82,M1T,Scarborough,"Clarks Corners, Sullivan, Tam O'Shanter",43.781638,-79.304302,0


Observation: Through this clustering, we can further classify the Toronto city into 5 clusters. This will be useful for administrative purposes. Though this clustering is based purely on location coordinates.