In [1]:
!conda install -c conda-forge beautifulsoup4 --yes

!conda install -c conda-forge geopy --yes

!conda install -c conda-forge folium=0.5.0 --yes

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.11

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    soupsieve-1.9.3            |           py36_0          57 KB  conda-forge
    beautifulsoup4-4.8.0       |           py36_0         144 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         201 KB

The following NEW packages will be INSTALLED:

    soupsieve:      1.9.3-py36_0 conda-forge

The following packages will be UPDATED:

    beautifulsoup4: 4.6.3-py37_0             --> 4.8.0-py36_0 conda-forge


Downloading and Extracting Packages
soupsieve-1.9.3      | 57 KB     |

In [2]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from bs4 import BeautifulSoup # library to parse HTML and XML documents

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

Data will be downloaded from the Wikipedia page---- https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [3]:
data_canada = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
data_canada = BeautifulSoup(data_canada, 'html.parser')
data_canada.head()

[<meta charset="utf-8"/>,
 <title>List of postal codes of Canada: M - Wikipedia</title>,
 <script>document.documentElement.className=document.documentElement.className.replace(/(^|\s)client-nojs(\s|$)/,"$1client-js$2");RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":906439794,"wgRevisionId":906439794,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":!1,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["

In [4]:
zipcodes = []
for row in data_canada.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        zipcodes.append(cells[0].text)

In [5]:
boroughs = []
for row in data_canada.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        boroughs.append(cells[1].text)

In [6]:
neighborhoods = []
for row in data_canada.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        neighborhoods.append(cells[2].text.rstrip('\n'))

Now we will transform the data into a pandas dataframe. The dataframe will comprise of three columns: PostalCode, Borough, and Neighborhood

In [7]:
toronto_df = pd.DataFrame({"PostalCode": zipcodes,
                           "Borough": boroughs,
                           "Neighborhood": neighborhoods})

toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


We go ahead and process only the cells that have an assigned borough. We shall ignore cells with a borough that is not assigned.

In [8]:
toronto_df_dropna = toronto_df[toronto_df.Borough != "Not assigned"].reset_index(drop=True)
toronto_df_dropna.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [9]:
toronto_df_grouped = toronto_df_dropna.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
toronto_df_grouped.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [10]:
for index, row in toronto_df_grouped.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
toronto_df_grouped.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [11]:
toronto_df_grouped.shape

(103, 3)

Import Latitude and Longitude for the neighborhoods

In [12]:
!conda install -c conda-forge geocoder --yes

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.11

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    click-7.0                  |             py_0          61 KB  conda-forge
    ratelim-0.1.6              |             py_2           6 KB  conda-forge
    geocoder-1.38.1            |             py_1          53 KB  conda-forge
    future-0.17.1              |        py36_1000         701 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         821 KB

The following NEW packages will be INSTALLED:

    future:   0.17.1-py36_1000 conda-forge
    geocoder: 1.38.1-py_1      conda-for

In [13]:
import geocoder # import geocoder

In [14]:
# initialize your variable to None
lat_lng_coords = None

In [15]:
geolocator = Nominatim(user_agent="Segmenting and Clustering Toronto")
location = geolocator.geocode('Toronto, Ontario')
latitude = location.latitude
longitude = location.longitude
print(latitude)
print(longitude)

43.653963
-79.387207


Since this package is unreliable, loading the csv file that has the geographical coordinates of each postal code: http://cocl.us/Geospatial_data

In [16]:
coordinates_data_canada = requests.get('http://cocl.us/Geospatial_data').text
#coordinates_data_canada = BeautifulSoup(coordinates_data_canada, 'html.parser')
coordinates_data_canada

'Postal Code,Latitude,Longitude\r\nM1B,43.8066863,-79.1943534\r\nM1C,43.7845351,-79.1604971\r\nM1E,43.7635726,-79.1887115\r\nM1G,43.7709921,-79.2169174\r\nM1H,43.773136,-79.2394761\r\nM1J,43.7447342,-79.2394761\r\nM1K,43.7279292,-79.2620294\r\nM1L,43.7111117,-79.2845772\r\nM1M,43.716316,-79.2394761\r\nM1N,43.692657,-79.2648481\r\nM1P,43.7574096,-79.273304\r\nM1R,43.7500715,-79.2958491\r\nM1S,43.7942003,-79.2620294\r\nM1T,43.7816375,-79.3043021\r\nM1V,43.8152522,-79.2845772\r\nM1W,43.7995252,-79.3183887\r\nM1X,43.8361247,-79.2056361\r\nM2H,43.8037622,-79.3634517\r\nM2J,43.7785175,-79.3465557\r\nM2K,43.7869473,-79.385975\r\nM2L,43.7574902,-79.3747141\r\nM2M,43.789053,-79.4084928\r\nM2N,43.7701199,-79.4084928\r\nM2P,43.7527583,-79.4000493\r\nM2R,43.7827364,-79.4422593\r\nM3A,43.7532586,-79.3296565\r\nM3B,43.7459058,-79.352188\r\nM3C,43.7258997,-79.340923\r\nM3H,43.7543283,-79.4422593\r\nM3J,43.7679803,-79.4872619\r\nM3K,43.7374732,-79.4647633\r\nM3L,43.7390146,-79.5069436\r\nM3M,43.728496

Now we will transform the data into Pandas dataframe

In [17]:
coordinates_data_canada_df=pd.read_csv('http://cocl.us/Geospatial_data', header=0)
coordinates_data_canada_df

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [18]:
coordinates_data_canada_df.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
coordinates_data_canada_df.head(10)

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


Now appending the Latitude and Longitudes to the Borough and Neighborhood Dataframes using the Postal Codes

In [19]:
Neighborhood_Coordinates_Df = toronto_df_grouped.merge(coordinates_data_canada_df, on="PostalCode", how="left")
Neighborhood_Coordinates_Df.head(200)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


Now we will be exploring and clustering the neighborhoods in Toronto

In [20]:
Toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(Neighborhood_Coordinates_Df['Latitude'], Neighborhood_Coordinates_Df['Longitude'], Neighborhood_Coordinates_Df['Borough'], Neighborhood_Coordinates_Df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=1.0).add_to(Toronto_map)  
    
Toronto_map

We will filter the boroughs that has the Neighborhood with the word Village in them

In [21]:
neighborhood_names = list(toronto_df_dropna.Neighborhood.unique())

neighborhood_names_village = []

for x in neighborhood_names:
    if "village" in x.lower():
        neighborhood_names_village.append(x)
        
neighborhood_names_village

['Victoria Village',
 'Hillcrest Village',
 'Dovercourt Village',
 'Scarborough Village',
 'Bayview Village',
 'Parkdale Village',
 'Scarborough Village West',
 'Kingsview Village']

We will create a new dataframe of the respective boroughs and neighborhoods

In [22]:
toronto_df_dropna=toronto_df_dropna[toronto_df_dropna['Neighborhood'].isin(neighborhood_names_village)].reset_index(drop=True)
toronto_df_dropna.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M4A,North York,Victoria Village
1,M2H,North York,Hillcrest Village
2,M6H,West Toronto,Dovercourt Village
3,M1J,Scarborough,Scarborough Village
4,M2K,North York,Bayview Village


In [23]:
Neighborhood_Coordinates_Df = toronto_df_dropna.merge(coordinates_data_canada_df, on="PostalCode", how="left")
Neighborhood_Coordinates_Df.head(6)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4A,North York,Victoria Village,43.725882,-79.315572
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M6H,West Toronto,Dovercourt Village,43.669005,-79.442259
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M2K,North York,Bayview Village,43.786947,-79.385975
5,M6K,West Toronto,Parkdale Village,43.636847,-79.428191


Foursquare Credentials

In [24]:
CLIENT_ID = '3EI0RVQAPVSNF5JB1NJAJQZNRMV3FCU2QMI5BY4NOCXJ4PFJ' # your Foursquare ID
CLIENT_SECRET = 'BBDAFESWPHRAHWURYKOHXAJ2IWVT4OGC2GMDLSUUPQGYVVMY' # your Foursquare Secret
VERSION = '20180604'
print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentials:
CLIENT_ID: 3EI0RVQAPVSNF5JB1NJAJQZNRMV3FCU2QMI5BY4NOCXJ4PFJ
CLIENT_SECRET:BBDAFESWPHRAHWURYKOHXAJ2IWVT4OGC2GMDLSUUPQGYVVMY


In [25]:
radius = 500
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(Neighborhood_Coordinates_Df['Latitude'], Neighborhood_Coordinates_Df['Longitude'], Neighborhood_Coordinates_Df['PostalCode'], Neighborhood_Coordinates_Df['Borough'], Neighborhood_Coordinates_Df['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [26]:
venues_df = pd.DataFrame(venues)
venues_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']
venues_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M4A,North York,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
1,M4A,North York,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
2,M4A,North York,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
3,M4A,North York,Victoria Village,43.725882,-79.315572,The Frig,43.727051,-79.317418,French Restaurant
4,M4A,North York,Victoria Village,43.725882,-79.315572,Eglinton Ave E & Sloane Ave/Bermondsey Rd,43.726086,-79.31362,Intersection
5,M4A,North York,Victoria Village,43.725882,-79.315572,Pizza Nova,43.725824,-79.31286,Pizza Place
6,M2H,North York,Hillcrest Village,43.803762,-79.363452,Eagle's Nest Golf Club,43.805455,-79.364186,Golf Course
7,M2H,North York,Hillcrest Village,43.803762,-79.363452,AY Jackson Pool,43.804515,-79.366138,Pool
8,M2H,North York,Hillcrest Village,43.803762,-79.363452,Villa Madina,43.801685,-79.363938,Mediterranean Restaurant
9,M2H,North York,Hillcrest Village,43.803762,-79.363452,Duncan Creek Park,43.805539,-79.360695,Dog Run


In [27]:
venues_df.groupby(["PostalCode", "Borough", "Neighborhood"])
venues_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M4A,North York,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
1,M4A,North York,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
2,M4A,North York,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
3,M4A,North York,Victoria Village,43.725882,-79.315572,The Frig,43.727051,-79.317418,French Restaurant
4,M4A,North York,Victoria Village,43.725882,-79.315572,Eglinton Ave E & Sloane Ave/Bermondsey Rd,43.726086,-79.31362,Intersection
5,M4A,North York,Victoria Village,43.725882,-79.315572,Pizza Nova,43.725824,-79.31286,Pizza Place
6,M2H,North York,Hillcrest Village,43.803762,-79.363452,Eagle's Nest Golf Club,43.805455,-79.364186,Golf Course
7,M2H,North York,Hillcrest Village,43.803762,-79.363452,AY Jackson Pool,43.804515,-79.366138,Pool
8,M2H,North York,Hillcrest Village,43.803762,-79.363452,Villa Madina,43.801685,-79.363938,Mediterranean Restaurant
9,M2H,North York,Hillcrest Village,43.803762,-79.363452,Duncan Creek Park,43.805539,-79.360695,Dog Run


Top Toronto Venues across Neighborhoods containing the word Village

In [28]:
toronto = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")
toronto['PostalCode'] = venues_df['PostalCode'] 
toronto['Borough'] = venues_df['Borough'] 
toronto['Neighborhoods'] = venues_df['Neighborhood'] 

toronto_columns = list(toronto.columns[-3:]) + list(toronto.columns[:-3])
toronto = toronto[toronto_columns]

toronto_grouped = toronto.groupby(["PostalCode", "Borough", "Neighborhoods"]).mean().reset_index()

toronto_grouped

Unnamed: 0,PostalCode,Borough,Neighborhoods,American Restaurant,Art Gallery,Athletics & Sports,Bakery,Bank,Bar,Breakfast Spot,Brewery,Burrito Place,Bus Line,Café,Caribbean Restaurant,Chinese Restaurant,Climbing Gym,Coffee Shop,Convenience Store,Dog Run,French Restaurant,Furniture / Home Store,Golf Course,Grocery Store,Gym,Gym / Fitness Center,Hockey Arena,Intersection,Italian Restaurant,Japanese Restaurant,Mediterranean Restaurant,Middle Eastern Restaurant,Mobile Phone Shop,Motel,Music Venue,Park,Performing Arts Venue,Pet Store,Pharmacy,Pizza Place,Playground,Pool,Portuguese Restaurant,Restaurant,Stadium,Supermarket
0,M1J,Scarborough,Scarborough Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
1,M1M,Scarborough,Scarborough Village West,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M2H,North York,Hillcrest Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0
3,M2K,North York,Bayview Village,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M4A,North York,Victoria Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.166667,0.0,0.0,0.0
5,M6H,West Toronto,Dovercourt Village,0.0,0.058824,0.058824,0.117647,0.058824,0.058824,0.0,0.058824,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.058824,0.058824,0.0,0.0,0.117647,0.058824,0.0,0.0,0.0,0.0,0.0,0.117647
6,M6K,West Toronto,Parkdale Village,0.0,0.0,0.0,0.047619,0.0,0.047619,0.095238,0.0,0.047619,0.0,0.095238,0.047619,0.0,0.047619,0.095238,0.047619,0.0,0.0,0.047619,0.0,0.047619,0.047619,0.0,0.0,0.047619,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.047619,0.0,0.0,0.0,0.0,0.0,0.047619,0.047619,0.0
7,M9R,Etobicoke,Kingsview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0


Top Venues across Categories

In [30]:
top_venues = 10

areaColumns = ['PostalCode', 'Borough', 'Neighborhoods']
freqColumns = []
for ind in np.arange(top_venues):
    try:
        freqColumns.append('Most Common Venue {}'.format(ind+1))
    except:
        freqColumns.append('Most Common Venue {}'.format(ind+1))
columns = areaColumns+freqColumns


neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']
neighborhoods_venues_sorted['Borough'] = toronto_grouped['Borough']
neighborhoods_venues_sorted['Neighborhoods'] = toronto_grouped['Neighborhoods']

for ind in np.arange(toronto_grouped.shape[0]):
    row_categories = toronto_grouped.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:top_venues]

neighborhoods_venues_sorted

Unnamed: 0,PostalCode,Borough,Neighborhoods,Most Common Venue 1,Most Common Venue 2,Most Common Venue 3,Most Common Venue 4,Most Common Venue 5,Most Common Venue 6,Most Common Venue 7,Most Common Venue 8,Most Common Venue 9,Most Common Venue 10
0,M1J,Scarborough,Scarborough Village,Playground,Grocery Store,Supermarket,Café,Furniture / Home Store,French Restaurant,Dog Run,Convenience Store,Coffee Shop,Climbing Gym
1,M1M,Scarborough,Scarborough Village West,American Restaurant,Motel,Grocery Store,Furniture / Home Store,French Restaurant,Dog Run,Convenience Store,Coffee Shop,Climbing Gym,Chinese Restaurant
2,M2H,North York,Hillcrest Village,Dog Run,Pool,Mediterranean Restaurant,Golf Course,Supermarket,Café,French Restaurant,Convenience Store,Coffee Shop,Climbing Gym
3,M2K,North York,Bayview Village,Café,Bank,Japanese Restaurant,Chinese Restaurant,Caribbean Restaurant,Furniture / Home Store,French Restaurant,Dog Run,Convenience Store,Coffee Shop
4,M4A,North York,Victoria Village,French Restaurant,Portuguese Restaurant,Hockey Arena,Pizza Place,Intersection,Coffee Shop,Café,Dog Run,Convenience Store,Climbing Gym
5,M6H,West Toronto,Dovercourt Village,Supermarket,Bakery,Pharmacy,Park,Gym / Fitness Center,Café,Middle Eastern Restaurant,Music Venue,Brewery,Bar
6,M6K,West Toronto,Parkdale Village,Café,Coffee Shop,Breakfast Spot,Gym,Convenience Store,Grocery Store,Stadium,Climbing Gym,Intersection,Italian Restaurant
7,M9R,Etobicoke,Kingsview Village,Mobile Phone Shop,Pizza Place,Park,Bus Line,Supermarket,French Restaurant,Dog Run,Convenience Store,Coffee Shop,Climbing Gym


Clustering and Segmenting the Neighborhoods and Venues

In [31]:
k = 4

toronto_grouped_clustered = toronto_grouped.drop(["PostalCode", "Borough", "Neighborhoods"], 1)

kmeans = KMeans(n_clusters=k, random_state=0).fit(toronto_grouped_clustered)

kmeans.labels_[0:10]

array([2, 0, 3, 1, 1, 1, 1, 1], dtype=int32)

Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood

In [32]:
toronto_merged = Neighborhood_Coordinates_Df.copy()

toronto_merged["Cluster Labels"] = kmeans.labels_

toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.drop(["Borough", "Neighborhoods"], 1).set_index("PostalCode"), on="PostalCode")

toronto_merged.head(10) 
toronto_merged.sort_values(["Cluster Labels"], inplace=True)
toronto_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,Most Common Venue 1,Most Common Venue 2,Most Common Venue 3,Most Common Venue 4,Most Common Venue 5,Most Common Venue 6,Most Common Venue 7,Most Common Venue 8,Most Common Venue 9,Most Common Venue 10
1,M2H,North York,Hillcrest Village,43.803762,-79.363452,0,Dog Run,Pool,Mediterranean Restaurant,Golf Course,Supermarket,Café,French Restaurant,Convenience Store,Coffee Shop,Climbing Gym
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476,1,Playground,Grocery Store,Supermarket,Café,Furniture / Home Store,French Restaurant,Dog Run,Convenience Store,Coffee Shop,Climbing Gym
4,M2K,North York,Bayview Village,43.786947,-79.385975,1,Café,Bank,Japanese Restaurant,Chinese Restaurant,Caribbean Restaurant,Furniture / Home Store,French Restaurant,Dog Run,Convenience Store,Coffee Shop
5,M6K,West Toronto,Parkdale Village,43.636847,-79.428191,1,Café,Coffee Shop,Breakfast Spot,Gym,Convenience Store,Grocery Store,Stadium,Climbing Gym,Intersection,Italian Restaurant
6,M1M,Scarborough,Scarborough Village West,43.716316,-79.239476,1,American Restaurant,Motel,Grocery Store,Furniture / Home Store,French Restaurant,Dog Run,Convenience Store,Coffee Shop,Climbing Gym,Chinese Restaurant
7,M9R,Etobicoke,Kingsview Village,43.688905,-79.554724,1,Mobile Phone Shop,Pizza Place,Park,Bus Line,Supermarket,French Restaurant,Dog Run,Convenience Store,Coffee Shop,Climbing Gym
0,M4A,North York,Victoria Village,43.725882,-79.315572,2,French Restaurant,Portuguese Restaurant,Hockey Arena,Pizza Place,Intersection,Coffee Shop,Café,Dog Run,Convenience Store,Climbing Gym
2,M6H,West Toronto,Dovercourt Village,43.669005,-79.442259,3,Supermarket,Bakery,Pharmacy,Park,Gym / Fitness Center,Café,Middle Eastern Restaurant,Music Venue,Brewery,Bar


In [33]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(k)
ys = [i+x+(i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['PostalCode'], toronto_merged['Borough'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [34]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,Most Common Venue 1,Most Common Venue 2,Most Common Venue 3,Most Common Venue 4,Most Common Venue 5,Most Common Venue 6,Most Common Venue 7,Most Common Venue 8,Most Common Venue 9,Most Common Venue 10
1,North York,0,Dog Run,Pool,Mediterranean Restaurant,Golf Course,Supermarket,Café,French Restaurant,Convenience Store,Coffee Shop,Climbing Gym


In [35]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,Most Common Venue 1,Most Common Venue 2,Most Common Venue 3,Most Common Venue 4,Most Common Venue 5,Most Common Venue 6,Most Common Venue 7,Most Common Venue 8,Most Common Venue 9,Most Common Venue 10
3,Scarborough,1,Playground,Grocery Store,Supermarket,Café,Furniture / Home Store,French Restaurant,Dog Run,Convenience Store,Coffee Shop,Climbing Gym
4,North York,1,Café,Bank,Japanese Restaurant,Chinese Restaurant,Caribbean Restaurant,Furniture / Home Store,French Restaurant,Dog Run,Convenience Store,Coffee Shop
5,West Toronto,1,Café,Coffee Shop,Breakfast Spot,Gym,Convenience Store,Grocery Store,Stadium,Climbing Gym,Intersection,Italian Restaurant
6,Scarborough,1,American Restaurant,Motel,Grocery Store,Furniture / Home Store,French Restaurant,Dog Run,Convenience Store,Coffee Shop,Climbing Gym,Chinese Restaurant
7,Etobicoke,1,Mobile Phone Shop,Pizza Place,Park,Bus Line,Supermarket,French Restaurant,Dog Run,Convenience Store,Coffee Shop,Climbing Gym


In [36]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,Most Common Venue 1,Most Common Venue 2,Most Common Venue 3,Most Common Venue 4,Most Common Venue 5,Most Common Venue 6,Most Common Venue 7,Most Common Venue 8,Most Common Venue 9,Most Common Venue 10
0,North York,2,French Restaurant,Portuguese Restaurant,Hockey Arena,Pizza Place,Intersection,Coffee Shop,Café,Dog Run,Convenience Store,Climbing Gym


In [37]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,Most Common Venue 1,Most Common Venue 2,Most Common Venue 3,Most Common Venue 4,Most Common Venue 5,Most Common Venue 6,Most Common Venue 7,Most Common Venue 8,Most Common Venue 9,Most Common Venue 10
2,West Toronto,3,Supermarket,Bakery,Pharmacy,Park,Gym / Fitness Center,Café,Middle Eastern Restaurant,Music Venue,Brewery,Bar


THANK YOU