# Part I: Scraping a Wiki Table for the Postal Codes of Canada

In [1]:
# importing the libraries
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
rows = iter(soup.find('table').find_all('tr'))



In [2]:
# skip first row
#create an empty list object
#iterate through each <tr> and grab the data as text from <td> to append as rows
next(rows)


ln = []
for tr in rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    ln.append(row)

pd.DataFrame(ln, columns=["Postal Code", "Borough", "Neighbourhood"])


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M6A,North York,Lawrence Heights\n
6,M6A,North York,Lawrence Manor\n
7,M7A,Queen's Park,Not assigned\n
8,M8A,Not assigned,Not assigned\n
9,M9A,Queen's Park,Queen's Park\n


In [3]:
df = pd.DataFrame(ln, columns=["Postal Code", "Borough", "Neighbourhood"])
#remove \n
df['Neighbourhood'] = df['Neighbourhood'].str.replace(r'\n$', '')
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [4]:
#drop not assigned Boroughs
df.drop(df[df['Borough'] == "Not assigned" ].index , inplace=True)
df.head(15)

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Queen's Park,Not assigned
9,M9A,Queen's Park,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [5]:
#replace assign Borough to Neighbourhood if Neighbourhood is not assigned.
df.loc[df.Neighbourhood == "Not assigned",'Neighbourhood'] = df.Borough

In [6]:
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Queen's Park,Queen's Park
9,M9A,Queen's Park,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [7]:
#reset index
df.index = np.arange(0,len(df))
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [8]:
#group by Postcode and Borough and apply as list
df1 = df.groupby(['Postal Code','Borough'])['Neighbourhood'].apply(list)
df1.head()

Postal Code  Borough    
M1B          Scarborough                            [Rouge, Malvern]
M1C          Scarborough    [Highland Creek, Rouge Hill, Port Union]
M1E          Scarborough         [Guildwood, Morningside, West Hill]
M1G          Scarborough                                    [Woburn]
M1H          Scarborough                                 [Cedarbrae]
Name: Neighbourhood, dtype: object

In [9]:
#reset index again to regain all headers
df1 = df1.sample(frac=1).reset_index() 
df1
#join same Neighbourhoods with a comma
df1['Neighbourhood'] = df1['Neighbourhood'].str.join(',')

In [10]:
df1.dtypes
df1

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M7R,Mississauga,Canada Post Gateway Processing Centre
1,M3K,North York,"CFB Toronto,Downsview East"
2,M7A,Queen's Park,Queen's Park
3,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North"
4,M5B,Downtown Toronto,"Ryerson,Garden District"
5,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
6,M1W,Scarborough,L'Amoreaux West
7,M3A,North York,Parkwoods
8,M5P,Central Toronto,"Forest Hill North,Forest Hill West"
9,M6H,West Toronto,"Dovercourt Village,Dufferin"


In [11]:
df1.shape

(103, 3)

# Part II: Get Longitudes and Latitudes by Postal Code

In [12]:

import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_bc00cad771044b85bc90ca573e543b19 = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='wqL-zAX2IiPwcq46fUwgLpSpNNIpmJKZQRx4jIGEQcLX',
    ibm_auth_endpoint="https://iam.eu-de.bluemix.net/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3.eu-geo.objectstorage.service.networklayer.com')

body = client_bc00cad771044b85bc90ca573e543b19.get_object(Bucket='exambeforeexam-donotdelete-pr-bdb2pwhmopsr6z',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_data_1 = pd.read_csv(body)
df_data_1.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
#merge the two dataframes with Postal Code match
merged_left = pd.merge(left=df1,right=df_data_1, how='left', left_on='Postal Code', right_on='Postal Code')
merged_left

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M7R,Mississauga,Canada Post Gateway Processing Centre,43.636966,-79.615819
1,M3K,North York,"CFB Toronto,Downsview East",43.737473,-79.464763
2,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
3,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North",43.653654,-79.506944
4,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
5,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
6,M1W,Scarborough,L'Amoreaux West,43.799525,-79.318389
7,M3A,North York,Parkwoods,43.753259,-79.329656
8,M5P,Central Toronto,"Forest Hill North,Forest Hill West",43.696948,-79.411307
9,M6H,West Toronto,"Dovercourt Village,Dufferin",43.669005,-79.442259


In [14]:
merged_left.shape

(103, 5)

# Part III: Analysis, Segmenting & Clustering of Toronto Data using Forsquare's API

In [19]:
#boroughs that only contain Toronto
df_toronto = merged_left[merged_left['Borough'].str.contains ('Toronto')].reset_index(drop=True)
df_toronto.head()


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
1,M5P,Central Toronto,"Forest Hill North,Forest Hill West",43.696948,-79.411307
2,M6H,West Toronto,"Dovercourt Village,Dufferin",43.669005,-79.442259
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5T,Downtown Toronto,"Chinatown,Grange Park,Kensington Market",43.653206,-79.400049


In [16]:
df_toronto.shape

(38, 5)

In [17]:
import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library


Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    cer

In [18]:
# get the geographical coordinates of Toronto

address = 'Toronto, ON'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [20]:
#create a map of Toronto with the Boroughs that contain the word "Toronto" (like East, Downtown, etc)

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood, pst in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighbourhood'],df_toronto['Postal Code']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [29]:
#drill down to focus only on downtown_toronto
downtown_toronto = df_toronto[df_toronto['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
downtown_toronto.head(18)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
1,M5T,Downtown Toronto,"Chinatown,Grange Park,Kensington Market",43.653206,-79.400049
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4X,Downtown Toronto,"Cabbagetown,St. James Town",43.667967,-79.367675
4,M6G,Downtown Toronto,Christie,43.669542,-79.422564
5,M5X,Downtown Toronto,"First Canadian Place,Underground city",43.648429,-79.38228
6,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
7,M5H,Downtown Toronto,"Adelaide,King,Richmond",43.650571,-79.384568
8,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",43.640816,-79.381752
9,M5K,Downtown Toronto,"Design Exchange,Toronto Dominion Centre",43.647177,-79.381576


In [22]:
downtown_toronto.shape

(18, 5)

In [24]:
address = 'Downtown Toronto, TO'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.6541737, -79.3808116451341.


In [25]:
#create a map only on borough downtown_toronto
map_downtown_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood, pst in zip(downtown_toronto['Latitude'], downtown_toronto['Longitude'], downtown_toronto['Borough'], downtown_toronto['Neighbourhood'],downtown_toronto['Postal Code']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown_toronto)  
    
map_downtown_toronto

In [26]:
#Credentials for Foursquare API
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 
CLIENT_SECRET:


In [36]:
rosedale = downtown_toronto[downtown_toronto['Neighbourhood'] == 'Rosedale']
rosedale

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
10,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529


In [50]:
# limit of number of venues returned by Foursquare API
LIMIT = 100 
# radius of 1000m
radius = 1000 

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    43.679563, 
    -79.377529, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=YO3RFRVY1PJLKFTWG1IHYN2MRDV00PKOC0H3EJRBPSTRKM0Z&client_secret=PT5NXRR1OA1UJQGB0EVHQWVZLFUVFD3E3DARB2D5NHEWWLNB&v=20180605&ll=43.679563,-79.377529&radius=1000&limit=100'

In [51]:
results_top100 = requests.get(url).json()
results_top100

{'meta': {'code': 200, 'requestId': '5dd1c602e826ac00280af655'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Rosedale',
  'headerFullLocation': 'Rosedale, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 21,
  'suggestedBounds': {'ne': {'lat': 43.68856300900001,
    'lng': -79.36510776540459},
   'sw': {'lat': 43.67056299099999, 'lng': -79.3899502345954}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4adcb343f964a520e32e21e3',
       'name': 'Summerhill Market',
       'location': {'address': '446 Summerhill Ave',
        'crossStreet': 'btwn. MacLennan Ave. and Glen Rd.',
        'lat': 43.68626482142425,
        'lng': -79.37545823237794,
        

In [52]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [53]:
venues = results_top100['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Summerhill Market,Grocery Store,43.686265,-79.375458
1,Black Camel,BBQ Joint,43.677016,-79.389367
2,Toronto Lawn Tennis Club,Athletics & Sports,43.680667,-79.388559
3,Maison Selby,Bistro,43.671232,-79.376618
4,Tinuno,Filipino Restaurant,43.671281,-79.37492


In [56]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

21 venues were returned by Foursquare.


In [57]:
#get all neighborhood of Downtown Toronto
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [58]:
downtown_toronto_venues = getNearbyVenues(names=downtown_toronto['Neighbourhood'],
                                   latitudes=downtown_toronto['Latitude'],
                                   longitudes=downtown_toronto['Longitude']
                                  )

Ryerson,Garden District
Chinatown,Grange Park,Kensington Market
St. James Town
Cabbagetown,St. James Town
Christie
First Canadian Place,Underground city
Harbourfront
Adelaide,King,Richmond
Harbourfront East,Toronto Islands,Union Station
Design Exchange,Toronto Dominion Centre
Rosedale
Church and Wellesley
Berczy Park
CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
Harbord,University of Toronto
Stn A PO Boxes 25 The Esplanade
Commerce Court,Victoria Hotel
Central Bay Street


In [59]:
print(downtown_toronto_venues.shape)
downtown_toronto_venues.head()

(1287, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Ryerson,Garden District",43.657162,-79.378937,UNIQLO ユニクロ,43.65591,-79.380641,Clothing Store
1,"Ryerson,Garden District",43.657162,-79.378937,Blaze Pizza,43.656518,-79.380015,Pizza Place
2,"Ryerson,Garden District",43.657162,-79.378937,Silver Snail Comics,43.657031,-79.381403,Comic Shop
3,"Ryerson,Garden District",43.657162,-79.378937,Yonge-Dundas Square,43.656054,-79.380495,Plaza
4,"Ryerson,Garden District",43.657162,-79.378937,Page One Cafe,43.657772,-79.376073,Café


#### Let's find out how many venues are there for each Neighborhood

In [67]:
downtown_toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",100,100,100,100,100,100
Berczy Park,56,56,56,56,56,56
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",15,15,15,15,15,15
"Cabbagetown,St. James Town",44,44,44,44,44,44
Central Bay Street,80,80,80,80,80,80
"Chinatown,Grange Park,Kensington Market",96,96,96,96,96,96
Christie,17,17,17,17,17,17
Church and Wellesley,89,89,89,89,89,89
"Commerce Court,Victoria Hotel",100,100,100,100,100,100
"Design Exchange,Toronto Dominion Centre",100,100,100,100,100,100


#### How many unique Categories are there

In [61]:
print('There are {} uniques categories.'.format(len(downtown_toronto_venues['Venue Category'].unique())))

There are 201 uniques categories.


In [65]:
# one hot encoding
downtown_toronto_onehot = pd.get_dummies(downtown_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
downtown_toronto_onehot['Neighborhood'] = downtown_toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [downtown_toronto_onehot.columns[-1]] + list(downtown_toronto_onehot.columns[:-1])
downtown_toronto_onehot = downtown_toronto_onehot[fixed_columns]

downtown_toronto_onehot.head()

Unnamed: 0,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
downtown_toronto_onehot.shape

(1287, 201)

#### group rows by neighborhood and by taking the mean of the frequency of occurrence of each category 

In [69]:
downtown_toronto_grouped = downtown_toronto_onehot.groupby('Neighborhood').mean().reset_index()
downtown_toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.0
2,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.0,0.066667,0.066667,0.066667,0.133333,0.2,0.133333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Cabbagetown,St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0125,...,0.0,0.0,0.0,0.0,0.0125,0.0,0.0,0.0125,0.0,0.0
5,"Chinatown,Grange Park,Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.041667,0.0,0.052083,0.010417,0.0,0.0
6,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Church and Wellesley,0.011236,0.011236,0.0,0.0,0.0,0.0,0.0,0.0,0.011236,...,0.011236,0.0,0.0,0.0,0.0,0.011236,0.011236,0.0,0.011236,0.011236
8,"Commerce Court,Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
9,"Design Exchange,Toronto Dominion Centre",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0


In [70]:
downtown_toronto_grouped.shape

(18, 201)

#### Let's find out the Top5 Venues per Neigborhood

In [71]:
num_top_venues = 5

for hood in downtown_toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = downtown_toronto_grouped[downtown_toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
            venue  freq
0     Coffee Shop  0.07
1            Café  0.05
2      Steakhouse  0.04
3             Bar  0.04
4  Cosmetics Shop  0.03


----Berczy Park----
                venue  freq
0         Coffee Shop  0.07
1              Bakery  0.05
2            Beer Bar  0.04
3        Cocktail Bar  0.04
4  Seafood Restaurant  0.04


----CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara----
              venue  freq
0   Airport Service  0.20
1  Airport Terminal  0.13
2    Airport Lounge  0.13
3   Harbor / Marina  0.07
4       Coffee Shop  0.07


----Cabbagetown,St. James Town----
                venue  freq
0          Restaurant  0.07
1         Coffee Shop  0.07
2       Grocery Store  0.05
3  Italian Restaurant  0.05
4              Bakery  0.05


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.14
1                Café  0.05
2      Ice Cream Shop  0.05
3  Italian Restaurant 

#### let's create a dataframe out of it

In [72]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### And do the same for Top10 Most Common Venues

In [76]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = downtown_toronto_grouped['Neighborhood']

for ind in np.arange(downtown_toronto.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtown_toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Bar,Steakhouse,Breakfast Spot,Restaurant,Bakery,Asian Restaurant,Thai Restaurant,Sushi Restaurant
1,Berczy Park,Coffee Shop,Bakery,Seafood Restaurant,Beer Bar,Farmers Market,Cocktail Bar,Cheese Shop,Steakhouse,Café,Hotel
2,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Service,Airport Lounge,Airport Terminal,Boat or Ferry,Boutique,Sculpture Garden,Airport,Airport Food Court,Airport Gate,Harbor / Marina
3,"Cabbagetown,St. James Town",Restaurant,Coffee Shop,Grocery Store,Pub,Café,Pizza Place,Bakery,Park,Italian Restaurant,Playground
4,Central Bay Street,Coffee Shop,Café,Ice Cream Shop,Italian Restaurant,Sandwich Place,Burger Joint,Bubble Tea Shop,Bar,Bakery,Chinese Restaurant


## Now, Clustering

Use k-means to cluster the neighborhood into 5 clusters.

In [109]:
# set number of clusters to 5
kclusters = 5

downtown_toronto_grouped_clustering = downtown_toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(downtown_toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 2, 0, 0, 0, 4, 0, 0, 0], dtype=int32)

Create a dataframe that includes the cluster as well as the top 10 venues for each neighborhood

In [110]:
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

downtown_toronto_merged = downtown_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
downtown_toronto_merged = downtown_toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

downtown_toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937,0,Coffee Shop,Clothing Store,Cosmetics Shop,Bakery,Café,Fast Food Restaurant,Burger Joint,Diner,Pizza Place,Sporting Goods Shop
1,M5T,Downtown Toronto,"Chinatown,Grange Park,Kensington Market",43.653206,-79.400049,0,Café,Vietnamese Restaurant,Bar,Vegetarian / Vegan Restaurant,Chinese Restaurant,Coffee Shop,Dumpling Restaurant,Mexican Restaurant,Bakery,Caribbean Restaurant
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Café,Coffee Shop,Hotel,Restaurant,Italian Restaurant,Cosmetics Shop,Bakery,Breakfast Spot,Gastropub,Cocktail Bar
3,M4X,Downtown Toronto,"Cabbagetown,St. James Town",43.667967,-79.367675,0,Restaurant,Coffee Shop,Grocery Store,Pub,Café,Pizza Place,Bakery,Park,Italian Restaurant,Playground
4,M6G,Downtown Toronto,Christie,43.669542,-79.422564,4,Grocery Store,Café,Park,Athletics & Sports,Italian Restaurant,Diner,Convenience Store,Nightclub,Candy Store,Restaurant


Visualize the Data

In [111]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(downtown_toronto_merged['Latitude'], downtown_toronto_merged['Longitude'], downtown_toronto_merged['Neighbourhood'], downtown_toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Cluster 1

In [112]:
downtown_toronto_merged.loc[downtown_toronto_merged['Cluster Labels'] == 0, downtown_toronto_merged.columns[[1] + list(range(5, downtown_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,0,Coffee Shop,Clothing Store,Cosmetics Shop,Bakery,Café,Fast Food Restaurant,Burger Joint,Diner,Pizza Place,Sporting Goods Shop
1,Downtown Toronto,0,Café,Vietnamese Restaurant,Bar,Vegetarian / Vegan Restaurant,Chinese Restaurant,Coffee Shop,Dumpling Restaurant,Mexican Restaurant,Bakery,Caribbean Restaurant
2,Downtown Toronto,0,Café,Coffee Shop,Hotel,Restaurant,Italian Restaurant,Cosmetics Shop,Bakery,Breakfast Spot,Gastropub,Cocktail Bar
3,Downtown Toronto,0,Restaurant,Coffee Shop,Grocery Store,Pub,Café,Pizza Place,Bakery,Park,Italian Restaurant,Playground
5,Downtown Toronto,0,Coffee Shop,Café,Steakhouse,Hotel,Restaurant,Gym,Deli / Bodega,Gastropub,Seafood Restaurant,Bar
6,Downtown Toronto,0,Coffee Shop,Park,Bakery,Pub,Café,Breakfast Spot,Theater,Restaurant,Mexican Restaurant,Dessert Shop
7,Downtown Toronto,0,Coffee Shop,Café,Bar,Steakhouse,Breakfast Spot,Restaurant,Bakery,Asian Restaurant,Thai Restaurant,Sushi Restaurant
8,Downtown Toronto,0,Coffee Shop,Hotel,Aquarium,Café,Fried Chicken Joint,Italian Restaurant,Brewery,Scenic Lookout,Restaurant,Pizza Place
9,Downtown Toronto,0,Coffee Shop,Café,Hotel,American Restaurant,Restaurant,Steakhouse,Bar,Gastropub,Seafood Restaurant,Italian Restaurant
11,Downtown Toronto,0,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Restaurant,Gay Bar,Gym,Men's Store,Mediterranean Restaurant,Italian Restaurant,Hotel


### Cluster 2

In [113]:
downtown_toronto_merged.loc[downtown_toronto_merged['Cluster Labels'] == 1, downtown_toronto_merged.columns[[1] + list(range(5, downtown_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Downtown Toronto,1,Park,Playground,Trail,Dance Studio,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store,Diner


### Cluster 3

In [114]:
downtown_toronto_merged.loc[downtown_toronto_merged['Cluster Labels'] == 2, downtown_toronto_merged.columns[[1] + list(range(5, downtown_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
13,Downtown Toronto,2,Airport Service,Airport Lounge,Airport Terminal,Boat or Ferry,Boutique,Sculpture Garden,Airport,Airport Food Court,Airport Gate,Harbor / Marina


### Cluster 4

In [115]:
downtown_toronto_merged.loc[downtown_toronto_merged['Cluster Labels'] == 3, downtown_toronto_merged.columns[[1] + list(range(5, downtown_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,Downtown Toronto,3,Café,Restaurant,Bakery,Bar,Bookstore,Sandwich Place,Japanese Restaurant,Italian Restaurant,French Restaurant,Pub


### Cluster 5

In [116]:
downtown_toronto_merged.loc[downtown_toronto_merged['Cluster Labels'] == 4, downtown_toronto_merged.columns[[1] + list(range(5, downtown_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Downtown Toronto,4,Grocery Store,Café,Park,Athletics & Sports,Italian Restaurant,Diner,Convenience Store,Nightclub,Candy Store,Restaurant
