In [52]:
import requests
import json  
import xml
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
import lxml
from bs4 import BeautifulSoup
import geocoder

In [2]:
# Grab html content
response = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(response.content, features='html.parser')
tbl = str(soup.find('table')) #our table is at the first ocurrence of 'table' element
# Transofrm raw HTML string into Pandas Dataframe object
df = pd.read_html(tbl)
df = df[0] #grab the first element since Pandas.read_html always either returns a list of DataFrames or bails
# Data Prepping

# Rename columns
df.columns = ['Postalcode', 'Borough', 'Neighbourhood']
df = df[df['Borough'] != 'Not assigned'] # weed out 'Not assigned' Borough entries

# Assign Borough names on 'Not assigned' Neighbourhoods
for i in range(0, df.shape[0]):
    if df.iloc[i][2] == 'Not assigned':
        df.iloc[i][2] = df.iloc[i][1]
        i = i + 1

# Group Similar Postal codes and concatting Neighbourhoods with ", "
df = df.groupby(['Postalcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()

In [3]:
# postal code to coordinates function
def get_geocodes(postal_code):
    latlong_coords = None
    # loop until you get the coordinates
    while(latlong_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        latlong_coords = g.latlng
    return latlong_coords

In [28]:
# grab our data's coordinates through postal codes and the function we've created

coordinates = []
postalcodes = df['Postalcode'].tolist()
for postalcode in postalcodes:
    coordinates.append(get_geocodes(postalcode))
df_coords = pd.DataFrame(coordinates, columns=['Latitude', 'Longitude'])
df['Latitude'] = df_coords['Latitude']
df['Longitude'] = df_coords['Longitude']
df

Unnamed: 0,Postalcode,Borough,Neighbourhood,Latitude,Longitude
70,M5X,Downtown Toronto,"First Canadian Place, Underground city",43.64828,-79.381461


In [5]:
geolocator = Nominatim(user_agent="mai0li")
location = geolocator.geocode('Toronto')
toronto_map = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)

# build the map
for lat, lng, borough, neighbourhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)

toronto_map

In [43]:
CLIENT_ID = 'NUS4ZERSSW0RWRIGFQCNTBZUAEDXCSIUNLZC50O23K2ZAFPY' # your Foursquare ID
CLIENT_SECRET = 'YA3F3FCO1B04GGIHUSIB5TGTO1P35NWX5NX2MRNYLTPFJ5A1' # your Foursquare Secret
VERSION = '20180303' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

neighbourhood = df.loc[70, 'Neighbourhood']
neighbourhood_latitude = df.loc[70, 'Latitude']
neighbourhood_longitude = df.loc[70, 'Longitude']

LIMIT = 10
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)

print(url)

results = requests.get(url).json()
results

Your credentails:
CLIENT_ID: NUS4ZERSSW0RWRIGFQCNTBZUAEDXCSIUNLZC50O23K2ZAFPY
CLIENT_SECRET:YA3F3FCO1B04GGIHUSIB5TGTO1P35NWX5NX2MRNYLTPFJ5A1
https://api.foursquare.com/v2/venues/explore?&client_id=NUS4ZERSSW0RWRIGFQCNTBZUAEDXCSIUNLZC50O23K2ZAFPY&client_secret=YA3F3FCO1B04GGIHUSIB5TGTO1P35NWX5NX2MRNYLTPFJ5A1&v=20180303&ll=43.64828000000006,-79.38146082599997&radius=500&limit=10


{'meta': {'code': 200, 'requestId': '5cdd45b0db04f52f650496dc'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-501ae947e4b0d11883b910a7-0',
      'venue': {'beenHere': {'count': 0,
        'lastCheckinExpiredAt': 0,
        'marked': False,
        'unconfirmedCount': 0},
       'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/building/gym_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d176941735',
         'name': 'Gym',
         'pluralName': 'Gyms',
         'primary': True,
         'shortName': 'Gym'}],
       'contact': {},
       'hereNow': {'count': 0, 'groups': [], 'summary': 'Nobody here'},
       'id': '501ae947e4b0d11883b910a7',
       'location': {'address': '199 Bay St',
        'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'crossS

In [44]:
results = requests.get(url).json()
results

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [45]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

print(nearby_venues.head())
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

                     name            categories        lat        lng
0      Equinox Bay Street                   Gym  43.648100 -79.379989
1                   Canoe            Restaurant  43.647452 -79.381320
2          Mos Mos Coffee                  Café  43.648159 -79.378745
3  Walrus Pub & Beer Hall                   Pub  43.647375 -79.379515
4   Adelaide Club Toronto  Gym / Fitness Center  43.649279 -79.381921
10 venues were returned by Foursquare.


In [46]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [47]:
toronto_venues = getNearbyVenues(names=df['Neighbourhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

print(toronto_venues.shape)
toronto_venues.head()

Rouge, Malvern
Highland Creek, Rouge Hill, Port Union
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park, Ionview, Kennedy Park
Clairlea, Golden Mile, Oakridge
Cliffcrest, Cliffside, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Scarborough Town Centre, Wexford Heights
Maryvale, Wexford
Agincourt
Clarks Corners, Sullivan, Tam O'Shanter
Agincourt North, L'Amoreaux East, Milliken, Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
Silver Hills, York Mills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park, Don Mills South
Bathurst Manor, Downsview North, Wilson Heights
Northwood Park, York University
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens, Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West, 

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.811525,-79.195517,Canadian Appliance Source Whitby,43.808353,-79.191331,Home Service
1,"Highland Creek, Rouge Hill, Port Union",43.78573,-79.15875,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Guildwood, Morningside, West Hill",43.76569,-79.175256,Homestead Roofing Repair,43.76514,-79.178663,Construction & Landscaping
3,"Guildwood, Morningside, West Hill",43.76569,-79.175256,Heron Park Community Centre,43.768867,-79.176958,Gym / Fitness Center
4,"Guildwood, Morningside, West Hill",43.76569,-79.175256,Heron Park,43.769327,-79.177201,Park


Let's check how many venues were returned for each neighborhood.
Also, let's find out how many unique categories can be curated from all the returned venues.

In [50]:
toronto_venues.groupby('Neighborhood').count()
toronto_venues.groupby('Neighborhood').count()


print('There are {} unique categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 170 unique categories.


In [54]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]



toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

print(toronto_onehot.shape)
toronto_onehot.head()

(694, 170)


Unnamed: 0,Yoga Studio,Airport,American Restaurant,Art Gallery,Art Museum,Asian Restaurant,Athletics & Sports,Auto Dealership,Auto Garage,BBQ Joint,...,Thai Restaurant,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Shop,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))



----Adelaide, King, Richmond----
                 venue  freq
0           Restaurant   0.1
1          Coffee Shop   0.1
2  American Restaurant   0.1
3                 Café   0.1
4           Steakhouse   0.1


----Agincourt----
                 venue  freq
0        Shopping Mall   0.2
1                 Pool   0.1
2       Discount Store   0.1
3         Skating Rink   0.1
4  Shanghai Restaurant   0.1


----Agincourt North, L'Amoreaux East, Milliken, Steeles East----
                venue  freq
0            Pharmacy   1.0
1         Yoga Studio   0.0
2              Museum   0.0
3       Metro Station   0.0
4  Mexican Restaurant   0.0


----Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown----
                  venue  freq
0   Fried Chicken Joint   0.1
1         Grocery Store   0.1
2              Pharmacy   0.1
3        Sandwich Place   0.1
4  Fast Food Restaurant   0.1


----Alderwood, Long Branch----
                venue  freq
0  

                venue  freq
0   Convenience Store  0.25
1                Park  0.25
2  Italian Restaurant  0.25
3      Farmers Market  0.25
4              Museum  0.00


----Emery, Humberlea----
                venue  freq
0         Coffee Shop  0.50
1           Nightclub  0.25
2                Park  0.25
3              Museum  0.00
4  Mexican Restaurant  0.00


----Fairview, Henry Farm, Oriole----
               venue  freq
0       Burger Joint   0.1
1           Pharmacy   0.1
2      Shopping Mall   0.1
3  Electronics Store   0.1
4   Toy / Game Store   0.1


----First Canadian Place, Underground city----
                 venue  freq
0                 Café   0.2
1           Restaurant   0.1
2  American Restaurant   0.1
3                  Pub   0.1
4            Gastropub   0.1


----Flemingdon Park, Don Mills South----
           venue  freq
0  Grocery Store  0.33
1   Intersection  0.17
2            Gym  0.17
3    Coffee Shop  0.17
4     Beer Store  0.17


----Forest Hill North, Forest 

               venue  freq
0      Women's Store  0.25
1      Train Station  0.25
2  Indian Restaurant  0.25
3         Restaurant  0.25
4          Pet Store  0.00


----St. James Town----
                venue  freq
0           Gastropub   0.2
1            Creperie   0.1
2  Italian Restaurant   0.1
3          Restaurant   0.1
4         Coffee Shop   0.1


----Stn A PO Boxes 25 The Esplanade----
                  venue  freq
0             Speakeasy   0.1
1          Concert Hall   0.1
2          Burger Joint   0.1
3  Brazilian Restaurant   0.1
4            Steakhouse   0.1


----Studio District----
                       venue  freq
0  Latin American Restaurant   0.1
1                       Café   0.1
2      Vietnamese Restaurant   0.1
3                Pizza Place   0.1
4                    Brewery   0.1


----The Annex, North Midtown, Yorkville----
                venue  freq
0                Café   0.2
1       Historic Site   0.1
2   Indian Restaurant   0.1
3  Mexican Restaurant   0.1
4

In [56]:
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Gastropub,American Restaurant,Gym / Fitness Center,Gym,Seafood Restaurant,Coffee Shop,Café,Restaurant,Bakery,Steakhouse
1,Agincourt,Shopping Mall,Chinese Restaurant,Shanghai Restaurant,Supermarket,Sushi Restaurant,Skating Rink,Discount Store,Bakery,Pool,Electronics Store
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Pharmacy,Women's Store,Eastern European Restaurant,Flower Shop,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Hardware Store,Japanese Restaurant,Sandwich Place,Fast Food Restaurant,Liquor Store,Pharmacy,Beer Store,Fried Chicken Joint,Pizza Place,Grocery Store
4,"Alderwood, Long Branch",Gym,Athletics & Sports,Pub,Sandwich Place,Women's Store,Dog Run,Field,Fast Food Restaurant,Farmers Market,Falafel Restaurant


# 4. k-means cluster

In [60]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 3, 2, 3, 3, 3, 3, 0, 3, 3], dtype=int32)

In [61]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

ValueError: cannot insert Cluster Labels, already exists

In [None]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters