In [3]:
import requests
import folium

import numpy as np
import pandas as pd

import matplotlib.cm as cm
import matplotlib.colors as colors

from pandas.io.json import json_normalize
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup

print("Packages Imported")

Packages Imported


### Gathering data using BeautifulSoup to scrape Wikipedia ###

In [4]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
bsoup = BeautifulSoup(url,'lxml')

In [5]:
table_post = bsoup.find('table')
fields = table_post.find_all('td')

postcode = []
borough = []
neighbourhood = []

for i in range(0, len(fields), 3):
    postcode.append(fields[i].text.strip())
    borough.append(fields[i+1].text.strip())
    neighbourhood.append(fields[i+2].text.strip())
        
df_pc = pd.DataFrame(data=[postcode, borough, neighbourhood]).transpose()
df_pc.columns = ['Postcode', 'Borough', 'Neighbourhood']
df_pc.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


### Data Cleaning ###

In [9]:
df_pc['Borough'].replace('Not assigned', np.nan, inplace=True)
df_pc.dropna(subset=['Borough'], inplace=True)
df_pcn = df_pc.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df_pcn.columns = ['Postcode', 'Borough', 'Neighbourhood']
df_pcn['Neighbourhood'].replace('Not assigned', "Queen's Park", inplace=True)

In [12]:
# adding latitude and longtitude to the data frame

df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.columns = ['Postcode', 'Latitude', 'Longitude']
df_pos = pd.merge(df_pcn, df_geo, on=['Postcode'], how='inner')
df_tor = df_pos[['Borough', 'Neighbourhood', 'Postcode', 'Latitude', 'Longitude']].copy()

In [13]:
df_tor

Unnamed: 0,Borough,Neighbourhood,Postcode,Latitude,Longitude
0,Scarborough,Malvern / Rouge,M1B,43.806686,-79.194353
1,Scarborough,Rouge Hill / Port Union / Highland Creek,M1C,43.784535,-79.160497
2,Scarborough,Guildwood / Morningside / West Hill,M1E,43.763573,-79.188711
3,Scarborough,Woburn,M1G,43.770992,-79.216917
4,Scarborough,Cedarbrae,M1H,43.773136,-79.239476
...,...,...,...,...,...
98,York,Weston,M9N,43.706876,-79.518188
99,Etobicoke,Westmount,M9P,43.696319,-79.532242
100,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...,M9R,43.688905,-79.554724
101,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...,M9V,43.739416,-79.588437


### Visualize The Data ###

In [38]:
latitude = 43.6532
longitude = -79.3832
map_tor = folium.Map(location=[latitude, longitude], zoom_start=11)
for lat, lng, label in zip(df_tor['Latitude'], df_tor['Longitude'], df_tor['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='##FF0001',
        fill_opacity=1,
        parse_html=False).add_to(map_tor)  
map_tor

### Utilize Foursquare's API ###

In [39]:
# get connected!

CLIENT_ID = '4V4HZ4JNKI1MHQLGHKCSVTL14QCOJ0I54K1V15I01OIK5XRP'
CLIENT_SECRET = 'YHLJR5LIEWY4VYZ1CLW0FPTX5CQ1QMPAA0XDE0M2YMOHP3AR'
VERSION = '20180605'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 4V4HZ4JNKI1MHQLGHKCSVTL14QCOJ0I54K1V15I01OIK5XRP
CLIENT_SECRET:YHLJR5LIEWY4VYZ1CLW0FPTX5CQ1QMPAA0XDE0M2YMOHP3AR


In [40]:
LIMIT = 100
radius = 1000

neighbourhood_latitude = 43.6532
neighbourhood_longitude = -79.3832

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)

url

'https://api.foursquare.com/v2/venues/explore?&client_id=4V4HZ4JNKI1MHQLGHKCSVTL14QCOJ0I54K1V15I01OIK5XRP&client_secret=YHLJR5LIEWY4VYZ1CLW0FPTX5CQ1QMPAA0XDE0M2YMOHP3AR&v=20180605&ll=43.6532,-79.3832&radius=1000&limit=100'

### placeholder

In [73]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [74]:
tor_venues = getNearbyVenues(names=df_tor['Neighbourhood'],
    latitudes=df_tor['Latitude'],
    longitudes=df_tor['Longitude'])

Malvern / Rouge
Rouge Hill / Port Union / Highland Creek
Guildwood / Morningside / West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park / Ionview / East Birchmount Park
Golden Mile / Clairlea / Oakridge
Cliffside / Cliffcrest / Scarborough Village West
Birch Cliff / Cliffside West
Dorset Park / Wexford Heights / Scarborough Town Centre
Wexford / Maryvale
Agincourt
Clarks Corners / Tam O'Shanter / Sullivan
Milliken / Agincourt North / Steeles East / L'Amoreaux East
Steeles West / L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview / Henry Farm / Oriole
Bayview Village
York Mills / Silver Hills
Willowdale / Newtonbrook
Willowdale
York Mills West
Willowdale
Parkwoods
Don Mills
Don Mills
Bathurst Manor / Wilson Heights / Downsview North
Northwood Park / York University
Downsview
Downsview
Downsview
Downsview
Victoria Village
Parkview Hill / Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West / Riverdale
India Bazaar / The Beaches 

In [75]:
print(tor_venues.shape)
tor_venues.head()

(2139, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Malvern / Rouge,43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,Malvern / Rouge,43.806686,-79.194353,T Hamilton & Son Roofing Inc,43.807985,-79.198194,Construction & Landscaping
2,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497,Scarborough Historical Society,43.788755,-79.162438,History Museum
4,Guildwood / Morningside / West Hill,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


In [80]:
tor_venues.groupby(["Neighborhood"]).count()
print('There are {} uniques categories.'.format(len(tor_venues['Venue Category'].unique())))

There are 264 uniques categories.


### Processing ###

In [85]:
onehot = pd.get_dummies(tor_venues[['Venue Category']], prefix="", prefix_sep="")
onehot['Neighborhoods'] = tor_venues['Neighborhood'] 
fixed_columns = [onehot.columns[-1]] + list(onehot.columns[:-1])
onehot = onehot[fixed_columns]
grouped = onehot.groupby(["Neighborhoods"]).mean().reset_index()
grouped.head()

Unnamed: 0,Neighborhoods,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Alderwood / Long Branch,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bathurst Manor / Wilson Heights / Downsview North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Bedford Park / Lawrence Manor East,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [91]:
len(grouped[grouped["Vegetarian / Vegan Restaurant"] > 0])
tor_rest = grouped[["Neighborhoods","Vegetarian / Vegan Restaurant"]]
tor_rest

Unnamed: 0,Neighborhoods,Vegetarian / Vegan Restaurant
0,Agincourt,0.0
1,Alderwood / Long Branch,0.0
2,Bathurst Manor / Wilson Heights / Downsview North,0.0
3,Bayview Village,0.0
4,Bedford Park / Lawrence Manor East,0.0
...,...,...
89,Willowdale,0.0
90,Woburn,0.0
91,Woodbine Heights,0.0
92,York Mills / Silver Hills,0.0


### Implement Kmeans Clustering ###

In [101]:
kclusters = 5

tor_clustering = tor_rest.drop(["Neighborhoods"], 1)
tor_clustering.head()
kmeans = KMeans(init="k-means++", n_clusters=kclusters, n_init=12).fit(tor_clustering)
kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 2, 0, 0, 0, 0], dtype=int32)

In [107]:
merged = tor_rest.copy()
merged["Category"] = kmeans.labels_
merged.rename(columns={"Neighborhoods": "Neighbourhood"}, inplace=True)
merged.sort_values(["Category"], inplace=True, ascending=False)
merged

Unnamed: 0,Neighbourhood,Vegetarian / Vegan Restaurant,Category
68,Runnymede / Swansea,0.025000,4
72,St. James Town,0.011765,3
83,Toronto Dominion Centre / Design Exchange,0.010000,3
75,Stn A PO Boxes,0.010526,3
64,Richmond / Adelaide / King,0.010309,3
...,...,...,...
31,"Garden District, Ryerson",0.000000,0
30,Forest Hill North & West,0.000000,0
28,Fairview / Henry Farm / Oriole,0.000000,0
27,Eringate / Bloordale Gardens / Old Burnhamthor...,0.000000,0


In [116]:
# this merges our new dataframe with our old df which has latitude and longtitude information
dfmerged = merged.merge(df_tor)
dfmerged.sort_values(["Category"], inplace=True, ascending=False)
dfmerged.head()

Unnamed: 0,Neighbourhood,Vegetarian / Vegan Restaurant,Category,Borough,Postcode,Latitude,Longitude
0,Runnymede / Swansea,0.025,4,West Toronto,M6S,43.651571,-79.48445
2,Toronto Dominion Centre / Design Exchange,0.01,3,Downtown Toronto,M5K,43.647177,-79.381576
3,Stn A PO Boxes,0.010526,3,Downtown Toronto,M5W,43.646435,-79.374846
4,Richmond / Adelaide / King,0.010309,3,Downtown Toronto,M5H,43.650571,-79.384568
5,Harbourfront East / Union Station / Toronto Is...,0.01,3,Downtown Toronto,M5J,43.640816,-79.381752


### Visualization ###

In [122]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
markers_colors = []
for lat, lon, poi, cluster in zip(dfmerged['Latitude'], dfmerged['Longitude'], dfmerged['Neighbourhood'], dfmerged['Category']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=1).add_to(map_clusters)
map_clusters

### Explore Result ###

In [123]:
merged.loc[merged['Category'] == 0]

Unnamed: 0,Neighbourhood,Vegetarian / Vegan Restaurant,Category
56,Northwest,0.0,0
67,Rouge Hill / Port Union / Highland Creek,0.0,0
66,Roselawn,0.0,0
65,Rosedale,0.0,0
53,New Toronto / Mimico South / Humber Bay Shores,0.0,0
...,...,...,...
31,"Garden District, Ryerson",0.0,0
30,Forest Hill North & West,0.0,0
28,Fairview / Henry Farm / Oriole,0.0,0
27,Eringate / Bloordale Gardens / Old Burnhamthor...,0.0,0


In [124]:
merged.loc[merged['Category'] == 1]

Unnamed: 0,Neighbourhood,Vegetarian / Vegan Restaurant,Category
78,The Annex / North Midtown / Yorkville,0.045455,1
48,Little Portugal / Trinity,0.046512,1
43,Kensington Market / Chinatown / Grange Park,0.048387,1


In [125]:
merged.loc[merged['Category'] == 2]

Unnamed: 0,Neighbourhood,Vegetarian / Vegan Restaurant,Category
13,Central Bay Street,0.015625,2
5,Berczy Park,0.018182,2
18,Commerce Court / Victoria Hotel,0.02,2


In [126]:
merged.loc[merged['Category'] == 3]

Unnamed: 0,Neighbourhood,Vegetarian / Vegan Restaurant,Category
72,St. James Town,0.011765,3
83,Toronto Dominion Centre / Design Exchange,0.01,3
75,Stn A PO Boxes,0.010526,3
64,Richmond / Adelaide / King,0.010309,3
35,Harbourfront East / Union Station / Toronto Is...,0.01,3
29,First Canadian Place / Underground city,0.01,3


In [127]:
merged.loc[merged['Category'] == 4]

Unnamed: 0,Neighbourhood,Vegetarian / Vegan Restaurant,Category
68,Runnymede / Swansea,0.025,4


In [128]:
merged.loc[merged['Category'] == 5]

Unnamed: 0,Neighbourhood,Vegetarian / Vegan Restaurant,Category


### Results ###

When identifying these categories, it's clear that category 4 (which only contains one neighbourhood) is saturated with vegetarian / vegan restaurants compared to other categories. Categories 0 and 1 have little to know vegan / vegetarian representation. Categories 2 and 3 are where we can see there solid representation, but not oversaturation.

We see from our visualization that categories 2 & 3 are close to the downtown waterfront area. I would advise someone looking to open a vegan / vegetarian restaurant near to the locations of our cluster categories 2 & 3 due to a proven cliental (likely capitalizing on commuter activity from local businesses). There is even a risk but potentially high reward option to choose a category 0 neighbourhood that is very close to the neighbourhoods from categories 2 & 3. Queen's Park or the Garden District are examples of areas that don't have vegan / vegitarian resuarants yet, however they are close enough to areas ares with proven demand that there is likely room to grow in those locations.