In [2]:
!pip install folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/4f/86/1ab30184cb60bc2b95deffe2bd86b8ddbab65a4fac9f7313c278c6e8d049/folium-0.9.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 14.1MB/s eta 0:00:01
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.3.1 folium-0.9.1


### load libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

### scraping data

In [2]:
def load_page():
    page_url = "https://en.m.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
    rs = requests.get(page_url)
    soup = BeautifulSoup(rs.text)
    selected = soup.find(class_ = "wikitable sortable")
    return selected

### process dataframe
 _1. Drop Borough equals Not assigned_<br/>
 _2. Subsitute for Neighbour Not assigned with its Borough_<br/>
 _3. Join Neighbour with same Postcode_<br/>

In [3]:
def table_to_dataframe(soup):
    df = pd.read_html(str(soup))[0]
    df = df[df.Borough!='Not assigned']
    df.at[df.Neighbourhood=='Not assigned', 'Neighbourhood'] = df.Borough
    df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
    return df

### load dataframe

In [85]:
df_post = table_to_dataframe(load_page())
df_post.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Load Geopatial data

In [86]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.rename(columns = {'Postal Code':'Postcode'},inplace=True)

### Merge data

In [87]:
df = df_post.merge(df_geo,on='Postcode')

### Filtering Toronto

In [88]:
df_toronto = df[df.Borough.str.contains('Toronto')]
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


### Config foursqaure

In [103]:
CLIENT_ID = 'LJK2NPIVGDROGKGUCPT2DMTOFZ1MW4524JGHKVFR3OVEZNP5' 
CLIENT_SECRET = 'ZXQIJGTTXAKUT4XK50EYWYBI4EYSINLMMHIZSYHJRTAMTE1T' 
VERSION = '20180605' 
LIMIT = 100

### Explore Toronto Neighbourhood

In [104]:
def getNearbyVenues(name, latitude, longitude, radius=500):
    
    # create the API request URL
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            latitude, 
            longitude, 
            radius, 
            LIMIT)
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
        
    # return only relevant information for each nearby venue
    return [(name, latitude, longitude, v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'],  v['venue']['categories'][0]['name']) for v in results]

### Get All the Venues in Toronto

In [105]:
venues = []
for row in df_toronto[['Neighbourhood','Latitude','Longitude']].values:
    v = getNearbyVenues(*row)
    venues = venues+v
    
df_toronto_venues = pd.DataFrame.from_records(venues,columns=['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category'])

### Analyze Neighboorhood Venue Category
_1. transform categorical value to onehot_ <br/>

In [120]:
toronto_onehot = pd.get_dummies(df_toronto_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = df_toronto_venues['Neighbourhood'] 
# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.head()

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"The Danforth West,Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [121]:
toronto_groupped = toronto_onehot.groupby(['Neighbourhood']).mean().reset_index()
toronto_groupped.head()

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.0
2,"Brockton,Exhibition Place,Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.058824,0.058824,0.058824,0.117647,0.176471,0.117647,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [122]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Use the most common venues to describe neighbourhood features

In [140]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_groupped['Neighbourhood']

for ind in np.arange(toronto_groupped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_groupped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Thai Restaurant,Bar,Steakhouse,Burger Joint,Cosmetics Shop,Restaurant,Hotel,American Restaurant
1,Berczy Park,Coffee Shop,Cocktail Bar,Steakhouse,Cheese Shop,Café,Bakery,Farmers Market,Seafood Restaurant,Beer Bar,Restaurant
2,"Brockton,Exhibition Place,Parkdale Village",Breakfast Spot,Café,Coffee Shop,Intersection,Falafel Restaurant,Convenience Store,Burrito Place,Stadium,Caribbean Restaurant,Bar
3,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Yoga Studio,Auto Workshop,Garden Center,Garden,Fast Food Restaurant,Farmers Market,Comic Shop,Pizza Place,Restaurant
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Service,Airport Lounge,Airport Terminal,Boutique,Plane,Coffee Shop,Sculpture Garden,Boat or Ferry,Bar,Harbor / Marina


### Kmean Clustering

In [124]:
def fit_model(k,data):
    model = KMeans(n_clusters=k,init='k-means++',random_state=0).fit(data)
    return model.labels_

### Visulization
 _1.Use unique borough count in Toronto as the number of clusters_<br/>
 _2.Use latitude and longitude in the Central Toronto as central point of the Map_<br/> 
 _2.Show clusters on Map_<br/>

In [141]:
#Clustering
k = 3
data = toronto_groupped.drop('Neighbourhood', 1)

labels = fit_model(k,data)

#Define Map central point
latitude,longitude = df_toronto[df_toronto.Borough=='Central Toronto'][['Latitude','Longitude']].values[0]

#Merge to toronto data
df_sorted = neighborhoods_venues_sorted
df_sorted.insert(0, 'Label', labels)
df_toronto_merged = df_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
df_toronto_merged = df_toronto_merged.join(df_sorted.set_index('Neighbourhood'), on='Neighbourhood')

In [142]:
#Create Map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

#Create Color Palette
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

#Add Markers with cluster labels
for lat, lon, neigh, cluster in zip(df_toronto_merged['Latitude'], df_toronto_merged['Longitude'], df_toronto_merged['Neighbourhood'], df_toronto_merged['Label']):
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=folium.Popup(str(neigh) + ' Cluster ' + str(cluster), parse_html=True),
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    
#Show map
map_clusters

### Examine

In [143]:
df_toronto_merged[df_toronto_merged.Label==0].head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Photography Studio,Park,Swim School,Bus Line,Discount Store,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
48,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316,0,Summer Camp,Playground,Park,Tennis Court,Yoga Studio,Dive Bar,Dog Run,Doner Restaurant,Donut Shop,Eastern European Restaurant
50,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,0,Park,Playground,Trail,Building,Yoga Studio,Discount Store,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
64,M5P,Central Toronto,"Forest Hill North,Forest Hill West",43.696948,-79.411307,0,Park,Trail,Sushi Restaurant,Jewelry Store,Yoga Studio,Discount Store,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store


In [146]:
df_toronto_merged[df_toronto_merged.Label==1].head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Health Food Store,Neighborhood,Trail,Pub,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,1,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Yoga Studio,Pizza Place,Bookstore,Brewery,Bubble Tea Shop
42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,1,Italian Restaurant,Board Shop,Sushi Restaurant,Sandwich Place,Brewery,Movie Theater,Fish & Chips Shop,Pub,Ice Cream Shop,Fast Food Restaurant
43,M4M,East Toronto,Studio District,43.659526,-79.340923,1,Café,Coffee Shop,Italian Restaurant,Bakery,American Restaurant,Gastropub,Cheese Shop,Stationery Store,Fish Market,Latin American Restaurant
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197,1,Hotel,Breakfast Spot,Gym,Park,Sandwich Place,Clothing Store,Food & Drink Shop,Yoga Studio,Dumpling Restaurant,Dog Run


In [145]:
df_toronto_merged[df_toronto_merged.Label==2].head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
63,M5N,Central Toronto,Roselawn,43.711695,-79.416936,2,Garden,Yoga Studio,Discount Store,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


| Cluster   |      Actual Label     |
|----------|:-------------:|
| 0 |  Public Facility |
| 1 |    Food & Drink   |
| 2 | Garden |