In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import geocoder # import geocoder
from geopy.geocoders import Nominatim
import folium
import numpy as np
from sklearn.cluster import KMeans

import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

page = requests.get(" https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
page.status_code
page.content

soup = BeautifulSoup(page.content, 'lxml')
  
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))
#print(df[0].to_json(orient='records'))
df = df[0]



In [2]:
##cleaning data - removing Not assigned boroughs and replacing not assigned neighbourhoods with boroughs.
df.loc[0,:]
df.columns = df.loc[0,:]#['a', 'b']
df.drop([0],axis=0,inplace = True)
df = df[df['Borough']!='Not assigned']  # removing entries with no value for Borough
df = df.reset_index()                   #resetting index after removing
df.drop(['index'],axis=1,inplace = True)
temp = df[df['Neighbourhood']=='Not assigned'].index.tolist() # getting a list of Neighbourhoods with the value "Not assigned."
df.loc[temp,'Neighbourhood'] = df.loc[temp,'Borough'] ## replacing those neighbourhoods with borough name

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [3]:
#finding and grouping Neighborhoods with the same Post code
for index, row in df.iterrows():
      row['Neighbourhood'] =  df[df['Postcode']== row['Postcode']]['Neighbourhood'].str.cat(sep=',')

        
df.drop_duplicates(subset='Postcode', keep='first', inplace=True) #dropping the duplicates
df.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
4,M6A,North York,"Lawrence Heights,Lawrence Manor"
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,"Rouge,Malvern"
10,M3B,North York,Don Mills North
11,M4B,East York,"Woodbine Gardens,Parkview Hill"
13,M5B,Downtown Toronto,"Ryerson,Garden District"


In [4]:
##geocoder is taking forever to respond. Therefore, continuing with the provided location data.
geodata = pd.read_csv('https://cocl.us/Geospatial_data')


In [5]:
#For each row in geodata frame, check the postal code and add Latitude and Longitude values to the df frame.

for index, row in geodata.iterrows():
    temp = df[df['Postcode'] == row['Postal Code']].index.tolist()
    df.loc[temp,'Latitude'] = row['Latitude']
    df.loc[temp,'Longitude'] = row['Longitude']
    
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
4,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
6,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [6]:
Toronto_boroughs = df[df['Borough'].str.contains("Toronto")] #Filtering out Boruoghs that contain the word Toronto
Toronto_boroughs.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
13,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
27,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
36,M4E,East Toronto,The Beaches,43.676357,-79.293031
37,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [7]:

address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [8]:
import matplotlib.cm as cm
import matplotlib.colors as colors
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(Toronto_boroughs['Latitude'], Toronto_boroughs['Longitude'], Toronto_boroughs['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

In [9]:
CLIENT_ID = 'DTKGH5IFWTL4ITCLDCUPEFNGXG21CTMOCFQZQXRDDWCK2S0R' # your Foursquare ID
CLIENT_SECRET = '1PJXLH2E4TGBXVYRACMVGXOHZHFCM3D4KQOTCG4Q5NQ34H1Q' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [10]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [11]:
LIMIT = 100
toronto_venues = getNearbyVenues(names = Toronto_boroughs.loc[:, 'Neighbourhood'], 
                                   latitudes = Toronto_boroughs.loc[:, 'Latitude'],  
                                   longitudes = Toronto_boroughs.loc[:, 'Longitude'], 
                                   radius=500)


Harbourfront,Regent Park
Ryerson,Garden District
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Adelaide,King,Richmond
Dovercourt Village,Dufferin
Harbourfront East,Toronto Islands,Union Station
Little Portugal,Trinity
The Danforth West,Riverdale
Design Exchange,Toronto Dominion Centre
Brockton,Exhibition Place,Parkdale Village
The Beaches West,India Bazaar
Commerce Court,Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North,Forest Hill West
High Park,The Junction South
North Toronto West
The Annex,North Midtown,Yorkville
Parkdale,Roncesvalles
Davisville
Harbord,University of Toronto
Runnymede,Swansea
Moore Park,Summerhill East
Chinatown,Grange Park,Kensington Market
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
Rosedale
Stn A PO Boxes 25 The Esplanade
Cabbagetown,St. James Town
First Canadian Place,Underground city


In [12]:
#Adding neighborhood column and bring it to front
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
neighbourhood = toronto_venues['Neighborhood'] 
toronto_onehot.drop(labels=['Neighborhood'], axis=1,inplace = True)
toronto_onehot.insert(0, 'Neighborhood', neighbourhood)
toronto_onehot.head()


Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,"Harbourfront,Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Harbourfront,Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Harbourfront,Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Harbourfront,Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Harbourfront,Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [14]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [15]:

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Steakhouse,Thai Restaurant,American Restaurant,Bar,Cosmetics Shop,Hotel,Restaurant,Gym
1,Berczy Park,Coffee Shop,Cocktail Bar,Restaurant,Bakery,Beer Bar,Farmers Market,Steakhouse,Seafood Restaurant,Cheese Shop,Café
2,"Brockton,Exhibition Place,Parkdale Village",Coffee Shop,Café,Breakfast Spot,Gym,Climbing Gym,Caribbean Restaurant,Burrito Place,Stadium,Furniture / Home Store,Bar
3,Business reply mail Processing Centre969 Eastern,Light Rail Station,Comic Shop,Auto Workshop,Smoke Shop,Brewery,Spa,Farmers Market,Fast Food Restaurant,Burrito Place,Restaurant
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Lounge,Airport Service,Airport Terminal,Harbor / Marina,Boat or Ferry,Airport,Airport Food Court,Airport Gate,Boutique,Sculpture Garden


In [16]:

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

In [17]:

toronto_merged = Toronto_boroughs

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636,0,Coffee Shop,Bakery,Café,Park,Breakfast Spot,Pub,Theater,Mexican Restaurant,Yoga Studio,Dessert Shop
13,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937,0,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Bar,Japanese Restaurant,Restaurant,Middle Eastern Restaurant,Ramen Restaurant,Diner
27,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Coffee Shop,Café,Restaurant,Hotel,Clothing Store,Gastropub,Cosmetics Shop,Bakery,Italian Restaurant,Cocktail Bar
36,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Coffee Shop,Pub,Yoga Studio,Dim Sum Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
37,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0,Coffee Shop,Cocktail Bar,Restaurant,Bakery,Beer Bar,Farmers Market,Steakhouse,Seafood Restaurant,Cheese Shop,Café


In [20]:
#Visualizing the clusters on the map

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Observations

Majority of the neighborhoods belong to cluster 0