<h1>Segmenting and Clustering Neighborhoods in Toronto</h1>

<h2>Part 1. Scrape the Wikipedia page.</h2>

<h4>I'm using Beautiful Soup library to scrape wiki html page.</h4>

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim

#!conda install -c conda-forge folium --yes
import folium

In [2]:
# Get html data
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_data = requests.get(wiki_url).text

# Find table with Neighborhoods data
soup = BeautifulSoup(html_data, 'html5lib')
table = soup.find("table", class_ = "wikitable sortable")

# Create and fill dataframe
toronto_df = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"])
for row in table.tbody.find_all("tr"):  
        col = row.find_all("td")
        if (col != []):
            postal_code = col[0].text.replace("\n", "")
            borough = col[1].text.replace("\n", "")
            neig = col[2].text.replace("\n", "")
            
            # check borough value, skip row if Not assigned
            if (borough == "Not assigned"): 
                continue
            
            # check neighborhood value, assign borough if Not assigned 
            if (neig == "Not assigned"):
                neig = borought
            
            toronto_df = toronto_df.append({"PostalCode":postal_code, "Borough":borough, "Neighborhood":neig}, ignore_index=True)
            
            # split neighborhood values 
            #neig_split = neig.split(',')
            #for neig in neig_split:
            #    toronto_df = toronto_df.append({"PostalCode":postal_code, "Borough":borough, "Neighborhood":neig}, ignore_index=True)

In [3]:
toronto_df.shape

(103, 3)

<h2>Part 2. Get the geographical coordinates of each neighborhood</h2>

<h4>I'm using csv-file to get the geographical coordinates of each postal code</h4>

In [4]:
# Read the csv-file with geo data
csv_url = 'https://cocl.us/Geospatial_data'
geo_df = pd.read_csv(csv_url)

In [5]:
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Rename Postal Code column in geo_df and merge it with toronto_df

In [6]:
geo_df.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
toronto_df = pd.merge(toronto_df, geo_df, on="PostalCode")
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


Let's check Neighborhoods count for each Borough

In [7]:
toronto_df.groupby("Borough").count()

Unnamed: 0_level_0,PostalCode,Neighborhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,19,19,19,19
East Toronto,5,5,5,5
East York,5,5,5,5
Etobicoke,12,12,12,12
Mississauga,1,1,1,1
North York,24,24,24,24
Scarborough,17,17,17,17
Toronto/York,1,1,1,1
West Toronto,6,6,6,6


I'll use North Yort borough to explore and clustering 

In [8]:
northyork_df = toronto_df[toronto_df['Borough'] == 'North York'].reset_index(drop=True)
northyork_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
3,M3B,North York,Don Mills,43.745906,-79.352188
4,M6B,North York,Glencairn,43.709577,-79.445073


<h2>Part 3. Explore and cluster the neighborhoods in Toronto<h2>

<h4>I'm using North York borough to explore and clustering<h4>

In [9]:
# Convert an address North York borough into latitude and longitude values
address = 'North York, Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of North York, Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of North York, Toronto are 43.7543263, -79.44911696639593.


Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.

In [10]:
CLIENT_ID = 'DHYLYXUPWEPVDCYM2FUHHFV24UKWFL33Z0L30W2FYUIKK3RI' # your Foursquare ID
CLIENT_SECRET = 'HLCH2E1XFPH0RRH5F1ARVXKNCGS2SIWKUXVVVCVO0FTFVH3Q' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value
radius = 500


Define function to get all venues by borough

In [11]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Getting all venues by North York borough

In [12]:
northyork_venues = getNearbyVenues(names=northyork_df['Neighborhood'],
                                   latitudes=northyork_df['Latitude'],
                                   longitudes=northyork_df['Longitude']
                                  )


Parkwoods
Victoria Village
Lawrence Manor, Lawrence Heights
Don Mills
Glencairn
Don Mills
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Fairview, Henry Farm, Oriole
Northwood Park, York University
Bayview Village
Downsview
York Mills, Silver Hills
Downsview
North Park, Maple Leaf Park, Upwood Park
Humber Summit
Willowdale, Newtonbrook
Downsview
Bedford Park, Lawrence Manor East
Humberlea, Emery
Willowdale, Willowdale East
Downsview
York Mills West
Willowdale, Willowdale West


In [13]:
print(northyork_venues.shape)
northyork_venues.head()

(244, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


Let's prepare venues data for neighborhoods clustering

In [14]:
# one hot encoding
northyork_onehot = pd.get_dummies(northyork_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
northyork_onehot['Neighborhood'] = northyork_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [northyork_onehot.columns[-1]] + list(northyork_onehot.columns[:-1])
northyork_onehot = northyork_onehot[fixed_columns]

# let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
northyork_grouped = northyork_onehot.groupby('Neighborhood').mean().reset_index()
northyork_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,...,Steakhouse,Supermarket,Supplement Shop,Sushi Restaurant,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Vietnamese Restaurant,Women's Store
0,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,...,0.0,0.045455,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park, Lawrence Manor East",0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.045455,0.045455,0.0,0.0,0.0,0.0,0.0
3,Don Mills,0.0,0.0,0.0,0.041667,0.0,0.041667,0.041667,0.0,0.0,...,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Downsview,0.0,0.071429,0.0,0.0,0.0,0.0,0.071429,0.0,0.071429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<h4>Clustering<h4>

In [19]:
from sklearn.cluster import KMeans

cluster_df = northyork_grouped.drop('Neighborhood', axis=1)
kclusters = 5
k_means = KMeans(init = "k-means++", n_clusters = kclusters, n_init = 12)
k_means.fit(cluster_df)
k_means.labels_


array([2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 4, 0, 2, 2, 4, 3],
      dtype=int32)

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [20]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [47]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = northyork_grouped['Neighborhood']

for ind in np.arange(northyork_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(northyork_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Gas Station,Sandwich Place,Ice Cream Shop,Frozen Yogurt Shop,Fried Chicken Joint,Intersection,Diner,Deli / Bodega
1,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Accessories Store,Lounge,Mobile Phone Shop,Miscellaneous Shop,Middle Eastern Restaurant,Metro Station
2,"Bedford Park, Lawrence Manor East",Coffee Shop,Italian Restaurant,Sandwich Place,Grocery Store,Juice Bar,Comfort Food Restaurant,Pharmacy,Pizza Place,Café,Butcher
3,Don Mills,Gym,Coffee Shop,Restaurant,Beer Store,Bike Shop,Discount Store,Clothing Store,Chinese Restaurant,Caribbean Restaurant,Café
4,Downsview,Grocery Store,Park,Liquor Store,Discount Store,Airport,Athletics & Sports,Gym / Fitness Center,Bank,Baseball Field,Food Truck


Preparing data to visualization on the map

In [48]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', k_means.labels_)

In [49]:
northyork_merged = northyork_df

# merge northyork_grouped with northyork_df to add latitude/longitude for each neighborhood
northyork_merged = northyork_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
northyork_merged.dropna(axis=0, inplace=True)
northyork_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,4.0,Park,Food & Drink Shop,Jewelry Store,Miscellaneous Shop,Middle Eastern Restaurant,Metro Station,Men's Store,Mediterranean Restaurant,Massage Studio,Luggage Store
1,M4A,North York,Victoria Village,43.725882,-79.315572,0.0,Coffee Shop,Hockey Arena,French Restaurant,Pizza Place,Intersection,Portuguese Restaurant,Lounge,Miscellaneous Shop,Middle Eastern Restaurant,Metro Station
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,2.0,Furniture / Home Store,Clothing Store,Accessories Store,Boutique,Vietnamese Restaurant,Coffee Shop,Women's Store,Bakery,Lounge,Movie Theater
3,M3B,North York,Don Mills,43.745906,-79.352188,2.0,Gym,Coffee Shop,Restaurant,Beer Store,Bike Shop,Discount Store,Clothing Store,Chinese Restaurant,Caribbean Restaurant,Café
4,M6B,North York,Glencairn,43.709577,-79.445073,2.0,Park,Pizza Place,Bakery,Pub,Japanese Restaurant,Liquor Store,Miscellaneous Shop,Middle Eastern Restaurant,Metro Station,Men's Store


Creating the map

In [50]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(northyork_merged['Latitude'], northyork_merged['Longitude'], northyork_merged['Neighborhood'], northyork_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
           
map_clusters