# Segmenting and Clustering Neighborhoods in Toronto

## Week 3 Assignment

In [1]:
import numpy as np
import pandas as pd
print('Libraries imported!')

Libraries imported!


In [2]:
from bs4 import BeautifulSoup
import requests
print('Libraries imported!')

Libraries imported!


## Part 1

### Load Dataset

In [3]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

In [4]:
class Scrapy:
       
        def parse_url(self, url):
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'lxml')
            return [(self.parse_html_table(table))\
                    for table in soup.find_all('table', class_="wikitable sortable")]  
    
        def parse_html_table(self, table):
            n_columns = 0
            n_rows=0
            column_names = []
            for row in table.find_all('tr'):
                td_tags = row.find_all('td')
                if len(td_tags) > 0:
                    n_rows+=1
                    if n_columns == 0:
                        n_columns = len(td_tags)
                        
                th_tags = row.find_all('th') 
                if len(th_tags) > 0 and len(column_names) == 0:
                    for th in th_tags:
                        column_names.append(th.get_text())
    
            if len(column_names) > 0 and len(column_names) != n_columns:
                raise Exception("Column titles do not match the number of columns")
    
            columns = column_names if len(column_names) > 0 else range(0,n_columns)
            df = pd.DataFrame(columns = columns,
                              index= range(0,n_rows))
            row_marker = 0
            for row in table.find_all('tr'):
                column_marker = 0
                columns = row.find_all('td')
                for column in columns:
                    df.iat[row_marker,column_marker] = column.get_text()
                    column_marker += 1
                if len(columns) > 0:
                    row_marker += 1
                    
            for col in df:
                try:
                    df[col] = df[col].astype(float)
                except ValueError:
                    pass
            
            return df

In [5]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
hp = Scrapy()
table = hp.parse_url(url)[0] 
table = table.replace('\n',' ', regex=True)
table.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


### Prepare Dataset

Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned

In [7]:
table = table[table.Borough != 'Not assigned']
table.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table

In [8]:
df = table.groupby(['Postcode','Borough'])['Neighbourhood\n'].apply(lambda x: ", ".join(x.astype(str))).reset_index()
table2 = df.sample(frac=1).reset_index(drop=True)
table2.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge , Malvern"
1,M4N,Central Toronto,Lawrence Park
2,M3L,North York,Downsview West
3,M1S,Scarborough,Agincourt
4,M6B,North York,Glencairn
5,M4M,East Toronto,Studio District
6,M2M,North York,"Newtonbrook , Willowdale"
7,M1H,Scarborough,Cedarbrae
8,M4S,Central Toronto,Davisville
9,M2L,North York,"Silver Hills , York Mills"


In [9]:
table2.rename(columns={'Neighbourhood\n': 'Neighbourhood'}, inplace=True)
table2.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge , Malvern"
1,M4N,Central Toronto,Lawrence Park
2,M3L,North York,Downsview West
3,M1S,Scarborough,Agincourt
4,M6B,North York,Glencairn
5,M4M,East Toronto,Studio District
6,M2M,North York,"Newtonbrook , Willowdale"
7,M1H,Scarborough,Cedarbrae
8,M4S,Central Toronto,Davisville
9,M2L,North York,"Silver Hills , York Mills"


If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.


In [10]:
for i in range(len(table2)):
    line_data=table2.iloc[i,:]
    if line_data['Neighbourhood'] == 'Not assigned ':
        line_data['Neighbourhood'] = line_data['Borough']

table2.tail(50)

Unnamed: 0,Postcode,Borough,Neighbourhood
53,M6J,West Toronto,"Little Portugal , Trinity"
54,M4V,Central Toronto,"Deer Park , Forest Hill SE , Rathnelly , South..."
55,M1T,Scarborough,"Clarks Corners , Sullivan , Tam O'Shanter"
56,M1J,Scarborough,Scarborough Village
57,M4W,Downtown Toronto,Rosedale
58,M4K,East Toronto,"The Danforth West , Riverdale"
59,M5B,Downtown Toronto,"Ryerson , Garden District"
60,M5E,Downtown Toronto,Berczy Park
61,M6N,York,"The Junction North , Runnymede"
62,M4T,Central Toronto,"Moore Park , Summerhill East"


In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [11]:
table2.shape

(103, 3)

***

## Part 2

### Use the Geocoder package or the csv file to create the following dataframe

Given that this package can be very unreliable, in case you are not able to get the geographical coordinates of the neighborhoods using the Geocoder package, here is a link to a csv file that has the geographical coordinates of each postal code: http://cocl.us/Geospatial_data

In [12]:
geo_data = pd.read_csv("https://cocl.us/Geospatial_data")
geo_data.rename(columns={'Postal Code':'Postcode'}, inplace = True)
geo_data.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Add geolocalization to Borough

In [13]:
table3 = pd.merge(table2, geo_data, on='Postcode')
table3.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge , Malvern",43.806686,-79.194353
1,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
2,M3L,North York,Downsview West,43.739015,-79.506944
3,M1S,Scarborough,Agincourt,43.7942,-79.262029
4,M6B,North York,Glencairn,43.709577,-79.445073


In [14]:
table3.shape

(103, 5)

***

## Part 3

Explore and cluster the neighborhoods in Toronto

In [15]:
from geopy.geocoders import Nominatim
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [16]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported!')

Libraries imported!


### Toronto Map (without clustering)

In [17]:
# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(table3['Latitude'], table3['Longitude'], table3['Borough'], table3['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_Toronto)  
    
map_Toronto

You can decide to work with only boroughs that contain the word _"Scarborough"_ and then replicate the same analysis we did to the New York City data. It is up to you

In [18]:
Scarborough_data = table3[table3['Borough'] == 'Scarborough'].reset_index(drop=True)
Scarborough_data

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge , Malvern",43.806686,-79.194353
1,M1S,Scarborough,Agincourt,43.7942,-79.262029
2,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
3,M1P,Scarborough,"Dorset Park , Scarborough Town Centre , Wexfor...",43.75741,-79.273304
4,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
5,M1G,Scarborough,Woburn,43.770992,-79.216917
6,M1R,Scarborough,"Maryvale , Wexford",43.750072,-79.295849
7,M1C,Scarborough,"Highland Creek , Rouge Hill , Port Union",43.784535,-79.160497
8,M1M,Scarborough,"Cliffcrest , Cliffside , Scarborough Village W...",43.716316,-79.239476
9,M1K,Scarborough,"East Birchmount Park , Ionview , Kennedy Park",43.727929,-79.262029


In [19]:
address = 'Scarborough, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Scarborough are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Scarborough are 43.773077, -79.257774.


In [20]:
# create map of Scarborough using latitude and longitude values
map_Scarborough = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(table3['Latitude'], table3['Longitude'], table3['Borough'], table3['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_Scarborough)  
    
map_Scarborough

In [21]:
CLIENT_ID = '3CYZT4F5XGUGM1JY1XP0B4XK0XTA2BPMXTO3QROSRMJSSPIF' # your Foursquare ID
CLIENT_SECRET = 'C24OJ1Z0EEKAVUR005PSUFIFEGJKEKTSACUZRSPZDOI3LRWJ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 3CYZT4F5XGUGM1JY1XP0B4XK0XTA2BPMXTO3QROSRMJSSPIF
CLIENT_SECRET:C24OJ1Z0EEKAVUR005PSUFIFEGJKEKTSACUZRSPZDOI3LRWJ


### Define Foursquare Credentials and Version

In [22]:
CLIENT_ID = 'ZKEP450G4HOGSULAXUMQD5RY1MIHRBHEBDQQYL00YPQIFHHF' # your Foursquare ID
CLIENT_SECRET = 'EJFOBEYXFIKKKXOB3MCYE5H5G4PJHVNE5CS45MTDONB0EQS4' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ZKEP450G4HOGSULAXUMQD5RY1MIHRBHEBDQQYL00YPQIFHHF
CLIENT_SECRET:EJFOBEYXFIKKKXOB3MCYE5H5G4PJHVNE5CS45MTDONB0EQS4


### Now, let's get the top 100 venues that are in Scarborough within a radius of 500 meters.

In [23]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=ZKEP450G4HOGSULAXUMQD5RY1MIHRBHEBDQQYL00YPQIFHHF&client_secret=EJFOBEYXFIKKKXOB3MCYE5H5G4PJHVNE5CS45MTDONB0EQS4&v=20180605&ll=43.773077,-79.257774&radius=500&limit=100'

Send the GET request

In [24]:
results = requests.get(url).json()

From the Foursquare lab in the previous module, we know that all the information is in the items key. Before we proceed, let's borrow the get_category_type function from the Foursquare lab.

In [25]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Now we are ready to clean the json and structure it into a pandas dataframe.

In [26]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Disney Store,Toy / Game Store,43.775537,-79.256833
1,American Eagle Outfitters,Clothing Store,43.775908,-79.258352
2,SEPHORA,Cosmetics Shop,43.775017,-79.258109
3,DAVIDsTEA,Tea Room,43.776613,-79.258516
4,Tommy Hilfiger Company Store,Clothing Store,43.776015,-79.257369


And how many venues were returned by Foursquare?

In [27]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

45 venues were returned by Foursquare.


### Explore Neighborhoods in Scarborough

In [28]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Now write the code to run the above function on each neighborhood and create a new dataframe called scarborough_venues.

In [29]:
scarborough_venues = getNearbyVenues(names=Scarborough_data['Neighbourhood'],
                                   latitudes=Scarborough_data['Latitude'],
                                   longitudes=Scarborough_data['Longitude']
                                  )


Rouge , Malvern 
Agincourt 
Cedarbrae 
Dorset Park , Scarborough Town Centre , Wexford Heights 
Guildwood , Morningside , West Hill 
Woburn 
Maryvale , Wexford 
Highland Creek , Rouge Hill , Port Union 
Cliffcrest , Cliffside , Scarborough Village West 
East Birchmount Park , Ionview , Kennedy Park 
Agincourt North , L'Amoreaux East , Milliken , Steeles East 
Clarks Corners , Sullivan , Tam O'Shanter 
Scarborough Village 
Upper Rouge 
Clairlea , Golden Mile , Oakridge 
Birch Cliff , Cliffside West 
L'Amoreaux West , Steeles West 


Let's check the size of the resulting dataframe

In [30]:
print(scarborough_venues.shape)
scarborough_venues.head()

(84, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge , Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,Agincourt,43.7942,-79.262029,Panagio's Breakfast & Lunch,43.79237,-79.260203,Breakfast Spot
2,Agincourt,43.7942,-79.262029,Subway,43.792823,-79.259681,Sandwich Place
3,Agincourt,43.7942,-79.262029,Twilight,43.791999,-79.258584,Lounge
4,Agincourt,43.7942,-79.262029,Commander Arena,43.794867,-79.267989,Skating Rink


Let's check how many venues were returned for each neighborhood

In [31]:
scarborough_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Agincourt North , L'Amoreaux East , Milliken , Steeles East",2,2,2,2,2,2
"Birch Cliff , Cliffside West",4,4,4,4,4,4
Cedarbrae,7,7,7,7,7,7
"Clairlea , Golden Mile , Oakridge",9,9,9,9,9,9
"Clarks Corners , Sullivan , Tam O'Shanter",9,9,9,9,9,9
"Cliffcrest , Cliffside , Scarborough Village West",2,2,2,2,2,2
"Dorset Park , Scarborough Town Centre , Wexford Heights",6,6,6,6,6,6
"East Birchmount Park , Ionview , Kennedy Park",5,5,5,5,5,5
"Guildwood , Morningside , West Hill",8,8,8,8,8,8


Let's find out how many unique categories can be curated from all the returned venues

In [32]:
print('There are {} uniques categories.'.format(len(scarborough_venues['Venue Category'].unique())))

There are 54 uniques categories.


### Analyze Each Neighborhood

In [33]:
# one hot encoding
scarborough_onehot = pd.get_dummies(scarborough_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
scarborough_onehot['Neighborhood'] = scarborough_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [scarborough_onehot.columns[-1]] + list(scarborough_onehot.columns[:-1])
scarborough_onehot = scarborough_onehot[fixed_columns]

scarborough_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,American Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Breakfast Spot,Bus Line,Bus Station,Café,Caribbean Restaurant,Chinese Restaurant,Coffee Shop,College Stadium,Construction & Landscaping,Department Store,Discount Store,Electronics Store,Fast Food Restaurant,Fried Chicken Joint,General Entertainment,Grocery Store,Hakka Restaurant,History Museum,Hobby Shop,Indian Restaurant,Intersection,Italian Restaurant,Japanese Restaurant,Korean Restaurant,Latin American Restaurant,Lounge,Medical Center,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Motel,Nail Salon,Noodle House,Park,Pet Store,Pharmacy,Pizza Place,Playground,Rental Car Location,Sandwich Place,Shopping Mall,Skating Rink,Smoke Shop,Soccer Field,Spa,Thai Restaurant,Vietnamese Restaurant
0,"Rouge , Malvern",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Agincourt,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Agincourt,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,Agincourt,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Agincourt,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


And let's examine the new dataframe size.

In [34]:
scarborough_onehot.shape

(84, 55)

Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [35]:
scarborough_grouped = scarborough_onehot.groupby('Neighborhood').mean().reset_index()
scarborough_grouped

Unnamed: 0,Neighborhood,Accessories Store,American Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Breakfast Spot,Bus Line,Bus Station,Café,Caribbean Restaurant,Chinese Restaurant,Coffee Shop,College Stadium,Construction & Landscaping,Department Store,Discount Store,Electronics Store,Fast Food Restaurant,Fried Chicken Joint,General Entertainment,Grocery Store,Hakka Restaurant,History Museum,Hobby Shop,Indian Restaurant,Intersection,Italian Restaurant,Japanese Restaurant,Korean Restaurant,Latin American Restaurant,Lounge,Medical Center,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Motel,Nail Salon,Noodle House,Park,Pet Store,Pharmacy,Pizza Place,Playground,Rental Car Location,Sandwich Place,Shopping Mall,Skating Rink,Smoke Shop,Soccer Field,Spa,Thai Restaurant,Vietnamese Restaurant
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.25,0.0,0.0,0.0,0.0,0.0
1,"Agincourt North , L'Amoreaux East , Milliken ,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Birch Cliff , Cliffside West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
3,Cedarbrae,0.0,0.0,0.142857,0.0,0.142857,0.142857,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0
4,"Clairlea , Golden Mile , Oakridge",0.0,0.0,0.0,0.0,0.222222,0.0,0.0,0.0,0.222222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0
5,"Clarks Corners , Sullivan , Tam O'Shanter",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.111111,0.222222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0
6,"Cliffcrest , Cliffside , Scarborough Village W...",0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Dorset Park , Scarborough Town Centre , Wexfor...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667
8,"East Birchmount Park , Ionview , Kennedy Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.2,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Guildwood , Morningside , West Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0


Let's confirm the new size

In [36]:
scarborough_grouped.shape

(16, 55)

Let's print each neighborhood along with the top 5 most common venues

In [37]:
num_top_venues = 5

for hood in scarborough_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = scarborough_grouped[scarborough_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt ----
               venue  freq
0             Lounge  0.25
1       Skating Rink  0.25
2     Breakfast Spot  0.25
3     Sandwich Place  0.25
4  Accessories Store  0.00


----Agincourt North , L'Amoreaux East , Milliken , Steeles East ----
                 venue  freq
0                 Park   0.5
1           Playground   0.5
2    Accessories Store   0.0
3  Japanese Restaurant   0.0
4    Korean Restaurant   0.0


----Birch Cliff , Cliffside West ----
                   venue  freq
0        College Stadium  0.25
1           Skating Rink  0.25
2  General Entertainment  0.25
3                   Café  0.25
4              Pet Store  0.00


----Cedarbrae ----
                  venue  freq
0  Caribbean Restaurant  0.14
1    Athletics & Sports  0.14
2       Thai Restaurant  0.14
3                Bakery  0.14
4                  Bank  0.14


----Clairlea , Golden Mile , Oakridge ----
          venue  freq
0        Bakery  0.22
1      Bus Line  0.22
2  Intersection  0.11
3          Par

Let's put that into a pandas dataframe  
First, let's write a function to sort the venues in descending order.

In [38]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [113]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = scarborough_grouped['Neighborhood']

for ind in np.arange(scarborough_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(scarborough_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Skating Rink,Sandwich Place,Lounge,Breakfast Spot,Vietnamese Restaurant,Coffee Shop,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant
1,"Agincourt North , L'Amoreaux East , Milliken ,...",Playground,Park,Vietnamese Restaurant,Chinese Restaurant,Hakka Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
2,"Birch Cliff , Cliffside West",General Entertainment,Skating Rink,Café,College Stadium,Vietnamese Restaurant,Coffee Shop,Hakka Restaurant,Grocery Store,Fried Chicken Joint,Fast Food Restaurant
3,Cedarbrae,Athletics & Sports,Hakka Restaurant,Bakery,Bank,Fried Chicken Joint,Caribbean Restaurant,Thai Restaurant,Vietnamese Restaurant,Construction & Landscaping,History Museum
4,"Clairlea , Golden Mile , Oakridge",Bakery,Bus Line,Soccer Field,Intersection,Fast Food Restaurant,Park,Metro Station,Vietnamese Restaurant,College Stadium,Grocery Store


### Cluster Neighborhoods

Run k-means to cluster the neighborhood into 5 clusters.

In [114]:
# set number of clusters
kclusters = 5

scarborough_grouped_clustering = scarborough_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(scarborough_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [115]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [126]:
scarborough_merged = Scarborough_data

scarborough_merged = scarborough_merged[scarborough_merged.Postcode != 'M1X']

scarborough_merged


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge , Malvern",43.806686,-79.194353
1,M1S,Scarborough,Agincourt,43.7942,-79.262029
2,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
3,M1P,Scarborough,"Dorset Park , Scarborough Town Centre , Wexfor...",43.75741,-79.273304
4,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
5,M1G,Scarborough,Woburn,43.770992,-79.216917
6,M1R,Scarborough,"Maryvale , Wexford",43.750072,-79.295849
7,M1C,Scarborough,"Highland Creek , Rouge Hill , Port Union",43.784535,-79.160497
8,M1M,Scarborough,"Cliffcrest , Cliffside , Scarborough Village W...",43.716316,-79.239476
9,M1K,Scarborough,"East Birchmount Park , Ionview , Kennedy Park",43.727929,-79.262029


In [127]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,Agincourt,Skating Rink,Sandwich Place,Lounge,Breakfast Spot,Vietnamese Restaurant,Coffee Shop,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant
1,1,"Agincourt North , L'Amoreaux East , Milliken ,...",Playground,Park,Vietnamese Restaurant,Chinese Restaurant,Hakka Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
2,0,"Birch Cliff , Cliffside West",General Entertainment,Skating Rink,Café,College Stadium,Vietnamese Restaurant,Coffee Shop,Hakka Restaurant,Grocery Store,Fried Chicken Joint,Fast Food Restaurant
3,0,Cedarbrae,Athletics & Sports,Hakka Restaurant,Bakery,Bank,Fried Chicken Joint,Caribbean Restaurant,Thai Restaurant,Vietnamese Restaurant,Construction & Landscaping,History Museum
4,0,"Clairlea , Golden Mile , Oakridge",Bakery,Bus Line,Soccer Field,Intersection,Fast Food Restaurant,Park,Metro Station,Vietnamese Restaurant,College Stadium,Grocery Store


In [128]:
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
scarborough_merged = scarborough_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')
scarborough_merged # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge , Malvern",43.806686,-79.194353,2,Fast Food Restaurant,Vietnamese Restaurant,Coffee Shop,History Museum,Hakka Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Electronics Store,Discount Store
1,M1S,Scarborough,Agincourt,43.7942,-79.262029,0,Skating Rink,Sandwich Place,Lounge,Breakfast Spot,Vietnamese Restaurant,Coffee Shop,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant
2,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0,Athletics & Sports,Hakka Restaurant,Bakery,Bank,Fried Chicken Joint,Caribbean Restaurant,Thai Restaurant,Vietnamese Restaurant,Construction & Landscaping,History Museum
3,M1P,Scarborough,"Dorset Park , Scarborough Town Centre , Wexfor...",43.75741,-79.273304,0,Indian Restaurant,Pet Store,Chinese Restaurant,Latin American Restaurant,Vietnamese Restaurant,Skating Rink,Hakka Restaurant,General Entertainment,Fried Chicken Joint,Fast Food Restaurant
4,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711,0,Rental Car Location,Electronics Store,Medical Center,Breakfast Spot,Pizza Place,Mexican Restaurant,Intersection,Spa,Fast Food Restaurant,Coffee Shop
5,M1G,Scarborough,Woburn,43.770992,-79.216917,4,Coffee Shop,Korean Restaurant,Vietnamese Restaurant,Hakka Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store
6,M1R,Scarborough,"Maryvale , Wexford",43.750072,-79.295849,0,Accessories Store,Shopping Mall,Breakfast Spot,Bakery,Sandwich Place,Middle Eastern Restaurant,Auto Garage,Smoke Shop,General Entertainment,Coffee Shop
7,M1C,Scarborough,"Highland Creek , Rouge Hill , Port Union",43.784535,-79.160497,3,History Museum,Bar,Vietnamese Restaurant,Coffee Shop,Hakka Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
8,M1M,Scarborough,"Cliffcrest , Cliffside , Scarborough Village W...",43.716316,-79.239476,0,American Restaurant,Motel,Vietnamese Restaurant,Coffee Shop,Hakka Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
9,M1K,Scarborough,"East Birchmount Park , Ionview , Kennedy Park",43.727929,-79.262029,0,Hobby Shop,Coffee Shop,Discount Store,Bus Station,Department Store,History Museum,Hakka Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint


Finally, let's visualize the resulting clusters

In [136]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(scarborough_merged['Latitude'], scarborough_merged['Longitude'], scarborough_merged['Neighbourhood'], scarborough_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters

Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster. I will leave this exercise to you.
#### Cluster 1

In [131]:
scarborough_merged.loc[scarborough_merged['Cluster Labels'] == 0, scarborough_merged.columns[[1] + list(range(5, scarborough_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Scarborough,0,Skating Rink,Sandwich Place,Lounge,Breakfast Spot,Vietnamese Restaurant,Coffee Shop,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant
2,Scarborough,0,Athletics & Sports,Hakka Restaurant,Bakery,Bank,Fried Chicken Joint,Caribbean Restaurant,Thai Restaurant,Vietnamese Restaurant,Construction & Landscaping,History Museum
3,Scarborough,0,Indian Restaurant,Pet Store,Chinese Restaurant,Latin American Restaurant,Vietnamese Restaurant,Skating Rink,Hakka Restaurant,General Entertainment,Fried Chicken Joint,Fast Food Restaurant
4,Scarborough,0,Rental Car Location,Electronics Store,Medical Center,Breakfast Spot,Pizza Place,Mexican Restaurant,Intersection,Spa,Fast Food Restaurant,Coffee Shop
6,Scarborough,0,Accessories Store,Shopping Mall,Breakfast Spot,Bakery,Sandwich Place,Middle Eastern Restaurant,Auto Garage,Smoke Shop,General Entertainment,Coffee Shop
8,Scarborough,0,American Restaurant,Motel,Vietnamese Restaurant,Coffee Shop,Hakka Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
9,Scarborough,0,Hobby Shop,Coffee Shop,Discount Store,Bus Station,Department Store,History Museum,Hakka Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint
11,Scarborough,0,Pizza Place,Noodle House,Chinese Restaurant,Thai Restaurant,Italian Restaurant,Fried Chicken Joint,Fast Food Restaurant,Pharmacy,General Entertainment,Electronics Store
14,Scarborough,0,Bakery,Bus Line,Soccer Field,Intersection,Fast Food Restaurant,Park,Metro Station,Vietnamese Restaurant,College Stadium,Grocery Store
15,Scarborough,0,General Entertainment,Skating Rink,Café,College Stadium,Vietnamese Restaurant,Coffee Shop,Hakka Restaurant,Grocery Store,Fried Chicken Joint,Fast Food Restaurant


#### Cluster 2

In [132]:
scarborough_merged.loc[scarborough_merged['Cluster Labels'] == 1, scarborough_merged.columns[[1] + list(range(5, scarborough_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Scarborough,1,Playground,Park,Vietnamese Restaurant,Chinese Restaurant,Hakka Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
12,Scarborough,1,Playground,Construction & Landscaping,Vietnamese Restaurant,Chinese Restaurant,Hakka Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store


#### Cluster 3

In [133]:
scarborough_merged.loc[scarborough_merged['Cluster Labels'] == 2, scarborough_merged.columns[[1] + list(range(5, scarborough_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,2,Fast Food Restaurant,Vietnamese Restaurant,Coffee Shop,History Museum,Hakka Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Electronics Store,Discount Store


#### Cluster 4

In [134]:
scarborough_merged.loc[scarborough_merged['Cluster Labels'] == 3, scarborough_merged.columns[[1] + list(range(5, scarborough_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
7,Scarborough,3,History Museum,Bar,Vietnamese Restaurant,Coffee Shop,Hakka Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store


#### Cluster 5

In [135]:
scarborough_merged.loc[scarborough_merged['Cluster Labels'] == 4, scarborough_merged.columns[[1] + list(range(5, scarborough_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Scarborough,4,Coffee Shop,Korean Restaurant,Vietnamese Restaurant,Hakka Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store


***