# Part 1: Segmentation and Clustering Neighborhoods in Toronto

## 1. Setup environment

In [376]:
# Install missing libraries
!pip install beautifulsoup4
!pip install lxml
!pip install html5lib
!pip install requests
!pip install geocoder
!pip install folium



{'tags': ['remove_output']}

In [327]:
# Import libraries
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import geocoder
from geopy.geocoders import Nominatim
import folium # map rendering library
from folium import plugins #Marker cluster object instiantiate
from IPython.core.display import HTML # render of html output
import json # library to handle JSON files
from sklearn.cluster import KMeans # import k-means from clustering stage
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors


print('Libraries imported.')

Libraries imported.


## 2. Read source html file

In [328]:
# Use 'lxml' parser to read the html input file
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

#print(soup.prettify())

In [329]:
# Locate 'table' tag to help parse the information
source_table = soup.find('table')
#print(source_table.prettify())

In [330]:
# Identified tags
#  <tr> : identifies the whole table
#  <th> : identifies column headers
#  <td> : identifies cell data

# Read column and rows in sepaate list vars
output_df_columns = []
output_df_rows = []

# Search for the first entry of <tr> where the main table is located
# Store table data/headers
# Remove '\n' for every single row
for source_row in source_table.find_all('tr'):
    
    try:
        for columns in source_row.find_all('th'):
            output_df_columns.append(columns.text)
        
        for rows in source_row.find_all('td'):
            output_df_rows.append(rows.text)
            
    except Exception as e:
        print("Something went wrong when reading the table")
        
    finally:
        output_df_columns = [w.replace('\n', '') for w in output_df_columns]
        output_df_rows = [w.replace('\n', '') for w in output_df_rows]

## 3. Generate Dataframe

In [331]:
# Generate columns_names as per table headers <th> tag's
output_df = pd.DataFrame(columns=output_df_columns)
num_columns = len(output_df_columns)
num_fields = len(output_df_rows)
# Append all the rows as per table data <td> tag's
for num_row in np.arange(int(num_fields/num_columns)):
    output_df.loc[num_row,:] = output_df_rows[num_row*num_columns:num_row*num_columns+num_columns]

## 4. Data Wrangling

### 1. Requirements Specifications
1. Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned
2. The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
3. More than one neighborhood can exist in one postal code area, combine into one row with the neighborhoods separated with a comma
4. If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [332]:
# Dropping rows without borough assigned
output_df.drop(output_df[output_df.Borough == 'Not assigned'].index, inplace=True)
# Rename columns names
output_df.columns = ['PostalCode','Borough','Neighborhood']
# Group by PO code and borough, joining neighborhoods
output_df = output_df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(','.join).reset_index()
# assign borough name's to non assigned neighborhood rows
output_df.loc[(output_df.Neighborhood == 'Not assigned'),'Neighborhood'] = output_df.Borough

### 2. Check output dataframe

In [333]:
# Column Name types
output_df.dtypes

PostalCode      object
Borough         object
Neighborhood    object
dtype: object

In [334]:
# Size of output dataframe
output_df.shape

(103, 3)

In [335]:
# Output dataframe
output_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## 5. Get latitude/longuitude data

### 1. Requirements specifications
Get the latitude and the longitude coordinates of each neighborhood

In [336]:
# Using the provided csv with the necessary information of latitude/longuitude
!wget -q -O 'toronto_data.csv' http://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


### 2. Read csv file
Convert data to dataframe

In [337]:
toronto_geo_df = pd.read_csv('toronto_data.csv')
toronto_geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### 3. Check data integrity

In [338]:
# Check if data from both tables are sorted in the same order
output_df.loc[:,'PostalCode'].equals(toronto_geo_df.loc[:,'Postal Code'])

True

In [339]:
# It's safe to proceed to concat both dataframes
output_df = pd.concat([output_df, toronto_geo_df.iloc[:,1:].reindex(output_df.index)], axis=1)

In [340]:
# Final dataframe inforamtion
output_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [341]:
print('The dataframe has {} boroughs: {}.'.format(len(output_df['Borough'].unique()), output_df.Borough.unique()))


The dataframe has 11 boroughs: ['Scarborough' 'North York' 'East York' 'East Toronto' 'Central Toronto'
 'Downtown Toronto' 'York' 'West Toronto' 'Mississauga' 'Etobicoke'
 "Queen's Park"].


## 6. Visualization Map

### 1. Gather data

In [342]:
# Proceed to fill nominatim inputs
address = 'Toronto, ON'
# user-agent: to_explorer
geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### 2. Borough denisty map
MarkerCluster are a preferred solution over CircleMarker for simplicity


In [345]:

# create map of Toronto using latitude and longitude values
city_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# instantiate a mark cluster object for the incidents in the dataframe
boroughs = plugins.MarkerCluster().add_to(city_map)

# loop through the dataframe and add each data point to the mark cluster
for lat, lng, borough, pocode in zip(output_df['Latitude'], output_df['Longitude'], output_df['Borough'], output_df['PostalCode']):
    label = '{}-{}'.format(borough, pocode)
    label = folium.Popup(label, parse_html=True)
    folium.Marker(
        location=[lat, lng],
        icon=None,
        popup=label,
    ).add_to(boroughs)
# render as html
HTML(city_map._repr_html_())

## 7. Exploring Boroughs in Toronto

### 1. Foursquare service credentials

In [369]:
# Claim access service credentials as per user/pwd
CLIENT_ID = '{my_id}' # your Foursquare ID
CLIENT_SECRET = '{my_key}' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: {my_id}
CLIENT_SECRET:{my_key}


### 2. Explore nearby venues 
Explore and cluster the neighborhoods in Toronto. You can decide to work with only boroughs that contain the word Toronto and then replicate the same analysis we did to the New York City data. It is up to you.

In [347]:
# Use prevoius function to get venue data from Foursquare
def getNearbyVenues(names, latitudes, longitudes, radius=1000, limit=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        with open('city_venues.json', 'a+') as f:
            json.dump(results, f)
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode', 
                  'PostalCode Latitude', 
                  'PostalCode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [348]:
# call function to retrieve desired data about venues in Toronto
toronto_venues = getNearbyVenues(names=output_df['PostalCode'],
                                   latitudes=output_df['Latitude'],
                                   longitudes=output_df['Longitude']
                                  )

In [349]:
# Retrieved venues data
print('Number of venues in Toronto {}'.format(toronto_venues.shape))
toronto_venues.head()

Number of venues in Toronto (4933, 7)


Unnamed: 0,PostalCode,PostalCode Latitude,PostalCode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,43.806686,-79.194353,Wendy's,43.802008,-79.19808,Fast Food Restaurant
1,M1B,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
2,M1B,43.806686,-79.194353,Staples Morningside,43.800285,-79.196607,Paper / Office Supplies Store
3,M1B,43.806686,-79.194353,Caribbean Wave,43.798558,-79.195777,Caribbean Restaurant
4,M1B,43.806686,-79.194353,Harvey's,43.800106,-79.198258,Fast Food Restaurant


In [370]:
# Let's check how many venues per POstal Code
toronto_venues.groupby(['PostalCode']).count().head()

Unnamed: 0_level_0,PostalCode Latitude,PostalCode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M1B,16,16,16,16,16,16
M1C,5,5,5,5,5,5
M1E,23,23,23,23,23,23
M1G,10,10,10,10,10,10
M1H,30,30,30,30,30,30


In [351]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 328 uniques categories.


### 3. Analyze each PO Code
Calculate which venues are mos common as per PO, later on will be necessary for clustering

In [352]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['PostalCode'] = toronto_venues['PostalCode'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,PostalCode,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [353]:
# Evaluate weighs of very feature by grouping and calculating mean values
toronto_grouped = toronto_onehot.groupby('PostalCode').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,PostalCode,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.033333,0.0


In [354]:
# Focus on the first 5 weighs and append to final dataframe
num_top_venues = 5

for pocode in toronto_grouped['PostalCode']:
    #print("----"+pocode+"----")
    temp = toronto_grouped[toronto_grouped['PostalCode'] == pocode].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    #print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    #print('\n')

In [355]:
# Function that returns sorted venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [356]:
# Generate dataframe with most common venues per Postal Code
num_top_venues = 5
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['PostalCode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
postalcode_venues_sorted = pd.DataFrame(columns=columns)
postalcode_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']

for ind in np.arange(toronto_grouped.shape[0]):
    postalcode_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

postalcode_venues_sorted.head()

Unnamed: 0,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M1B,Fast Food Restaurant,Coffee Shop,Trail,Paper / Office Supplies Store,Chinese Restaurant
1,M1C,Burger Joint,Park,Breakfast Spot,Playground,Italian Restaurant
2,M1E,Pizza Place,Fast Food Restaurant,Coffee Shop,Sports Bar,Food & Drink Shop
3,M1G,Park,Coffee Shop,Chinese Restaurant,Pharmacy,Mobile Phone Shop
4,M1H,Bakery,Coffee Shop,Gas Station,Indian Restaurant,Pharmacy


### 4.Cluster PO Code
Use of K-Means algorithm to clusterize data

In [357]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('PostalCode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_ += 1 # reserved label=0 for PO code with no venues
kmeans.labels_

array([1, 4, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 2, 4, 5, 2, 2,
       4, 1, 4, 2, 2, 1, 1, 2, 4, 3, 1, 1, 1, 4, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 1, 4, 2, 2, 2, 2, 1, 4, 1, 2, 2, 2, 2, 2, 2, 4,
       1, 2, 4, 2, 4, 4, 2, 4, 4, 1, 1, 1, 1, 2], dtype=int32)

In [358]:
# add clustering labels
postalcode_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
# original data
toronto_merged = output_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(postalcode_venues_sorted.set_index('PostalCode'), on='PostalCode')

# once merged with original data, no data shows Nan when no venues are present
toronto_merged.fillna(0, inplace=True)
toronto_merged = toronto_merged.astype({'Cluster Labels':'int32'}, inplace=True)

toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353,1,Fast Food Restaurant,Coffee Shop,Trail,Paper / Office Supplies Store,Chinese Restaurant
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,4,Burger Joint,Park,Breakfast Spot,Playground,Italian Restaurant
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711,1,Pizza Place,Fast Food Restaurant,Coffee Shop,Sports Bar,Food & Drink Shop
3,M1G,Scarborough,Woburn,43.770992,-79.216917,1,Park,Coffee Shop,Chinese Restaurant,Pharmacy,Mobile Phone Shop
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1,Bakery,Coffee Shop,Gas Station,Indian Restaurant,Pharmacy


In [359]:
# check data integrity
toronto_merged.dtypes

PostalCode                object
Borough                   object
Neighborhood              object
Latitude                 float64
Longitude                float64
Cluster Labels             int32
1st Most Common Venue     object
2nd Most Common Venue     object
3rd Most Common Venue     object
4th Most Common Venue     object
5th Most Common Venue     object
dtype: object

### 5. Map Visualization
Plot the final clusterize dataframe

In [362]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# Add main 
points_a = [[43.82,-79.34],[43.67,-79.28]]
points_b = [[43.71,-79.295],[43.70,-79.43]]
points_c = [[43.77,-79.53],[43.70,-79.43]]
points_d = [[43.64,-79.46],[43.70,-79.43]]

# set color scheme for the clusters
x = np.arange(kclusters+1)
ys = [i + x + (i*x)**2 for i in range(kclusters+1)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


# add markers to the map
markers_colors = []
for lat, lon, borough, pocode, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Borough'], 
                                              toronto_merged['PostalCode'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(borough) + '-' + str(pocode) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    
# Plot lines that represent the neighboorhoods: Scarborough, North York, East york and Etobicoke
#folium.PolyLine(points_a, color="green", weight=2.5, opacity=0.5).add_to(map_clusters)
#folium.PolyLine(points_b, color="green", weight=2.5, opacity=0.5).add_to(map_clusters)
#folium.PolyLine(points_c, color="green", weight=2.5, opacity=0.5).add_to(map_clusters)
#folium.PolyLine(points_d, color="green", weight=2.5, opacity=0.5).add_to(map_clusters)

HTML(map_clusters._repr_html_())

### 6. Examine clusters
Rational behind centroids clusterization
Note: On the first attempt to clusterize, the map showed all the nodes in just two clusters. Looks like all the points were isolated, so I increase the radius of the request to Foursquare from 500m to 1000m. Still is within a walking distance from {HOME}, and the results are quite different.

#### 1. Cluster = 0. Only one single entry, located in the upper right side of the map, in red color. No venues, represents Rouge National Park area.

In [363]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[0] + list(range(1, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
16,M1X,Scarborough,Upper Rouge,43.836125,-79.205636,0,0,0,0,0,0


#### 2. Cluster = 1. Scarborough area is represented in this cluster, plus vincinity areas from East and North York. Also some outliers from Etobicoke area.

In [371]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[0] + list(range(1, toronto_merged.shape[1]))]].head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353,1,Fast Food Restaurant,Coffee Shop,Trail,Paper / Office Supplies Store,Chinese Restaurant
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711,1,Pizza Place,Fast Food Restaurant,Coffee Shop,Sports Bar,Food & Drink Shop
3,M1G,Scarborough,Woburn,43.770992,-79.216917,1,Park,Coffee Shop,Chinese Restaurant,Pharmacy,Mobile Phone Shop
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1,Bakery,Coffee Shop,Gas Station,Indian Restaurant,Pharmacy
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476,1,Ice Cream Shop,Japanese Restaurant,Restaurant,Fast Food Restaurant,Coffee Shop


#### 3. Cluster = 2. Mainly represents North and East York Toronto area.

In [372]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[0] + list(range(1, toronto_merged.shape[1]))]].head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
18,M2J,North York,"Fairview,Henry Farm,Oriole",43.778517,-79.346556,2,Clothing Store,Coffee Shop,Fast Food Restaurant,Sandwich Place,Juice Bar
21,M2M,North York,"Newtonbrook,Willowdale",43.789053,-79.408493,2,Korean Restaurant,Café,Coffee Shop,Pizza Place,Middle Eastern Restaurant
22,M2N,North York,Willowdale South,43.77012,-79.408493,2,Coffee Shop,Ramen Restaurant,Korean Restaurant,Fast Food Restaurant,Japanese Restaurant
26,M3B,North York,Don Mills North,43.745906,-79.352188,2,Pizza Place,Japanese Restaurant,Coffee Shop,Burger Joint,Office
27,M3C,North York,"Flemingdon Park,Don Mills South",43.7259,-79.340923,2,Gym,Coffee Shop,Restaurant,Asian Restaurant,Supermarket


#### 4. Cluster = 3. Altough belongs to North York, is represented as a single centroid, close to green park areas.

In [374]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[0] + list(range(1, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
32,M3M,North York,Downsview Central,43.728496,-79.495697,3,Vietnamese Restaurant,Thai Restaurant,Baseball Field,Zoo,Field


#### 5. Cluster = 4. Mainly represents Etobicoke area, some outliers in Downtown Toronto and Scarborough.

In [375]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[0] + list(range(1, toronto_merged.shape[1]))]].head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,4,Burger Joint,Park,Breakfast Spot,Playground,Italian Restaurant
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848,4,Park,Restaurant,Thai Restaurant,Skating Rink,Diner
19,M2K,North York,Bayview Village,43.786947,-79.385975,4,Japanese Restaurant,Gas Station,Bank,Grocery Store,Chinese Restaurant
23,M2P,North York,York Mills West,43.752758,-79.400049,4,Park,Coffee Shop,Restaurant,Dog Run,French Restaurant
25,M3A,North York,Parkwoods,43.753259,-79.329656,4,Park,Shopping Mall,Bus Stop,Convenience Store,Pharmacy


#### 6. Cluster = 5. Although located in North York, shows high school areas and isolated in a single cluster.

In [368]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 5, toronto_merged.columns[[0] + list(range(1, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
20,M2L,North York,"Silver Hills,York Mills",43.75749,-79.374714,5,Park,Pool,Zoo,Filipino Restaurant,Ethiopian Restaurant


### End of Part 1