# The first two parts of this code are copied from "Segmentation and Clustering of Neighborhoods - Part 1 & Part 2 - KR #

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# first step is to get the url
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
link = requests.get(url).text
toronto = BeautifulSoup(link,'lxml')

In [3]:
# extracting the information and putting it in the dataframe

# creating the dataframe
column_name = ['Postalcode', 'Borough', 'Neighborhood']
toronto_df = pd.DataFrame(columns = column_name)

# going through toronto to extract the Postal Code, Borough, and Neighborhood
info = toronto.find('div', class_='mw-parser-output')
table = info.table.tbody
postcode = 0
borough = 0
neighborhood = 0

for tr in table.find_all('tr'):
    i = 0
    for td in tr.find_all('td'):
        if i == 0:
            postcode = td.text
            i = i + 1
        elif i == 1:
            borough = td.text
            i = i + 1
        elif i == 2: 
            neighborhood = td.text.strip('\n').replace(']','')
    toronto_df = toronto_df.append({'Postalcode': postcode,'Borough': borough,'Neighborhood': neighborhood},ignore_index=True)

In [4]:
toronto_df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,0,0,0
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


Getting rid of Boroughs with a value of "Not assigned"

In [5]:
toronto_df = toronto_df[toronto_df.Borough != 'Not assigned']
toronto_df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,0,0,0
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park


Getting rid of the first row (all 0's) and re-naming Postalcode to Postal Code

In [6]:
toronto_df = toronto_df[toronto_df.Borough != 0]
toronto_df.rename(columns = {'Postalcode': 'Postal Code'}, inplace = True)
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


Removing Neighborhoods that have a value of "Not assigned" and consolidating Borough's which have multiple Neighborhoods listed

In [7]:
toronto_df[toronto_df.Neighborhood == "Not assigned"]

Unnamed: 0,Postal Code,Borough,Neighborhood
9,M7A,Queen's Park,Not assigned


In [8]:
toronto_df[toronto_df.Borough == "Queen's Park"]

# Borough was erased in code that was previously run

Unnamed: 0,Postal Code,Borough,Neighborhood
9,M7A,Queen's Park,Not assigned


In [9]:
# Consolidating the neighborhoods

toronto_df = toronto_df.groupby(["Postal Code", "Borough"])['Neighborhood'].apply(', '.join).reset_index()

In [10]:
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# Part 2 - Segmenting the neigborhoods by including latitude and longitude information #

In [11]:
import numpy as np

In [12]:
from geopy.geocoders import Nominatim

In [13]:
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [14]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ca_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [15]:
address1 = 'M1B, Toronto, Ontario'

location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print('{}, {}'.format(latitude, longitude))

43.653963, -79.387207


In [16]:
address2 = 'M6P, Toronto, Ontario'

location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print('{}, {}'.format(latitude, longitude))

43.653963, -79.387207


#### Installed the geocoder library as I kept getting the same coordinates regardless of postal code ####

In [17]:
import geocoder

In [18]:
def get_coordinates(postal_code):
    # initialize your variable to None
    coordinates = None
    # loop until you get the coordinates
    while(coordinates is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        coordinates = g.latlng
    return coordinates
    
get_coordinates('M6P')

[43.659935000000075, -79.46301926299998]

In [19]:
get_coordinates('M1B')

[43.811525000000074, -79.19551746399998]

#### Now that I got different coordinates, it's time to do this to the entire dataframe ####

In [20]:
pc = toronto_df['Postal Code']

lat_lng = [get_coordinates(codes) for codes in pc.tolist()]

In [21]:
df_latlong = pd.DataFrame(lat_lng, columns = ['Latitude', "Longitude"])
toronto_df['Latitude'] = df_latlong['Latitude']
toronto_df['Longitude'] = df_latlong['Longitude']

In [22]:
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.785665,-79.158725
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.765815,-79.175193
3,M1G,Scarborough,Woburn,43.768369,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944


# Part 3 - Clustering the dataset #

In [23]:
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors


In [24]:
from sklearn.cluster import KMeans
import folium

In [25]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ca_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [26]:
toronto_df.Borough.value_counts()

North York          24
Downtown Toronto    18
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East York            5
East Toronto         5
York                 5
Mississauga          1
Queen's Park         1
Name: Borough, dtype: int64

#### Since North York has the highest number of Postal Codes, this is the Borough I will focus on ####

In [27]:
# creating a map of Toronto
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough']):
    label = '{}'.format(borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Segmenting and Clustering North York

In [28]:
df_northy = toronto_df[toronto_df['Borough'] == "North York"].reset_index(drop = True)
df_northy.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M2H,North York,Hillcrest Village,43.802845,-79.356207
1,M2J,North York,"Fairview, Henry Farm, Oriole",43.78097,-79.347813
2,M2K,North York,Bayview Village,43.781015,-79.380529
3,M2L,North York,"Silver Hills, York Mills",43.757095,-79.38032
4,M2M,North York,"Newtonbrook, Willowdale",43.791475,-79.413605


Getting the geographical coordinates of North York

In [29]:
address_ny = "North York, Ontario"

geolocator1 = Nominatim(user_agent="ny_explorer")
location1 = geolocator1.geocode(address_ny)
latitude1 = location1.latitude
longitude1 = location1.longitude
print('The geograpical coordinate of North York are {}, {}.'.format(latitude1, longitude1))

The geograpical coordinate of North York are 43.7708175, -79.4132998.


Creating a map of North York

In [30]:
map_ny = folium.Map(location=[latitude1, longitude1], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_northy['Latitude'], df_northy['Longitude'], df_northy['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_ny)  
    
map_ny

Next step is to use the Foursqaure API to explore each neighborhood and segment them

Exploring the first neighborhood listed in the North York dataframe.

In [31]:
df_northy.Neighborhood[0]

'Hillcrest Village'

In [32]:
hv_long = df_northy.loc[0,'Longitude']
hv_lat = df_northy.loc[0,'Latitude']
hv_name = df_northy.loc[0,'Neighborhood']

print('Latitude and Longitude of {} is {},{}.'.format(hv_name, hv_lat, hv_long))

Latitude and Longitude of Hillcrest Village is 43.80284500000005,-79.35620744999994.


Reviewing the top 100 venues in Hillcrest Village

In [35]:
LIMIT = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, hv_lat, hv_long, VERSION, radius, LIMIT)

In [36]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5db67824f96b2c002c272965'},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 2,
  'suggestedBounds': {'ne': {'lat': 43.80734500450005,
    'lng': -79.34998403092742},
   'sw': {'lat': 43.798344995500045, 'lng': -79.36243086907247}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4f2e83dbe4b062ad2c20dd96',
       'name': 'Woodbrooke Estate',
       'location': {'address': '3740 Don Mills Road',
        'crossStreet': '& the Byways',
        'lat': 43.802067347853196,
        'lng': -79.35434683847974,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.802067347853196,
          'lng': -79.35434683847974}],
        'distance': 172,
        'posta

In [37]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [38]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Woodbrooke Estate,Residential Building (Apartment / Condo),43.802067,-79.354347
1,Duncan Creek Park,Dog Run,43.805539,-79.360695


In [39]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

2 venues were returned by Foursquare.


In [40]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [41]:
york_venues = getNearbyVenues(names=df_northy['Neighborhood'],
                                   latitudes=df_northy['Latitude'],
                                   longitudes=df_northy['Longitude']
                                  )

Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
Silver Hills, York Mills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park, Don Mills South
Bathurst Manor, Downsview North, Wilson Heights
Northwood Park, York University
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Bedford Park, Lawrence Manor East
Lawrence Heights, Lawrence Manor
Glencairn
Downsview, North Park, Upwood Park
Humber Summit
Emery, Humberlea


In [42]:
## size of the dataframe
print(york_venues.shape)
york_venues.head()

(284, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Hillcrest Village,43.802845,-79.356207,Woodbrooke Estate,43.802067,-79.354347,Residential Building (Apartment / Condo)
1,Hillcrest Village,43.802845,-79.356207,Duncan Creek Park,43.805539,-79.360695,Dog Run
2,"Fairview, Henry Farm, Oriole",43.78097,-79.347813,The LEGO Store,43.778207,-79.343483,Toy / Game Store
3,"Fairview, Henry Farm, Oriole",43.78097,-79.347813,SilverCity Fairview Mall Cinemas,43.778681,-79.344085,Movie Theater
4,"Fairview, Henry Farm, Oriole",43.78097,-79.347813,CF Fairview Mall,43.777994,-79.343665,Shopping Mall


In [43]:
print('There are {} uniques categories.'.format(len(york_venues['Venue Category'].unique())))

There are 114 uniques categories.


## Analyzing each Neighborhood in North York

In [44]:
northy_onehot = pd.get_dummies(york_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
northy_onehot['Neighborhood'] = york_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [northy_onehot.columns[-1]] + list(northy_onehot.columns[:-1])
northy_onehot = northy_onehot[fixed_columns]

northy_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Bar,Baseball Field,...,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Trail,Video Game Store,Vietnamese Restaurant,Wings Joint,Women's Store
0,Hillcrest Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Hillcrest Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Fairview, Henry Farm, Oriole",0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,"Fairview, Henry Farm, Oriole",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Fairview, Henry Farm, Oriole",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
northy_onehot.shape

(284, 115)

In [46]:
york_grouped = northy_onehot.groupby('Neighborhood').mean().reset_index()
york_grouped

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Bar,Baseball Field,...,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Trail,Video Game Store,Vietnamese Restaurant,Wings Joint,Women's Store
0,"Bathurst Manor, Downsview North, Wilson Heights",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0
2,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.047619,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CFB Toronto, Downsview East",0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Don Mills North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Downsview Central,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Downsview Northwest,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0
7,Downsview West,0.0,0.0,0.0,0.0,0.0,0.083333,0.083333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0
8,"Downsview, North Park, Upwood Park",0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Emery, Humberlea",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
york_grouped.shape

(23, 115)

## Top 5 most common venues in each neighborhood

In [48]:
num_top_venues = 5

for hood in york_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = york_grouped[york_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Manor, Downsview North, Wilson Heights----
               venue  freq
0       Home Service   1.0
1  Accessories Store   0.0
2      Metro Station   0.0
3           Platform   0.0
4        Pizza Place   0.0


----Bayview Village----
                        venue  freq
0          Golf Driving Range  0.25
1  Construction & Landscaping  0.25
2                       Trail  0.25
3                        Park  0.25
4           Accessories Store  0.00


----Bedford Park, Lawrence Manor East----
                venue  freq
0  Italian Restaurant  0.10
1         Coffee Shop  0.10
2           Juice Bar  0.05
3         Sports Club  0.05
4                 Pub  0.05


----CFB Toronto, Downsview East----
                venue  freq
0             Airport  0.25
1         Coffee Shop  0.25
2                Park  0.25
3          Food Court  0.25
4  Mexican Restaurant  0.00


----Don Mills North----
                        venue  freq
0  Construction & Landscaping   0.2
1                 Coffee

## Clustering the neighborhoods

In [49]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [50]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = york_grouped['Neighborhood']

for ind in np.arange(york_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(york_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Downsview North, Wilson Heights",Home Service,Women's Store,Frozen Yogurt Shop,Department Store,Dessert Shop,Discount Store,Dog Run,Eastern European Restaurant,Electronics Store,Falafel Restaurant
1,Bayview Village,Construction & Landscaping,Trail,Park,Golf Driving Range,Women's Store,Fried Chicken Joint,Department Store,Dessert Shop,Discount Store,Dog Run
2,"Bedford Park, Lawrence Manor East",Coffee Shop,Italian Restaurant,Butcher,Indian Restaurant,Greek Restaurant,Juice Bar,Liquor Store,Fast Food Restaurant,Cosmetics Shop,Comfort Food Restaurant
3,"CFB Toronto, Downsview East",Airport,Coffee Shop,Park,Food Court,Women's Store,Frozen Yogurt Shop,Dessert Shop,Discount Store,Dog Run,Eastern European Restaurant
4,Don Mills North,Construction & Landscaping,Park,Coffee Shop,Burger Joint,Soccer Field,Women's Store,Fried Chicken Joint,Dessert Shop,Discount Store,Dog Run


In [51]:
# setting number of clusters to 5
kclusters = 5

york_grouped_clustering = york_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(york_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 0, 3, 3, 0, 4, 3, 3, 0, 3])

### Visualizing the North York clusters

In [52]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

york_merged = df_northy

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
york_merged = york_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

york_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M2H,North York,Hillcrest Village,43.802845,-79.356207,3.0,Residential Building (Apartment / Condo),Dog Run,Women's Store,Fried Chicken Joint,Deli / Bodega,Department Store,Dessert Shop,Discount Store,Eastern European Restaurant,Electronics Store
1,M2J,North York,"Fairview, Henry Farm, Oriole",43.78097,-79.347813,3.0,Clothing Store,Fast Food Restaurant,Coffee Shop,Women's Store,Food Court,Deli / Bodega,Liquor Store,Juice Bar,Japanese Restaurant,Wings Joint
2,M2K,North York,Bayview Village,43.781015,-79.380529,0.0,Construction & Landscaping,Trail,Park,Golf Driving Range,Women's Store,Fried Chicken Joint,Department Store,Dessert Shop,Discount Store,Dog Run
3,M2L,North York,"Silver Hills, York Mills",43.757095,-79.38032,,,,,,,,,,,
4,M2M,North York,"Newtonbrook, Willowdale",43.791475,-79.413605,3.0,Café,Middle Eastern Restaurant,Korean Restaurant,Shopping Mall,Fast Food Restaurant,Sandwich Place,Fried Chicken Joint,Supermarket,Ramen Restaurant,Grocery Store


Removing the 4th row as it contains NaN values

In [53]:
york_merged.drop(3,axis = 0, inplace = True)
york_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M2H,North York,Hillcrest Village,43.802845,-79.356207,3.0,Residential Building (Apartment / Condo),Dog Run,Women's Store,Fried Chicken Joint,Deli / Bodega,Department Store,Dessert Shop,Discount Store,Eastern European Restaurant,Electronics Store
1,M2J,North York,"Fairview, Henry Farm, Oriole",43.78097,-79.347813,3.0,Clothing Store,Fast Food Restaurant,Coffee Shop,Women's Store,Food Court,Deli / Bodega,Liquor Store,Juice Bar,Japanese Restaurant,Wings Joint
2,M2K,North York,Bayview Village,43.781015,-79.380529,0.0,Construction & Landscaping,Trail,Park,Golf Driving Range,Women's Store,Fried Chicken Joint,Department Store,Dessert Shop,Discount Store,Dog Run
4,M2M,North York,"Newtonbrook, Willowdale",43.791475,-79.413605,3.0,Café,Middle Eastern Restaurant,Korean Restaurant,Shopping Mall,Fast Food Restaurant,Sandwich Place,Fried Chicken Joint,Supermarket,Ramen Restaurant,Grocery Store
5,M2N,North York,Willowdale South,43.768165,-79.40742,3.0,Coffee Shop,Fast Food Restaurant,Ramen Restaurant,Café,Fried Chicken Joint,Movie Theater,Pet Store,Discount Store,Pizza Place,Plaza


In [55]:
# converting the Cluster Labels column to an integer

york_merged['Cluster Labels'] = york_merged['Cluster Labels'].astype(int)

In [56]:
york_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M2H,North York,Hillcrest Village,43.802845,-79.356207,3,Residential Building (Apartment / Condo),Dog Run,Women's Store,Fried Chicken Joint,Deli / Bodega,Department Store,Dessert Shop,Discount Store,Eastern European Restaurant,Electronics Store
1,M2J,North York,"Fairview, Henry Farm, Oriole",43.78097,-79.347813,3,Clothing Store,Fast Food Restaurant,Coffee Shop,Women's Store,Food Court,Deli / Bodega,Liquor Store,Juice Bar,Japanese Restaurant,Wings Joint
2,M2K,North York,Bayview Village,43.781015,-79.380529,0,Construction & Landscaping,Trail,Park,Golf Driving Range,Women's Store,Fried Chicken Joint,Department Store,Dessert Shop,Discount Store,Dog Run
4,M2M,North York,"Newtonbrook, Willowdale",43.791475,-79.413605,3,Café,Middle Eastern Restaurant,Korean Restaurant,Shopping Mall,Fast Food Restaurant,Sandwich Place,Fried Chicken Joint,Supermarket,Ramen Restaurant,Grocery Store
5,M2N,North York,Willowdale South,43.768165,-79.40742,3,Coffee Shop,Fast Food Restaurant,Ramen Restaurant,Café,Fried Chicken Joint,Movie Theater,Pet Store,Discount Store,Pizza Place,Plaza


In [57]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(york_merged['Latitude'], york_merged['Longitude'], york_merged['Neighborhood'], york_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examining the Clusters

In [64]:
# First cluster

york_merged.loc[york_merged['Cluster Labels'] == 0, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,North York,0,Construction & Landscaping,Trail,Park,Golf Driving Range,Women's Store,Fried Chicken Joint,Department Store,Dessert Shop,Discount Store,Dog Run
8,North York,0,Park,Food & Drink Shop,Bus Stop,Women's Store,Fried Chicken Joint,Department Store,Dessert Shop,Discount Store,Dog Run,Eastern European Restaurant
9,North York,0,Construction & Landscaping,Park,Coffee Shop,Burger Joint,Soccer Field,Women's Store,Fried Chicken Joint,Dessert Shop,Discount Store,Dog Run
17,North York,0,Park,Grocery Store,Women's Store,Fried Chicken Joint,Department Store,Dessert Shop,Discount Store,Dog Run,Eastern European Restaurant,Electronics Store
21,North York,0,Park,Bakery,Women's Store,Frozen Yogurt Shop,Department Store,Dessert Shop,Discount Store,Dog Run,Eastern European Restaurant,Electronics Store


In [65]:
# Second Cluster

york_merged.loc[york_merged['Cluster Labels'] == 1, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
11,North York,1,Home Service,Women's Store,Frozen Yogurt Shop,Department Store,Dessert Shop,Discount Store,Dog Run,Eastern European Restaurant,Electronics Store,Falafel Restaurant


In [66]:
# 3rd Cluster

york_merged.loc[york_merged['Cluster Labels'] == 2, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,North York,2,Rental Car Location,Sporting Goods Shop,Women's Store,Fried Chicken Joint,Deli / Bodega,Department Store,Dessert Shop,Discount Store,Dog Run,Eastern European Restaurant


In [67]:
# 4th Cluster

york_merged.loc[york_merged['Cluster Labels'] == 3, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,3,Residential Building (Apartment / Condo),Dog Run,Women's Store,Fried Chicken Joint,Deli / Bodega,Department Store,Dessert Shop,Discount Store,Eastern European Restaurant,Electronics Store
1,North York,3,Clothing Store,Fast Food Restaurant,Coffee Shop,Women's Store,Food Court,Deli / Bodega,Liquor Store,Juice Bar,Japanese Restaurant,Wings Joint
4,North York,3,Café,Middle Eastern Restaurant,Korean Restaurant,Shopping Mall,Fast Food Restaurant,Sandwich Place,Fried Chicken Joint,Supermarket,Ramen Restaurant,Grocery Store
5,North York,3,Coffee Shop,Fast Food Restaurant,Ramen Restaurant,Café,Fried Chicken Joint,Movie Theater,Pet Store,Discount Store,Pizza Place,Plaza
6,North York,3,Convenience Store,Bank,Park,Speakeasy,Home Service,Hockey Arena,Department Store,Dessert Shop,Ice Cream Shop,Discount Store
7,North York,3,Coffee Shop,Park,Eastern European Restaurant,Bakery,Bus Line,Convenience Store,Intersection,Discount Store,Dog Run,Frozen Yogurt Shop
10,North York,3,Gym,Supermarket,Intersection,Coffee Shop,Bubble Tea Shop,Grocery Store,Smoke Shop,Beer Store,Food & Drink Shop,Fast Food Restaurant
12,North York,3,Coffee Shop,Bar,Furniture / Home Store,Caribbean Restaurant,Fast Food Restaurant,Restaurant,Miscellaneous Shop,Massage Studio,Bank,Japanese Restaurant
13,North York,3,Airport,Coffee Shop,Park,Food Court,Women's Store,Frozen Yogurt Shop,Dessert Shop,Discount Store,Dog Run,Eastern European Restaurant
14,North York,3,Convenience Store,Bank,Discount Store,Coffee Shop,Clothing Store,Electronics Store,Fast Food Restaurant,Beer Store,Pizza Place,Bakery


In [68]:
# 5th Cluster

york_merged.loc[york_merged['Cluster Labels'] == 4, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
15,North York,4,Insurance Office,Cosmetics Shop,Department Store,Dessert Shop,Discount Store,Dog Run,Eastern European Restaurant,Electronics Store,Falafel Restaurant,Fast Food Restaurant
