### Scrap neighborhoods in Toronto - Wikipedia

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [2]:
#Scrapping with pandas

df= pd.read_html("https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=862527922")[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [3]:
df.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')

In [4]:
df = df.rename(columns={"Postcode": "PostalCode", "Neighbourhood": "Neighborhood"})
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [5]:
# Ignore cells with a borough that is Not assigned.

df=df[df.Borough != 'Not assigned']
df = df.sort_values(by=['PostalCode','Borough'])

df.reset_index(inplace=True)
df.drop('index',axis=1,inplace=True)

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Rouge
1,M1B,Scarborough,Malvern
2,M1C,Scarborough,Highland Creek
3,M1C,Scarborough,Rouge Hill
4,M1C,Scarborough,Port Union


In [6]:
## neighborhood will be the same as the borough.

df['Neighborhood']=df['Neighborhood'].replace('Not assigned', df['Borough'])

In [7]:
## combined into one row with the neighborhoods separated with a comma

df = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(list)

In [8]:
df = df.sample(frac=1).reset_index()

In [9]:
df['Neighborhood']=df['Neighborhood'].str.join(',')

In [10]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M2R,North York,Willowdale West
1,M5P,Central Toronto,"Forest Hill North,Forest Hill West"
2,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North"
3,M1R,Scarborough,"Maryvale,Wexford"
4,M6M,York,"Del Ray,Keelsdale,Mount Dennis,Silverthorn"


In [11]:
df.shape

(103, 3)

Trying GeoCoder Package

In [12]:
## I tried geocoder but was not working

import geocoder

In [13]:
geocoordinates = pd.read_csv('Geospatial_Coordinates.csv')

In [14]:
geocoordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
geocoordinates = geocoordinates.rename(columns={"Postal Code": "PostalCode"})
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M2R,North York,Willowdale West
1,M5P,Central Toronto,"Forest Hill North,Forest Hill West"
2,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North"
3,M1R,Scarborough,"Maryvale,Wexford"
4,M6M,York,"Del Ray,Keelsdale,Mount Dennis,Silverthorn"


In [16]:
geocoordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
geocoordinates.shape

(103, 3)

In [18]:
newdf = pd.merge(df, geocoordinates, how='right')


In [19]:
newdf.reset_index(drop=True)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M2R,North York,Willowdale West,43.782736,-79.442259
1,M5P,Central Toronto,"Forest Hill North,Forest Hill West",43.696948,-79.411307
2,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North",43.653654,-79.506944
3,M1R,Scarborough,"Maryvale,Wexford",43.750072,-79.295849
4,M6M,York,"Del Ray,Keelsdale,Mount Dennis,Silverthorn",43.691116,-79.476013
5,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
6,M5K,Downtown Toronto,"Design Exchange,Toronto Dominion Centre",43.647177,-79.381576
7,M7Y,East Toronto,Business reply mail Processing Centre969 Eastern,43.662744,-79.321558
8,M6S,West Toronto,"Runnymede,Swansea",43.651571,-79.484450
9,M3B,North York,Don Mills North,43.745906,-79.352188


In [20]:
newdf.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M2R,North York,Willowdale West,43.782736,-79.442259
1,M5P,Central Toronto,"Forest Hill North,Forest Hill West",43.696948,-79.411307
2,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North",43.653654,-79.506944
3,M1R,Scarborough,"Maryvale,Wexford",43.750072,-79.295849
4,M6M,York,"Del Ray,Keelsdale,Mount Dennis,Silverthorn",43.691116,-79.476013


In [21]:
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

#### Use geopy library to get the latitude and longitude values of Toronto


In [22]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


#### Create a map of Toronto with neighborhoods superimposed on top.

In [23]:
# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(newdf['Latitude'], newdf['Longitude'], newdf['Borough'], newdf['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

#### Define Foursquare Credentials and Version

In [92]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 
CLIENT_SECRET:


#### Let's explore the first neighborhood in our dataframe.

In [25]:
newdf.loc[0, 'Neighborhood']

'Willowdale West'

Get the neighborhood's latitude and longitude values.

In [26]:
neighborhood_latitude = newdf.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = newdf.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = newdf.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Willowdale West are 43.7827364, -79.4422593.


#### Now, let's get the top 100 venues that are in Marble Hill within a radius of 500 meters.

In [27]:
# First, let's create the GET request URL. Name your URL **url**.

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=5JUO2XVTNTGAMQZ2BBW55NBR33CED2EXMDT2TLGNKVWM0EOE&client_secret=EXOFUWQRDSQSWD0PQ2XESIJLDRYJAQS4SAT30QZWELDHGIDV&v=20180605&ll=43.7827364,-79.4422593&radius=500&limit=100'

In [28]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e9ef64d949393001be9f7a7'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 5,
  'suggestedBounds': {'ne': {'lat': 43.7872364045, 'lng': -79.43603797460023},
   'sw': {'lat': 43.778236395499995, 'lng': -79.44848062539978}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4b352657f964a520c42a25e3',
       'name': 'Tov-Li',
       'location': {'address': '5982 Bathurst St.',
        'crossStreet': 'at Rockford Rd.',
        'lat': 43.78421369713337,
        'lng': -79.44609771646198,
        'labeledLatLngs': [{'label': 'display',
          'lat':

In [29]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Now we are ready to clean the json and structure it into a *pandas* dataframe.

In [30]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Tov-Li,Pizza Place,43.784214,-79.446098
1,Shoppers Drug Mart,Pharmacy,43.784847,-79.446028
2,Tim Hortons,Coffee Shop,43.78094,-79.444231
3,Price Chopper,Grocery Store,43.783237,-79.446339
4,RBC Royal Bank,Bank,43.783894,-79.446603


#### Let's create a function to repeat the same process to all the neighborhoods in Toronto

In [31]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [32]:

Toronto_venues = getNearbyVenues(names=newdf['Neighborhood'],
                                   latitudes=newdf['Latitude'],
                                   longitudes=newdf['Longitude']
                                  )


Willowdale West
Forest Hill North,Forest Hill West
The Kingsway,Montgomery Road,Old Mill North
Maryvale,Wexford
Del Ray,Keelsdale,Mount Dennis,Silverthorn
Central Bay Street
Design Exchange,Toronto Dominion Centre
Business reply mail Processing Centre969 Eastern
Runnymede,Swansea
Don Mills North
Highland Creek,Rouge Hill,Port Union
Harbourfront East,Toronto Islands,Union Station
The Beaches West,India Bazaar
Parkdale,Roncesvalles
Silver Hills,York Mills
Caledonia-Fairbanks
Cedarbrae
Adelaide,King,Richmond
Thorncliffe Park
Northwest
Glencairn
Leaside
Bedford Park,Lawrence Manor East
Humber Bay Shores,Mimico South,New Toronto
Church and Wellesley
Dovercourt Village,Dufferin
The Annex,North Midtown,Yorkville
Guildwood,Morningside,West Hill
Clarks Corners,Sullivan,Tam O'Shanter
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
The Junction North,Runnymede
Hillcrest Village
Queen's Park
Kingsview Village,Martin Grove Gardens,Richview Gardens,St. Phillips
Kingsway Park South West

#### Let's check the size of the resulting dataframe

In [33]:
print(Toronto_venues.shape)
Toronto_venues.head()

(2130, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Willowdale West,43.782736,-79.442259,Tov-Li,43.784214,-79.446098,Pizza Place
1,Willowdale West,43.782736,-79.442259,Shoppers Drug Mart,43.784847,-79.446028,Pharmacy
2,Willowdale West,43.782736,-79.442259,Tim Hortons,43.78094,-79.444231,Coffee Shop
3,Willowdale West,43.782736,-79.442259,Price Chopper,43.783237,-79.446339,Grocery Store
4,Willowdale West,43.782736,-79.442259,RBC Royal Bank,43.783894,-79.446603,Bank


In [34]:
Toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",94,94,94,94,94,94
Agincourt,4,4,4,4,4,4
"Agincourt North,L'Amoreaux East,Milliken,Steeles East",2,2,2,2,2,2
"Albion Gardens,Beaumond Heights,Humbergate,Jamestown,Mount Olive,Silverstone,South Steeles,Thistletown",9,9,9,9,9,9
"Alderwood,Long Branch",9,9,9,9,9,9
"Bathurst Manor,Downsview North,Wilson Heights",20,20,20,20,20,20
Bayview Village,4,4,4,4,4,4
"Bedford Park,Lawrence Manor East",23,23,23,23,23,23
Berczy Park,57,57,57,57,57,57
"Birch Cliff,Cliffside West",4,4,4,4,4,4


#### Let's find out how many unique categories can be curated from all the returned venues

In [35]:
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 271 uniques categories.


## 3. Analyze Each Neighborhood


In [36]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = Toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()


Unnamed: 0,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
Toronto_onehot.shape

(2130, 271)

#### Next, let's group rows by PostalCode and by taking the mean of the frequency of occurrence of each category

In [38]:
Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped.head()


Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031915,...,0.0,0.0,0.010638,0.0,0.0,0.0,0.0,0.0,0.0,0.010638
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood,Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Let's confirm the new size

In [39]:
Toronto_grouped.shape

(101, 271)

In [42]:
# Let's print each PostalCode along with the top 5 most common venues

num_top_venues = 5

for hood in Toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Toronto_grouped[Toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
                 venue  freq
0          Coffee Shop  0.09
1                 Café  0.05
2           Restaurant  0.04
3       Clothing Store  0.03
4  American Restaurant  0.03


----Agincourt----
                       venue  freq
0  Latin American Restaurant  0.25
1             Breakfast Spot  0.25
2               Skating Rink  0.25
3                     Lounge  0.25
4                Yoga Studio  0.00


----Agincourt North,L'Amoreaux East,Milliken,Steeles East----
                        venue  freq
0                        Park   0.5
1                  Playground   0.5
2                 Yoga Studio   0.0
3    Mediterranean Restaurant   0.0
4  Modern European Restaurant   0.0


----Albion Gardens,Beaumond Heights,Humbergate,Jamestown,Mount Olive,Silverstone,South Steeles,Thistletown----
                  venue  freq
0         Grocery Store  0.22
1           Pizza Place  0.11
2            Beer Store  0.11
3  Fast Food Restaurant  0.11
4   Fried Chicken Join

In [43]:
# Let's put that into a *pandas* dataframe

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [44]:
# Now let's create the new dataframe and display the top 10 venues for each neighborhood.

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Restaurant,Gym,American Restaurant,Hotel,Deli / Bodega,Thai Restaurant,Clothing Store,Pizza Place
1,Agincourt,Latin American Restaurant,Skating Rink,Lounge,Breakfast Spot,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",Park,Playground,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Drugstore
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",Grocery Store,Pizza Place,Fast Food Restaurant,Beer Store,Sandwich Place,Coffee Shop,Fried Chicken Joint,Pharmacy,Construction & Landscaping,Concert Hall
4,"Alderwood,Long Branch",Pizza Place,Gym,Coffee Shop,Sandwich Place,Athletics & Sports,Pub,Skating Rink,Pharmacy,Department Store,Eastern European Restaurant


## 4. Cluster Neighborhoods

In [45]:
# set number of clusters
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 2, 1, 1, 1, 1, 1, 1, 1])

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [46]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Toronto_merged = newdf

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M2R,North York,Willowdale West,43.782736,-79.442259,1.0,Grocery Store,Pharmacy,Bank,Pizza Place,Coffee Shop,Women's Store,Doner Restaurant,Diner,Discount Store,Distribution Center
1,M5P,Central Toronto,"Forest Hill North,Forest Hill West",43.696948,-79.411307,1.0,Trail,Sushi Restaurant,Bus Line,Jewelry Store,Donut Shop,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Women's Store
2,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North",43.653654,-79.506944,2.0,Park,Pool,River,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run
3,M1R,Scarborough,"Maryvale,Wexford",43.750072,-79.295849,1.0,Middle Eastern Restaurant,Smoke Shop,Bakery,Breakfast Spot,Donut Shop,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Women's Store
4,M6M,York,"Del Ray,Keelsdale,Mount Dennis,Silverthorn",43.691116,-79.476013,1.0,Fast Food Restaurant,Discount Store,Sandwich Place,Fried Chicken Joint,Women's Store,Dog Run,Dim Sum Restaurant,Diner,Distribution Center,Donut Shop


In [67]:
Toronto_merged['Cluster Labels'].isna().sum()

2

In [79]:
Toronto_Merged2 = Toronto_merged.dropna(subset=['Cluster Labels'])

In [82]:
Toronto_Merged2['Cluster Labels'].isna().sum()

0

In [91]:
Toronto_Merged2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M2R,North York,Willowdale West,43.782736,-79.442259,1.0,Grocery Store,Pharmacy,Bank,Pizza Place,Coffee Shop,Women's Store,Doner Restaurant,Diner,Discount Store,Distribution Center
1,M5P,Central Toronto,"Forest Hill North,Forest Hill West",43.696948,-79.411307,1.0,Trail,Sushi Restaurant,Bus Line,Jewelry Store,Donut Shop,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Women's Store
2,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North",43.653654,-79.506944,2.0,Park,Pool,River,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run
3,M1R,Scarborough,"Maryvale,Wexford",43.750072,-79.295849,1.0,Middle Eastern Restaurant,Smoke Shop,Bakery,Breakfast Spot,Donut Shop,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Women's Store
4,M6M,York,"Del Ray,Keelsdale,Mount Dennis,Silverthorn",43.691116,-79.476013,1.0,Fast Food Restaurant,Discount Store,Sandwich Place,Fried Chicken Joint,Women's Store,Dog Run,Dim Sum Restaurant,Diner,Distribution Center,Donut Shop


Finally, let's visualize the resulting clusters

In [87]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_Merged2['Latitude'], Toronto_Merged2['Longitude'], Toronto_Merged2['Neighborhood'], Toronto_Merged2['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow [int(cluster-1)],
        fill=True,
        fill_color=rainbow [int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Examine Clusters

In [86]:
Toronto_Merged2.loc[Toronto_Merged2['Cluster Labels'] == 0, Toronto_Merged2.columns[[1] + list(range(5, Toronto_Merged2.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,North York,0.0,Cafeteria,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Women's Store,College Stadium


#### Cluster 2

In [89]:
Toronto_Merged2.loc[Toronto_Merged2['Cluster Labels'] == 1, Toronto_Merged2.columns[[1] + list(range(5, Toronto_Merged2.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,1.0,Grocery Store,Pharmacy,Bank,Pizza Place,Coffee Shop,Women's Store,Doner Restaurant,Diner,Discount Store,Distribution Center
1,Central Toronto,1.0,Trail,Sushi Restaurant,Bus Line,Jewelry Store,Donut Shop,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Women's Store
3,Scarborough,1.0,Middle Eastern Restaurant,Smoke Shop,Bakery,Breakfast Spot,Donut Shop,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Women's Store
4,York,1.0,Fast Food Restaurant,Discount Store,Sandwich Place,Fried Chicken Joint,Women's Store,Dog Run,Dim Sum Restaurant,Diner,Distribution Center,Donut Shop
5,Downtown Toronto,1.0,Coffee Shop,Italian Restaurant,Café,Japanese Restaurant,Sandwich Place,Ice Cream Shop,Salad Place,Bubble Tea Shop,Burger Joint,Fried Chicken Joint
6,Downtown Toronto,1.0,Coffee Shop,Hotel,Café,Restaurant,American Restaurant,Seafood Restaurant,Salad Place,Japanese Restaurant,Sporting Goods Shop,Breakfast Spot
7,East Toronto,1.0,Smoke Shop,Pizza Place,Skate Park,Brewery,Burrito Place,Spa,Farmers Market,Fast Food Restaurant,Restaurant,Butcher
8,West Toronto,1.0,Coffee Shop,Café,Sushi Restaurant,Pizza Place,Pub,Italian Restaurant,Sandwich Place,Bar,Food,Fish & Chips Shop
9,North York,1.0,Japanese Restaurant,Gym / Fitness Center,Caribbean Restaurant,Baseball Field,Café,Donut Shop,Discount Store,Distribution Center,Dog Run,Doner Restaurant
11,Downtown Toronto,1.0,Coffee Shop,Aquarium,Italian Restaurant,Restaurant,Hotel,Café,Scenic Lookout,Fried Chicken Joint,Brewery,Sporting Goods Shop
