# Jupyter Notebook for the Capstone Project of Coursera

In [283]:
#Loading libraries
import pandas as pd
import numpy as np

In [284]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


## Part 1: Scrape Wikipedia Page

In [329]:
# Importing libraries
import bs4 as bs
import urllib
import requests
from geopy.geocoders import Nominatim
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import requests 
from pandas.io.json import json_normalize

In [330]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text
canada_data = bs.BeautifulSoup(source,'html.parser')

In [331]:
# Columns name and we create an empty data frame
col_nm = ['PostalCode', 'Borough', 'Neighborhood']
city_data = pd.DataFrame(columns = col_nm)

city_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood


In [332]:
# loop through the data to find postcode, borough and neighborhood
content = canada_data.find('div', class_ = 'mw-parser-output')
table = content.table.tbody
postcode = 0
borough = 0
neighborhood = 0

for tr in table.find_all('tr'):
    i = 0
    for td in tr.find_all('td'):
        if i == 0:
            postcode = td.text
            i += 1
        elif i == 1:
            borough = td.text
            i += 1
        elif i == 2: 
            neighborhood = td.text.strip('\n').replace(']','')
            
        city_data = city_data.append(
            {'PostalCode': postcode,'Borough': borough,'Neighborhood': neighborhood},
            ignore_index=True
        )

We should now have a data frame with entries

In [333]:
print(city_data.shape)
city_data.head()

(861, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,0,0
1,M1A,Not assigned,0
2,M1A,Not assigned,Not assigned
3,M2A,Not assigned,Not assigned
4,M2A,Not assigned,Not assigned


We can clean our data frame

In [367]:
# We remove the entries that don't have a borough (not assigned or 0)
city_data = city_data[city_data.Borough != 'Not assigned']
city_data = city_data[city_data.Borough != 0]

# We reset the index
city_data.reset_index(drop = True, inplace = True)
print(city_data.shape)

(630, 3)


In [368]:
# If an entry has a borough but no neighborhood, we assign the borough to the neighborhood

for i in range(0,city_data.shape[0]):
    if city_data.iloc[i][2] == 'Not assigned':
        city_data.iloc[i][2] = city_data.iloc[i][1]
        
# We group the entries
df = df.groupby('PostalCode').agg({'Borough':'first','Neighborhood': ', '.join}).reset_index()

df.shape

(103, 3)

In [369]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Queen's Park,"Queen's Park, Scarborough, Rouge, Rouge, Rouge..."
1,M1C,Etobicoke,"West Deane Park, West Deane Park, Highland Cre..."
2,M1E,Etobicoke,"Old Burnhamthorpe, Old Burnhamthorpe, Guildwoo..."
3,M1G,Scarborough,"Scarborough, Woburn"
4,M1H,Scarborough,"Scarborough, Cedarbrae"


## Part 2

We get the data

In [370]:
!wget -O toronto_data.csv  http://cocl.us/Geospatial_data
    
print("Download complete")

/bin/sh: wget: command not found
Download complete


We read the data and merge the two dataframes

In [371]:
geo_df = pd.read_csv("toronto_data.csv")
geo_df = geo_df.rename(columns={'Postal Code':'PostalCode'})
geo_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [372]:
df = pd.merge(df, geo_df, on='PostalCode')
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Queen's Park,"Queen's Park, Scarborough, Rouge, Rouge, Rouge...",43.806686,-79.194353
1,M1C,Etobicoke,"West Deane Park, West Deane Park, Highland Cre...",43.784535,-79.160497
2,M1E,Etobicoke,"Old Burnhamthorpe, Old Burnhamthorpe, Guildwoo...",43.763573,-79.188711
3,M1G,Scarborough,"Scarborough, Woburn",43.770992,-79.216917
4,M1H,Scarborough,"Scarborough, Cedarbrae",43.773136,-79.239476


In [373]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


## Part 3

#### We get Toronto's address (lat, long)

In [374]:
address = 'Toronto, CA'

geo_loc = Nominatim(user_agent="toronto_explorer")
loc1 = geo_loc.geocode(address)
lat1 = loc1.latitude
long1 = loc1.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(lat1, long1))

The geograpical coordinate of Toronto City are 43.653963, -79.387207.


In [375]:
map1 = folium.Map(location=[lat1, long1], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map1)  
    
map1

In [376]:
CLIENT_ID = '2QJ03LTLPH3D0FLHKNBZ2HVAXN45512L4BXCLKIYECEXCH4L' # your Foursquare ID
CLIENT_SECRET = 'EUTT1SWU2IWN2KJ3O3UFPDSIVSI5WV2JY0PAZHXNO0M0OGDF' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 2QJ03LTLPH3D0FLHKNBZ2HVAXN45512L4BXCLKIYECEXCH4L
CLIENT_SECRET:EUTT1SWU2IWN2KJ3O3UFPDSIVSI5WV2JY0PAZHXNO0M0OGDF


### We're going to work with Toronto borough

In [377]:
toronto_borough = df[df['Borough'].str.contains('Toronto',regex=False)].reset_index(drop=True)
toronto_borough.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,"East Toronto, The Beaches",43.676357,-79.293031
1,M4K,East Toronto,"Downsview East, The Danforth West, The Danfort...",43.679557,-79.352188
2,M4L,East Toronto,"Downsview West, The Beaches West, The Beaches ...",43.668999,-79.315572
3,M4M,East Toronto,"Downsview Central, Studio District, Downsview ...",43.659526,-79.340923
4,M4N,Central Toronto,"Downsview Northwest, Lawrence Park, Downsview ...",43.72802,-79.38879


In [378]:
print(toronto_borough['Borough'].unique())

['East Toronto' 'Central Toronto' 'Downtown Toronto']


In [379]:
latitude = []
longitude = []

for neighbor in toronto_borough['Borough'].unique():
    address = neighbor+', CA'
    geolocator = Nominatim(user_agent='ny_explorer')
    location = geolocator.geocode(address)
    latitude.append(location.latitude)
    longitude.append(location.longitude)
    print('Coordinate of {}: {} {}'.format(neighbor, latitude[-1], longitude[-1]))

Coordinate of East Toronto: 43.6247901 -79.3934918
Coordinate of Central Toronto: 43.653963 -79.387207
Coordinate of Downtown Toronto: 43.6563221 -79.3809161


In [380]:
neighborhood_latitude = toronto_borough.loc[0, 'Latitude']
neighborhood_longitude = toronto_borough.loc[0, 'Longitude']

In [381]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=2QJ03LTLPH3D0FLHKNBZ2HVAXN45512L4BXCLKIYECEXCH4L&client_secret=EUTT1SWU2IWN2KJ3O3UFPDSIVSI5WV2JY0PAZHXNO0M0OGDF&v=20180605&ll=43.67635739999999,-79.2930312&radius=500&limit=100'

In [382]:
results = requests.get(url).json()

results

{'meta': {'code': 200, 'requestId': '5e2b55b2e826ac001ba09bc0'},
 'response': {'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.67682094413784,
          'lng': -79.29394208780985}],
        'distanc

In [390]:
# Let's get the category of the venue

def venue_category(row):
    try: 
        categories_list = row['categories']
    except: 
        categories_list = row['venue.categories']
    
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [391]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(venue_category, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,Grover Pub and Grub,Pub,43.679181,-79.297215
3,Upper Beaches,Neighborhood,43.680563,-79.292869


In [392]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

4 venues were returned by Foursquare.


### Explore Neighborhoods in Toronto

In [393]:
# We can create a function that repeats the same process for all neighborhoods
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [395]:
# We can now run the above code

toronto_venues = getNearbyVenues(names=toronto_borough['Neighborhood'],
                                   latitudes=toronto_borough['Latitude'],
                                   longitudes=toronto_borough['Longitude']
                                  )



East Toronto, The Beaches
Downsview East, The Danforth West, The Danforth West, The Danforth West, Riverdale, Downsview East
Downsview West, The Beaches West, The Beaches West, The Beaches West, India Bazaar, Downsview West
Downsview Central, Studio District, Downsview Central
Downsview Northwest, Lawrence Park, Downsview Northwest
Central Toronto, Davisville North
Central Toronto, North Toronto West
Central Toronto, Davisville
Central Toronto, Moore Park, Moore Park, Moore Park, Summerhill East
Central Toronto, Deer Park, Deer Park, Deer Park, Forest Hill SE, Forest Hill SE, Forest Hill SE, Rathnelly, Rathnelly, Rathnelly, South Hill, South Hill, South Hill, Summerhill West
Downtown Toronto, Rosedale
Downtown Toronto, Cabbagetown, Cabbagetown, Cabbagetown, St. James Town
Downtown Toronto, Church and Wellesley
Victoria Village, Harbourfront, Victoria Village
Parkview Hill, Ryerson, Ryerson, Ryerson, Garden District, Parkview Hill
Woodbine Heights, St. James Town, Woodbine Heights
The B

In [396]:
print(toronto_venues.shape)
toronto_venues.head()

(1786, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"East Toronto, The Beaches",43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,"East Toronto, The Beaches",43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,"East Toronto, The Beaches",43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,"East Toronto, The Beaches",43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"Downsview East, The Danforth West, The Danfort...",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [397]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Berczy Park, Berczy Park, Caledonia-Fairbanks",5,5,5,5,5,5
"Business Reply Mail Processing Centre 969 Eastern, Business Reply Mail Processing Centre 969 Eastern, Humber Bay, Humber Bay, Humber Bay, King's Mill Park, King's Mill Park, King's Mill Park, Kingsway Park South East, Kingsway Park South East, Kingsway Park South East, Mimico NE, Mimico NE, Mimico NE, Old Mill South, Old Mill South, Old Mill South, The Queensway East, The Queensway East, The Queensway East, Royal York South East, Royal York South East, Royal York South East, Sunnylea",2,2,2,2,2,2
"Central Bay Street, Central Bay Street, Christie",18,18,18,18,18,18
"Central Toronto, Davisville",33,33,33,33,33,33
"Central Toronto, Davisville North",9,9,9,9,9,9
"Central Toronto, Deer Park, Deer Park, Deer Park, Forest Hill SE, Forest Hill SE, Forest Hill SE, Rathnelly, Rathnelly, Rathnelly, South Hill, South Hill, South Hill, Summerhill West",14,14,14,14,14,14
"Central Toronto, Moore Park, Moore Park, Moore Park, Summerhill East",4,4,4,4,4,4
"Central Toronto, North Toronto West",21,21,21,21,21,21
"Davisville North, Davisville North, Forest Hill North, Forest Hill North, Forest Hill North, Forest Hill West",5,5,5,5,5,5
"Davisville, Davisville, Harbord, Harbord, Harbord, University of Toronto",35,35,35,35,35,35


In [398]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 239 uniques categories.


### Analyze each venue

In [399]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [400]:
toronto_onehot.shape

(1786, 239)

Let's group rows by neighborhood and by takign the mean of the frequency of occurence of each category

In [401]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store
0,"Berczy Park, Berczy Park, Caledonia-Fairbanks",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2
1,Business Reply Mail Processing Centre 969 East...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Central Bay Street, Central Bay Street, Christie",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Central Toronto, Davisville",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Central Toronto, Davisville North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Central Toronto, Deer Park, Deer Park, Deer Pa...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0
6,"Central Toronto, Moore Park, Moore Park, Moore...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Central Toronto, North Toronto West",0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"Davisville North, Davisville North, Forest Hil...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Davisville, Davisville, Harbord, Harbord, Harb...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.028571,0.0,0.0,0.0,0.0


In [402]:
# The new size
toronto_grouped.shape

(47, 239)

In [403]:
# We can print each neighborhood along with the top 5 most common venues

num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park, Berczy Park, Caledonia-Fairbanks----
                  venue  freq
0                  Park   0.4
1                Market   0.2
2         Women's Store   0.2
3  Fast Food Restaurant   0.2
4       Harbor / Marina   0.0


----Business Reply Mail Processing Centre 969 Eastern, Business Reply Mail Processing Centre 969 Eastern, Humber Bay, Humber Bay, Humber Bay, King's Mill Park, King's Mill Park, King's Mill Park, Kingsway Park South East, Kingsway Park South East, Kingsway Park South East, Mimico NE, Mimico NE, Mimico NE, Old Mill South, Old Mill South, Old Mill South, The Queensway East, The Queensway East, The Queensway East, Royal York South East, Royal York South East, Royal York South East, Sunnylea----
                        venue  freq
0  Construction & Landscaping   0.5
1              Baseball Field   0.5
2                 Yoga Studio   0.0
3  Modern European Restaurant   0.0
4          Light Rail Station   0.0


----Central Bay Street, Central Bay Street, Chris

We can put all of this into a pandas dataframe

In [404]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [405]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Berczy Park, Berczy Park, Caledonia-Fairbanks",Park,Market,Fast Food Restaurant,Women's Store,Airport,Department Store,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
1,Business Reply Mail Processing Centre 969 East...,Construction & Landscaping,Baseball Field,Women's Store,Department Store,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant
2,"Central Bay Street, Central Bay Street, Christie",Grocery Store,Café,Park,Athletics & Sports,Italian Restaurant,Diner,Restaurant,Baby Store,Candy Store,Gas Station
3,"Central Toronto, Davisville",Sandwich Place,Pizza Place,Dessert Shop,Coffee Shop,Gym,Café,Sushi Restaurant,Italian Restaurant,Gas Station,Indian Restaurant
4,"Central Toronto, Davisville North",Hotel,Breakfast Spot,Food & Drink Shop,Sandwich Place,Convenience Store,Department Store,Park,Gym,Doner Restaurant,Donut Shop


### Cluster neighborhoods

In [407]:
# Let's run k-means
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 1, 0, 0, 0, 0, 4, 0, 3, 0], dtype=int32)

In [408]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_borough

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,"East Toronto, The Beaches",43.676357,-79.293031,0,Pub,Trail,Health Food Store,Women's Store,Diner,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Discount Store
1,M4K,East Toronto,"Downsview East, The Danforth West, The Danfort...",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Furniture / Home Store,Ice Cream Shop,Pub,Caribbean Restaurant,Liquor Store,Sports Bar,Dessert Shop
2,M4L,East Toronto,"Downsview West, The Beaches West, The Beaches ...",43.668999,-79.315572,0,Sandwich Place,Pizza Place,Food & Drink Shop,Board Shop,Brewery,Italian Restaurant,Burger Joint,Burrito Place,Ice Cream Shop,Fast Food Restaurant
3,M4M,East Toronto,"Downsview Central, Studio District, Downsview ...",43.659526,-79.340923,0,Café,Coffee Shop,Gastropub,Bakery,Brewery,Italian Restaurant,American Restaurant,Sandwich Place,Bookstore,Cheese Shop
4,M4N,Central Toronto,"Downsview Northwest, Lawrence Park, Downsview ...",43.72802,-79.38879,3,Park,Bus Line,Lawyer,Swim School,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store


In [410]:
# Let's visualize the resulting clusters

# create map
map_clusters = folium.Map(location=[lat1, long1], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters
