# Week 3 Cousera Capstone Data Science Course - Task3

## -------------------------------------------------
## Scrape Wikipedia Page for Tonronto geodata and present it in a DataFrame
## -------------------------------------------------

#### Let's import some of the needed libraries

In [50]:
import numpy as np # library for vectorized computation
import pandas as pd # library to process data as dataframes 
import requests 
from bs4 import BeautifulSoup

import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

from sklearn.cluster import KMeans

In [51]:
from urllib.request import urlopen
resp = urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
#print(resp.read())   

soup = BeautifulSoup(resp.read(),'html.parser')
soup = soup.find("table", attrs={"class":"wikitable sortable"})
#print(soup.prettify())

#soup.table

In [52]:

datasets = []
for tr in soup.find_all("tr")[1:]:
    dataset = [td.get_text() for td in tr.find_all('td')]  
    # Remove the "\n" in the last element of the list
    dataset[-1] = dataset[-1][:].replace('\n', '')
    
    # when Borough has a value but neighborhood is "Not Assigned" 
    # then assign the Borough's value to Neighborhood
    if(dataset[2]=='Not assigned' and dataset[1]!='Not assigned'): 
        dataset[2] = dataset[1]  
    else:
        None

    # when Borough is "Not Assigned regardless of the neighborhood value"    
    # then skip that has Borough
    if(dataset[1]=='Not assigned'): 
        None 
    else:
        datasets.append(dataset)


datasets[0:5]

[['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights']]

In [53]:
# The first tr contains the column names(headers).
headings = [th.get_text() for th in soup.find("tr").find_all("th")]
headings

['Postcode', 'Borough', 'Neighbourhood\n']

In [54]:
# Remove the "\n" in the last field of column name
headings[-1] = headings[-1][:].replace('\n', '')
headings

['Postcode', 'Borough', 'Neighbourhood']

In [55]:
raw_toronto=pd.DataFrame(datasets, columns=['Postcode', 'Borough', 'Neighborhood'])
raw_toronto.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [56]:
# get the unique Postcodes/Borough
group_raw_toronto = raw_toronto.groupby(['Postcode', 'Borough'],as_index=False).count()
#group_raw.head()

In [57]:
# Combine neigbourhoods within same Postalcose/Borough into a single tuple
list1=[]

for row in group_raw_toronto.itertuples():
    string1=''
    for nghbr in raw_toronto[raw_toronto.Postcode==row.Postcode].itertuples():
        
        if string1 =='': 
            string1 = (nghbr.Neighborhood)
        else :
            string1 = string1 +', '+(nghbr.Neighborhood)
    list1.append([row.Postcode,row.Borough,string1])
#print(list1)


toronto=pd.DataFrame(list1, columns=['Postcode', 'Borough', 'Neighborhood'])
toronto.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [58]:
toronto.shape

(103, 3)

## -------------------------------------------------
## Include Latitude & Longitude in the Postcode/ Borough/ Nghbrhood DF
## -------------------------------------------------

In [59]:
# use the provided csv file gor geo loaction to avoid any unstability while using "geo location coder"
geo = pd.read_csv('http://cocl.us/Geospatial_data')
geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [60]:
# join the two data frames and present the latidtude/longitude in the same DF as the Borough/Neighbourhood
toronto = pd.merge(toronto, geo, left_on='Postcode', right_on='Postal Code',how='left')
toronto.head(3)

Unnamed: 0,Postcode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711


In [61]:
# Remove the undesired column
del toronto['Postal Code']

In [62]:
# The Capstone week3 - task2 dataframe
toronto

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


## -------------------------------------------------
## Explore TORONTO Neighbourhoods
##### (replicate the New York city analysis)
## -------------------------------------------------

In [63]:
# Let's import needed libraries

from geopy.geocoders import Nominatim

import folium
import matplotlib.cm as cm
import matplotlib.colors as colors 



#### Let's get the latitude and longitude of Toronto using Geopy

In [64]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="earth_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Coordinates of Toronto are {}, {}.'.format(latitude, longitude))

Coordinates of Toronto are 43.653963, -79.387207.


### Visualisation of Toronto using the co-ordinates of Postcode areas

In [65]:
# create map of Toronto using the coordinate we retrirved
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, nghbrhood in zip(toronto['Latitude'], toronto['Longitude'], toronto['Borough'], toronto['Neighborhood']):
    label = '{}, {}'.format(nghbrhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto


#### I choose to analyze only the Borough of "Downtown Toronto"

In [66]:
dttoronto = toronto[toronto['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
dttoronto.head(9)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
4,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
5,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568


##### Create a more concise map of Downtown Toronto

In [67]:
# Fetch Downtown Toronto coordinates

address = 'Downtown Toronto, Toronto'

geolocator = Nominatim(user_agent="earth_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Coordinates of DT Toronto are {}, {}.'.format(latitude, longitude))

Coordinates of DT Toronto are 43.6541737, -79.3808116451341.


In [68]:
# create map of DT Toronto using the coordinate we retrirved
map_dttoronto = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to map
for lat, lng, borough, nghbrhood in zip(dttoronto['Latitude'], dttoronto['Longitude'], dttoronto['Borough'], dttoronto['Neighborhood']):
    label = '{}, {}'.format(nghbrhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dttoronto)  
    
map_dttoronto


####  
#### Let's Foursquare to get places around a chosen point (Postcode) in Downtown Toronto. 
<font color="blue"><b> _ATTN: It is important to note that we are using the coordinates assigned to Postcodes in our excel file which may cover one or more neighborhoods. In this first example, our chosen postcode contains only one neighborhood : "Central bay street". Even if the postcode covered more than one neighborhood, will still treat them as a single neighborhood using the coordinates of the Postcode assigned to them._ </b></font>

##### Let's see if central bay street is in the dttoronto dataframe

In [69]:
dttoronto.loc[dttoronto['Neighborhood'] == 'Central Bay Street']


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


##### We see it exists. We will use the Latitude and Longitude assigned to the Postcode (of central bay street) and get 100 venues within 600metres of the coordinate

In [None]:
# Define my Foursquare credentials (in the next cell)

# CLIENT_ID = ?
# CLIENT_SECRET = ?
# VERSION = '20180605' 

# since there is only one tuple in the selection/s below, we use .value[0] to extract the value as a string not List nor series
cbs_lat = dttoronto.loc[dttoronto['Neighborhood'] == 'Central Bay Street', 'Latitude'].values[0] 
cbs_long =  dttoronto.loc[dttoronto['Neighborhood'] == 'Central Bay Street', 'Longitude'].values[0] 

In [1]:
# The code was removed by Watson Studio for sharing.

In [None]:

LIMIT = 100 # desired number of venues fetched from Foursquare API (My account doesn't seem to fetch more than 100 nearby venues per location when I tried)
radius = 600 # our 600 metres radius around the chosen point (Central bay street)

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    cbs_lat, 
    cbs_long, 
    radius, 
    LIMIT)
# url # display URL

In [72]:
# Get the results
results = requests.get(url).json()

In [73]:
# Avoid the lengthy json
#results


#### Now Just like the example in the ungraded LAB quiz, let's extract the category of the venues within 600meteres from Central Bay, using the "get_category_type" function from Foursquare

In [74]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#### We load the json into a DF

In [75]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Jimmy's Coffee,Coffee Shop,43.658421,-79.385613
1,Tim Hortons,Coffee Shop,43.65857,-79.385123
2,Hailed Coffee,Coffee Shop,43.658833,-79.383684
3,The Queen and Beaver Public House,Gastropub,43.657472,-79.383524
4,Panago,Pizza Place,43.658258,-79.384313


In [76]:
print(nearby_venues.shape[0],'venues around Central Bay street')

100 venues around Central Bay street


#### Next we define a function just like in the lab quiz that will iterate through the Postcode locations in "Downtown Toronto" Borough, extract their coordinates and use it to get at most, 100 venues(my account couldn't get more than 100 per location) within 600meteres of each PostCode Coordinate

In [77]:
def getNearbyVenues(names, latitudes, longitudes, radius=495):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### We now apply the function we just defined, to fetch nearby venues in other areas of downtownToronto

In [78]:
dttoronto_venues = getNearbyVenues(names=dttoronto['Neighborhood'], latitudes=dttoronto['Latitude'], longitudes=dttoronto['Longitude'] )

Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie


In [79]:
print(dttoronto_venues.shape)
dttoronto_venues.head()

(1272, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
1,Rosedale,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
2,Rosedale,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
3,Rosedale,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail
4,"Cabbagetown, St. James Town",43.667967,-79.367675,Cranberries,43.667843,-79.369407,Diner


In [80]:
# This will show how many venues were fetched from each postal code location
dttoronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Berczy Park,55,55,55,55,55,55
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",15,15,15,15,15,15
"Cabbagetown, St. James Town",45,45,45,45,45,45
Central Bay Street,83,83,83,83,83,83
"Chinatown, Grange Park, Kensington Market",100,100,100,100,100,100
Christie,16,16,16,16,16,16
Church and Wellesley,87,87,87,87,87,87
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
"Design Exchange, Toronto Dominion Centre",100,100,100,100,100,100


In [81]:
print(format(len(dttoronto_venues['Venue Category'].unique())), 'unique category of venues in DT toronto')

208 unique category of venues in DT toronto


#### Analyse each Location (postcode coordinates) in DT toronto usinf one_hot encoding

In [82]:
# one hot encoding
dttoronto_onehot = pd.get_dummies(dttoronto_venues[['Venue Category']], prefix="", prefix_sep="")
dttoronto_onehot.head()

Unnamed: 0,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
# return the neighborhood column to the dataframe
dttoronto_onehot['Neighborhood'] = dttoronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = list(dttoronto_onehot.columns) 
fixed_columns.remove('Neighborhood') 
fixed_columns = ['Neighborhood'] + fixed_columns
dttoronto_onehot = dttoronto_onehot[fixed_columns]

dttoronto_onehot.head()

Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Rosedale,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,"Cabbagetown, St. James Town",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [84]:
dttoronto_onehot.shape

(1272, 208)

#### We now group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [85]:
dttoronto_grouped = dttoronto_onehot.groupby('Neighborhood').mean().reset_index()
dttoronto_grouped.head()

Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.066667,0.066667,0.066667,0.133333,0.2,0.133333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012048,...,0.0,0.0,0.0,0.012048,0.0,0.0,0.012048,0.0,0.0,0.0


In [86]:
dttoronto_grouped.shape

(18, 208)

##### Let's replicate what was done with the Newyork City data but we will limit our transposed display to top 2 venues for only 10 Postcode locations

In [87]:
num_top_venues = 2

for hood in dttoronto_grouped['Neighborhood']:
        print("The top " + str(num_top_venues) + "venues for:- " + hood )
        temp = dttoronto_grouped[dttoronto_grouped['Neighborhood'] == hood].T.reset_index() #transpose
        temp.columns = ['venue','freq']
        temp = temp.iloc[1:]
        temp['freq'] = temp['freq'].astype(float)
        temp = temp.round({'freq': 2})
        print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
        print('\n') 


The top 2venues for:- Adelaide, King, Richmond
         venue  freq
0  Coffee Shop  0.06
1         Café  0.05


The top 2venues for:- Berczy Park
          venue  freq
0   Coffee Shop  0.09
1  Cocktail Bar  0.05


The top 2venues for:- CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
              venue  freq
0   Airport Service  0.20
1  Airport Terminal  0.13


The top 2venues for:- Cabbagetown, St. James Town
         venue  freq
0  Coffee Shop  0.09
1   Restaurant  0.04


The top 2venues for:- Central Bay Street
                venue  freq
0         Coffee Shop  0.17
1  Italian Restaurant  0.05


The top 2venues for:- Chinatown, Grange Park, Kensington Market
                           venue  freq
0                           Café  0.08
1  Vegetarian / Vegan Restaurant  0.06


The top 2venues for:- Christie
           venue  freq
0  Grocery Store  0.19
1           Café  0.19


The top 2venues for:- Church and Wellesley
       

#### Convert these results into a DataFrame

In [88]:
# Cull from the Lab tests with new york city, we define a function
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [89]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
dttoronto_venues_sorted = pd.DataFrame(columns=columns)
dttoronto_venues_sorted['Neighborhood'] = dttoronto_grouped['Neighborhood']

for ind in np.arange(dttoronto_grouped.shape[0]):
    dttoronto_venues_sorted.iloc[ind, 1:] = return_most_common_venues(dttoronto_grouped.iloc[ind, :], num_top_venues)

dttoronto_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,American Restaurant,Bar,Thai Restaurant,Steakhouse,Hotel,Burger Joint,Bakery,Gym
1,Berczy Park,Coffee Shop,Cocktail Bar,Seafood Restaurant,Café,Steakhouse,Beer Bar,Italian Restaurant,Restaurant,Farmers Market,Bakery
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Service,Airport Lounge,Airport Terminal,Sculpture Garden,Boat or Ferry,Airport,Airport Food Court,Airport Gate,Harbor / Marina,Boutique
3,"Cabbagetown, St. James Town",Coffee Shop,Restaurant,Pub,Italian Restaurant,Café,Bakery,Market,Pizza Place,Pet Store,Breakfast Spot
4,Central Bay Street,Coffee Shop,Italian Restaurant,Café,Indian Restaurant,Bar,Bakery,Spa,Sandwich Place,Salad Place,Sushi Restaurant


## -------------------------------------------------
## Clustering of Neighbourhoods
##### Run *k*-means to cluster the neighborhood into 5 clusters
## -------------------------------------------------

In [90]:
# set number of clusters
kclusters = 5

dttoronto_grouped_clustering = dttoronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dttoronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 0, 1, 1, 3, 4, 1, 1, 1], dtype=int32)

In [91]:
# add clustering labels
dttoronto_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

dttoronto_merged = dttoronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
dttoronto_merged = dttoronto_merged.join(dttoronto_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

dttoronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,2,Park,Playground,Trail,Yoga Studio,Dessert Shop,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675,1,Coffee Shop,Restaurant,Pub,Italian Restaurant,Café,Bakery,Market,Pizza Place,Pet Store,Breakfast Spot
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,1,Japanese Restaurant,Sushi Restaurant,Coffee Shop,Gay Bar,Restaurant,Gastropub,Men's Store,Pub,Hotel,Fast Food Restaurant
3,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,1,Coffee Shop,Bakery,Restaurant,Pub,Park,Mexican Restaurant,Breakfast Spot,Theater,Event Space,Performing Arts Venue
4,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,1,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant,Ramen Restaurant,Theater,Bakery,Pizza Place,Sporting Goods Shop


#### Visualize Clusters

In [98]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dttoronto_merged['Latitude'], dttoronto_merged['Longitude'], dttoronto_merged['Neighborhood'], dttoronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Examine Clusters

##### Check Cluster 1

In [93]:
dttoronto_merged.loc[dttoronto_merged['Cluster Labels'] == 0, dttoronto_merged.columns[[1] + list(range(5, dttoronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,Downtown Toronto,0,Airport Service,Airport Lounge,Airport Terminal,Sculpture Garden,Boat or Ferry,Airport,Airport Food Court,Airport Gate,Harbor / Marina,Boutique


##### Check Cluster 2

In [94]:
dttoronto_merged.loc[dttoronto_merged['Cluster Labels'] == 1, dttoronto_merged.columns[[1] + list(range(5, dttoronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Downtown Toronto,1,Coffee Shop,Restaurant,Pub,Italian Restaurant,Café,Bakery,Market,Pizza Place,Pet Store,Breakfast Spot
2,Downtown Toronto,1,Japanese Restaurant,Sushi Restaurant,Coffee Shop,Gay Bar,Restaurant,Gastropub,Men's Store,Pub,Hotel,Fast Food Restaurant
3,Downtown Toronto,1,Coffee Shop,Bakery,Restaurant,Pub,Park,Mexican Restaurant,Breakfast Spot,Theater,Event Space,Performing Arts Venue
4,Downtown Toronto,1,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant,Ramen Restaurant,Theater,Bakery,Pizza Place,Sporting Goods Shop
5,Downtown Toronto,1,Coffee Shop,Café,Restaurant,Hotel,Cosmetics Shop,Gastropub,Breakfast Spot,Bakery,Clothing Store,Gym
6,Downtown Toronto,1,Coffee Shop,Cocktail Bar,Seafood Restaurant,Café,Steakhouse,Beer Bar,Italian Restaurant,Restaurant,Farmers Market,Bakery
7,Downtown Toronto,1,Coffee Shop,Italian Restaurant,Café,Indian Restaurant,Bar,Bakery,Spa,Sandwich Place,Salad Place,Sushi Restaurant
8,Downtown Toronto,1,Coffee Shop,Café,American Restaurant,Bar,Thai Restaurant,Steakhouse,Hotel,Burger Joint,Bakery,Gym
9,Downtown Toronto,1,Coffee Shop,Aquarium,Hotel,Italian Restaurant,Café,Fried Chicken Joint,Scenic Lookout,Brewery,Pizza Place,Restaurant
10,Downtown Toronto,1,Coffee Shop,Hotel,Café,Restaurant,Italian Restaurant,Bakery,Deli / Bodega,Gastropub,Steakhouse,American Restaurant


##### Check Cluster 3

In [95]:
dttoronto_merged.loc[dttoronto_merged['Cluster Labels'] == 2, dttoronto_merged.columns[[1] + list(range(5, dttoronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,2,Park,Playground,Trail,Yoga Studio,Dessert Shop,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


##### Check Cluster 4

In [96]:
dttoronto_merged.loc[dttoronto_merged['Cluster Labels'] == 3, dttoronto_merged.columns[[1] + list(range(5, dttoronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,Downtown Toronto,3,Café,Restaurant,Bookstore,Bar,Japanese Restaurant,Bakery,Italian Restaurant,Nightclub,Chinese Restaurant,Pub
13,Downtown Toronto,3,Café,Vegetarian / Vegan Restaurant,Dumpling Restaurant,Bar,Mexican Restaurant,Bakery,Coffee Shop,Vietnamese Restaurant,Chinese Restaurant,Grocery Store


##### Check Cluster 5

In [97]:
dttoronto_merged.loc[dttoronto_merged['Cluster Labels'] == 4, dttoronto_merged.columns[[1] + list(range(5, dttoronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
17,Downtown Toronto,4,Grocery Store,Café,Park,Baby Store,Athletics & Sports,Italian Restaurant,Diner,Nightclub,Restaurant,Coffee Shop


<strong>We can say that :</strong>
<ol type="1">
  <li> Transportation activities dominate one of the clusters as the more popular venues are Airport service, Airports, Boats_or_Ferry , Harbor/Marina etc.</li>
  <li> Coffee shops dominate the popularity in the next cluster showing showing heavily industrialized/ commercial areas </li>
  <li> Parks, Play grounds, Yoga studio etc. dominate one of the clusters, suggesting the area may be more recreational than residential </li>
  <li> The other two clusters show more of residential trends as they include Cafe, baby stores, Diner, Italian/vegetarian/Vietnamese and other restaurants</li>
</ol> 