### Import all the libraries that we will need

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't installed geopy yet
from geopy.geocoders import Nominatim # convserts an address into longitude and latitude values
import requests # library to handle requests
from pandas import json_normalize
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncoment this line if you haven't installed it yet
!pip install folium
import folium # map rendering library


Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 5.8 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1


In [2]:
# to make this notebook's output identical at every run
np.random.seed(42)

### First part  
**Use the Notebook to build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas  dataframe**

In [3]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs=pd.read_html(url, header=0)
print("number of tables: " + str(len(dfs)))

number of tables: 3


**Create the dataframe:**  
- The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [4]:
df=dfs[0] # get the first table
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


**Check for null values:**

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Postal Code    180 non-null    object
 1   Borough        180 non-null    object
 2   Neighbourhood  180 non-null    object
dtypes: object(3)
memory usage: 4.3+ KB


We can observe from the above cell that there are _NO Null_ values

**Only process the cells that have an assigned borough.  
Ignore cells with a borough that is _Not assigned_**.

In [6]:
df.drop(df.index[df['Borough'] == 'Not assigned'], inplace=True)

df = df.reset_index(drop=True)
df.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
df.head(11)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


**Checking for Postal Code duplicates and if they exist, the rows will be combined into one row with the neighborhoods separated by comma**

In [7]:
df_postcode = df.groupby(['PostalCode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()

df_postcode.head(11)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [8]:
df_postcode.shape  # Returns the number of rows and columns

(103, 3)

**Check if a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.**

In [9]:
df_replace = df_postcode.replace({'Neighbourhood':'Not assigned'}, df_postcode['Borough'])
df_replace.head(11)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


**Use the `.shape` method to print the number of rows of your dataframe.**

In [10]:
df_replace.shape

(103, 3)

## Second part  
### In order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood. 

#### Read the Geospatial_data csv file that has the geographical coordinates of each postal code:` http://cocl.us/Geospatial_datafile ` 

In [11]:
df_latlong = pd.read_csv('https://cocl.us/Geospatial_data', index_col='Postal Code')

In [12]:
df_latlong.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


**Join both the dataframes on `Postal Code` column**

In [13]:
toronto_data = df_replace.join(df_latlong, on='PostalCode')
toronto_data.head(11)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


### Explore and cluster the neighborhoods in Toronto.

**Use geopy library to get the latitude and longitude values of Toronto.  
In order to define an instance of the geocoders, we need to define an user_agent. We will name our agent `to_explorer.`**

In [14]:
address = 'Toronto, ON'
geolocator = Nominatim(user_agent='to_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinates of Toronto are {}, {}'.format(latitude, longitude))

The geographical coordinates of Toronto are 43.6534817, -79.3839347


**Create a map of Toronto using latitude and longitude values using _Folium_ library**

In [15]:
map_toronto = folium.Map(location = [latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighbourhood']):
    label='{},{}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng],
                       radius=5,
                       popup=label,
                       color='blue',
                       fill=True,
                       fill_color='#318cc',
                       fill_opacity=0.7,
                       parse_html=False).add_to(map_toronto)

map_toronto

### Let's segment and cluster only the neighborhoods from _Central Toronto_ borough

In [16]:
central_toronto_data = toronto_data[toronto_data['Borough']=='Central Toronto'].reset_index(drop=True)
central_toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
2,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
3,M4S,Central Toronto,Davisville,43.704324,-79.38879
4,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316


Get the geographical coordinates of Central Toronto

In [17]:
address = 'Central Toronto, ON'
geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinates of Central Toronto are {}, {}'.format(latitude, longitude))

The geographical coordinates of Central Toronto are 43.6534817, -79.3839347


**Create the map of Central Toronto using latitude and longitude values.**

In [18]:
# create the map of Central Toronto using latitude and longitude values.
map_central_to = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(central_toronto_data['Latitude'], central_toronto_data['Longitude'], central_toronto_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng],
                       radius=5,
                       popup = label,
                       color='blue',
                       fill = True,
                       fill_color='#3186cc',
                       fill_opacity=0.7,
                       parse_html=False).add_to(map_central_to)
    
map_central_to

### Now we are going to start utilizing the Foursquare API to explore the neighborhoods in Central Toronto and segment them.  
**Define Foursquare Credentials and Version**

In [20]:
# The code was removed by Watson Studio for sharing.

#### Let's explore the first neighborhood in our Central Toronto dataframe.  
Get the first neighborhood's name.

In [21]:
central_toronto_data.loc[0, 'Neighbourhood']


'Lawrence Park'

Get the neighborhood's latitude and longitude values.

In [22]:
neighborhood_latitude = central_toronto_data.loc[0, 'Latitude'] # neighborhood latitude value 
neighborhood_longitude = central_toronto_data.loc[0, 'Longitude'] # neighborhood longitude value 
neighborhood_name = central_toronto_data.loc[0, 'Neighbourhood'] # neighborhood name 
print('Latitude and longitude values of {} are {}, {}'.format(neighborhood_name,
                                                             neighborhood_latitude,
                                                             neighborhood_longitude))

Latitude and longitude values of Lawrence Park are 43.7280205, -79.3887901


#### Now let's get the top 100 venues that are in Lawrence Park within a radius of 500 meters.  
First, let's create the GET request URL. Name your URL **url**.

In [23]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?client_id=TFH22S5UUQBMJVNP5HFRECJZAMVAO2LJDPTMRR5BZ4WBF5L2&client_secret=UEQX2MBQCYLW1ZWS3DHQJMF5BZ0XI1JY52QDAHDKKTGPIJCM&ll=43.7280205,-79.3887901&v=20180605&radius=500&limit=100'

Send the GET request and examine the results.

In [24]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '60277a32b9e8291154db5efb'},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 3,
  'suggestedBounds': {'ne': {'lat': 43.7325205045, 'lng': -79.3825744605273},
   'sw': {'lat': 43.7235204955, 'lng': -79.3950057394727}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '50e6da19e4b0d8a78a0e9794',
       'name': 'Lawrence Park Ravine',
       'location': {'address': '3055 Yonge Street',
        'crossStreet': 'Lawrence Avenue East',
        'lat': 43.72696303913755,
        'lng': -79.39438246708775,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.72696303913755,
          'lng': -79.39438246708775}],
        'distance': 465,
        'cc': 'CA',
  

**We know that all the information is in the _items_ key. Before we procede, let's borrow the `get_category_type` function from Foursquare lab.**

In [25]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Clean the json and structure it into a pandas dataframe.

In [26]:
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venues.categories'] = nearby_venues.apply(get_category_type, axis = 1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues.head()

Unnamed: 0,name,categories,lat,lng,categories.1
0,Lawrence Park Ravine,"[{'id': '4bf58dd8d48988d163941735', 'name': 'P...",43.726963,-79.394382,Park
1,Zodiac Swim School,"[{'id': '52e81612bcbc57f1066b7a44', 'name': 'S...",43.728532,-79.38286,Swim School
2,TTC Bus #162 - Lawrence-Donway,"[{'id': '4bf58dd8d48988d12b951735', 'name': 'B...",43.728026,-79.382805,Bus Line


Get the number values that were returned by Forsquare

In [27]:
print('{} venues were returned by Forsquare.'.format(nearby_venues.shape[0]))

3 venues were returned by Forsquare.


#### Repeat the same process to all neighborhoods in Central Toronto.  
**We can borrow the `getNearbyVenues` function from Foursquare lab**

In [28]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Use the above function on each neighbourhood and create a new dataframe called _central_toronto_venues_.

In [29]:
central_toronto_venues = getNearbyVenues(names=central_toronto_data['Neighbourhood'],
                                   latitudes=central_toronto_data['Latitude'],
                                   longitudes=central_toronto_data['Longitude']
                                  )

Lawrence Park
Davisville North
North Toronto West, Lawrence Park
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Roselawn
Forest Hill North & West, Forest Hill Road Park
The Annex, North Midtown, Yorkville


#### Get the size of the venues dataframe from Central Toronto

In [30]:
print(central_toronto_venues.shape)
central_toronto_venues.head()

(114, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Lawrence Park,43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
2,Lawrence Park,43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
3,Davisville North,43.712751,-79.390197,Homeway Restaurant & Brunch,43.712641,-79.391557,Breakfast Spot
4,Davisville North,43.712751,-79.390197,Sherwood Park,43.716551,-79.387776,Park


**Check how many venues are in each neighbourhood**

In [31]:
central_toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Davisville,38,38,38,38,38,38
Davisville North,10,10,10,10,10,10
"Forest Hill North & West, Forest Hill Road Park",4,4,4,4,4,4
Lawrence Park,3,3,3,3,3,3
"Moore Park, Summerhill East",4,4,4,4,4,4
"North Toronto West, Lawrence Park",19,19,19,19,19,19
Roselawn,3,3,3,3,3,3
"Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park",14,14,14,14,14,14
"The Annex, North Midtown, Yorkville",19,19,19,19,19,19


**Get the unique categories from venues**

In [32]:
unique_venue_categories=len(central_toronto_venues['Venue Category'].unique())
print('There are {} unique categories.'.format(unique_venue_categories))

There are 63 unique categories.


### Analyze each neghbourhood

In [33]:
# one hot encoding
central_toronto_onehot = pd.get_dummies(central_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
central_toronto_onehot['Neighbourhood'] = central_toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [central_toronto_onehot.columns[-1]] + list(central_toronto_onehot.columns[:-1])
central_toronto_onehot = central_toronto_onehot[fixed_columns]

central_toronto_onehot.head()

Unnamed: 0,Neighbourhood,American Restaurant,BBQ Joint,Bagel Shop,Bank,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Chinese Restaurant,Clothing Store,Coffee Shop,Department Store,Dessert Shop,Diner,Discount Store,Donut Shop,Farmers Market,Fast Food Restaurant,Food & Drink Shop,Fried Chicken Joint,Garden,Gas Station,Gourmet Shop,Greek Restaurant,Gym,Gym / Fitness Center,History Museum,Home Service,Hotel,Ice Cream Shop,Indian Restaurant,Indoor Play Area,Italian Restaurant,Jewelry Store,Light Rail Station,Liquor Store,Mexican Restaurant,Middle Eastern Restaurant,New American Restaurant,Optical Shop,Park,Pet Store,Pharmacy,Pizza Place,Playground,Pub,Restaurant,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Shoe Store,Spa,Sporting Goods Shop,Supermarket,Sushi Restaurant,Swim School,Tennis Court,Thai Restaurant,Toy / Game Store,Trail,Vietnamese Restaurant,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,Lawrence Park,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Davisville North,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Davisville North,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Get the new dataframe size

In [34]:
central_toronto_onehot.shape

(114, 64)

**Group the dataframe by _Neighbiurhood_ and calculate the `mean` for each Venue category**

In [35]:
central_toronto_grouped = central_toronto_onehot.groupby('Neighbourhood').mean().reset_index()
central_toronto_grouped.head()

Unnamed: 0,Neighbourhood,American Restaurant,BBQ Joint,Bagel Shop,Bank,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Chinese Restaurant,Clothing Store,Coffee Shop,Department Store,Dessert Shop,Diner,Discount Store,Donut Shop,Farmers Market,Fast Food Restaurant,Food & Drink Shop,Fried Chicken Joint,Garden,Gas Station,Gourmet Shop,Greek Restaurant,Gym,Gym / Fitness Center,History Museum,Home Service,Hotel,Ice Cream Shop,Indian Restaurant,Indoor Play Area,Italian Restaurant,Jewelry Store,Light Rail Station,Liquor Store,Mexican Restaurant,Middle Eastern Restaurant,New American Restaurant,Optical Shop,Park,Pet Store,Pharmacy,Pizza Place,Playground,Pub,Restaurant,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Shoe Store,Spa,Sporting Goods Shop,Supermarket,Sushi Restaurant,Swim School,Tennis Court,Thai Restaurant,Toy / Game Store,Trail,Vietnamese Restaurant,Yoga Studio
0,Davisville,0.026316,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.052632,0.0,0.0,0.052632,0.0,0.078947,0.026316,0.026316,0.0,0.026316,0.0,0.0,0.0,0.0,0.026316,0.026316,0.026316,0.052632,0.0,0.0,0.0,0.0,0.0,0.026316,0.026316,0.052632,0.0,0.0,0.0,0.0,0.0,0.026316,0.026316,0.026316,0.0,0.026316,0.105263,0.0,0.0,0.026316,0.0,0.078947,0.026316,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.026316,0.026316,0.0,0.0,0.0
1,Davisville North,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Forest Hill North & West, Forest Hill Road Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0
3,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0
4,"Moore Park, Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0


In [36]:
# Get the new size:
central_toronto_grouped.shape

(9, 64)

#### Create a new dataframe that contains the top 10 venues for each neighbourhood

In [37]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

In [38]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicator[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
        


In [39]:
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood']=central_toronto_grouped['Neighbourhood']
neighborhoods_venues_sorted

Unnamed: 0,Neighbourhood,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,,,,,,,,,,
1,Davisville North,,,,,,,,,,
2,"Forest Hill North & West, Forest Hill Road Park",,,,,,,,,,
3,Lawrence Park,,,,,,,,,,
4,"Moore Park, Summerhill East",,,,,,,,,,
5,"North Toronto West, Lawrence Park",,,,,,,,,,
6,Roselawn,,,,,,,,,,
7,"Summerhill West, Rathnelly, South Hill, Forest...",,,,,,,,,,
8,"The Annex, North Midtown, Yorkville",,,,,,,,,,


In [40]:
for ind in np.arange(central_toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(central_toronto_grouped.iloc[ind, :], num_top_venues)
neighborhoods_venues_sorted.head()    

Unnamed: 0,Neighbourhood,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,Pizza Place,Dessert Shop,Sandwich Place,Café,Coffee Shop,Sushi Restaurant,Italian Restaurant,Gym,Indian Restaurant,Greek Restaurant
1,Davisville North,Hotel,Pizza Place,Food & Drink Shop,Gym / Fitness Center,Gym,Breakfast Spot,Park,Department Store,Sandwich Place,Farmers Market
2,"Forest Hill North & West, Forest Hill Road Park",Trail,Jewelry Store,Sushi Restaurant,Bus Line,Yoga Studio,Gas Station,Farmers Market,Fast Food Restaurant,Food & Drink Shop,Fried Chicken Joint
3,Lawrence Park,Bus Line,Park,Swim School,Yoga Studio,Farmers Market,Fast Food Restaurant,Food & Drink Shop,Fried Chicken Joint,Garden,Gas Station
4,"Moore Park, Summerhill East",Playground,Restaurant,Tennis Court,Park,Food & Drink Shop,Discount Store,Donut Shop,Farmers Market,Fast Food Restaurant,Yoga Studio


### Create Neighbourhoods clusters using `k-means`

In [41]:
# set number of clusters
kclusters = 5
# drop the Neighbourhood column in order to do the clustering, because it's not a numerical column
central_toronto_grouped_clustering = central_toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters = kclusters, random_state=0).fit(central_toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 1, 2, 0, 4, 1, 3, 1, 1], dtype=int32)

**Create a new dataframe that contains the clusters labels and the top 10 venues for each neighbourhood**

In [42]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
central_toronto_merged = central_toronto_data

# merge central_toronto_grouped with central_toronto_data to add latitude/longitude for each neighbourhood
central_toronto_merged = central_toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

central_toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Bus Line,Park,Swim School,Yoga Studio,Farmers Market,Fast Food Restaurant,Food & Drink Shop,Fried Chicken Joint,Garden,Gas Station
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197,1,Hotel,Pizza Place,Food & Drink Shop,Gym / Fitness Center,Gym,Breakfast Spot,Park,Department Store,Sandwich Place,Farmers Market
2,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678,1,Coffee Shop,Clothing Store,Sporting Goods Shop,Chinese Restaurant,Fast Food Restaurant,Mexican Restaurant,Diner,Park,Pet Store,Gym / Fitness Center
3,M4S,Central Toronto,Davisville,43.704324,-79.38879,1,Pizza Place,Dessert Shop,Sandwich Place,Café,Coffee Shop,Sushi Restaurant,Italian Restaurant,Gym,Indian Restaurant,Greek Restaurant
4,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,4,Playground,Restaurant,Tennis Court,Park,Food & Drink Shop,Discount Store,Donut Shop,Farmers Market,Fast Food Restaurant,Yoga Studio


**Create a map to visualize the clusters**

In [43]:
# create map
map_clusters = folium.Map(location = [latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors=[]
for lat, lon, poi, cluster in zip(central_toronto_merged['Latitude'], central_toronto_merged['Longitude'], central_toronto_merged['Neighbourhood'], central_toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker([lat, lon],
                       radius=5,
                       popup=label,
                       color=rainbow[cluster-1],
                       fill=True,
                       fill_color=rainbow[cluster-1],
                       fill_opacity=0.7).add_to(map_clusters)
map_clusters


### Examine Clusters  

#### Cluster 1

In [44]:
central_toronto_merged.loc[central_toronto_merged['Cluster Labels'] == 0, central_toronto_merged.columns[[0,1,2] + list(range(5, central_toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Neighbourhood,Cluster Labels,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4N,Central Toronto,Lawrence Park,0,Bus Line,Park,Swim School,Yoga Studio,Farmers Market,Fast Food Restaurant,Food & Drink Shop,Fried Chicken Joint,Garden,Gas Station


#### Cluster 2

In [45]:
central_toronto_merged.loc[central_toronto_merged['Cluster Labels'] == 1, central_toronto_merged.columns[[0,1,2] + list(range(5, central_toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Neighbourhood,Cluster Labels,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,M4P,Central Toronto,Davisville North,1,Hotel,Pizza Place,Food & Drink Shop,Gym / Fitness Center,Gym,Breakfast Spot,Park,Department Store,Sandwich Place,Farmers Market
2,M4R,Central Toronto,"North Toronto West, Lawrence Park",1,Coffee Shop,Clothing Store,Sporting Goods Shop,Chinese Restaurant,Fast Food Restaurant,Mexican Restaurant,Diner,Park,Pet Store,Gym / Fitness Center
3,M4S,Central Toronto,Davisville,1,Pizza Place,Dessert Shop,Sandwich Place,Café,Coffee Shop,Sushi Restaurant,Italian Restaurant,Gym,Indian Restaurant,Greek Restaurant
5,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",1,Coffee Shop,American Restaurant,Restaurant,Bagel Shop,Bank,Pizza Place,Vietnamese Restaurant,Sushi Restaurant,Supermarket,Pub
8,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",1,Café,Sandwich Place,Coffee Shop,Indian Restaurant,Park,Middle Eastern Restaurant,Pharmacy,Pizza Place,History Museum,Pub


#### Cluster 3

In [46]:
central_toronto_merged.loc[central_toronto_merged['Cluster Labels'] == 2, central_toronto_merged.columns[[0,1,2] + list(range(5, central_toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Neighbourhood,Cluster Labels,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
7,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",2,Trail,Jewelry Store,Sushi Restaurant,Bus Line,Yoga Studio,Gas Station,Farmers Market,Fast Food Restaurant,Food & Drink Shop,Fried Chicken Joint


#### Cluster 4

In [47]:
central_toronto_merged.loc[central_toronto_merged['Cluster Labels'] == 3, central_toronto_merged.columns[[0,1,2] + list(range(5, central_toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Neighbourhood,Cluster Labels,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,M5N,Central Toronto,Roselawn,3,Home Service,Garden,Ice Cream Shop,Yoga Studio,Discount Store,History Museum,Gym / Fitness Center,Gym,Greek Restaurant,Gourmet Shop


#### Cluster 5

In [48]:
central_toronto_merged.loc[central_toronto_merged['Cluster Labels'] == 4, central_toronto_merged.columns[[0,1,2] + list(range(5, central_toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Neighbourhood,Cluster Labels,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,M4T,Central Toronto,"Moore Park, Summerhill East",4,Playground,Restaurant,Tennis Court,Park,Food & Drink Shop,Discount Store,Donut Shop,Farmers Market,Fast Food Restaurant,Yoga Studio
