## Segmenting and Clustering Neighbourhoods in Toronto

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim 
import folium  #map rendering library
from sklearn.cluster import KMeans
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import numpy as np
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
print('All required libraries load complete')

All required libraries load complete


# PART 1: 'Scrape the Wikipedia page and Create a DataFrame'

### Extracting table from the weblink using Beautiful Soup and panda

In [2]:
#this script shows the output using the link below on 24th april 2020.

file_link = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&direction=prev&oldid=926287641'
res = requests.get(file_link)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))


### Saving the table as dataframe

In [3]:
new_df = pd.DataFrame(df[0])

### Checking nu. of cells with 'not assigned' in 'Borough' & 'Neighbourhood'

In [4]:
#checking how many cells have 'not assigned' in Borough 
print('Nu. of cells with \'not assigned\' in Borough: ',new_df[new_df.Borough == 'Not assigned'].Borough.value_counts()[0])

Nu. of cells with 'not assigned' in Borough:  77


In [5]:
new_df.shape

(288, 3)

In [6]:
#checking if there are null values in the columns
new_df.isnull().sum()

Postcode         0
Borough          0
Neighbourhood    0
dtype: int64

In [7]:
print('Total Boroughs with not assigned : ',new_df[new_df.Borough == 'Not assigned'].shape[0])
print('Total Neighborhood with Not assigned values : ',new_df[new_df.Neighbourhood == 'Not assigned'].shape[0])


Total Boroughs with not assigned :  77
Total Neighborhood with Not assigned values :  78


### Getting the dataframe where Borough has no 'Not Assigned'

In [8]:
new_df = new_df[new_df.Borough != 'Not assigned']

In [9]:
print('Neighborhood with No assigned Borough: ',new_df[new_df.Neighbourhood == 'Not assigned'].shape[0])
print('Hence, Need to fix this neighbourhood as Borough')

Neighborhood with No assigned Borough:  1
Hence, Need to fix this neighbourhood as Borough


In [10]:
not_assigned = new_df[new_df.Neighbourhood == 'Not assigned']

not_assigned

Unnamed: 0,Postcode,Borough,Neighbourhood
8,M7A,Queen's Park,Not assigned


In [11]:
#fixing the Not assigned in Neighbour with the Borough name

new_df.loc[not_assigned.index,'Neighbourhood'] = not_assigned.Borough[not_assigned.index[0]]

In [12]:
#checking the changes  
print('Current No assigned Neighbourhood in the data: ',new_df[new_df.Neighbourhood == 'Not assigned'].shape[0])

Current No assigned Neighbourhood in the data:  0


In [13]:
new_df = new_df.dropna().groupby(['Postcode','Borough']).agg(lambda x: ','.join(x)).reset_index()

In [14]:
new_df.loc[:,'Neighbourhood'] = new_df.Neighbourhood.apply(lambda x: x.replace(' / ',', '))

In [15]:
print('The number of rows of the dataframe: ', new_df.shape[0])

The number of rows of the dataframe:  103


# PART 2: 'Built a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name'

## I used OPTION 2 the provided Geospatail csv file becaue the  Google Maps Geocoding API was totally unreliable while running for me

### Using Geospatial_data csv file to get the lat and long

In [16]:
Geospatial_data_filename = 'Geospatial_Coordinates.csv'

Geospatial_Coord = pd.read_csv(Geospatial_data_filename)

Geospatial_Coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
Geospatial_Coord.shape

(103, 3)

### Merge two dataframes using 'Postal code' as a join.

In [18]:
new_df.dtypes

Postcode         object
Borough          object
Neighbourhood    object
dtype: object

In [19]:
Geospatial_Coord.dtypes

Postal Code     object
Latitude       float64
Longitude      float64
dtype: object

In [20]:
# notice the postcode is different in new_df and Geospatial_Coord

new_df = new_df.rename(columns = {'Postcode':'Postal Code'})

#### observation : Postal code and Postal Code different

In [21]:
# Joining the two dataframes to get the lat and long
lat_long = new_df.merge(Geospatial_Coord, how = 'inner', on=['Postal Code'])

In [67]:
lat_long

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.739416,-79.588437


In [23]:
lat_long.shape

(103, 5)

# PART 3: Explore and cluster the neighborhoods in Toronto. Deciding to work with only boroughs that contain the word Toronto and then replicate the same analysis we did to the New York City data. 

### Getting the dataframe where Borough has 'Toronto' in it

In [24]:
df_Toronto = lat_long[lat_long['Borough'].str.contains('Toronto')].reset_index(drop = True)

In [25]:
df_Toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [26]:
df_Toronto.shape

(38, 5)

### Replicating the same analysis we did with New York City data

In [27]:
df_Toronto.head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


### Let's get the geographical location of Toronto

In [28]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinate of Toronto are {}, {}.'.format(latitude, longitude));

The geographical coordinate of Toronto are 43.6534817, -79.3839347.


### Create a map of Toronto with Neighbourhoods superimposed on top


In [29]:
neighbourhoods = df_Toronto[['Borough', 'Neighbourhood', 'Latitude','Longitude']]
neighbourhoods.head(5)

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,East Toronto,The Beaches,43.676357,-79.293031
1,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,East Toronto,Studio District,43.659526,-79.340923
4,Central Toronto,Lawrence Park,43.72802,-79.38879


In [30]:
print('The dataframe has {} boroughs and {} neighbourhoods.'.format(
        len(neighbourhoods['Borough'].unique()),
        neighbourhoods.shape[0]
    )
)

The dataframe has 4 boroughs and 38 neighbourhoods.


In [31]:
# create map of Toronto  using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighbourhoods['Latitude'], neighbourhoods['Longitude'], neighbourhoods['Borough'], neighbourhoods['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
map_toronto.save('map_toronto.html')
map_toronto 

Next, we are going to utilise Foursquare API to explore the neighbourhoods and segment them

### Define Foursquare Credentials and Version

In [32]:
CLIENT_ID = 'BCW0BQ1ILZUXBMX3FF52AYLGNIM0HV0BE0YMJIQVAOQ5GA0N' 
CLIENT_SECRET = 'TPA1LPD2KPRNAU1W3T2X1NWJXL0XAW1QZKR0NCNDAEF5WVZG'
VERSION = '20180605' # Foursquare API version

### Let's explore the first Neighbourhood in the dataframe

In [33]:
#Get the Neighbourhood name
df_Toronto.loc[0,'Neighbourhood']

'The Beaches'

In [34]:
# Get the neighbourhood's latitude and longitude values.

neighborhood_latitude = df_Toronto.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_Toronto.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df_Toronto.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


### Now, let's get the top 100 venues that are in The Beaches within a radius of 500 metres

In [35]:
#create the GET request URL
LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=BCW0BQ1ILZUXBMX3FF52AYLGNIM0HV0BE0YMJIQVAOQ5GA0N&client_secret=TPA1LPD2KPRNAU1W3T2X1NWJXL0XAW1QZKR0NCNDAEF5WVZG&v=20180605&ll=43.67635739999999,-79.2930312&radius=500&limit=100'

In [36]:
results = requests.get(url).json()


In [37]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Now we are ready to clean the json and structure it into pandas dataframe

In [38]:
json_normalize(results['response']['groups'][0]['items']).columns

  """Entry point for launching an IPython kernel.


Index(['referralId', 'reasons.count', 'reasons.items', 'venue.id',
       'venue.name', 'venue.location.address', 'venue.location.crossStreet',
       'venue.location.lat', 'venue.location.lng',
       'venue.location.labeledLatLngs', 'venue.location.distance',
       'venue.location.cc', 'venue.location.city', 'venue.location.state',
       'venue.location.country', 'venue.location.formattedAddress',
       'venue.categories', 'venue.photos.count', 'venue.photos.groups',
       'venue.location.postalCode', 'venue.venuePage.id'],
      dtype='object')

In [39]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON
print('flattened json column names: ',nearby_venues.columns)
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

flattened json column names:  Index(['referralId', 'reasons.count', 'reasons.items', 'venue.id',
       'venue.name', 'venue.location.address', 'venue.location.crossStreet',
       'venue.location.lat', 'venue.location.lng',
       'venue.location.labeledLatLngs', 'venue.location.distance',
       'venue.location.cc', 'venue.location.city', 'venue.location.state',
       'venue.location.country', 'venue.location.formattedAddress',
       'venue.categories', 'venue.photos.count', 'venue.photos.groups',
       'venue.location.postalCode', 'venue.venuePage.id'],
      dtype='object')


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,Grover Pub and Grub,Pub,43.679181,-79.297215
3,Upper Beaches,Neighborhood,43.680563,-79.292869
4,Dip 'n Sip,Coffee Shop,43.678897,-79.297745


In [40]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

5 venues were returned by Foursquare.


## Explore neighbourhoods in Toronto

In [41]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Now write the code to run the above function on each neighborhood and create a new dataframe called *Toronto_venues*.

In [42]:
Toronto_venues = getNearbyVenues(names=df_Toronto['Neighbourhood'],
                                   latitudes=df_Toronto['Latitude'],
                                   longitudes=df_Toronto['Longitude']
                                  )

The Beaches
The Danforth West,Riverdale
The Beaches West,India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park,Summerhill East
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
Rosedale
Cabbagetown,St. James Town
Church and Wellesley
Harbourfront,Regent Park
Ryerson,Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide,King,Richmond
Harbourfront East,Toronto Islands,Union Station
Design Exchange,Toronto Dominion Centre
Commerce Court,Victoria Hotel
Roselawn
Forest Hill North,Forest Hill West
The Annex,North Midtown,Yorkville
Harbord,University of Toronto
Chinatown,Grange Park,Kensington Market
CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place,Underground city
Christie
Dovercourt Village,Dufferin
Little Portugal,Trinity
Brockton,Exhibition Place,Parkdale Village
High Park,The Junction South
Parkdale,Roncesvall

In [43]:
print(Toronto_venues.shape)
Toronto_venues.head()

(1593, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,The Beaches,43.676357,-79.293031,Dip 'n Sip,43.678897,-79.297745,Coffee Shop


### checking how many venues were returned for each neighbourhood

In [44]:
Toronto_venues.groupby('Neighborhood')['Venue'].count()

Neighborhood
Adelaide,King,Richmond                                                                                  100
Berczy Park                                                                                              58
Brockton,Exhibition Place,Parkdale Village                                                               24
Business Reply Mail Processing Centre 969 Eastern                                                        16
CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara     16
Cabbagetown,St. James Town                                                                               45
Central Bay Street                                                                                       65
Chinatown,Grange Park,Kensington Market                                                                  64
Christie                                                                                                 17
Church and Well

In [45]:
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 233 uniques categories.


## Analyse Each Neighbourhood

In [46]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighbourhood'] = Toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
Toronto_onehot.shape

(1593, 234)

### Let's group rows by neighbourhood and by taking the mean of the frequency of occurence of each category

In [48]:
Toronto_grouped = Toronto_onehot.groupby('Neighbourhood').mean().reset_index()
Toronto_grouped.head(5)

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,...,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton,Exhibition Place,Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.0625,0.0625,0.0625,0.125,0.1875,0.125,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Let's print each neighbourhood along with top 5 most common venues

In [49]:
num_top_venues = 5

for hood in Toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = Toronto_grouped[Toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
            venue  freq
0     Coffee Shop  0.07
1            Café  0.05
2           Hotel  0.04
3  Clothing Store  0.04
4             Gym  0.04


----Berczy Park----
                venue  freq
0         Coffee Shop  0.09
1  Seafood Restaurant  0.03
2      Farmers Market  0.03
3              Bakery  0.03
4            Beer Bar  0.03


----Brockton,Exhibition Place,Parkdale Village----
            venue  freq
0            Café  0.12
1  Breakfast Spot  0.08
2          Bakery  0.08
3     Coffee Shop  0.08
4     Yoga Studio  0.04


----Business Reply Mail Processing Centre 969 Eastern----
                  venue  freq
0  Gym / Fitness Center  0.06
1         Auto Workshop  0.06
2            Smoke Shop  0.06
3            Skate Park  0.06
4            Restaurant  0.06


----CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara----
                 venue  freq
0      Airport Service  0.19
1       Airport Lounge  0.12


### Let's put the result above in the dataframe

In [50]:
#sorting the venues in descending order

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Let's create a new dataframe and display the top 10 venues for each neighbourhood

In [51]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighbourhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Hotel,Clothing Store,Gym,Restaurant,Bar,Steakhouse,Thai Restaurant,Lounge
1,Berczy Park,Coffee Shop,Seafood Restaurant,Beer Bar,Bakery,Cocktail Bar,Farmers Market,Cheese Shop,Café,Restaurant,Japanese Restaurant
2,"Brockton,Exhibition Place,Parkdale Village",Café,Breakfast Spot,Bakery,Coffee Shop,Furniture / Home Store,Burrito Place,Restaurant,Italian Restaurant,Stadium,Intersection
3,Business Reply Mail Processing Centre 969 Eastern,Yoga Studio,Smoke Shop,Auto Workshop,Brewery,Burrito Place,Comic Shop,Farmers Market,Fast Food Restaurant,Garden,Garden Center
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Service,Airport Lounge,Airport Terminal,Airport,Airport Food Court,Airport Gate,Sculpture Garden,Harbor / Marina,Bar,Boat or Ferry


## Cluster Neighbourhoods

In [52]:
# Run K-means to cluster the neighbourhood into 5 clusters

kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 3, 0])

In [53]:
# Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighbourhood

In [54]:
# add clustering labels
neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Hotel,Clothing Store,Gym,Restaurant,Bar,Steakhouse,Thai Restaurant,Lounge
1,Berczy Park,Coffee Shop,Seafood Restaurant,Beer Bar,Bakery,Cocktail Bar,Farmers Market,Cheese Shop,Café,Restaurant,Japanese Restaurant
2,"Brockton,Exhibition Place,Parkdale Village",Café,Breakfast Spot,Bakery,Coffee Shop,Furniture / Home Store,Burrito Place,Restaurant,Italian Restaurant,Stadium,Intersection
3,Business Reply Mail Processing Centre 969 Eastern,Yoga Studio,Smoke Shop,Auto Workshop,Brewery,Burrito Place,Comic Shop,Farmers Market,Fast Food Restaurant,Garden,Garden Center
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Service,Airport Lounge,Airport Terminal,Airport,Airport Food Court,Airport Gate,Sculpture Garden,Harbor / Marina,Bar,Boat or Ferry
5,"Cabbagetown,St. James Town",Coffee Shop,Bakery,Café,Pub,Italian Restaurant,Market,Pizza Place,Pet Store,Restaurant,Grocery Store
6,Central Bay Street,Coffee Shop,Sandwich Place,Café,Italian Restaurant,Bubble Tea Shop,Salad Place,Department Store,Japanese Restaurant,Burger Joint,Modern European Restaurant
7,"Chinatown,Grange Park,Kensington Market",Café,Coffee Shop,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Mexican Restaurant,Burger Joint,Pizza Place,Caribbean Restaurant,Park,Bakery
8,Christie,Grocery Store,Café,Park,Baby Store,Candy Store,Restaurant,Diner,Italian Restaurant,Athletics & Sports,Nightclub
9,Church and Wellesley,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Gay Bar,Restaurant,Pub,Men's Store,Mediterranean Restaurant,Hotel,Yoga Studio


In [55]:
neighbourhoods_venues_sorted = neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [56]:
df_Toronto.head(2)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188


In [57]:
Toronto_merged = df_Toronto.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on = 'Neighbourhood')
Toronto_merged.tail()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
33,M6K,West Toronto,"Brockton,Exhibition Place,Parkdale Village",43.636847,-79.428191,0,Café,Breakfast Spot,Bakery,Coffee Shop,Furniture / Home Store,Burrito Place,Restaurant,Italian Restaurant,Stadium,Intersection
34,M6P,West Toronto,"High Park,The Junction South",43.661608,-79.464763,0,Grocery Store,Mexican Restaurant,Café,Thai Restaurant,Arts & Crafts Store,Furniture / Home Store,Cajun / Creole Restaurant,Discount Store,Bar,Diner
35,M6R,West Toronto,"Parkdale,Roncesvalles",43.64896,-79.456325,0,Breakfast Spot,Gift Shop,Restaurant,Bar,Italian Restaurant,Bookstore,Dessert Shop,Movie Theater,Cuban Restaurant,Eastern European Restaurant
36,M6S,West Toronto,"Runnymede,Swansea",43.651571,-79.48445,0,Café,Coffee Shop,Sushi Restaurant,Pub,Italian Restaurant,Pizza Place,Yoga Studio,Diner,Boutique,Sandwich Place
37,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,0,Yoga Studio,Smoke Shop,Auto Workshop,Brewery,Burrito Place,Comic Shop,Farmers Market,Fast Food Restaurant,Garden,Garden Center


### Let's visualise the resulting clusters

In [58]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighbourhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters.save("map_clusters.html")
map_clusters

## Examine Clusters

### Cluster_0

In [59]:
cluster_0 = Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 0, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

cluster_0

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,0,Coffee Shop,Health Food Store,Neighborhood,Trail,Pub,Yoga Studio,Dog Run,Diner,Discount Store,Distribution Center
1,East Toronto,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Restaurant,Furniture / Home Store,Yoga Studio,Caribbean Restaurant,Indian Restaurant,Spa
2,East Toronto,0,Food & Drink Shop,Pub,Sandwich Place,Burrito Place,Fast Food Restaurant,Italian Restaurant,Fish & Chips Shop,Restaurant,Steakhouse,Ice Cream Shop
3,East Toronto,0,Café,Coffee Shop,American Restaurant,Bakery,Brewery,Gastropub,Yoga Studio,Fish Market,Pet Store,Park
6,Central Toronto,0,Clothing Store,Coffee Shop,Sporting Goods Shop,Ice Cream Shop,Fast Food Restaurant,Diner,Mexican Restaurant,Park,Pet Store,Chinese Restaurant
7,Central Toronto,0,Dessert Shop,Sandwich Place,Coffee Shop,Café,Gym,Italian Restaurant,Sushi Restaurant,Pizza Place,Seafood Restaurant,Pharmacy
9,Central Toronto,0,Coffee Shop,Pub,Pizza Place,Restaurant,Supermarket,Sushi Restaurant,Bank,Fried Chicken Joint,Sports Bar,Bagel Shop
11,Downtown Toronto,0,Coffee Shop,Bakery,Café,Pub,Italian Restaurant,Market,Pizza Place,Pet Store,Restaurant,Grocery Store
12,Downtown Toronto,0,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Gay Bar,Restaurant,Pub,Men's Store,Mediterranean Restaurant,Hotel,Yoga Studio
13,Downtown Toronto,0,Coffee Shop,Bakery,Pub,Café,Park,Breakfast Spot,Theater,Yoga Studio,Farmers Market,Restaurant


### observation cluster_0

Cluster_0 represents the busiest areas because of the presence of airport and aquariums. Coffeshops is also the 1st most common venue here indicating a good business because of the travelling customers.

### Cluster_1

In [60]:
cluster_1 = Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 1, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

cluster_1

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Central Toronto,1,Park,Swim School,Bus Line,Yoga Studio,Diner,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
10,Downtown Toronto,1,Park,Playground,Trail,Yoga Studio,Department Store,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
23,Central Toronto,1,Park,Jewelry Store,Trail,Sushi Restaurant,Bus Line,Dessert Shop,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


### observation from cluster_1:
Cluster_1 seems to be more of a open land with parks, trail, trail as the most common top 3 common venue. It seems it is easily accessbile via the bus lines.Also, the presence of restaurants and shops gives a good indication of business around and rough ideas on the estimated visitors for this area.

### Cluster_2

In [65]:
cluster_2 = Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 2, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]


In [66]:
cluster_2

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Central Toronto,2,Gym,Hotel,Breakfast Spot,Food & Drink Shop,Sandwich Place,Department Store,Park,Convenience Store,Distribution Center,Ethiopian Restaurant
8,Central Toronto,2,Gym,Park,Tennis Court,Dessert Shop,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


### observation cluster_2:
Gym is the most common venues in cluster_2 indicating people living in this cluster are much more into fitness and health. 

### Cluster_3

In [63]:
cluster_3 = Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 3, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

cluster_3

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
30,Downtown Toronto,3,Grocery Store,Café,Park,Baby Store,Candy Store,Restaurant,Diner,Italian Restaurant,Athletics & Sports,Nightclub


### observation from cluster_3

Grocery store is the most common venues in cluster_3 giving us an idea this cluster might include residential areas.There are cafes, parks, restaurants, athletics & sports and nightclubs. It has all the features of being a town centre too.

### Cluster _4

In [64]:
cluster_4 = Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 4, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

cluster_4

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,Central Toronto,4,Pool,Home Service,Garden,Yoga Studio,Department Store,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


#### observation cluster_4

Cluster_4 has the 1st Most Common venue as Pool. So, this area might be far away from the beach giving business opportunity for pools. 