# Important Imports
These are the imports that are important for the whole notebook.

In [187]:
import pandas as pd
import numpy as np

## For web scrapping
import requests
from bs4 import BeautifulSoup

# Part One:

### Scrapping and first cleaning
Scrape the website 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' using BeautifulSoup and extract the column names and the data ros

In [188]:
## Scrape wikipedia site
html_doc = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(html_doc, "html.parser")

tables = soup.find_all("table")
print(f"Found {len(tables)} tables in the document.")
table = tables[0]   # You have to look for which one is the correct one

# Get table rows and column_names
table_body = table.find("tbody")
table_rows = table_body.find_all("tr")
print(f"Found {len(table_rows)} rows in the table.")

column_names = table_rows[0]
rows = table_rows[1:]
print(f"Found {len(rows)} rows with values in the table.")
print(f"Column names: {column_names}.")

Found 3 tables in the document.
Found 181 rows in the table.
Found 180 rows with values in the table.
Column names: <tr>
<th>Postal Code
</th>
<th>Borough
</th>
<th>Neighbourhood
</th></tr>.


### Data refinement
Refine data into a dataframe

In [189]:
# Process scraped values in a dataframe
column_names = [x.text.strip() for x in column_names.find_all("th")]

postal_code = []
borough = []
neighbourhood = []

for r in rows:
    values = r.find_all("td")
    postal_code.append(values[0].text.strip())
    borough.append(values[1].text.strip())
    neighbourhood.append(values[2].text.strip())

# Create the DataFrame
df = pd.DataFrame({"Postal Code": postal_code, "Borough": borough, "Neighbourhood": neighbourhood})

print(f"Shape of raw data: {df.shape}")
df.head()

Shape of raw data: (180, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Filtering
Filter data so it does not contain invalid data.

In [191]:
# Filter the data so only rows with assigned boroughs are kept
df = df[df["Borough"] != "Not assigned"]

# Aggregate rows with the same boroughs but different neighbourhoods
df = df.groupby(["Postal Code", "Borough"])["Neighbourhood"].apply(lambda x: ",".join(x)).reset_index()

df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [192]:
df.shape

(103, 3)

# Part Two:

In [193]:
import geocoder # import geocoder

print("Finished importing libraries.")

Finished importing libraries.


The code bellow is based on the code given in the exercise. Sadly, as already indicated in the exercise description, it does not work (for me at this point in time). Therefore I will use the .csv file.

In [196]:
# This cell does not work for me
def try_coordinates(postal_code):
    """
        Code based on snipped in corsera course
    """

    coords = None
    coords = geocoder.google('{}, Toronto, Ontario'.format(postal_code)).latlng
    
    if coords is None: print("Failed, will try again:")
    
    while(coords is None):
        print("\tFailure")
        coords = geocoder.google('{}, Toronto, Ontario'.format(postal_code)).latlng
        if (not coords is None): print()
        
    return coords

# This call does dont work for some reason, I will use the .csv file
# coordinates = try_coordinates("M5G")
# print(coordinates)

In [197]:
# Load data from website
location_df = pd.read_csv("https://cocl.us/Geospatial_data")
location_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merging the two dataframes:

In [198]:
print("DataFrame from Part One:")
print(df.head())
print(df.shape)
print()
print("DataFrame containing the coordinates:")
print(location_df.head())
print(df.shape)

DataFrame from Part One:
  Postal Code      Borough                           Neighbourhood
0         M1B  Scarborough                          Malvern, Rouge
1         M1C  Scarborough  Rouge Hill, Port Union, Highland Creek
2         M1E  Scarborough       Guildwood, Morningside, West Hill
3         M1G  Scarborough                                  Woburn
4         M1H  Scarborough                               Cedarbrae
(103, 3)

DataFrame containing the coordinates:
  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476
(103, 3)


In [199]:
merged_df = pd.merge(df, location_df, on="Postal Code")

merged_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Part Three:

In [201]:
# Imports for part three
import folium
import json
from sklearn.cluster import KMeans

In [202]:
# Rename for convenience
df = merged_df

In [203]:
df["Borough"].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East York            5
East Toronto         5
Mississauga          1
Name: Borough, dtype: int64

In [204]:
# Filter df to only contain boroughs whose name contains "Toronto" or "toronto"
df = df[df["Borough"].str.contains(".*(T|t)oronto.*")]

df.head()

  return func(self, *args, **kwargs)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [209]:
# Set mean point as starting point
latitude = df["Latitude"].mean()
longitude = df["Longitude"].mean()

map = folium.Map(location = [latitude, longitude], zoom_start = 12)

# Iterate through rows and unpack values
for _, (_, bor, neigh, lat, long) in df.iterrows():
    
    # Generate labels to be displayed when clicked on
    label_text = '{} ({})'.format(neigh, bor)
    label = folium.Popup(label_text, parse_html=True)
    
    # Generate marker and add them to the map
    folium.CircleMarker(
        location=(lat, long),
        popup=label,
        color='green',
        fill_opacity=0.0, # Fully opaque markers
    ).add_to(map) 

map

## Note:
The following code is equivalent to the one in the given example. I did, however, modify it a bit as to better fit my taste.

In [213]:
# Read access data from a private file (as not to publish private data online)
data = pd.read_json(".private_data.json")["foursquare"]

CLIENT_ID = data["CLIENT_ID"]
CLIENT_SECRET = data["CLIENT_SECRET"]
VERSION = data["VERSION"]
LIMIT = 100
RADIUS = 500

# Actual processing
url_prefix = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&radius={}&limit={}'
url_prefix = url_prefix.format(CLIENT_ID, CLIENT_SECRET, VERSION, RADIUS, LIMIT)
url_payload = "&ll={},{}"

# Delete data so it can't be read in another cell
del(data)
del(CLIENT_ID)
del(CLIENT_SECRET)
del(VERSION)

In [214]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        res = categories_list[0]['name']
        
        # A fix, because the value "Neighborhood" confuses pandas
        if (res == "Neighborhood"): 
            return "Neighborhood (Category)"
        
        return categories_list[0]['name']

In [215]:
column_names = ["Venue", "Venue Category", "Venue Lat", "Venue Lng", "Neighborhood", "Lat", "Lng"]

def get_venues(neigh, lat, long):
    
    # Query data from foursquare
    url = url_prefix + url_payload.format(lat, long)
    result= requests.get(url).json()
    
    # Convert json in DataFrame object
    venues = result["response"]["groups"][0]["items"]
    df_venues = pd.json_normalize(venues)

    # Keep only relevant columns
    relevant_columns = ["venue.name","venue.categories", "venue.location.lat", "venue.location.lng"]
    df_venues = df_venues[relevant_columns]

    # Map the "venue.categories"-field to actual values.
    df_venues["venue.categories"] = df_venues.apply(get_category_type, axis=1)
    
    # broadcasting to all rows
    df_venues["Neighborhood"] = neigh 
    df_venues["Lat"] = lat
    df_venues["Lng"] = long
    
    # Sort columns
    df_venues.columns = column_names
    
    return df_venues

df_venues = pd.DataFrame()
for _, row in df[["Neighbourhood", "Latitude", "Longitude"]].iterrows():
    neigh = row[0]
    lat = row[1]
    long = row[2]
    
    tmp = get_venues(neigh, lat, long)
    df_venues = df_venues.append(tmp)
    
# Clean up the dataframe
df_venues = df_venues.reset_index()
df_venues = df_venues[["Neighborhood", "Lat", "Lng", "Venue", "Venue Category", "Venue Lat", "Venue Lng"]]

df_venues.head()

Unnamed: 0,Neighborhood,Lat,Lng,Venue,Venue Category,Venue Lat,Venue Lng
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,Pub,43.679181,-79.297215
3,The Beaches,43.676357,-79.293031,Upper Beaches,Neighborhood (Category),43.680563,-79.292869
4,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,Greek Restaurant,43.677621,-79.351434


Let's check how many venues were returned for each neighborhood


In [216]:
df_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Lat,Lng,Venue,Venue Category,Venue Lat,Venue Lng
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,55,55,55,55,55,55
"Brockton, Parkdale Village, Exhibition Place",23,23,23,23,23,23
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",16,16,16,16,16,16
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",16,16,16,16,16,16
Central Bay Street,68,68,68,68,68,68
Christie,16,16,16,16,16,16
Church and Wellesley,75,75,75,75,75,75
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,33,33,33,33,33,33
Davisville North,9,9,9,9,9,9


#### Let's find out how many unique categories can be curated from all the returned venues


In [217]:
print('Uniques categories: {}'.format(df_venues['Venue Category'].unique().shape[0]))

Uniques categories: 237


## Analyzing the neighbourhoods:

In [219]:
# One-hot encoding the venue categories
onehot = pd.get_dummies(df_venues["Venue Category"])

# Update the columns
old_columns = list(onehot.columns)
onehot["Neighborhood"] = df_venues["Neighborhood"]
new_columns = ["Neighborhood"] + old_columns
onehot = onehot[new_columns]

onehot.head()

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"The Danforth West, Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [220]:
# Shape:
onehot.shape

(1624, 238)

In [221]:
grouped = onehot.groupby("Neighborhood").mean().reset_index()

print(grouped.shape)
grouped

(39, 238)


Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0625,0.0625,0.0625,0.125,0.125,0.0625,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.014706,0.0,0.0,0.014706,0.0,0.014706
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.013333,0.0,0.0,0.0,0.0,0.0,0.0,0.013333,0.0,...,0.013333,0.0,0.0,0.0,0.0,0.0,0.013333,0.0,0.0,0.026667
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [222]:
# Print out the top venues in each neighborhood
num_top_venues = 7

for hood in grouped['Neighborhood']:
    # Get only entries where the neighborhood is equal to hood
    local = grouped[grouped['Neighborhood'] == hood]
    
    # Transpose and reset index
    local = local.T.reset_index()
    
    # Clean up the dataframe and correct data type of freq
    local.columns = ['Venue','Frequency']
    local = local.iloc[1:]
    local['Frequency'] = local['Frequency'].astype(float)
    local = local.round({'Frequency': 2})
    
    # Sort the results
    local = local.sort_values('Frequency', ascending=False).reset_index(drop=True)
    
    # Print the output
    print("-- {} -----".format(hood))
    print(local.head(num_top_venues))
    print('\n')

-- Berczy Park -----
                Venue  Frequency
0         Coffee Shop       0.09
1        Cocktail Bar       0.04
2            Beer Bar       0.04
3      Farmers Market       0.04
4         Cheese Shop       0.04
5              Bakery       0.04
6  Seafood Restaurant       0.04


-- Brockton, Parkdale Village, Exhibition Place -----
            Venue  Frequency
0            Café       0.13
1  Breakfast Spot       0.09
2     Coffee Shop       0.09
3       Nightclub       0.09
4       Pet Store       0.04
5    Intersection       0.04
6         Stadium       0.04


-- Business reply mail Processing Centre, South Central Letter Processing Plant Toronto -----
                  Venue  Frequency
0  Gym / Fitness Center       0.06
1         Auto Workshop       0.06
2           Pizza Place       0.06
3            Comic Shop       0.06
4      Recording Studio       0.06
5            Restaurant       0.06
6               Butcher       0.06


-- CN Tower, King and Spadina, Railway Lands, Har

Now I will create a DataFrame with the top most common venues as columns.

In [223]:
num_top_venues = 12

# Create an empty DataFrame
def number_formater(x):
    """ Helper function to generate Column names """
    if x == 1: suffix = "st" 
    elif x == 2: suffix = "nd"
    elif x == 3: suffix = "rd" 
    else: suffix = "th"
    return "{}{} Most Common".format(x, suffix)
        
columns = ["Neighborhood"]
columns += [number_formater(x+1) for x in range(num_top_venues)]
df_top = pd.DataFrame(columns=columns)

# Assign Neighborhoods
df_top["Neighborhood"] = grouped["Neighborhood"]

# Iterate through all rows:
for index in range(grouped.shape[0]):    
    # Extract top values from grouped
    row = grouped.iloc[index, 1:].sort_values(ascending=False)
    top = row[:num_top_venues]
    
    # Take their index instead of their values
    top = top.index.values
    
    # Assign values to their row
    df_top.iloc[index, 1:] = top
    
df_top.head()

Unnamed: 0,Neighborhood,1st Most Common,2nd Most Common,3rd Most Common,4th Most Common,5th Most Common,6th Most Common,7th Most Common,8th Most Common,9th Most Common,10th Most Common,11th Most Common,12th Most Common
0,Berczy Park,Coffee Shop,Cocktail Bar,Seafood Restaurant,Bakery,Beer Bar,Farmers Market,Restaurant,Cheese Shop,Basketball Stadium,Sporting Goods Shop,Park,Italian Restaurant
1,"Brockton, Parkdale Village, Exhibition Place",Café,Nightclub,Coffee Shop,Breakfast Spot,Grocery Store,Intersection,Bar,Bakery,Italian Restaurant,Climbing Gym,Restaurant,Gym
2,"Business reply mail Processing Centre, South C...",Gym / Fitness Center,Auto Workshop,Comic Shop,Park,Pizza Place,Recording Studio,Restaurant,Butcher,Burrito Place,Brewery,Skate Park,Farmers Market
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Boutique,Plane,Airport,Airport Food Court,Airport Gate,Airport Terminal,Bar,Harbor / Marina,Rental Car Location,Boat or Ferry
4,Central Bay Street,Coffee Shop,Café,Sandwich Place,Italian Restaurant,Salad Place,Department Store,Thai Restaurant,Burger Joint,Bubble Tea Shop,Japanese Restaurant,Portuguese Restaurant,Poke Place


## Clustering:
Now we will cluster the data to find similar neighborhoods.
We will use the k-Means Clustering Algorithm for this. In practice we would have to find an optimal k but for now I will take a k that performed well in some test runs.

In [224]:
# As a reminder: This is the data we will use for training:
grouped.head()

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0625,0.0625,0.0625,0.125,0.125,0.0625,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.014706,0.0,0.0,0.014706,0.0,0.014706


In [232]:
# Number of clusters for kNN
k = 4

# Prepare training data
# The data is already normalized, so no need for that
X = grouped.drop("Neighborhood", axis = 1)

clusterer = KMeans(n_clusters = k).fit(X)

print("Results:")
for _, row in grouped.iterrows():
    neigh = row[0]
    cluster = clusterer.predict([row[1:]])
    
    print(f"'{neigh}' belongs to cluster {cluster[0]}")

Results:
'Berczy Park' belongs to cluster 1
'Brockton, Parkdale Village, Exhibition Place' belongs to cluster 1
'Business reply mail Processing Centre, South Central Letter Processing Plant Toronto' belongs to cluster 1
'CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport' belongs to cluster 1
'Central Bay Street' belongs to cluster 1
'Christie' belongs to cluster 1
'Church and Wellesley' belongs to cluster 1
'Commerce Court, Victoria Hotel' belongs to cluster 1
'Davisville' belongs to cluster 1
'Davisville North' belongs to cluster 1
'Dufferin, Dovercourt Village' belongs to cluster 1
'First Canadian Place, Underground city' belongs to cluster 1
'Forest Hill North & West, Forest Hill Road Park' belongs to cluster 3
'Garden District, Ryerson' belongs to cluster 1
'Harbourfront East, Union Station, Toronto Islands' belongs to cluster 1
'High Park, The Junction South' belongs to cluster 1
'India Bazaar, The Beaches West' belongs to c

 We will now create a final dataframe which shows the top most common venues as well as the clusters to which they belong.

In [233]:
# This is the exiting dataframe with rankings:
df_top.head()

Unnamed: 0,Neighborhood,1st Most Common,2nd Most Common,3rd Most Common,4th Most Common,5th Most Common,6th Most Common,7th Most Common,8th Most Common,9th Most Common,10th Most Common,11th Most Common,12th Most Common
0,Berczy Park,Coffee Shop,Cocktail Bar,Seafood Restaurant,Bakery,Beer Bar,Farmers Market,Restaurant,Cheese Shop,Basketball Stadium,Sporting Goods Shop,Park,Italian Restaurant
1,"Brockton, Parkdale Village, Exhibition Place",Café,Nightclub,Coffee Shop,Breakfast Spot,Grocery Store,Intersection,Bar,Bakery,Italian Restaurant,Climbing Gym,Restaurant,Gym
2,"Business reply mail Processing Centre, South C...",Gym / Fitness Center,Auto Workshop,Comic Shop,Park,Pizza Place,Recording Studio,Restaurant,Butcher,Burrito Place,Brewery,Skate Park,Farmers Market
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Boutique,Plane,Airport,Airport Food Court,Airport Gate,Airport Terminal,Bar,Harbor / Marina,Rental Car Location,Boat or Ferry
4,Central Bay Street,Coffee Shop,Café,Sandwich Place,Italian Restaurant,Salad Place,Department Store,Thai Restaurant,Burger Joint,Bubble Tea Shop,Japanese Restaurant,Portuguese Restaurant,Poke Place


In [234]:
# Now we will generate a dataframe with only two columns: The neighborhood and its cluster.
df_neig_cluster = pd.DataFrame()
df_neig_cluster["Neighborhood"] = grouped["Neighborhood"]
df_neig_cluster["Cluster"] = clusterer.predict(grouped.drop("Neighborhood", axis = 1))

df_neig_cluster.head()

Unnamed: 0,Neighborhood,Cluster
0,Berczy Park,1
1,"Brockton, Parkdale Village, Exhibition Place",1
2,"Business reply mail Processing Centre, South C...",1
3,"CN Tower, King and Spadina, Railway Lands, Har...",1
4,Central Bay Street,1


In [235]:
# Now we will merge the previous two dataframes:
df_merged = pd.merge(df_top, df_neig_cluster, on="Neighborhood").reset_index()

df_merged.head()

Unnamed: 0,index,Neighborhood,1st Most Common,2nd Most Common,3rd Most Common,4th Most Common,5th Most Common,6th Most Common,7th Most Common,8th Most Common,9th Most Common,10th Most Common,11th Most Common,12th Most Common,Cluster
0,0,Berczy Park,Coffee Shop,Cocktail Bar,Seafood Restaurant,Bakery,Beer Bar,Farmers Market,Restaurant,Cheese Shop,Basketball Stadium,Sporting Goods Shop,Park,Italian Restaurant,1
1,1,"Brockton, Parkdale Village, Exhibition Place",Café,Nightclub,Coffee Shop,Breakfast Spot,Grocery Store,Intersection,Bar,Bakery,Italian Restaurant,Climbing Gym,Restaurant,Gym,1
2,2,"Business reply mail Processing Centre, South C...",Gym / Fitness Center,Auto Workshop,Comic Shop,Park,Pizza Place,Recording Studio,Restaurant,Butcher,Burrito Place,Brewery,Skate Park,Farmers Market,1
3,3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Boutique,Plane,Airport,Airport Food Court,Airport Gate,Airport Terminal,Bar,Harbor / Marina,Rental Car Location,Boat or Ferry,1
4,4,Central Bay Street,Coffee Shop,Café,Sandwich Place,Italian Restaurant,Salad Place,Department Store,Thai Restaurant,Burger Joint,Bubble Tea Shop,Japanese Restaurant,Portuguese Restaurant,Poke Place,1


We can now use the code we already used early to generate a map to generate a map where the common neighborhoods are marked:

In [236]:
# We will need the first dataframe we generated for lat and lng
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [237]:
# Set mean point as starting point
latitude = df["Latitude"].mean()
longitude = df["Longitude"].mean()

# Set the colors for the clusters
colors = ["red", "orange", "yellow", "blue", "green"]

map2 = folium.Map(location = [latitude, longitude], zoom_start = 12)

# Iterate through rows and unpack values
for _, (_, bor, neigh, lat, long) in df.iterrows():
    cluster = df_merged[df_merged["Neighborhood"] == neigh]["Cluster"]
    cluster = cluster.values[0].item()
    color = colors[cluster]
    
    # Generate labels to be displayed when clicked on
    label_text = '{} ({})'.format(neigh, bor)
    label = folium.Popup(label_text, parse_html=True)
    
    # Generate marker and add them to the map
    folium.CircleMarker(
        location=(lat, long),
        popup=label,
        color=color,
        fill_opacity=0.0, # Fully opaque markers
    ).add_to(map2) 

map2

In [238]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


## Examining the clusters

##### Cluster 0:

In [239]:
df_merged[df_merged["Cluster"] == 0]

Unnamed: 0,index,Neighborhood,1st Most Common,2nd Most Common,3rd Most Common,4th Most Common,5th Most Common,6th Most Common,7th Most Common,8th Most Common,9th Most Common,10th Most Common,11th Most Common,12th Most Common,Cluster
18,18,Lawrence Park,Park,Bus Line,Swim School,Dim Sum Restaurant,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,0


##### Cluster 1:

In [240]:
df_merged[df_merged["Cluster"] == 1]

Unnamed: 0,index,Neighborhood,1st Most Common,2nd Most Common,3rd Most Common,4th Most Common,5th Most Common,6th Most Common,7th Most Common,8th Most Common,9th Most Common,10th Most Common,11th Most Common,12th Most Common,Cluster
0,0,Berczy Park,Coffee Shop,Cocktail Bar,Seafood Restaurant,Bakery,Beer Bar,Farmers Market,Restaurant,Cheese Shop,Basketball Stadium,Sporting Goods Shop,Park,Italian Restaurant,1
1,1,"Brockton, Parkdale Village, Exhibition Place",Café,Nightclub,Coffee Shop,Breakfast Spot,Grocery Store,Intersection,Bar,Bakery,Italian Restaurant,Climbing Gym,Restaurant,Gym,1
2,2,"Business reply mail Processing Centre, South C...",Gym / Fitness Center,Auto Workshop,Comic Shop,Park,Pizza Place,Recording Studio,Restaurant,Butcher,Burrito Place,Brewery,Skate Park,Farmers Market,1
3,3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Boutique,Plane,Airport,Airport Food Court,Airport Gate,Airport Terminal,Bar,Harbor / Marina,Rental Car Location,Boat or Ferry,1
4,4,Central Bay Street,Coffee Shop,Café,Sandwich Place,Italian Restaurant,Salad Place,Department Store,Thai Restaurant,Burger Joint,Bubble Tea Shop,Japanese Restaurant,Portuguese Restaurant,Poke Place,1
5,5,Christie,Grocery Store,Café,Park,Coffee Shop,Restaurant,Athletics & Sports,Italian Restaurant,Candy Store,Baby Store,Nightclub,Donut Shop,Dog Run,1
6,6,Church and Wellesley,Coffee Shop,Japanese Restaurant,Gay Bar,Sushi Restaurant,Restaurant,Pub,Men's Store,Mediterranean Restaurant,Hotel,Yoga Studio,Café,Bubble Tea Shop,1
7,7,"Commerce Court, Victoria Hotel",Coffee Shop,Restaurant,Café,Hotel,American Restaurant,Gym,Seafood Restaurant,Japanese Restaurant,Deli / Bodega,Cocktail Bar,Beer Bar,Thai Restaurant,1
8,8,Davisville,Sandwich Place,Dessert Shop,Pizza Place,Café,Italian Restaurant,Gym,Coffee Shop,Sushi Restaurant,Pharmacy,Indian Restaurant,Farmers Market,Diner,1
9,9,Davisville North,Gym / Fitness Center,Sandwich Place,Park,Department Store,Breakfast Spot,Dance Studio,Hotel,Dog Run,Food & Drink Shop,Distribution Center,Doner Restaurant,Diner,1


##### Cluster 2:

In [241]:
df_merged[df_merged["Cluster"] == 2]

Unnamed: 0,index,Neighborhood,1st Most Common,2nd Most Common,3rd Most Common,4th Most Common,5th Most Common,6th Most Common,7th Most Common,8th Most Common,9th Most Common,10th Most Common,11th Most Common,12th Most Common,Cluster
27,27,Roselawn,Music Venue,Garden,Yoga Studio,Dessert Shop,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,2


##### Cluster 3:

In [242]:
df_merged[df_merged["Cluster"] == 3]

Unnamed: 0,index,Neighborhood,1st Most Common,2nd Most Common,3rd Most Common,4th Most Common,5th Most Common,6th Most Common,7th Most Common,8th Most Common,9th Most Common,10th Most Common,11th Most Common,12th Most Common,Cluster
12,12,"Forest Hill North & West, Forest Hill Road Park",Park,Jewelry Store,Trail,Sushi Restaurant,Yoga Studio,Dessert Shop,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,3
20,20,"Moore Park, Summerhill East",Playground,Trail,Yoga Studio,Department Store,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,3
26,26,Rosedale,Park,Playground,Trail,Department Store,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,3


#### Results:
As we can see most neighborhoods are very similar, there are however some outliers in the north of the city. This might indicate that there is a trend there.
After multiple runs, there is typically always one big cluster and several smaller ones. The big one has often venues such as bars, coffes and nighclubs in high scoring positions, while the smaller ones 
value parks, playgroung and musice venues more.

This was my submission, thanks for reading until here.