In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim
import folium
from sklearn.cluster import KMeans

# Part 1

## Download Dataset

In [2]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(data.content, 'html.parser')
table = soup.find('table')
trs = table.find_all('tr')
rows = []
for tr in trs:
    i = tr.find_all('td')
    if i:
        rows.append(i)
        
lst = []
for row in rows:
    postalcode = row[0].text.rstrip()
    borough = row[1].text.rstrip()
    neighborhood = row[2].text.rstrip()
    if borough != 'Not assigned':
        if neighborhood == 'Not assigned':
            neighborhood = borough
        lst.append([postalcode, borough, neighborhood])

## Transform into dataframe

In [3]:
cols = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(lst, columns=cols)
df = df.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [4]:
df.shape

(103, 3)

# Part 2

## import csv and join datasets

In [5]:
coord = pd.read_csv('https://cocl.us/Geospatial_data')
coord.rename(columns={"Postal Code": "PostalCode"}, inplace=True)

df = df.merge(coord, on="PostalCode", how="left")
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Verify dataset

In [6]:
column_names = ["PostalCode", "Borough", "Neighborhood", "Latitude", "Longitude"]
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(df[df["PostalCode"]==postcode], ignore_index=True)
    
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442


# Part 3

## Create a map

In [7]:
address = 'Toronto'
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#0a25f2',
        fill_opacity=0.6).add_to(map_toronto)  
    
map_toronto

## Explore Toronto Areas

In [8]:
borough_list = list(df.Borough.unique())
bwt = []

for x in borough_list:
    if "toronto" in x.lower():
        bwt.append(x)        
df = df[df['Borough'].isin(bwt)].reset_index(drop=True)

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#0a25f2',
        fill_opacity=0.6).add_to(map_toronto)  
    
map_toronto

## Import Foursquare

In [9]:
CLIENT_ID = 'MQHKJ53MZHBDNU3G5XF4YASFCM4JX1KNN1QRTYG3ZVSQVTBC'
CLIENT_SECRET = 'KUK4R1D1MWNMCKCIFBUDXY2543CORSN350VAH50J3ZWRHFPA'
VERSION = '20200501'
radius = 500
LIMIT = 100
venues = []
for lat, long, post, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['PostalCode'], df['Borough'], 
                                                  df['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id=MQHKJ53MZHBDNU3G5XF4YASFCM4JX1KNN1QRTYG3ZVSQVTBC&client_secret=KUK4R1D1MWNMCKCIFBUDXY2543CORSN350VAH50J3ZWRHFPA&v=20200501 \
     &ll=43.653963,-79.387207&radius=500&limit=100".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))
        
venues_df = pd.DataFrame(venues)
venues_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

## Area analysis

In [18]:
trt = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")
trt['PostalCode'] = venues_df['PostalCode'] 
trt['Borough'] = venues_df['Borough'] 
trt['Neighborhoods'] = venues_df['Neighborhood'] 
fixed_columns = list(trt.columns[-3:]) + list(trt.columns[:-3])
trt = trt[fixed_columns]
trt_grouped = trt.groupby(["PostalCode", "Borough", "Neighborhoods"]).mean().reset_index()

top_venues = 10
indicators = ['st', 'nd', 'rd']
areaColumns = ['PostalCode', 'Borough', 'Neighborhoods']
freqColumns = []
for ind in np.arange(top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns

# create a new dataframe
neighborhoods_venues = pd.DataFrame(columns=columns)
neighborhoods_venues['PostalCode'] = trt_grouped['PostalCode']
neighborhoods_venues['Borough'] = trt_grouped['Borough']
neighborhoods_venues['Neighborhoods'] = trt_grouped['Neighborhoods']

for ind in np.arange(trt_grouped.shape[0]):
    row_categories = trt_grouped.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues.iloc[ind, 3:] = row_categories_sorted.index.values[0:top_venues]

# neighborhoods_venues_sorted.sort_values(freqColumns, inplace=True)
print(neighborhoods_venues.shape)
neighborhoods_venues

(39, 13)


Unnamed: 0,PostalCode,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,Coffee Shop,Art Gallery,Café,Japanese Restaurant,Sushi Restaurant,Concert Hall,Gift Shop,Gastropub,French Restaurant,Donut Shop
1,M4K,East Toronto,"The Danforth West, Riverdale",Coffee Shop,Art Gallery,Café,Japanese Restaurant,Sushi Restaurant,Concert Hall,Gift Shop,Gastropub,French Restaurant,Donut Shop
2,M4L,East Toronto,"India Bazaar, The Beaches West",Coffee Shop,Art Gallery,Café,Japanese Restaurant,Sushi Restaurant,Concert Hall,Gift Shop,Gastropub,French Restaurant,Donut Shop
3,M4M,East Toronto,Studio District,Coffee Shop,Art Gallery,Café,Japanese Restaurant,Sushi Restaurant,Concert Hall,Gift Shop,Gastropub,French Restaurant,Donut Shop
4,M4N,Central Toronto,Lawrence Park,Coffee Shop,Art Gallery,Café,Japanese Restaurant,Sushi Restaurant,Concert Hall,Gift Shop,Gastropub,French Restaurant,Donut Shop
5,M4P,Central Toronto,Davisville North,Coffee Shop,Art Gallery,Café,Japanese Restaurant,Sushi Restaurant,Concert Hall,Gift Shop,Gastropub,French Restaurant,Donut Shop
6,M4R,Central Toronto,"North Toronto West, Lawrence Park",Coffee Shop,Art Gallery,Café,Japanese Restaurant,Sushi Restaurant,Concert Hall,Gift Shop,Gastropub,French Restaurant,Donut Shop
7,M4S,Central Toronto,Davisville,Coffee Shop,Art Gallery,Café,Japanese Restaurant,Sushi Restaurant,Concert Hall,Gift Shop,Gastropub,French Restaurant,Donut Shop
8,M4T,Central Toronto,"Moore Park, Summerhill East",Coffee Shop,Art Gallery,Café,Japanese Restaurant,Sushi Restaurant,Concert Hall,Gift Shop,Gastropub,French Restaurant,Donut Shop
9,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",Coffee Shop,Art Gallery,Café,Japanese Restaurant,Sushi Restaurant,Concert Hall,Gift Shop,Gastropub,French Restaurant,Donut Shop


## Clustering

In [30]:
k = 5
trt_grouped_cluster = trt_grouped.drop(["PostalCode", "Borough", "Neighborhoods"], 1)
kmeans = KMeans(n_clusters=k, random_state=0).fit(trt_grouped_cluster)
kmeans.labels_[0:10]

  This is separate from the ipykernel package so we can avoid doing imports until


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [31]:
df_merged["Cluster Labels"] = kmeans.labels_
df_merged = df_merged.merge(neighborhoods_venues, on='PostalCode', how='left')
df_merged.head()

Unnamed: 0,PostalCode,Borough_left,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue_x,2nd Most Common Venue_x,3rd Most Common Venue_x,4th Most Common Venue_x,...,1st Most Common Venue_y,2nd Most Common Venue_y,3rd Most Common Venue_y,4th Most Common Venue_y,5th Most Common Venue_y,6th Most Common Venue_y,7th Most Common Venue_y,8th Most Common Venue_y,9th Most Common Venue_y,10th Most Common Venue_y
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Coffee Shop,Art Gallery,Café,Japanese Restaurant,...,Coffee Shop,Art Gallery,Café,Japanese Restaurant,Sushi Restaurant,Concert Hall,Gift Shop,Gastropub,French Restaurant,Donut Shop
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Coffee Shop,Art Gallery,Café,Japanese Restaurant,...,Coffee Shop,Art Gallery,Café,Japanese Restaurant,Sushi Restaurant,Concert Hall,Gift Shop,Gastropub,French Restaurant,Donut Shop
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,0,Coffee Shop,Art Gallery,Café,Japanese Restaurant,...,Coffee Shop,Art Gallery,Café,Japanese Restaurant,Sushi Restaurant,Concert Hall,Gift Shop,Gastropub,French Restaurant,Donut Shop
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Coffee Shop,Art Gallery,Café,Japanese Restaurant,...,Coffee Shop,Art Gallery,Café,Japanese Restaurant,Sushi Restaurant,Concert Hall,Gift Shop,Gastropub,French Restaurant,Donut Shop
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Coffee Shop,Art Gallery,Café,Japanese Restaurant,...,Coffee Shop,Art Gallery,Café,Japanese Restaurant,Sushi Restaurant,Concert Hall,Gift Shop,Gastropub,French Restaurant,Donut Shop


## 5 clusters

In [33]:
df_merged.loc[df_merged['Cluster Labels'] == 0, df_merged.columns[[1] + \
                                                                                 list(range(5, df_merged.shape[1]))]]

Unnamed: 0,Borough_left,Cluster Labels,1st Most Common Venue_x,1st Most Common Venue_x.1,1st Most Common Venue_x.2,2nd Most Common Venue_x,2nd Most Common Venue_x.1,2nd Most Common Venue_x.2,3rd Most Common Venue_x,3rd Most Common Venue_x.1,...,7th Most Common Venue_y,8th Most Common Venue_y,8th Most Common Venue_y.1,8th Most Common Venue_y.2,9th Most Common Venue_y,9th Most Common Venue_y.1,9th Most Common Venue_y.2,10th Most Common Venue_y,10th Most Common Venue_y.1,10th Most Common Venue_y.2
0,East Toronto,0,Coffee Shop,Coffee Shop,Coffee Shop,Art Gallery,Art Gallery,Art Gallery,Café,Café,...,Gift Shop,Gastropub,Gastropub,Gastropub,French Restaurant,French Restaurant,French Restaurant,Donut Shop,Donut Shop,Donut Shop
1,East Toronto,0,Coffee Shop,Coffee Shop,Coffee Shop,Art Gallery,Art Gallery,Art Gallery,Café,Café,...,Gift Shop,Gastropub,Gastropub,Gastropub,French Restaurant,French Restaurant,French Restaurant,Donut Shop,Donut Shop,Donut Shop
2,East Toronto,0,Coffee Shop,Coffee Shop,Coffee Shop,Art Gallery,Art Gallery,Art Gallery,Café,Café,...,Gift Shop,Gastropub,Gastropub,Gastropub,French Restaurant,French Restaurant,French Restaurant,Donut Shop,Donut Shop,Donut Shop
3,East Toronto,0,Coffee Shop,Coffee Shop,Coffee Shop,Art Gallery,Art Gallery,Art Gallery,Café,Café,...,Gift Shop,Gastropub,Gastropub,Gastropub,French Restaurant,French Restaurant,French Restaurant,Donut Shop,Donut Shop,Donut Shop
4,Central Toronto,0,Coffee Shop,Coffee Shop,Coffee Shop,Art Gallery,Art Gallery,Art Gallery,Café,Café,...,Gift Shop,Gastropub,Gastropub,Gastropub,French Restaurant,French Restaurant,French Restaurant,Donut Shop,Donut Shop,Donut Shop
5,Central Toronto,0,Coffee Shop,Coffee Shop,Coffee Shop,Art Gallery,Art Gallery,Art Gallery,Café,Café,...,Gift Shop,Gastropub,Gastropub,Gastropub,French Restaurant,French Restaurant,French Restaurant,Donut Shop,Donut Shop,Donut Shop
6,Central Toronto,0,Coffee Shop,Coffee Shop,Coffee Shop,Art Gallery,Art Gallery,Art Gallery,Café,Café,...,Gift Shop,Gastropub,Gastropub,Gastropub,French Restaurant,French Restaurant,French Restaurant,Donut Shop,Donut Shop,Donut Shop
7,Central Toronto,0,Coffee Shop,Coffee Shop,Coffee Shop,Art Gallery,Art Gallery,Art Gallery,Café,Café,...,Gift Shop,Gastropub,Gastropub,Gastropub,French Restaurant,French Restaurant,French Restaurant,Donut Shop,Donut Shop,Donut Shop
8,Central Toronto,0,Coffee Shop,Coffee Shop,Coffee Shop,Art Gallery,Art Gallery,Art Gallery,Café,Café,...,Gift Shop,Gastropub,Gastropub,Gastropub,French Restaurant,French Restaurant,French Restaurant,Donut Shop,Donut Shop,Donut Shop
9,Central Toronto,0,Coffee Shop,Coffee Shop,Coffee Shop,Art Gallery,Art Gallery,Art Gallery,Café,Café,...,Gift Shop,Gastropub,Gastropub,Gastropub,French Restaurant,French Restaurant,French Restaurant,Donut Shop,Donut Shop,Donut Shop


In [34]:
df_merged.loc[df_merged['Cluster Labels'] == 1, df_merged.columns[[1] + \
                                                                                 list(range(5, df_merged.shape[1]))]]

Unnamed: 0,Borough_left,Cluster Labels,1st Most Common Venue_x,1st Most Common Venue_x.1,1st Most Common Venue_x.2,2nd Most Common Venue_x,2nd Most Common Venue_x.1,2nd Most Common Venue_x.2,3rd Most Common Venue_x,3rd Most Common Venue_x.1,...,7th Most Common Venue_y,8th Most Common Venue_y,8th Most Common Venue_y.1,8th Most Common Venue_y.2,9th Most Common Venue_y,9th Most Common Venue_y.1,9th Most Common Venue_y.2,10th Most Common Venue_y,10th Most Common Venue_y.1,10th Most Common Venue_y.2


In [35]:
df_merged.loc[df_merged['Cluster Labels'] == 2, df_merged.columns[[1] + \
                                                                                 list(range(5, df_merged.shape[1]))]]

Unnamed: 0,Borough_left,Cluster Labels,1st Most Common Venue_x,1st Most Common Venue_x.1,1st Most Common Venue_x.2,2nd Most Common Venue_x,2nd Most Common Venue_x.1,2nd Most Common Venue_x.2,3rd Most Common Venue_x,3rd Most Common Venue_x.1,...,7th Most Common Venue_y,8th Most Common Venue_y,8th Most Common Venue_y.1,8th Most Common Venue_y.2,9th Most Common Venue_y,9th Most Common Venue_y.1,9th Most Common Venue_y.2,10th Most Common Venue_y,10th Most Common Venue_y.1,10th Most Common Venue_y.2


In [36]:
df_merged.loc[df_merged['Cluster Labels'] == 3, df_merged.columns[[1] + \
                                                                                 list(range(5, df_merged.shape[1]))]]

Unnamed: 0,Borough_left,Cluster Labels,1st Most Common Venue_x,1st Most Common Venue_x.1,1st Most Common Venue_x.2,2nd Most Common Venue_x,2nd Most Common Venue_x.1,2nd Most Common Venue_x.2,3rd Most Common Venue_x,3rd Most Common Venue_x.1,...,7th Most Common Venue_y,8th Most Common Venue_y,8th Most Common Venue_y.1,8th Most Common Venue_y.2,9th Most Common Venue_y,9th Most Common Venue_y.1,9th Most Common Venue_y.2,10th Most Common Venue_y,10th Most Common Venue_y.1,10th Most Common Venue_y.2


In [37]:
df_merged.loc[df_merged['Cluster Labels'] == 4, df_merged.columns[[1] + \
                                                                                 list(range(5, df_merged.shape[1]))]]

Unnamed: 0,Borough_left,Cluster Labels,1st Most Common Venue_x,1st Most Common Venue_x.1,1st Most Common Venue_x.2,2nd Most Common Venue_x,2nd Most Common Venue_x.1,2nd Most Common Venue_x.2,3rd Most Common Venue_x,3rd Most Common Venue_x.1,...,7th Most Common Venue_y,8th Most Common Venue_y,8th Most Common Venue_y.1,8th Most Common Venue_y.2,9th Most Common Venue_y,9th Most Common Venue_y.1,9th Most Common Venue_y.2,10th Most Common Venue_y,10th Most Common Venue_y.1,10th Most Common Venue_y.2


Conclusion: I pick cluster 1.