# Part 1

In [1]:
import pandas as pd

Get all the tables from the web page

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
tables = pd.read_html(url)

len(tables)

3

Determine which table contains the neighborhood data

In [3]:
for i in range(len(tables)):
    print('Index', i)
    print(tables[i].head())

Index 0
  Postal Code           Borough               Neighborhood
0         M1A      Not assigned               Not assigned
1         M2A      Not assigned               Not assigned
2         M3A        North York                  Parkwoods
3         M4A        North York           Victoria Village
4         M5A  Downtown Toronto  Regent Park, Harbourfront
Index 1
                                                  0   \
0                                                NaN   
1  NL NS PE NB QC ON MB SK AB BC NU/NT YT A B C E...   
2                                                 NL   
3                                                  A   

                                                  1   \
0                              Canadian postal codes   
1  NL NS PE NB QC ON MB SK AB BC NU/NT YT A B C E...   
2                                                 NS   
3                                                  B   

                                                  2    3    4    5  

Save and clean the dataframe

In [4]:
toronto_df = tables[0]
toronto_df = toronto_df[toronto_df['Borough'] != 'Not assigned'].reset_index(drop=True)
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
toronto_df.shape

(103, 3)

# Part 2

The Geocoder package is too slow to be efficient, so the coordinates will be input from the csv file

In [6]:
url = 'https://cocl.us/Geospatial_data'
lat_lng_df = pd.read_csv(url)
lat_lng_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [7]:
toronto_df = toronto_df.merge(lat_lng_df, left_on='Postal Code', right_on='Postal Code', how='inner')
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [8]:
toronto_df.shape

(103, 5)

# Part 3

Create a new dataframe with only boroughs containing the word 'Toronto'

In [9]:
boroughs = toronto_df['Borough'].unique().tolist()
boroughs

['North York',
 'Downtown Toronto',
 'Etobicoke',
 'Scarborough',
 'East York',
 'York',
 'East Toronto',
 'West Toronto',
 'Central Toronto',
 'Mississauga']

In [10]:
toronto_boroughs = []
for i in range(len(boroughs)):
    borough_name = boroughs[i]
    if borough_name.endswith('Toronto'):
        toronto_boroughs.append(borough_name)
toronto_boroughs

['Downtown Toronto', 'East Toronto', 'West Toronto', 'Central Toronto']

In [11]:
toronto_df = toronto_df[toronto_df['Borough'].isin(toronto_boroughs)].reset_index(drop=True)
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [12]:
toronto_df.shape

(39, 5)

Now there are only 39 areas to work with as opposed to 103. The next step will be to map them.

In [13]:
!pip install folium
import folium



In [14]:
city_lat = 43.6532
city_lng = -79.3832

toronto_map = folium.Map(location=[city_lat, city_lng], zoom_start=12)

for lat, lng, postal_code, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Postal Code'], toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, postal_code)
    label = folium.Popup(label, parse_html=True)
    
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
toronto_map

Define Foursquare credentials and get the list of venues

In [15]:
# The code was removed by Watson Studio for sharing.

In [16]:
import json
import requests
from pandas.io.json import json_normalize

In [17]:
postal_codes = toronto_df['Postal Code']
lats = toronto_df['Latitude']
lngs = toronto_df['Longitude']

radius = 500
limit = 100

venues = []

In [18]:
for code, lat, lng in zip(postal_codes, lats, lngs):
    
    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        lng,
        radius,
        limit)
    
    results = requests.get(url).json()
    results = results['response']['groups'][0]['items']
    
    venues.append([(
            code, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

Convert the venues json to a dataframe

In [19]:
toronto_venues = pd.DataFrame([item for venue_list in venues for item in venue_list])
toronto_venues.columns = ['Postal Code', 'Postal Code Latitude', 'Postal Code Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
toronto_venues.head()

Unnamed: 0,Postal Code,Postal Code Latitude,Postal Code Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M5A,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,M5A,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,M5A,43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,M5A,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,M5A,43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


In [20]:
toronto_venues.groupby('Postal Code').count()

Unnamed: 0_level_0,Postal Code Latitude,Postal Code Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M4E,4,4,4,4,4,4
M4K,42,42,42,42,42,42
M4L,20,20,20,20,20,20
M4M,40,40,40,40,40,40
M4N,4,4,4,4,4,4
M4P,7,7,7,7,7,7
M4R,21,21,21,21,21,21
M4S,37,37,37,37,37,37
M4T,2,2,2,2,2,2
M4V,17,17,17,17,17,17


Prepare the data for clustering

In [21]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Postal Code'] = toronto_venues['Postal Code']

fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Postal Code,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
toronto_grouped = toronto_onehot.groupby('Postal Code').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Postal Code,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.02381,0.0,0.0,...,0.0,0.0,0.02381,0.0,0.0,0.0,0.0,0.0,0.0,0.02381
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025,0.0,0.025
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
toronto_grouped.shape

(39, 230)

Get the top venue types for each category

In [24]:
for code in toronto_grouped['Postal Code']:
    
    code_row = toronto_df[toronto_df['Postal Code'] == code].reset_index(drop=True)
    neighborhood = code_row.loc[0, 'Neighborhood']
    print(code + ', ' + neighborhood)
    
    type_list = toronto_grouped[toronto_grouped['Postal Code'] == code].T.reset_index()
    type_list.columns = ['Venue Type', 'Frequency']
    type_list = type_list.iloc[1:]
    type_list['Frequency'] = type_list['Frequency'].astype(float)
    type_list = type_list.round({'Frequency': 2})
    
    type_list = type_list.sort_values('Frequency', ascending=False).reset_index(drop=True)
    
    print(type_list.head())
    print('\n')

M4E, The Beaches
          Venue Type  Frequency
0              Trail       0.25
1  Health Food Store       0.25
2                Pub       0.25
3       Neighborhood       0.25
4        Men's Store       0.00


M4K, The Danforth West, Riverdale
               Venue Type  Frequency
0        Greek Restaurant       0.19
1             Coffee Shop       0.07
2      Italian Restaurant       0.07
3              Restaurant       0.05
4  Furniture / Home Store       0.05


M4L, India Bazaar, The Beaches West
           Venue Type  Frequency
0                Park       0.15
1      Sandwich Place       0.10
2   Fish & Chips Shop       0.05
3  Italian Restaurant       0.05
4         Pizza Place       0.05


M4M, Studio District
            Venue Type  Frequency
0                 Café       0.10
1          Coffee Shop       0.08
2               Bakery       0.05
3  American Restaurant       0.05
4            Gastropub       0.05


M4N, Lawrence Park
    Venue Type  Frequency
0         Park       0.

In [25]:
num_types = 4

columns_list = ['Postal Code']

for i in range(num_types):
    col_label = "Venue Category " + str(i + 1)
    columns_list.append(col_label)

top_venue_types = pd.DataFrame(columns=columns_list)
top_venue_types['Postal Code'] = toronto_grouped['Postal Code']

for i in range(toronto_grouped.shape[0]):
    row = toronto_grouped.iloc[i, :]
    categories = row.iloc[1:]
    sorted_categories = categories.sort_values(ascending=False)
    sorted_categories = sorted_categories.index.values[0:num_types]
    
    top_venue_types.iloc[i, 1:] = sorted_categories

top_venue_types.head()

Unnamed: 0,Postal Code,Venue Category 1,Venue Category 2,Venue Category 3,Venue Category 4
0,M4E,Health Food Store,Neighborhood,Trail,Pub
1,M4K,Greek Restaurant,Italian Restaurant,Coffee Shop,Restaurant
2,M4L,Park,Sandwich Place,Brewery,Steakhouse
3,M4M,Café,Coffee Shop,Gastropub,Brewery
4,M4N,Park,Swim School,Bus Line,Yoga Studio


Run K-Means clustering

In [26]:
from sklearn.cluster import KMeans

In [27]:
num_clusters = 5

toronto_clustering = toronto_grouped.drop('Postal Code', 1)
KMC = KMeans(n_clusters = num_clusters, random_state=0).fit(toronto_clustering)

In [28]:
KMC.labels_

array([0, 3, 3, 1, 2, 3, 3, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       4, 2, 3, 1, 1, 3, 3, 3, 1, 1, 3, 1, 1, 3, 3, 3, 1], dtype=int32)

In [29]:
top_venue_types.insert(0, 'Cluster', KMC.labels_)

toronto_final = toronto_df.join(top_venue_types.set_index('Postal Code'), on='Postal Code')
toronto_final.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster,Venue Category 1,Venue Category 2,Venue Category 3,Venue Category 4
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,3,Coffee Shop,Bakery,Park,Breakfast Spot
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,3,Coffee Shop,Diner,Yoga Studio,College Cafeteria
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,3,Clothing Store,Coffee Shop,Café,Japanese Restaurant
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,3,Café,Coffee Shop,Restaurant,Clothing Store
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Health Food Store,Neighborhood,Trail,Pub


Examine each cluster individually

In [30]:
cluster_0 = toronto_final[toronto_final['Cluster'] == 0]
cluster_0 = cluster_0.drop(['Cluster'], axis=1)
cluster_0

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Venue Category 1,Venue Category 2,Venue Category 3,Venue Category 4
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,Health Food Store,Neighborhood,Trail,Pub


In [31]:
cluster_1 = toronto_final[toronto_final['Cluster'] == 1]
cluster_1 = cluster_1.drop(['Cluster'], axis=1)
cluster_1

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Venue Category 1,Venue Category 2,Venue Category 3,Venue Category 4
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564,Grocery Store,Café,Park,Athletics & Sports
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259,Pharmacy,Bakery,Middle Eastern Restaurant,Music Venue
14,M6K,West Toronto,"Brockton, Parkdale Village, Exhibition Place",43.636847,-79.428191,Café,Coffee Shop,Breakfast Spot,Bakery
17,M4M,East Toronto,Studio District,43.659526,-79.340923,Café,Coffee Shop,Gastropub,Brewery
22,M6P,West Toronto,"High Park, The Junction South",43.661608,-79.464763,Mexican Restaurant,Café,Thai Restaurant,Grocery Store
27,M5S,Downtown Toronto,"University of Toronto, Harbord",43.662696,-79.400049,Café,Bookstore,Bar,Japanese Restaurant
30,M5T,Downtown Toronto,"Kensington Market, Chinatown, Grange Park",43.653206,-79.400049,Café,Coffee Shop,Vietnamese Restaurant,Vegetarian / Vegan Restaurant
38,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,Skate Park,Pizza Place,Recording Studio,Fast Food Restaurant


In [32]:
cluster_2 = toronto_final[toronto_final['Cluster'] == 2]
cluster_2 = cluster_2.drop(['Cluster'], axis=1)
cluster_2

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Venue Category 1,Venue Category 2,Venue Category 3,Venue Category 4
18,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,Park,Swim School,Bus Line,Yoga Studio
21,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",43.696948,-79.411307,Park,Trail,Jewelry Store,Sushi Restaurant
29,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,Park,Trail,Yoga Studio,Dance Studio
33,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,Park,Playground,Trail,Yoga Studio


In [33]:
cluster_3 = toronto_final[toronto_final['Cluster'] == 3]
cluster_3 = cluster_3.drop(['Cluster'], axis=1)
cluster_3

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Venue Category 1,Venue Category 2,Venue Category 3,Venue Category 4
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Coffee Shop,Bakery,Park,Breakfast Spot
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,Coffee Shop,Diner,Yoga Studio,College Cafeteria
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,Clothing Store,Coffee Shop,Café,Japanese Restaurant
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,Café,Coffee Shop,Restaurant,Clothing Store
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,Coffee Shop,Cocktail Bar,Café,Cheese Shop
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,Coffee Shop,Sandwich Place,Italian Restaurant,Café
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568,Coffee Shop,Café,Hotel,Restaurant
10,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752,Coffee Shop,Aquarium,Hotel,Café
11,M6J,West Toronto,"Little Portugal, Trinity",43.647927,-79.41975,Bar,Coffee Shop,Asian Restaurant,Restaurant
12,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,Greek Restaurant,Italian Restaurant,Coffee Shop,Restaurant


In [34]:
cluster_4 = toronto_final[toronto_final['Cluster'] == 4]
cluster_4 = cluster_4.drop(['Cluster'], axis=1)
cluster_4

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Venue Category 1,Venue Category 2,Venue Category 3,Venue Category 4
19,M5N,Central Toronto,Roselawn,43.711695,-79.416936,Music Venue,Garden,Dance Studio,Ethiopian Restaurant


In [35]:
for i in range(num_clusters):
    temp_df = toronto_final[toronto_final['Cluster'] == i]
    print('Cluster', i, 'size:', temp_df.shape[0])

Cluster 0 size: 1
Cluster 1 size: 8
Cluster 2 size: 4
Cluster 3 size: 25
Cluster 4 size: 1


- Clusters 0 and 4 only have 1 neighborhood each. Cluster 0 contains a health food store and cluster 4 contains a music venue. It is worth noting that these neighborhoods have fewer than 5 venues, as seen by the grouped toronto_venues dataframe.
- Cluster 1 contains mostly cafes.
- Cluster 2 contains mostly parks.
- Cluster 3 contains mostly coffee shops and restaurants.