Creating Notebook and Importing Libraries

In [1]:

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

!conda install -c conda-forge beautifulsoup4 --yes 
from bs4 import BeautifulSoup # crawl wikipedia website

!conda install -c conda-forge geocoder --yes
import geocoder

print('Libraries imported.')

Collecting package metadata: done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.6.7                |           py36_0         869 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    geopy-1.18.1               |             py_0          51 KB  conda-forge
    openssl-1.1.1b             |       h14c3975_0         4.0 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.9 MB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.49-py_0

The following packages will be UPDATED:

  conda                                        4.6.4-py36_0 --> 4.6.7-py36_0
  geopy       

Scraping the Wikipedia Page

In [2]:
import requests
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url)
html = BeautifulSoup(r.text, 'lxml')
postalcodes_table = html.find('table',{'class':'wikitable sortable'})

columns = ['Postcode', 'Borough', 'Neighbourhood']

postalcodes_arr=[]
for row in postalcodes_table.findAll('tr'):
    row_data = []
    for r in row.findAll('td'):
        row_data.append(r.get_text().replace('\n', ''))
    if len(row_data) != 0:
        postalcodes_arr.append(row_data)

postalcodes_df = pd.DataFrame(postalcodes_arr, columns=columns)

postalcodes_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Creating the Required Dataframe

In [3]:
# Removes rows with 'Not Assigned' Boroughs
postalcodes_df.drop(postalcodes_df.index[postalcodes_df['Borough'] == 'Not assigned'], inplace = True)

#Reset index
postalcodes_df.reset_index(drop=True, inplace=True)

#Replaces 'Not Assigned' Neighborhood names with Borough name
for index, row in postalcodes_df.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = row['Borough']
        
#Aggregates the Neighborhoods with a comma in between
postalcodes_grouped_df = postalcodes_df.groupby(['Postcode','Borough']).agg(", ".join).reset_index()
postalcodes_grouped_df.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [4]:
postalcodes_grouped_df.shape

(103, 3)

Getting Latitude and Longitude and Using Foursquare API

In [5]:
# Downloading the geo data
import io

url = 'http://cocl.us/Geospatial_data'
r=requests.get(url).content
postal_code_coords_df=pd.read_csv(io.StringIO(r.decode('utf-8')))
postal_code_coords_df.head()

postalcodes_grouped_df = postalcodes_grouped_df.join(postal_code_coords_df.set_index('Postal Code'), on='Postcode')
postalcodes_grouped_df.head()

CLIENT_ID = 'ANAKQS0BWDXBO1A5RFU2RDWUHPMUSZ5SYZKK0T0BULVGGUSH' # your Foursquare ID
CLIENT_SECRET = 'INDKATATMFTTDHP2NAKWM5ADXDL50C45KGPP2ZJUN2P1BEUN' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100

def getNearbyVenues(names, postalcode, latitudes, longitudes, radius=500):
    venues_list=[]
    for postalcode, name, lat, lng in zip(postalcode, names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            postalcode,
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode',
                            'Neighborhood', 
                            'Neighborhood Latitude', 
                            'Neighborhood Longitude', 
                            'Venue', 
                            'Venue Latitude', 
                            'Venue Longitude', 
                            'Venue Category']
    
    return(nearby_venues)

In [6]:
toronto_venues = getNearbyVenues(names=postalcodes_grouped_df['Neighbourhood'],
                                 postalcode=postalcodes_grouped_df['Postcode'],
                                 latitudes=postalcodes_grouped_df['Latitude'],
                                 longitudes=postalcodes_grouped_df['Longitude']
                                )
toronto_venues.head()

Unnamed: 0,Postcode,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,M1C,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,M1E,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
3,M1E,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,M1E,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant


Toronto Segmentation and Clustering

In [7]:
# one hot encoding
toronto_dummies = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
toronto_dummies['Neighbourhood'] = toronto_venues['Neighborhood'] 
toronto_dummies['Postcode'] = toronto_venues['Postcode'] 

# move neighborhood column to the first column
fixed_columns = [toronto_dummies.columns[-1]] + [toronto_dummies.columns[-2]] + list(toronto_dummies.columns[:-2])
toronto_dummies = toronto_dummies[fixed_columns]

toronto_dummies.head()

# Frequency of venues by Neighborhood
toronto_venues_frequency = toronto_dummies.groupby('Neighbourhood').mean().reset_index()
toronto_venues_frequency.head()

# Top 10 Venues per Neighborhood
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhood_venues_sorted = pd.DataFrame(columns=columns)
neighbourhood_venues_sorted['Neighbourhood'] = toronto_venues_frequency['Neighbourhood']

for ind in np.arange(toronto_venues_frequency.shape[0]):
    neighbourhood_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_venues_frequency.iloc[ind, :], num_top_venues)

neighbourhood_venues_sorted.head()

toronto_clustering = toronto_venues_frequency.drop('Neighbourhood', 1)

# # run k-means clustering
for kclusters in range(1, 20):
    kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_clustering)
    print(str(kclusters) + ': ' + str(kmeans.inertia_))
    
# Clustering with 4 Clusters
kclusters_opt = 4
kmeans_optimal = KMeans(n_clusters=kclusters_opt, random_state=0).fit(toronto_clustering)

kmeans_optimal.labels_


# Top 10 Venues per cluster
toronto_venue_clusters = toronto_venues_frequency
toronto_venue_clusters['Cluster'] = kmeans_optimal.labels_

toronto_dummies_cluster = pd.merge(toronto_venue_clusters[['Neighbourhood', 'Cluster']],
                 toronto_dummies,
                 on='Neighbourhood')
cluster_venues_frequency = toronto_dummies_cluster.groupby('Cluster').mean().reset_index()

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Cluster']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
cluster_venues_sorted = pd.DataFrame(columns=columns)
cluster_venues_sorted['Cluster'] = cluster_venues_frequency['Cluster']

for ind in np.arange(cluster_venues_frequency.shape[0]):
    cluster_venues_sorted.iloc[ind, 1:] = return_most_common_venues(cluster_venues_frequency.iloc[ind, :], num_top_venues)

cluster_venues_sorted

1: 22.92027951762768
2: 20.93847494807789
3: 19.169729384299906
4: 18.166844137501492
5: 17.18875502752885
6: 16.808433172299168
7: 15.310977658050824
8: 14.235491176012886
9: 14.047861517562271
10: 12.871410462963556
11: 11.282894179464343
12: 11.619123636311379
13: 10.448228642017142
14: 10.434021727060328
15: 10.258962091596047
16: 9.61569105052453
17: 9.064668977625324
18: 8.866751581659718
19: 8.50357573928185


Unnamed: 0,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,Coffee Shop,Café,Restaurant,Pizza Place,Bakery,Italian Restaurant,Hotel,Bar,Sandwich Place,Fast Food Restaurant
1,1,Baseball Field,Drugstore,Diner,Discount Store,Dive Bar,Dog Run,Doner Restaurant,Donut Shop,Yoga Studio,Farmers Market
2,2,Park,Bank,Bus Line,Grocery Store,Construction & Landscaping,Trail,Fast Food Restaurant,Playground,Market,Bakery
3,3,Garden,Yoga Studio,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dive Bar,Dog Run,Donut Shop,Department Store
