In [1]:
# Import the necessary libraries
import numpy as np
import pandas as pd

import json
import requests
from pandas.io.json import json_normalize

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes
import folium

!pip install lxml html5lib beautifulsoup4

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.9.1
  latest version: 4.9.2

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

Libraries imported.


### Get the data from the Wikipedia table and transform them into a dataframe

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

df = pd.read_html(url, header=0)[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [3]:
# Rename two columns of the dataframes
df.rename(columns={'Postal Code' : 'PostalCode', 'Neighbourhood' : 'Neighborhood'}, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
# How many boroughs are there?
df['Borough'].value_counts()

Not assigned        77
North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East Toronto         5
East York            5
Mississauga          1
Name: Borough, dtype: int64

In [5]:
# Drop the cells where the Borough is not assigned
df.drop(df.index[df['Borough'] == 'Not assigned'], inplace=True)

df = df.reset_index(drop=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
# Group the data by PostalCode and Borough
df = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(','.join).reset_index()
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
df.shape

(103, 3)

### Read the file with the geographical data of each postal code

In [8]:
geo_df = pd.read_csv('https://cocl.us/Geospatial_data')
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
geo_df.shape

(103, 3)

In [10]:
# Rename one column in the dataframe
geo_df.rename(columns={'Postal Code' : 'PostalCode'}, inplace=True)
geo_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
# Merge both dataframes on PostalCode
toronto = pd.merge(df, geo_df, on='PostalCode')
toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [12]:
# Filter out the data by Boroughs containing "Toronto" in its name to work only with certain Boroughs
toronto_central = toronto[toronto['Borough'].str.contains("Toronto") == 1].reset_index(drop=True)
toronto_central

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


In [13]:
# Create a map of Toronto to plot the Boroughs with "Toronto" in its name
latitude = 43.651070
longitude = -79.347015

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(toronto_central['Latitude'], toronto_central['Longitude'], toronto_central['Borough'], toronto_central['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

### Get Foursquare credentials

In [14]:
CLIENT_ID = 'UKO1NLU4ORXTTMYL105W4DNWI0B1UP2LIJTOVDZSSGGFUGKC'
CLIENT_SECRET = '5GIVTTMWGPSUK4FNWUFM2AAEM4QVUZJVGANVP5WJQXUV5F2V'
VERSION = '20201001'
LIMIT = 100

In [15]:
# Select the first Neighborhood of the dataframe toronto_only
toronto_central.loc[0, 'Neighborhood']

'The Beaches'

In [16]:
# Get the latitude and longitude of the 1st Neighborhood
neighborhood_latitude = toronto_central.loc[0, 'Latitude']
neighborhood_longitude = toronto_central.loc[0, 'Longitude']

neighborhood_name = toronto_central.loc[0, 'Neighborhood']

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, neighborhood_latitude, neighborhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


In [17]:
# Create the GET request url
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID,
    CLIENT_SECRET,
    VERSION,
    neighborhood_latitude,
    neighborhood_longitude,
    radius,
    LIMIT)

url

'https://api.foursquare.com/v2/venues/explore?client_id=UKO1NLU4ORXTTMYL105W4DNWI0B1UP2LIJTOVDZSSGGFUGKC&client_secret=5GIVTTMWGPSUK4FNWUFM2AAEM4QVUZJVGANVP5WJQXUV5F2V&v=20201001&ll=43.67635739999999,-79.2930312&radius=500&limit=100'

In [18]:
# Send the GEt request and the results
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5fc767545b23bb23308abf5e'},
 'response': {'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.67682094413784,
          'lng': -79.29394208780985}],
        'distanc

In [19]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [20]:
# clean the json and structure it into a dataframe
venues = results['response']['groups'][0]['items']

nearby_venues = json_normalize(venues)

filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  after removing the cwd from sys.path.


Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,Grover Pub and Grub,Pub,43.679181,-79.297215
3,Upper Beaches,Neighborhood,43.680563,-79.292869


In [21]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

4 venues were returned by Foursquare.


#### Explore the data in Central Toronto

In [22]:
# Function to repeat the process to all the Neighborhoods in Central Toronto

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        lng,
        radius,
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    venues_list.append([(
        name,
        lat,
        lng,
        v['venue']['name'],
        v['venue']['location']['lat'],
        v['venue']['location']['lng'],
        v['venue']['categories'][0]['name']) for v in results])
    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    
    return(nearby_venues)

In [23]:
toronto_central_venues = getNearbyVenues(names=toronto_central['Neighborhood'],
                                latitudes=toronto_central['Latitude'],
                                longitudes=toronto_central['Longitude'])

toronto_central_venues.shape

The Beaches
The Danforth West, Riverdale
India Bazaar, The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West, Lawrence Park
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North & West, Forest Hill Road Park
The Annex, North Midtown, Yorkville
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Stn A PO Boxes
First Canadian Place, Underground city
Christie
Dufferin, Dovercourt Village
Little Portugal, Trinity
Brockton, Parkdale Village, Exhibition Place
High 

(16, 7)

In [24]:
print(toronto_central_venues.shape)
print('There are {} uniques categories.'.format(len(toronto_central_venues['Venue Category'].unique())))
toronto_central_venues

(16, 7)
There are 16 uniques categories.


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,Rorschach Brewing Co.,43.663483,-79.319824,Brewery
1,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,Leslieville Farmers Market,43.664901,-79.319784,Farmers Market
2,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,The Sidekick,43.664484,-79.325162,Comic Shop
3,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,Chino Locos,43.664653,-79.325584,Burrito Place
4,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,Queen Margherita Pizza,43.664685,-79.324164,Pizza Place
5,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,Chick-n-Joy,43.665181,-79.321403,Fast Food Restaurant
6,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,The Green Wood,43.664728,-79.324117,Restaurant
7,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,Ashbridges Bay Skatepark,43.662548,-79.315631,Skate Park
8,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,East End Garden Centre & Hardware,43.664564,-79.324471,Garden Center
9,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,Amin Car Repair Garage,43.663544,-79.32013,Auto Workshop


In [25]:
# How many venues per Neighborhood?
toronto_central_venues.groupby(['Neighborhood']).count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",16,16,16,16,16,16


#### Are there Spanish/Mediterranean restaurants in the venues?

In [27]:
"Spanish Restaurant" in toronto_central_venues['Venue Category'].unique()
# "Mediterranean Restaurant" in toronto_central_veneus['Venue Category'].unique()
# true/false

False

#### Analyze each Neighborhood

In [28]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_central_venues[['Venue Category']], prefix="", prefix_sep="")

# add Borough column to the dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']

# move Neighborhood to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

print(toronto_onehot.shape)
toronto_onehot.head()

(4, 5)


Unnamed: 0,Neighborhood,Bar,Drugstore,Garden Center,Rental Car Location
0,"Northwest, West Humber - Clairville",0,0,0,1
1,"Northwest, West Humber - Clairville",1,0,0,0
2,"Northwest, West Humber - Clairville",0,1,0,0
3,"Northwest, West Humber - Clairville",0,0,1,0


In [29]:
# Let's group data by Neighborhood and by the mean of the frequency of ocurrence of each category

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

print(toronto_grouped.shape)
toronto_grouped.head()

(1, 5)


Unnamed: 0,Neighborhood,Bar,Drugstore,Garden Center,Rental Car Location
0,"Northwest, West Humber - Clairville",0.25,0.25,0.25,0.25


In [None]:
spanish = toronto_grouped(['Neighborhood', 'Spanish Restaurant']) # depends on the results of spanish restaurant.pay attention to how it is written
spanish.head()

# mediterranean = toronto_grouped(['Neighborhood', 'Mediterranean Restaurant'])
# mediterranean.head()

### Cluster the neighborhoods using KMeans clustering

In [None]:
kmeans = KMeans(n_clusters=5, init='k-means++', max_iter=15, random_state=8)

X = spanish.drop(['Neighborhood'], axis=1)
# X = mediterranean.drop(['Neighborhood'], axis=1)

In [None]:
kmeans.fit(X)

kmeans.labels_[0:10]

In [None]:
def get_inertia(n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=15, random_state=8)
    km.fit(X)
    return km.inertia_

In [None]:
scores = [get_inertia(x) for x in range(2,21)]

In [None]:
plt.figure(figsize=[10, 8])
sns.lineplot(x=range(2, 21), y=scores, color='r')
plt.title('K vs Error')
plt.xticks(range(2, 21))
plt.xlabel('K')
plt.ylabel('Error')

In [None]:
from yellowbrick.cluster import KElbowVisualizer

In [None]:
model = KMeans()
visualizer = KElbowVisualizer(model, k=(2, 21))

visualizer.fit(X)
visualizer.show()

In [None]:
### We find that the optimum K value is -- so let's code for -- clusters

In [None]:
kclusters = 

toronto_grouped_clustering = spanish.drop('Neighborhood', 1)
# toronto_groupes_clustering = mediterranean.drop('Neighborhood', 1)

kmeas = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

kmeans.labels_[0:10]

In [None]:
spanish_merged = spanish().copy
# mediterranean_merged = mediterranean().copy

spanish_merged['Clusters'] = kmeans.labels_
# mediterranean_merged['Clusters'] = kmeans.labels_

spanish_merged.head()
# mediterranean_merged.head()

In [None]:
spanish_merged = spanish_merged.join(toronto_venues.set_index('Neighborhood'), on='Neighborhood')
# mediterranean_merged = mediterranean_merged.join(toronto_venues.set_index('Neighborhood'), on='Neighborhood')

print(spanish_merged.shape)
# print(mediterranean_merged.shape)

spanish_merged.head()
# mediterranea_merged.head()

In [None]:
### sort the results by Cluster
spanish_merged.sort_values(['Clusters'], inplace=True)
# mediterranean_merged.sort_values(['Clusters'], inplace=True)
spanish_merged
# mediterranean_merged

In [None]:
#### Check how many Spanish/Mediterranean restaurants are there

spanish_merged['Venue Category'].value_counts()['Spanish Restaurant']
# mediterranean_merged['Venue Category'].value_counts()['Spanish Restaurant']

In [None]:
### We create a map with the Neighborhoods and the Clusters

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lng, poi, cluster in zip(spanish_merged['Neighborhood Latitude'], spanish_merged['Neighborhood Longitude'], spanish_merged['Neighborhood'], spanish_merged['Clusters Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster))
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters


### How many Neighborhoods per cluster?

In [None]:
spanish['Cluster Labels'] = kmeans.labels_
spanish.head()

In [None]:
## grafico?

In [None]:
spanish['Cluster Labels'].value_counts()

### Analysis of each cluster

In [None]:
#### Cluster 1
#### let's create a new dataframe with Neighborhood and Borough
toronto_new = toronto[['Borough', 'Neighborhood']]
toronto_new.head()

In [None]:
cluster1 = spanish_merged.loc[spanish_merged['Cluster Labels'] == 0]
cluster1_df = pd.merge(toronto_new, cluster1, on='Neighborhood')
cluster1_df

In [None]:
cluster1_df.sort_values(['Spanish Restaurant'], ascending=False)

In [None]:
#### Cluster2
cluster2 = spanish_merged.loc[spanish_merged['Cluster Labels'] == 1]
cluster2_df = pd.merge(toronto_new, cluster2, on='Neighborhood')
cluster2_df

In [None]:
cluster2_df.sort_values(['Spanish Restaurant'], ascending=False)

In [None]:
cluster2_df['Venue Category'].value_counts(ascending=False)['Spanish Restaurant']

In [None]:
#### Cluster3
cluster3 = spanish_merged.loc[spanish_merged['Cluster Labels'] == 2]
cluster3_df = pd.merge(toronto_new, cluster3, on='Neighborhood')
cluster3_df

In [None]:
cluster3_df.sort_values(['Spanish Restaurant'], ascending=False)

In [None]:
cluster3_df['Venue Category'].value_counts(ascending=False)['Spanish Restaurant']

In [None]:
#### Cluster4
cluster4 = spanish_merged.loc[spanish_merged['Cluster Labels'] == 3]
cluster4_df = pd.merge(toronto_new, cluster4, on='Neighborhood')
cluster4_df

In [None]:
cluster4_df.sort_values(['Spanish Restaurant'], ascending=False)

In [None]:
cluster4_df['Venue Category'].value_counts(ascending=False)['Spanish Restaurant']

In [None]:
#### Plot the average number of Spanish Restaurants per Cluster

clusters_mean = [cluster1_df['Spanish Restaurant'].mean(), cluster2_df['Spanish Restaurant'].mean(), cluster3_df['Spanish Restaurant'].mean(), cluster4_df['Spanish Restaurant'].mean()]
objects = (1,2,3,4)
y_pos = np.arange(len(objects))
perf = clusters_mean
plt.bar(y_pos, perf, align='center', alpha=0.8, color=[........])
plt.xticks(y_pos, objects)
plt.ylabel('Mean')
plt.xlabel('Cluster')
plt.title('Average number of Spanish Restaurants per Cluster')

plt.show()