# Clustering Neighbourhoods in Toronto.

# Part I. Web Sraping.

#### Installation of the required packages.

In [1]:
!pip install opencage
#!conda install beautifulsoup4
#!conda install -c conda-forge folium=0.5.0 --yes



#### Import of the required modules.

In [2]:
import lxml
import folium
import requests
import numpy as np
import pandas as pd
import matplotlib.cm as cm
import matplotlib.colors as colors

from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
from pandas.io.json import json_normalize
from opencage.geocoder import OpenCageGeocode

#### Getting the Neighbourhoods table from the Wikipedia page.

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
soup = BeautifulSoup(requests.get(url).content,features = 'lxml')
table = soup.find_all('table',class_ = 'wikitable sortable')[0]
df = pd.read_html(str(table))[0]

#### Transforming the data into the required form.

In [4]:
df = df.loc[(df['Borough'] != 'Not assigned')]
mask = (df['Neighbourhood'] == 'Not assigned')
df['Neighbourhood'][mask] = df['Borough'][mask]
df = df.sort_values(['Postcode','Borough','Neighbourhood'])
df = df.groupby(['Postcode','Borough'],as_index = False).agg({'Neighbourhood':', '.join})
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Printing the number of rows and columns of the resulting dataframe.

In [5]:
print("The number of rows and columns of the resulting dataframe:",df.shape)

The number of rows and columns of the resulting dataframe: (103, 3)


# Part II. Getting the coordinates for each neighbourhood.

#### Getting the geocodes for each postcode.

As neither "geopy.geocoders" nor "geocoder.google" did not work for me; I started to look for something else.
After some googling, I've found "opencage.geocoder".  
This worked well, excluding three cases, where opencage did not find the postcodes: M3A, M7A, M7R. 
I've updated their latitudes and longitudes manually.

In [6]:
# The code was removed by Watson Studio for sharing.

In [7]:
geocoder = OpenCageGeocode(key)
latitude_list = []
longitude_list = []
response = []

for i in df.index:
    address = df['Postcode'][i] + ', Toronto, Ontario'   
    results = geocoder.geocode(address) 
    for j in range(0,len(results)):
        if df['Postcode'][i] in results[j]['formatted']:
            latitude = results[j]['geometry']['lat']
            longitude = results[j]['geometry']['lng']
            break
        else:
            latitude = 0
            longitude = 0
    response.append(results[j]['formatted'])            
    latitude_list.append(latitude)
    longitude_list.append(longitude)
    #print('The geograpical coordinate are {}, {}.'.format(latitude, longitude))

df['Latitude'] = latitude_list
df['Longitude'] = longitude_list
df['response'] = response

df.loc[(df['Latitude'] == 0)]

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,response
25,M3A,North York,Parkwoods,0.0,0.0,"Toronto, ON M6K 1X9, Canada"
85,M7A,Downtown Toronto,Queen's Park,0.0,0.0,"Toronto, ON M6K 1X9, Canada"
86,M7R,Mississauga,Canada Post Gateway Processing Centre,0.0,0.0,"Toronto, ON M6K 1X9, Canada"


As mentioned above, there are three cases where opencage cannot find the requested postcodes.
That is getting obvious after the opencage response check.  
Therefore, in the code below, the geocodes have been updated manually.

In [8]:
df.loc[df['Postcode'] == 'M3A','Latitude'],df.loc[df['Postcode'] == 'M3A','Longitude']  = 43.7533,-79.3297
df.loc[df['Postcode'] == 'M7A','Latitude'],df.loc[df['Postcode'] == 'M7A','Longitude']  = 43.6623,-79.3895
df.loc[df['Postcode'] == 'M7R','Latitude'],df.loc[df['Postcode'] == 'M7R','Longitude']  = 43.6370,-79.6158
df.drop('response',axis = 1, inplace = True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.8113,-79.193
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.7878,-79.1564
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7678,-79.1866
3,M1G,Scarborough,Woburn,43.765717,-79.221898
4,M1H,Scarborough,Cedarbrae,43.7686,-79.2389


#### Create a map with Toronto neighbourhoods.

In [9]:
%matplotlib notebook
toronto_coord = geocoder.geocode('Toronto, Ontario')
toronto_lat = toronto_coord[0]['geometry']['lat']
toronto_lng = toronto_coord[0]['geometry']['lng']

map_toronto = folium.Map(location = [toronto_lat,toronto_lng],zoom_start = 10)

for lat,lng,borough,neighbourhood in zip(df['Latitude'],df['Longitude'],df['Borough'],df['Neighbourhood']):
    label = '{}: {}'.format(neighbourhood,borough)
    label = folium.Popup(label,parse_html = True)
    folium.CircleMarker([lat,lng],radius = 5,popup = label,color = 'blue',fill = True,
                        fill_color = '#3186cc',fill_opacity = 0.7,parse_html = False).add_to(map_toronto)
map_toronto

# Part III. Exploring the neighbourhoods in Toronto.

## 1. Getting the venues list for each neighbourhood.

In [10]:
# The code was removed by Watson Studio for sharing.

#### Definition of the function "getNearbyVenues". This function is dedicated to getting the venue list (max 100) for each neighbourhood in a 500m radius. 

In [11]:
def getNearbyVenues(neighbourhoods,latitudes,longitudes,radius = 500,limit = 3):    
    venues_list = []
    for neighbourhood,lat,lng in zip(neighbourhoods,latitudes,longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID,CLIENT_SECRET,VERSION,lat,lng,radius,limit)
        results = requests.get(url).json()['response']['groups'][0]['items']        
        venues_list.append([(neighbourhood,lat,lng,v['venue']['name'],v['venue']['location']['lat'],v['venue']['location']['lng'],v['venue']['categories'][0]['name'])
                            for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood','Neighbourhood Latitude','Neighbourhood Longitude','Venue','Venue Latitude','Venue Longitude','Venue Category']
    return(nearby_venues)

#### Getting the venues list for Toronto neighbourhoods.

In [12]:
toronto_venues = getNearbyVenues(neighbourhoods = df['Neighbourhood'],latitudes = df['Latitude'],longitudes = df['Longitude'],limit = 100)

KeyError: 'groups'

Let's check how many venues were returned for each neighbourhood.

In [None]:
pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',1000)
toronto_venues.groupby('Neighbourhood').count()

#### Short check of the obtained dataframe.

In [None]:
print('Number of unique neighbourhoods: {}'.format(len(toronto_venues['Neighbourhood'].unique())))

But in the initial dataframe we had 103 neighbourhoods.  
So first, let's find the repetitive neighbourhoods in the initial dataframe:

In [None]:
df[df.duplicated('Neighbourhood',keep = False)]

All right, then one missing row is explained by repeated neighbourhood Queen's Park. How about the other two?  
Let's check API responses for those neighbourhoods, which exist in the initial dataframe, but not in the new one.

In [None]:
for nb in list(set(df['Neighbourhood']) - set(toronto_venues['Neighbourhood'])):
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,CLIENT_SECRET,VERSION,df.loc[df['Neighbourhood'] == nb,'Latitude'].values[0],df.loc[df['Neighbourhood'] == nb,'Longitude'].values[0],100,500)
    results = requests.get(url).json()['response']['warning']['text']
    print('For neighbourhood = {}: \n {}'.format(nb,results))

So, two disappeared rows are explained by the missing result of the Foursquare API.

## 2. The venue's categories processing.

#### One hot encoding. Let's add venue categories as columns.

In [None]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']],prefix = '',prefix_sep = '')
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 
toronto_onehot = toronto_onehot[[toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])]
toronto_onehot.head()

In [None]:
print('The new dataframe size: {}.'.format(toronto_onehot.shape))

#### Now, let's group the rows by neighbourhood and by taking the mean of the frequency of the occurrence of each category.

In [None]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

In [None]:
print('The new dataframe size: {}.'.format(toronto_grouped.shape))

#### Let's print each neighbourhood along with the top 5 most common categories of venues.

In [None]:
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print('-----' + hood + '-----')
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq':2})
    print(temp.sort_values('freq',ascending = False).reset_index(drop = True).head(num_top_venues))
    print('\n')


#### Let's write a function to sort the venues in descending order.

In [None]:
def return_most_common_venues(row,num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending = False)    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Now let's create the new dataframe and display the top 10 venues for each neighbourhood.

In [None]:
num_top_venues = 10
indicators = ['st','nd','rd']

columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind + 1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind + 1))

neighbourhoods_venues_sorted = pd.DataFrame(columns = columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind,1:] = return_most_common_venues(toronto_grouped.iloc[ind,:], num_top_venues)

neighbourhoods_venues_sorted.head()

## 3. The clustering of the neighbourhoods, based on their venues categories representation.

#### To cluster neighbourhoods, we will be using the K-Means clustering.

In [None]:
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood',1)
kmeans = KMeans(n_clusters = kclusters,random_state = 0).fit(toronto_grouped_clustering)
kmeans.labels_[0:10] 

#### Now, let's add cluster labels, as well as the most common venues, into the initial dataframe.

In [None]:
neighbourhoods_venues_sorted.insert(0,'Cluster Labels',kmeans.labels_)
toronto_merged = df
toronto_merged = toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'),on = 'Neighbourhood')
toronto_merged.dropna(inplace = True)
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(int)
toronto_merged.head()

#### And now, it's time to visualize the resulting clusters!

In [None]:
map_clusters = folium.Map(location = [toronto_lat,toronto_lng],zoom_start = 10)

x = np.arange(kclusters)
ys = [i + x + (i * x) ** 2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat,lon,poi,cluster in zip(toronto_merged['Latitude'],toronto_merged['Longitude'],toronto_merged['Neighbourhood'],toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster),parse_html = True)
    folium.CircleMarker([lat,lon],radius = 5,popup = label,color = rainbow[cluster-1],fill = True,fill_color = rainbow[cluster-1],fill_opacity = 0.7).add_to(map_clusters)
map_clusters

### Let's have a look at the clusters one by one.

#### Cluster 1.

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

#### Cluster 2.

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

#### Cluster 3.

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

#### Cluster 4.

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

#### Cluster 5.

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

# Many thanks for your time!  
# The end.