# Explore and Cluster the Neighbourhoods in Toronto

This notebook will be collecting all the neighbourhood location information in Toronto from scraping Canada postal codes wikipedia page, then using FourSquare API to retrieve the specific information information about each neighbourhood. 

The List of postal codes of Canada Wikipedia link:
https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M 


In [1]:
import pandas as pd
import numpy as np
import requests
import html5lib
from bs4 import BeautifulSoup# beautifulsoup4
#import geocoder

## PART 1

## parsing wikipedia page and find the table

In [2]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(URL)
#print(r.content) # returns raw HTML content of the web of string type

# BeautifulSoup library is build on the top of the HTML parsing libraries like html5lib
soup = BeautifulSoup(r.content, 'html5lib') # specifiy the HTML parser

#print(soup.prettify()) # gives the visual representation of the parse tree

table = soup.find('table', {'class':'wikitable sortable'})

df = pd.DataFrame(columns=["PostalCode","Borough","Neighborhood"])
for tr in table.find_all('tr'):
    tds=tr.find_all('td')
    info = {}
    if tds and len(tds)==3:
        if tds[1].text != "Not assigned": # filter out borough not assigned
            PostalCode= tds[0].text
            if not tds[1].find_all('a'):
                Borough=((tds[1]).text.strip()) # add .strip() to filter out '/n'
            else:
                Borough=(tds[1].find_all('a')[0].text.strip())
                
            if not tds[2].find_all('a'):
                neighborhood=((tds[2]).text.strip())
            else:
                neighborhood=(tds[2].find_all('a')[0].text.strip())
                
            info = {"PostalCode": PostalCode, "Borough": Borough, "Neighborhood": neighborhood}
        
    if info:
        #print(info)
        df=df.append(info,ignore_index=True)

df.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


## combine by postal code and borough

In [3]:
combined_df = pd.DataFrame(df.groupby(["PostalCode","Borough"])["Neighborhood"].apply(lambda x: [i for i in x if i!="Not assigned"])).reset_index()
combined_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]


In [4]:
# join all list to string seperated by ","
# replace empty cell with None
# then use fillna to replace not assigned neighborhood with its borough
combined_df["Neighborhood"] = combined_df["Neighborhood"].apply(lambda x: ", ".join(x)).apply(lambda a : None if len(a)==0 else a).fillna(combined_df["Borough"])
combined_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


The final shape of combined dataframe is (103, 3)

In [5]:
print(combined_df.shape)

(103, 3)


# Utilize the Foursquare location data

## PART 2

## get the latitude and the longitude coordinates of each neighborhood by geocoder
Now that we have a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood.

In [None]:
import geocoder # import geocoder

def get_location(postal_code):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    return (latitude, longitude)

unzip = lambda x: zip(*x)

In [None]:
lat, lon = combined_df["PostalCode"].apply(get_location).apply(unzip)
combined_df["Latitude"] = lan
combined_df["Longitude"] = lon

combined_df.head()

(1, 3)

## Explore and cluster the neighborhoods in Toronto

In [None]:
# Get nearby Venues
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


Clustering

example code for manhatton

In [None]:
# set number of clusters
kclusters = 5

manhattan_grouped_clustering = manhattan_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

manhattan_merged = manhattan_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
manhattan_merged = manhattan_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

manhattan_merged.head() # check the last columns!

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Latitude'], manhattan_merged['Longitude'], manhattan_merged['Neighborhood'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
# examine clusters
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 0, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]