
# Segmenting and Clustering Neighborhoods in Toront

# Import important libraries

In [18]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
!conda install -c conda-forge folium=0.5.0 --yes
import folium
from geopy.geocoders import Nominatim
import json
from pandas.io.json import json_normalize 
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

Solving environment: done

# All requested packages already installed.



# 1.will use 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# 2.check for status code and headers

In [45]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
result = requests.get(url)
print(url)
print(result.status_code)
print(result.headers)

https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
200
{'Date': 'Tue, 05 May 2020 00:38:15 GMT', 'Vary': 'Accept-Encoding,Cookie,Authorization', 'Server': 'ATS/8.0.7', 'Content-Type': 'text/html; charset=UTF-8', 'X-Content-Type-Options': 'nosniff', 'P3P': 'CP="See https://en.wikipedia.org/wiki/Special:CentralAutoLogin/P3P for more info."', 'Content-language': 'en', 'Last-Modified': 'Mon, 04 May 2020 01:00:47 GMT', 'Content-Encoding': 'gzip', 'Age': '54598', 'X-Cache': 'cp1081 hit, cp1081 hit/134', 'X-Cache-Status': 'hit-front', 'Server-Timing': 'cache;desc="hit-front"', 'Strict-Transport-Security': 'max-age=106384710; includeSubDomains; preload', 'Set-Cookie': 'WMF-Last-Access=05-May-2020;Path=/;HttpOnly;secure;Expires=Sat, 06 Jun 2020 12:00:00 GMT, WMF-Last-Access-Global=05-May-2020;Path=/;Domain=.wikipedia.org;HttpOnly;secure;Expires=Sat, 06 Jun 2020 12:00:00 GMT, GeoIP=US:::37.75:-97.82:v4; Path=/; secure; Domain=.wikipedia.org', 'X-Client-IP': '141.125.71.190', 'Cache-

# will get data + clean it

In [126]:
soup = BeautifulSoup(result.content, 'html.parser')
table = soup.find('table')
trs = table.find_all('tr')
rows = []
for tr in trs:
    i = tr.find_all('td')
    if i:
        rows.append(i)
        
lst = []
for row in rows:
    postalcode = row[0].text.rstrip()
    borough = row[1].text.rstrip()
    neighborhood = row[2].text.rstrip()
    if borough != 'Not assigned':
        if neighborhood == 'Not assigned':
            neighborhood = borough
        lst.append([postalcode, borough, neighborhood])


# convert into a dataframe

In [163]:
cols = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(lst, columns=cols)
print(df.shape)

(103, 3)


# custom groupby / agg to merge Neighborhoods
# groupby PostalCode, keep the first Borough and join() Neighborhoods

In [164]:
df = df.groupby('PostalCode').agg(
    {
        'Borough':'first', 
        'Neighborhood': ', '.join,}
    ).reset_index()

# will check if 'M5A' example is done correctly

In [165]:
df.loc[df['PostalCode'] == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighborhood
53,M5A,Downtown Toronto,Regent Park / Harbourfront


# will check the df.shape

In [166]:
df.shape

(103, 3)

# will read geo data from a CSV
# need to rename a column so the merge (below) will work

In [167]:
dfgeo = pd.read_csv("http://cocl.us/Geospatial_data")
dfgeo.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)

# need to merge 2 dataframes based on a PostalCode column

In [168]:
df2 = pd.merge(df, dfgeo, on="PostalCode", how='left')

# let's check if the example returns the expected value(s)

In [169]:
df2.loc[df2['PostalCode'] == 'M5G']

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
57,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


In [170]:
df2.loc[df2['PostalCode'] == 'M9V']

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
101,M9V,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...,43.739416,-79.588437


# Toronto map


In [171]:
geolocator = Nominatim(user_agent="coursera")
address = 'Toronto'
try:
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    print('The geograpical coordinates of {} are {}, {}.'.format(address, latitude, longitude))
except AttributeError:
    print('Cannot find: {}, will drop index: {}'.format(address, index))

my_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df2['Latitude'], df2['Longitude'], df2['PostalCode']):
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(my_map)  
    
my_map

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


# Will analyze with Foursquare

In [172]:
CLIENT_ID = '4FYKOS2RLJNKMKHHWNOTECWNY53UTQHSYV45WPNHXGOT2W' # Foursquare ID
CLIENT_SECRET = 'QTEMD12KYWI4NIHNRHXJNLUZ5KT11DUEJCS2J4EFU4LDJ0C0' # Foursquare Secret
VERSION = '20180323' # API version
# defining radius and limit of venues to get

radius=500
LIMIT=100


In [173]:
df2.set_index('PostalCode', inplace = True) 
neighborhood_latitude = df2.loc['M5G']['Latitude']
neighborhood_longitude = df2.loc['M9V']['Longitude']

In [174]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=4FYKOS2RLJNKMKHHWNOTECWNY53UTQHSYV45WPNHXGOT2W&client_secret=QTEMD12KYWI4NIHNRHXJNLUZ5KT11DUEJCS2J4EFU4LDJ0C0&v=20180323&ll=43.6579524,-79.5884369&radius=500&limit=100'

In [175]:
results = requests.get(url).json()
results

{'meta': {'code': 400,
  'errorType': 'invalid_auth',
  'errorDetail': 'Missing access credentials. See https://developer.foursquare.com/docs/api/configuration/authentication for details.',
  'requestId': '5eb19d96df2774001bf84248'},
 'response': {}}

In [176]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [177]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [181]:
toronto_venues = getNearbyVenues(names = df2['Postalcode'],
                                   latitudes = df2['Latitude'],
                                   longitudes = df2['Longitude']
                                  )
toronto_venues.head()

KeyError: 'Postalcode'