In [108]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

#####################################
# Part 1: Scraping the postal codes #
#####################################

#Fetch the table with a combination of Request and BeautifolSoup
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = requests.get(url, headers).text
soup = BeautifulSoup(req, 'xml')
# Only fetch the table we are interested in
html_table = soup.find('table',{'class': 'wikitable sortable'})


html_table_rows = html_table.find_all('tr')
table_data = []
#Loop over the table itself to make sure we know the procedure of getting the information
for table_row in html_table_rows:
    cells = []
    for table_cell in table_row.find_all('td'):
        cells.append(table_cell.text.strip())
    table_data.append(cells)

# Transfer to a dataframe with the given columns
table_df = pd.DataFrame(table_data, columns=['PostalCode', 'Borough', 'Neighbourhood'])

# Remove any rows with an empty Borough or it is not assigned
table_df = table_df[~table_df['Borough'].isnull()]
table_df.drop(table_df[table_df.Borough == 'Not assigned'].index, inplace=True)
table_df.reset_index(drop = True, inplace = True)

# Make sure we do the grouping as given, with neighbourhoods seperated by comma
table_df = table_df.groupby(['PostalCode', 'Borough'])['Neighbourhood'].apply(lambda x: ','.join(x)).reset_index()
table_df['Neighbourhood'].replace('Not assigned', table_df['Borough'], inplace=True)

# Presto.gif
table_df

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [109]:
#####################################
# Part deux: Adding geospatial data #
#####################################

csv_df = pd.read_csv('https://cocl.us/Geospatial_data', names=["PostalCode","Lat", "Long"])
csv_df.head

table_df_geospatial = table_df.join(csv_df.set_index('PostalCode'), on='PostalCode')
table_df_geospatial

Unnamed: 0,PostalCode,Borough,Neighbourhood,Lat,Long
0,M1B,Scarborough,"Malvern, Rouge",43.8066863,-79.1943534
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.7845351,-79.1604971
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7635726,-79.1887115
3,M1G,Scarborough,Woburn,43.7709921,-79.2169174
4,M1H,Scarborough,Cedarbrae,43.773136,-79.2394761
5,M1J,Scarborough,Scarborough Village,43.7447342,-79.2394761
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.7279292,-79.2620294
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.7111117,-79.2845772
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.2394761
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.2648481


In [110]:
############################################
# Part three: Mapping and being interested #
############################################

#!conda install -c conda-forge geocoder --yes
#!conda install -c conda-forge folium --yes

In [111]:
import geocoder
import folium
from geopy.geocoders import Nominatim

# Get the latitude and longitude
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode('Toronto, Ontario')
latitude = location.latitude
longitude = location.longitude

# Create a map of Toronto using folium
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# Put up markers for the locations
table_df_geospatial


for borough, neighbourhood, lat, long in zip(table_df_geospatial['Borough'], table_df_geospatial['Neighbourhood'], table_df_geospatial['Lat'], table_df_geospatial['Long']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        ).add_to(map_Toronto)
    
map_Toronto

In [None]:
# Defining some functions we will use often

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# Function that lists the most common venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]   
   
# Function to get all nearby venues from the foursquare API
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    CLIENT_ID = 'YJD1GJCLQQOVQJSOFOHN2HWKMRGMV0GH5LNVQIACQZA5OVID' # your Foursquare ID
    CLIENT_SECRET = 'ENNSER2SMY3JV2Y1FSANFVGLPFLVUJDEMXEBHIQ5G4DX3MEY' # your Foursquare Secret
    VERSION = '20180604' # Foursquare API version
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            100)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


In [None]:
import json
from pandas.io.json import json_normalize

# Get all the venues from Toronto
toronto_venues = getNearbyVenues(names=table_df_geospatial['Neighbourhood'],
                                   latitudes=table_df_geospatial['Lat'],
                                   longitudes=table_df_geospatial['Long']
                                  )
print(toronto_venues.shape)
toronto_venues.head()

In [None]:
# Let's have a peek, when we group the venues by neighbourhood?
toronto_venues.groupby('Neighbourhood').count()

In [None]:
# And what's the amount of categories we can find, by neighbourhood?


# Using a spicy one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# Specify the neighbourhood, and move it to the front
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

print('Shape: ' + str(toronto_onehot.shape))
toronto_onehot.head()

In [None]:
# And what if we group the one hot encoded dataframe?

toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
print('Shape: ' + str(toronto_grouped.shape))
toronto_grouped.head()

In [None]:
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# Set the columns for the most common venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# YEET it into a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

In [None]:
# For my next trick, we'll cluster the neighbourhoods
from sklearn.cluster import KMeans
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

toronto_merged = table_df_geospatial

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')
toronto_merged.head()

In [None]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi in zip(toronto_merged['Lat'], toronto_merged['Long'], toronto_merged['Neighbourhood']):
    label = folium.Popup(str(poi), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        fill=True,
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters