# 1. Data Scraping

The scrape code was created using the tutorial available on the webpage:
https://simpleanalytical.com/how-to-web-scrape-wikipedia-python-urllib-beautiful-soup-pandas

By: Alan Hylands

In [1]:
import urllib.request

Downloading the page

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urllib.request.urlopen(url)

Extracting the date from the page

In [3]:
from bs4 import BeautifulSoup

In [4]:
soup = BeautifulSoup(page, "lxml")

In [5]:
all_tables=soup.find_all("table")

In [6]:
right_table=soup.find('table', class_='wikitable')

In [7]:
A=[]
B=[]
C=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        if cells[1].find(text=True)[:-1]!="Not assigned":
            #The strings have a '\n' at their end
            # that is why I will remove the lat letter
            A.append(cells[0].find(text=True)[:-1])
            B.append(cells[1].find(text=True)[:-1])
            #Some zip codes have multiple neighbourhoods
            # that is why I will take only the first one of them
            C.append(cells[2].find(text=True)[:-1].partition('/')[0])

Putting the data in a DataFrame

In [74]:
import pandas as pd
df=pd.DataFrame(A,columns=['Zip'])
df['Borough']=B
df['Neighborhood']=C
df

Unnamed: 0,Zip,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park
3,M6A,North York,Lawrence Manor
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,Malvern
7,M3B,North York,Don Mills
8,M4B,East York,Parkview Hill
9,M5B,Downtown Toronto,"Garden District, Ryerson"


# 2. Downloading the Data of Neighborhoods and Venues

Importing the necessary libraries

In [9]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


## Getting the Coordinates of the different Neighborhoods

In [10]:
AA=[]
geolocator = Nominatim(user_agent="foursquare_agent")
for ii,cc in enumerate(np.array(df)):
    address=cc[2]+', '+cc[1]+', Canada'
    location = geolocator.geocode(address)
    if location: #Some locations have no data
        latitude = location.latitude
        longitude = location.longitude
        #Saving all the data of the locations in an array
        AA.append([cc[0],cc[1],cc[2], latitude, longitude])

In [11]:
print(ii)

102


In [73]:
#Turning the array into a DataFrame
df2=pd.DataFrame(AA)
df2=df2.rename(columns={0: "Zip", 1: "Borough" , 2: "Neighborhood" , 3: "Latitude" , 4: "Longitude"})

df2

Unnamed: 0,Zip,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7588,-79.320197
1,M4A,North York,Victoria Village,43.732658,-79.311189
2,M6A,North York,Lawrence Manor,43.722079,-79.437507
3,M7A,Downtown Toronto,Queen's Park,43.663217,-79.38629
4,M9A,Etobicoke,Islington Avenue,43.622575,-79.514215
5,M1B,Scarborough,Malvern,43.809196,-79.221701
6,M3B,North York,Don Mills,43.775347,-79.345944
7,M6B,North York,Glencairn,43.708712,-79.440685
8,M9B,Etobicoke,West Deane Park,43.663199,-79.568568
9,M1C,Scarborough,Rouge Hill,43.780271,-79.130499


### Obtaining the coordinates of Toronto

In [13]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="TO_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Visualizing the locations of the Neighborhoods on the map of Toronto

In [14]:
# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, label in zip(df2[3], df2[4], df2[2]):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto