In [1]:
#Install all modules for this notebook
!pip install bs4
!pip install lxml
!pip install geocoder
!conda install -c conda-forge geopy --yes

Collecting bs4
  Downloading https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
Collecting beautifulsoup4 (from bs4)
[?25l  Downloading https://files.pythonhosted.org/packages/66/25/ff030e2437265616a1e9b25ccc864e0371a0bc3adb7c5a404fd661c6f4f6/beautifulsoup4-4.9.1-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 3.3MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2 (from beautifulsoup4->bs4)
  Downloading https://files.pythonhosted.org/packages/6f/8f/457f4a5390eeae1cc3aeab89deb7724c965be841ffca6cfca9197482e470/soupsieve-2.0.1-py3-none-any.whl
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/jupyterlab/.cache/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472
Successfully built bs4
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.9.1 bs4-0.0.

In [3]:
import pandas as pd  # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
from urllib.request import urlopen # module to open URLs
from bs4 import BeautifulSoup # package used to extract data from html file
import re # module provides regular expression matching operations
import geocoder #library to get latitude and longitude
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

## Download and Explore Dataset:  Question A

In [4]:
# provide web address where data exists
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html = urlopen(url)

In [5]:
# Create BeautifulSoup object
soup = BeautifulSoup(html, 'lxml')
type(soup)

bs4.BeautifulSoup

In [6]:
# Get the title
title = soup.title
print(title)

<title>List of postal codes of Canada: M - Wikipedia</title>


In [7]:
# Print out the text
text = soup.get_text()
#print(soup.text)

In [8]:
#assign data to variable
Toronto_data = soup.find_all('table')[0]

In [9]:
# check data type
type(Toronto_data)

bs4.element.Tag

In [10]:
# Use for loop to extract relavant data from the table
list_rows = []
for row in Toronto_data.find_all('tr'):
    temp_list = []
    for cell in row.find_all('td'):
        clean = re.compile('<.*?>|\\n')
        clean2 = (re.sub(clean, '',str(cell)))
        temp_list.append(clean2)
    list_rows.append(temp_list)
list_rows[:5]

[[],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village']]

In [11]:
# Create DataFrame to store relavant data
Toronto_data1 = pd.DataFrame(list_rows[1:])
Toronto_data1.head()

Unnamed: 0,0,1,2
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [12]:
# Create variable to store row data
Toronto_data.find_all('tr')[0]
row1 = Toronto_data.find_all('tr')[0]

In [13]:
# get column names
col_name=[]
for cell in row1.find_all('th'):
    clean = re.compile('<.*?>|\\n')
    clean2 = (re.sub(clean, '',str(cell)))
    col_name.append(clean2)
print(col_name)

['Postal Code', 'Borough', 'Neighborhood']


In [14]:
#assign column names to dataframe
Toronto_data1.columns=col_name
Toronto_data1.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [15]:
# Select only Boroughs that have assigned names
Toronto_data2=Toronto_data1[Toronto_data1['Borough']!='Not assigned'] # replace with filtered data

In [16]:
Toronto_data2.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [17]:
Toronto_data2[Toronto_data2['Neighborhood']=='Not assigned'] #check if any Neighborhood name is "Not Assigned"

Unnamed: 0,Postal Code,Borough,Neighborhood


In [18]:
Toronto_data2.shape

(103, 3)

## Question B

In [19]:
# initialize your variable to None
#lat_lng_coords = None

# loop until you get the coordinates
#while(lat_lng_coords is None):
#  g = geocoder.google('{}, Toronto, Ontario'.format('M5G'))
#  lat_lng_coords = g.latlng

#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

In [20]:
path = 'https://cocl.us/Geospatial_data'
latlon = pd.read_csv(path)
latlon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [21]:
Toronto_data3 = Toronto_data2.merge(latlon, on='Postal Code') #merge dataframe to include latitude and longitude columns
Toronto_data3.shape

(103, 5)

In [22]:
Toronto_data3.head()  #show DataFrame with the 5 columns

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
