 # Web scraping 
 

In [1]:
import bs4
import requests
from bs4 import BeautifulSoup as bs # use Beautiful Soup to scrap the webpage
import re
import pandas as pd

### Use Beautiful Soup to extract the web page content & table

In [2]:
r=requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
webpage = bs(r.content)
table1 = webpage.select("table.wikitable.sortable")[0] # Select the table from the website

### Extract the table column headers

In [3]:
columns = table1.find("tbody").find_all("th")
column_names = [str(c.text).strip() for c in columns] # Get the column headers for the table
column_names

['Postal Code', 'Borough', 'Neighbourhood']

### Retrieve the rows

In [4]:
table_rows = table1.find("tbody").find_all("tr")
#Add rows to the list one by one by striping the \n
l=[]
for tr in table_rows:
    td = tr.find_all("td")
    row = [str(tr.text).strip() for tr in td]
    l.append(row)

### Make a DF - removing  Borough= Not Assigned 

In [5]:
df = pd.DataFrame(l, columns = column_names)
df.drop(0,inplace=True) #remove first column which is an empty row
df = df[df['Borough']!='Not assigned'] #remove rows with 'not assigned' boroughs

### Check to see if any neighbirhoods = Not Assigned

all(df['Neighbourhood']=='Not assigned') #check to see if there are any neighborhoods that aren't assigned

In [6]:
df.shape

(103, 3)

### Toronto neighborhoods table from the website

In [7]:
df.head(10) 

Unnamed: 0,Postal Code,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
10,M1B,Scarborough,"Malvern, Rouge"
12,M3B,North York,Don Mills
13,M4B,East York,"Parkview Hill, Woodbine Gardens"
14,M5B,Downtown Toronto,"Garden District, Ryerson"


# Geocoding 


In [8]:
geo_df = pd.read_csv('https://cocl.us/Geospatial_data')

### Merge the CSV file and the Data frame to get lattitide and longitude for each borough

In [9]:
merged_df = pd.merge(df,geo_df,on='Postal Code')

In [10]:
merged_df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


### Creating a Data frame with Boroughs that contain the word "Toronto"

In [11]:
Toronto_df=merged_df[merged_df['Borough'].str.contains('Toronto')]
Toronto_df.head(10)



Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259
