In [51]:
! conda install -c conda-forge beautifulsoup4 --yes

import pandas as pd
import requests 

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.



**Use the BeautifulSoup package for web scraping**

In [53]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

def getHTMLContent(link):
    html = urlopen(link)
    soup = BeautifulSoup(html, 'html.parser')
    return soup

**Scrape the following Wikipedia page into a dataframe**

In [88]:
content = getHTMLContent('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
tables = content.find_all('table')

table = content.find('table', {'class': 'wikitable sortable'})
rows = table.find_all('tr')

data_content = []
for row in rows:
    cells = row.find_all('td')
    #Ignore cells with a borough that is Not assigned.
    if len(cells) > 1 and cells[1].get_text()!='Not assigned':
        country_info = [cell.text.strip('\n') for cell in cells]
        data_content.append(country_info)

dataset = pd.DataFrame(data_content)

# Define column headings
headers = rows[0].find_all('th')
headers = [header.get_text().strip('\n') for header in headers]
dataset.columns = headers


#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
for index, row in dataset.iterrows() :
    if row['Neighbourhood']== 'Not assigned':
        row['Neighbourhood']=row['Borough']
    
#More than one neighborhood combined into one row with the neighborhoods separated with a comma
dataset = dataset.groupby(['Postcode','Borough'], as_index=False).agg(lambda x: x.tolist())
dataset['Neighbourhood'] = [','.join(map(str, l)) for l in dataset['Neighbourhood']]

dataset.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [75]:
#use the .shape method to print the number of rows of your dataframe
print(dataset.shape)

(103, 1)


**Reading the Geospactial data into a dataframe**

In [76]:
geodata = pd.read_csv('https://cocl.us/Geospatial_data')
geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


**Merging the Geospactial data into the Neighborhood dataframe**

In [86]:
geodata.rename(columns={'Postal Code':'Postcode'}, inplace=True)
df_merged = pd.merge(dataset, geodata, on='Postcode', how='left')
df_merged.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848
