# Scrape and Convert Data

In [1]:
!pip install bs4



In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [4]:
data  = requests.get(url).text 

In [5]:
soup = BeautifulSoup(data, 'html5lib')

In [6]:
tables = soup.find_all('table')

In [8]:
postal_data = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"])

for row in soup.find("tbody").find_all("tr"):
    col = row.find_all("td")
    if (col != []):
        postalcode = col[0].text.strip()
        borough = col[1].text.strip()
        neigh = col[2].text.strip()
        
        postal_data = postal_data.append({"PostalCode":postalcode, "Borough":borough, "Neighborhood":neigh}, ignore_index=True)

In [9]:
postal_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [10]:
postal_data.drop(index=postal_data[postal_data['Borough'] == 'Not assigned'].index, axis=0, inplace=True)
postal_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


# Group Data

In [11]:
postal_group = postal_data.groupby('PostalCode')['Neighborhood'].apply(lambda x:"%s" % ','.join(x) )
postal_group = postal_group.reset_index(drop=False)

# Merge Data

In [12]:
postal_group.rename(columns={'Neighborhood':'Neigh_joined'}, inplace=True)
merge = pd.merge(postal_data, postal_group, on='PostalCode')
merge.drop(['Neighborhood'], axis=1, inplace=True)
merge.drop_duplicates(inplace=True)
merge.rename(columns={'Neigh_joined':'Neighborhood'}, inplace=True)
merge.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [13]:
merge.shape

(103, 3)