## Importing BeautifulSoup, requests and pandas libraries

In [1]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page
import pandas as pd

## Defining data source webpage from wikipedia

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

## Using requests to get web page text and beautifulsoup to format it to html

In [3]:
data  = requests.get(url).text 
soup = BeautifulSoup(data,'html5lib')

## Extracting tables from the soup

In [4]:
table_contents=[]  # Blank list to contain table data
table=soup.find('table')

## Extracting data from table and creating a list with the data

In [5]:
for row in table.findAll('td'):            ## iterating between all normal table cells - tag 'td'
    cell = {}                              ## creating blank dictionary which will contain each row data
    if row.span.text == 'Not assigned':    ## ignoring cells that contain 'Not assigned'
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]              #extracting 3 characters of postal code from row
        cell['Borough'] = (row.span.text).split('(')[0]  #extracting text before first open bracket
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')').replace('/',',')).replace(')', ' ')).strip(' ')) 
        ## line above extracts element after first parenthesis and remove closing bracket then replace / and , ) for blanks
        table_contents.append(cell)  #adding the dictionary to table contents


## Transforming table_contents list into pandas Dataframe

In [6]:
df=pd.DataFrame(table_contents)  ##converting list to dataframe
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                     'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                     'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                     'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

## lline above clean few cells that were not extracted right

df


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto Business,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


## Number of rows in the dataframe

In [7]:
print('The number of rows in the dataframe is: {}'.format(df.shape[0]))

The number of rows in the dataframe is: 103
