## Importing BeautifulSoup, requests and pandas libraries

In [2]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page
import pandas as pd

## Defining data source webpage from wikipedia

In [3]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

## Using requests to get web page text and beautifulsoup to format it to html

In [4]:
data  = requests.get(url).text 
soup = BeautifulSoup(data,'html5lib')

## Extracting tables from the soup

In [5]:
table_contents=[]  # Blank list to contain table data
table=soup.find('table')

## Extracting data from table and creating a list with the data

In [6]:
for row in table.findAll('td'):            ## iterating between all normal table cells - tag 'td'
    cell = {}                              ## creating blank dictionary which will contain each row data
    if row.span.text == 'Not assigned':    ## ignoring cells that contain 'Not assigned'
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]              #extracting 3 characters of postal code from row
        cell['Borough'] = (row.span.text).split('(')[0]  #extracting text before first open bracket
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')').replace('/',',')).replace(')', ' ')).strip(' ')) 
        ## line above extracts element after first parenthesis and remove closing bracket then replace / and , ) for blanks
        table_contents.append(cell)  #adding the dictionary to table contents


## Transforming table_contents list into pandas Dataframe

In [7]:
df=pd.DataFrame(table_contents)  ##converting list to dataframe
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                     'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                     'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                     'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

## line above clean few cells that were not extracted right

#df.drop_duplicates(subset = 'Neighborhood', keep='first', inplace= True) # dropping duplicates
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto Business,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


## Number of rows in the dataframe

In [8]:
print('The number of rows in the dataframe is: {}'.format(df.shape[0]))

The number of rows in the dataframe is: 103


## Installing geocoder

In [9]:
! pip install geocoder



## Obtaining Coordinates of postal codes -- Geocoder did not work -- never exiting loop

In [10]:
# import geocoder # import geocoder

# initialize your variable to None
# lat_lng_coords = None

# loop until you get the coordinates
# while(lat_lng_coords is None):
#  g = geocoder.google('{}, Toronto, Ontario'.format('M5G'))
#  lat_lng_coords = g.latlng

# latitude = lat_lng_coords[0]
# longitude = lat_lng_coords[1]

## Importing csv with coordinates

In [11]:
file = 'Geospatial_Coordinates.csv'
Geo_coord = pd.read_csv(file)
Geo_coord = Geo_coord.rename(columns={'Postal Code': 'PostalCode'})
Geo_coord

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [12]:
df = pd.merge(df, Geo_coord, how='left')
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto Business,Enclave of M4L,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,...",43.636258,-79.498509
