### Import all the libaries

In [1]:
#import all libaries
from bs4 import BeautifulSoup
import requests
import pandas as pd

### Download the page

In [2]:
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
page

<Response [200]>

### Parse Page with beautifulsoup

In [3]:
canada_postal_codes = []
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
table_body = table.find('tbody')

rows = table_body.findAll('tr')
for row in rows:
    cols = row.findAll('td')    
    cols = [ele.text.strip() for ele in cols]
    canada_postal_codes.append(cols)

# Remove the empty array from front
canada_postal_codes = canada_postal_codes[1:]
canada_postal_codes[0:5]

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront']]

### Convert the list to pandas Dataframe

In [4]:
columns = ["Postcode", "Borough", "Neighbourhood"]
canada_df = pd.DataFrame(canada_postal_codes, columns = columns)
canada_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Cleaning Data

In [5]:
# Drop rows with Not assigned
canada_df_cleaned = canada_df[canada_df.Borough != 'Not assigned']
canada_df_cleaned.reset_index(drop = True, inplace=True)
canada_df_cleaned.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


### Merge Neighbourhood with same post codes

In [6]:
canada_grouped = canada_df_cleaned.groupby('Postcode', as_index = False).agg(lambda x: ', '.join(set(x)))
canada_grouped.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Dealing with Not assigned neighborhood

In [7]:
canada_grouped[canada_grouped.Neighbourhood == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Not assigned


In [8]:
for row in canada_grouped.itertuples():
    if row.Neighbourhood == 'Not assigned':
        canada_grouped.at[row.Index,'Neighbourhood'] = row.Borough        

In [9]:
canada_grouped.iloc[85]

Postcode                  M7A
Borough          Queen's Park
Neighbourhood    Queen's Park
Name: 85, dtype: object

In [10]:
canada_grouped.shape

(103, 3)

### Add Geocoding data

In [13]:
!conda install -c conda-forge geopy 
!conda install -c conda-forge geocoder

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
geopy                     1.18.1                     py_0    conda-forge
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geocoder:   1.38.1-py_0  conda-forge
    orderedset: 2.0-py35_0   conda-forge
    ratelim:    0.1.6-py35_0 conda-forge

orderedset-2.0 100% |################################| Time: 0:00:00  50.19 MB/s
ratelim-0.1.6- 100% |################################| Time: 0:00:00  12.60 MB/s
geocoder-1.38. 100% |################################| Time: 0:00:00  40.44 MB/s


In [33]:
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

g = geocoder.google("Santa Cruz", components="country:ES")
lat_lng_coords = g.latlng
print(g.latlng)

# Unable to get data from geocoder, Download and read csv

None


### Download the csv provided to continue

In [35]:
canada_geo_data = pd.read_csv("http://cocl.us/Geospatial_data")
canada_geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [51]:
canada_geo_data_full = pd.concat([canada_geo_data, canada_grouped], axis=1)
canada_geo_data_full.head()

Unnamed: 0,Postal Code,Latitude,Longitude,Postcode,Borough,Neighbourhood
0,M1B,43.806686,-79.194353,M1B,Scarborough,"Malvern, Rouge"
1,M1C,43.784535,-79.160497,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,43.763573,-79.188711,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,M1G,Scarborough,Woburn
4,M1H,43.773136,-79.239476,M1H,Scarborough,Cedarbrae


### Rearrange The columns

In [52]:
cols = ['Postal Code', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude', 'Postcode']
canada_geo_data_full = canada_geo_data_full[cols]

#Drop last PostCode column
canada_geo_data_full.drop(['Postcode'], axis=1, inplace=True)

#Rename "Postal Code" to PostalCode
canada_geo_data_full.rename(index=str, columns={"Postal Code": "PostalCode"}, inplace=True)

canada_geo_data_full.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
