### Import required libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Download the URL for scraping

In [2]:
page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(page.content, 'html.parser')

#find the table containing the data we need
table = soup.find(class_='wikitable sortable')
table_row = table.find_all('tr') 

### Data Scraping

#### Initialize DataFrame

In [3]:
column_names = ['PostCode', 'Borough', 'Neighborhood']
postal = pd.DataFrame(columns=column_names)
postal

Unnamed: 0,PostCode,Borough,Neighborhood


#### Fill DataFrame

In [4]:
for tr in table_row[1:]:
    tr = tr.find_all('td')
    table_row_data = [data.get_text() for data in tr] 
    postcode = table_row_data[0]
    borough = table_row_data[1]
    neighborhood = table_row_data[2]
    postal = postal.append({'PostCode' : postcode,
                           'Borough' : borough,
                           'Neighborhood': neighborhood}, ignore_index=True)
    
postal.head()

Unnamed: 0,PostCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


#### Remove the newline at the end from neighborhood 

In [5]:
postal['Neighborhood'] = [col.split('\n')[0] for col in postal['Neighborhood']]
postal.head()                                   

Unnamed: 0,PostCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### 1. Ignore cells with a borough that is Not assigned

#### 2. Merge Neighborhoods with same postal code

In [6]:
#Ignore cells with a borough that is Not assigned
postal = postal.loc[postal['Borough'] != 'Not assigned']
dfc = postal.copy()

#Merge neighborhoods with same postal code
for group, member in dfc.groupby('PostCode'):
    index = []
    for i in range(0, len(member)):
        index.append(member.iloc[i].name)
    
    target = index[0]
        
    for i in range (1, len(member)):
        temp = member.loc[target, ['Neighborhood']].values[0]  + ',' + member.iloc[i]['Neighborhood']
        dfc.loc[target, 'Neighborhood'] = temp

    for i in range(1, len(member)):
        dfc.drop(index[i], inplace=True)

dfc.head()

Unnamed: 0,PostCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Harbourfront,Regent Park"
6,M6A,North York,"Lawrence Heights,Lawrence Manor"
8,M7A,Queen's Park,Not assigned


#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough


In [7]:
for index, row in dfc.iterrows():
    if ((row['Neighborhood'] == 'Not assigned') & (row['Borough'] != 'Not assigned')):
        row['Neighborhood'] = row['Borough']

dfc.head()

Unnamed: 0,PostCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Harbourfront,Regent Park"
6,M6A,North York,"Lawrence Heights,Lawrence Manor"
8,M7A,Queen's Park,Queen's Park


#### Print shape of final DataFrame

In [8]:
dfc = dfc.reset_index()
dfc = dfc.drop(columns=['index'])
dfc.shape

(103, 3)

### Get latitude and longitude using geopy

In [9]:
from geopy.geocoders import Nominatim

In [10]:
address = 'Lawrence Heights, North York'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))

The geograpical coordinate of Lawrence Heights, North York are 43.7227784, -79.4509332.



Since, each query takes some time and also for few addresses, geopy returns error for some reason, we would use data from a csv file which already has LAT, LANG for each postal address.

#### Read csv file consisting latitudes and longitudes

In [11]:
import pandas as pd
df_ll =pd.read_csv('Geospatial_Coordinates.csv')
df_ll.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merge the two dataframes

In [12]:
df_postal = pd.merge(dfc, df_ll, how='left', left_on='PostCode', right_on='Postal Code')
df_postal.drop(columns=['Postal Code'], inplace=True)
df_postal.head()

Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [13]:
#check if merged dataframe has all the data
df_temp = df_postal[['PostCode', 'Borough', 'Neighborhood']]
df_temp.equals(dfc)

True

### Create map with Folium


In [14]:
import folium
latitude = 43.753259
longitude = -79.329656
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

In [15]:
# add markers to map
neighborhoods = df_postal
for pc, lat, lng, borough, neighborhood in zip(neighborhoods['PostCode'], neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(pc, neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='green',
        fill=True,
        fill_color='green',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  

In [16]:
map_newyork