# This Notebook is used to scrape the Wikipedia page with Toronto neighborhoods.

#### Installation of the "beautifulsoup4" package.

In [1]:
#!conda install beautifulsoup4

#### Import of the required modules.

In [2]:
import lxml
import requests
import pandas as pd
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim

#### Getting the Neighborhoods table from the Wikipedia page.

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
soup = BeautifulSoup(requests.get(url).content,features = 'lxml')
table = soup.find_all('table',class_ = 'wikitable sortable')[0]
df = pd.read_html(str(table))[0]

#### Transforming the data into the required form.

In [4]:
df = df.loc[(df['Borough'] != 'Not assigned')]
mask = (df['Neighbourhood'] == 'Not assigned')
df['Neighbourhood'][mask] = df['Borough'][mask]
df = df.sort_values(['Postcode','Borough','Neighbourhood'])
df = df.groupby(['Postcode','Borough'],as_index = False).agg({'Neighbourhood':', '.join})
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Printing the number of rows and columns of the resulting dataframe.

In [None]:
print("The number of rows and columns of the resulting dataframe:",df.shape)

The number of rows and columns of the resulting dataframe: (103, 3)


In [None]:
#!pip install opencage
from opencage.geocoder import OpenCageGeocode

In [None]:
# The code was removed by Watson Studio for sharing.

In [None]:
geocoder = OpenCageGeocode(key)

latitude_list = []
longitude_list = []

for i in df.index:
    address = df['Postcode'][i] + ', Toronto, Ontario'   
    results = geocoder.geocode(address)
    n = len(results) - 1
    latitude = results[n]['geometry']['lat']
    longitude = results[n]['geometry']['lng']
    latitude_list.append(latitude)
    longitude_list.append(longitude)
    #print('The geograpical coordinate are {}, {}.'.format(latitude, longitude))

The geograpical coordinate are 43.8113, -79.193.
The geograpical coordinate are 43.7878, -79.1564.
The geograpical coordinate are 43.7678, -79.1866.
The geograpical coordinate are 43.7712, -79.2144.
The geograpical coordinate are 43.7686, -79.2389.
The geograpical coordinate are 43.7464, -79.2323.
The geograpical coordinate are 43.7298, -79.2639.
The geograpical coordinate are 43.7122, -79.2843.
The geograpical coordinate are 43.7247, -79.2312.
The geograpical coordinate are 43.6952, -79.2646.
The geograpical coordinate are 43.7612, -79.2707.
The geograpical coordinate are 43.7507, -79.3003.
The geograpical coordinate are 43.7946, -79.2644.
The geograpical coordinate are 43.7812, -79.3036.
The geograpical coordinate are 43.8177, -79.2819.
The geograpical coordinate are 43.8016, -79.3216.
The geograpical coordinate are 43.834, -79.2069.
The geograpical coordinate are 43.8015, -79.3577.
The geograpical coordinate are 43.7801, -79.3479.
The geograpical coordinate are 43.7797, -79.3813.
Th

In [None]:
df['Latitude'] = latitude_list
df['Longitude'] = longitude_list
df