# Scrape the Wikipedia page and wrangle the data, clean it, and then read it into a pandas dataframe

## Scrape the Wikipedia page

In [None]:
import requests
import pandas as pd

In [None]:
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wikipedia_page = requests.get(wikipedia_link)
wikipedia_doc = wikipedia_page.text

## BeautifulSoup for reading in the table of postal code

In [None]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(wikipedia_doc, 'html.parser')
table = soup.find('table', {'class': 'wikitable sortable'})
#table      

## Convert the table into a pandas dataframe

In [None]:
col_names = ["PostalCode", "Borough", "Neighborhood"]
df = pd.read_html(str(table), skiprows=1)
df = pd.DataFrame.from_dict(df[0])
df.columns = col_names
df.head()

## Ignore cells with a borough that is Not assigned

In [None]:
df = df[df.Borough != 'Not assigned'].reset_index(drop=True)
df.head()

## If there is more than one neighborhood existing in one postal code area these neighborhoods will be combined into one row with the neighborhoods separated with a comma

In [None]:
def combine_neighborhood(series):
    return series.str.cat(sep=', ')

df_by_pcode = df.groupby(["PostalCode", "Borough"])
df = df_by_pcode.agg({'Neighborhood': combine_neighborhood}).reset_index()
df.head()

## If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [None]:
def impute_neirghborhood(row):
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']
    
    return row

df = df.apply(impute_neirghborhood, axis=1)
df.head()

# Get the latitude and the longitude coordinates of each neighborhood

## Load geospatial coordinates from the provided csv file

In [None]:
lat_lng_coords = pd.read_csv("Geospatial_Coordinates.csv")
lat_lng_coords.rename(columns={'Postal Code':'PostalCode'}, inplace=True) 
lat_lng_coords.head()

## Add geospatial coordinates to the existing data frame

In [None]:
df_with_location = pd.merge(df, lat_lng_coords, how='inner', left_on='PostalCode', right_on='PostalCode')
df_with_location

## Add geospatial coordinates using geocoder (Optional)

In [None]:
'''
import geocoder

def get_location(row):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(row['PostalCode']))
      lat_lng_coords = g.latlng

    row['Latitude'] = lat_lng_coords[0]
    row['Longitude'] = lat_lng_coords[1]
    
    return row

df_with_location = df.apply(get_location, axis=1)
df_with_location.head(10)
'''