# Extracting Data

In [1]:
# Extract neighborhood data using beautiful soup

# Downloading the page using requests
import requests
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

#Parsing a page with BeautifulSoup
from bs4 import BeautifulSoup as bsoup
soup = bsoup(page.content, "html.parser")

# Extract table data
table = soup.find("table")
output_rows = []
for table_row in table.findAll('tr'):
    columns = table_row.findAll('td')
    output_row = []
    for column in columns:
        text = column.text
        text = text.replace('\n','')
        output_row.append(text)
    output_rows.append(output_row)
        
# Put the data in dataframe
import pandas as pd
df=pd.DataFrame(output_rows,columns=['Postcode','Borough', 'Neighborhood'])
df.drop(df.index[0], inplace = True)

# Data Cleaning

In [2]:
# Data Cleaning
# drop rows with 'Not assigned' Borough
df = df[df.Borough != 'Not assigned']
df.reset_index(drop = True, inplace = True)

# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
df.Neighborhood = df.Neighborhood.replace('Not assigned', df.Borough)

# Group neighborhoods with the same postcode
df_group = df.groupby(['Postcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_group.head()


Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# Geocoder

In [12]:
#read geospatial data 
file_path = "http://cocl.us/Geospatial_data"
geofile = pd.read_csv(file_path)
geofile.rename(columns = {'Postal Code':'Postcode'}, inplace = True)
geofile.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
# Merge the two datafile based on Postcode
df_merge = pd.merge(df_group, geofile, on='Postcode')
df_merge.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
