### Week 3 - IBM Capstone project

In [104]:
import pandas as pd
import numpy as np
import requests

In [105]:
# Use Beautiful Soup to extract the table from wikipedia page
from bs4 import BeautifulSoup

In [106]:
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [107]:
page = requests.get(URL)
soup = BeautifulSoup(page.content,'html.parser')

In [108]:
# store the table in my_table
my_table = soup.find('table',{'class':'wikitable sortable'})

### Extract the table rows

In [109]:
# Create array to hold the data we extract
postal_code = []
borough = []
neighborhood = []

for row in my_table.find_all('tr'):
    cells = row.find_all('td')
        
    if len(cells) > 1:
        postal_code.append(cells[0].text.strip())
        borough.append(cells[1].text.strip())
        neighborhood.append(cells[2].text.strip())

### Convert the table to a data frame

In [110]:

df1 = pd.DataFrame({'PostalCode': postal_code,'Borough': borough,'Neighborhood':neighborhood})
df1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [111]:
# Replace Neighborhood name with Borough name only where Neighborhood name is "Not Assigned"
def neigh(b,n):
    if (n == 'Not assigned'):
        return b
    else:
        return n
        
df1['Neighborhood'] = df1.apply(lambda x: neigh(x['Borough'],x['Neighborhood']),axis=1)


### Replace 'Not assigned' by NAN and then remove all Nan from Boroughs

In [112]:
df1['Borough'].replace('Not assigned', np.NaN, inplace = True)

df1.dropna(inplace=True)

#reset index after dropping the rows
df1 = df1.reset_index(drop=True)

df1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Check if there are any duplicate Postal Code

In [113]:
boolean = df1.duplicated(subset=['PostalCode']).any()
boolean

False

In [114]:
df1.shape

(103, 3)

### Week 3 - Assignment 2: Get the geocodes for each Postal Code

In [115]:
df = pd.read_csv("Geospatial_Coordinates.csv")
df.set_index('Postal Code', inplace=True)
df1.set_index('PostalCode', inplace=True)

In [116]:
# Combine the geocode file with the dataframe from before
result = pd.concat([df1,df],axis=1)

In [117]:
result.reset_index()
result['Postal Code'] = result.index
result.reset_index(drop=True, inplace=True)
result

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Postal Code
0,North York,Parkwoods,43.753259,-79.329656,M3A
1,North York,Victoria Village,43.725882,-79.315572,M4A
2,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636,M5A
3,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,M6A
4,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,M7A
...,...,...,...,...,...
98,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944,M8X
99,Downtown Toronto,Church and Wellesley,43.665860,-79.383160,M4Y
100,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,M7Y
101,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509,M8Y
