## This is the capstone project

### Importing libraries

In [2]:
import pandas as pd 
import numpy as np 
import requests 
from bs4 import BeautifulSoup

### Reading the webpage from wikipedia and extracting the table

In [39]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'lxml')
My_table = soup.find('table',{'class':'wikitable sortable'})

### Getting columns names

In [40]:
# Columns names
colsTemp = My_table.find_all('th')
columns = []
for col in colsTemp:
    columns.append(col.text.replace("\n",""))
columns

['Postcode', 'Borough', 'Neighbourhood']

### Getting data (without the columns) and creating the Data Frame

In [41]:
# get the data
tr = My_table.find_all("tr")
del tr[0]
data = []
for row in tr:
    cells = row.find_all("td")
    items = []
    for cell in cells:
        items.append(cell.text.replace("\n",""))
    data.append(items)
df = pd.DataFrame(data)
df.columns = columns
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Dropping all the Boroughs that are 'Not assigned'

In [42]:
# Drop the 'Not assigned'
df.drop(df[df['Borough'] == 'Not assigned'].index, inplace=True)

### Grouping the dataframe by Postcode, joining the neighbourhoods names and creating a new data frame with the new information

In [43]:
postcodes = df.groupby(['Postcode','Borough'])
data = []
for postcode, dataTemp in postcodes:
    data.append([postcode[0], postcode[1], dataTemp['Neighbourhood'].str.cat(sep=', ')])
df = pd.DataFrame(data)
df.columns = columns
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Asigning the name of the Borough to the Neighbourhoods that are 'Not assigned'

In [44]:
no_neig = df[df['Neighbourhood'] == 'Not assigned'].index
for index in no_neig:
    df.iloc[index]['Neighbourhood'] = df.iloc[index]['Borough']

### Getting the shape

In [45]:
df.shape

(103, 3)

### Adding geographical data

In [12]:

import sys
import types
from ibm_botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

body = client_3a3d9cb89de24675ace899164170aa08.get_object(Bucket='datasciencemachinelearning-donotdelete-pr-5xkkwzevcnf9sv',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_geo = pd.read_csv(body)
df_geo.head()



Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Merge the two dataframes

In [46]:
df = df.merge(df_geo, left_on='Postcode', right_on='Postal Code', how='inner')
df = df.drop(['Postal Code'], axis=1)

In [51]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
