### Import libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20200419172635-0001
KERNEL_ID = ad164ac0-683f-4611-bd7d-86e9b88464ee


### Get data from Wiki

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_html = requests.get(url).text
soup = BeautifulSoup(wiki_html, 'html.parser')

data = []
for tor in soup.tbody.find_all('tr'):
    data.append([ td.get_text().strip() for td in tor.find_all('td')])

### Data cleaning

In [3]:
# Clean Borough column
# Import to pandas
df=pd.DataFrame(data,columns=['PostalCode','Borough','Neighborhood2'])

# Find indexes of rows that have "Not assigned" in Borough column
indexNames = df[(df['Borough'] == "Not assigned")].index

# Drop rows that have "Not assigned" in Borough column
df.drop(indexNames,inplace=True)

# Drop the first row
df.dropna(inplace=True)

# Combine multiple rows into one row based on PostalCode and Borough
df=df.groupby(['PostalCode','Borough'])['Neighborhood2'].apply(', '.join).reset_index()

In [4]:
# Clean Neighborhood column
# Replace "Not assigned" in Neighborhood column with the value in Borough column
def custom_fx(data):
    if data['Neighborhood2']=='Not assigned':
        var=data['Borough']
    else:
        var=data['Neighborhood2']
    return var

# Apply the function
df['Neighborhood']=df.apply(custom_fx,axis='columns')

# Check that there is no more "Not assigned" in Neighborhood column
print("There are {} rows that have 'Not assigned' in Neighborhood column in the dataframe".format(
    len(df[df['Neighborhood']=='Not assigned'])
        )
    )

# Delete Neighborhood2 column
df.drop(columns='Neighborhood2', inplace = True)
df.head()

There are 0 rows that have 'Not assigned' in Neighborhood column in the dataframe


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
# Import Project to save data to Storage Cloud of IBM Watson
# I remove my project-id and token after

from project_lib import Project
project = Project(sc,"prj-id","prj-token")
project.save_data(file_name = "tor_df.csv",data = df.to_csv(index=False), overwrite = True)

{'file_name': 'tor_df.csv',
 'message': 'File saved to project storage.',
 'bucket_name': 'courseracapstoneprojectweek31-donotdelete-pr-0htiy8o9vqapci',
 'asset_id': '7b978183-4d5d-4ed8-96e1-6f57cb777ead'}