# IBM Capstone Course
### Week 3 Peer-graded Assignment

In this assignment, we will explore the neighborhoods in the city of Toronto.

### Importing Libraries

In [37]:
from bs4 import BeautifulSoup as bsoup
from urllib.request import urlopen as uReq
import requests
import lxml
import pandas as pd
from pandas import DataFrame
import numpy as np

### Scaping and Parsing Data

In [47]:
#url of the page to be scraped
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

req=requests.get(url)

#parse the table with soup using html parser
page=bsoup(req.text,"html.parser")

#extracting the table from the data
t=page.table
results=t.find_all('tr')

In [111]:
table = []
for row in results:
    info = row.text.split('\n')[1:-1] # remove empty str (the first and last items)
    table.append(info)

### Converting Table into a Pandas Dataframe

In [110]:
df = pd.DataFrame(table[1:], columns=table[0])
df.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Data Cleaning and Sorting

#### Dropping Cells with "Unassigned" Boroughs

In [101]:
#dropping rows where values in the Borough column are "Not Assigned"
df = df[df.Borough != 'Not assigned']

#### Correcting the Index of the Data Frame

In [102]:
#Correcting the index of the data frame
df.reset_index(drop=True, inplace=True)

#### Correcting Column Names

In [108]:
#Correcting column names
df.rename(columns={'Postcode':'PostalCode'}, inplace=True)
df.rename(columns={'Neighbourhood':'Neighborhood'}, inplace=True)
df.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### If a row has a Borough but an unassigned Neighbourhood then the Neighbourhood will be the same as the Borough

In [105]:
df.loc[df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = df['Borough']
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


#### For multiple neighbourhoods per Postal Code, the neighbourhood will be combined in one row separated by a comma

In [109]:
df = df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
df.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Shape of the Data Frame

In [98]:
df.shape

(103, 3)