# IBM Capstone Course
### Week 3 Peer-graded Assignment

In this assignment, we will explore the neighborhoods in the city of Toronto.

### Importing Libraries

In [37]:
from bs4 import BeautifulSoup as bsoup
from urllib.request import urlopen as uReq
import requests
import lxml
import pandas as pd
from pandas import DataFrame
import numpy as np

### Scaping and Parsing Data

In [122]:
#url of the page to be scraped
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

req=requests.get(url)

#parse the table with soup using html parser
page=bsoup(req.text,"html.parser")

#extracting the table from the data
t=page.table
results=t.find_all('tr')

In [123]:
table = []
for row in results:
    info = row.text.split('\n')[1:-1] # remove empty str (the first and last items)
    table.append(info)

### Converting Table into a Pandas Dataframe

In [124]:
df = pd.DataFrame(table[1:], columns=table[0])
df.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Data Cleaning and Sorting

#### Dropping Cells with "Unassigned" Boroughs

In [125]:
#dropping rows where values in the Borough column are "Not Assigned"
df = df[df.Borough != 'Not assigned']

#### Correcting the Index of the Data Frame

In [126]:
#Correcting the index of the data frame
df.reset_index(drop=True, inplace=True)

#### Correcting Column Names

In [127]:
#Correcting column names
df.rename(columns={'Postcode':'Postal Code'}, inplace=True)
df.rename(columns={'Neighbourhood':'Neighborhood'}, inplace=True)
df.head(5)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### If a row has a Borough but an unassigned Neighbourhood then the Neighbourhood will be the same as the Borough

In [128]:
df.loc[df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = df['Borough']
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


#### For multiple neighbourhoods per Postal Code, the neighbourhood will be combined in one row separated by a comma

In [130]:
df = df.groupby(['Postal Code','Borough'])['Neighborhood'].apply(', '.join).reset_index()
df.head(5)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Shape of the Data Frame

In [131]:
df.shape

(103, 3)

# Part: 2 - Attach Latitude and Longitude to the Dataframe

In [133]:
geo_data = pd.read_csv("Geospatial_Coordinates.csv")
geo_data.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merging

In [137]:
new_df = df.merge(geo_data, on = "Postal Code")
new_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Explore and Cluster the Neighbourhoods of Toronto