# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto (Week 3)

In [1]:
import numpy as np

In [2]:
from bs4 import BeautifulSoup

In [3]:
import requests

In [4]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

##### Get contents of the wiki page using BeautifulSoup

In [5]:
soup = BeautifulSoup(website_url,'lxml')
#print(soup.prettify())

##### Now, let's fetch the table contents in the wiki page 

In [6]:
table = soup.find("table", { "class" : "wikitable sortable" })

##### Loop through the table and collect row values

In [7]:
Postcode=[]
Borough=[]
Neighbourhood=[]

for row in table.findAll("tr"):
    cells = row.findAll("td")
    #For each "tr", assign each "td" to a variable.
    if len(cells) == 3:
        Postcode.append(cells[0].find(text=True))
        Borough.append(cells[1].find(text=True))
        Neighbourhood.append(cells[2].find(text=True).replace("\n", ""))

##### Let's convert the collected values to a dataframe

In [8]:
import pandas as pd

In [9]:
df=pd.DataFrame(Postcode,columns=['Postalcode'])
df['Borough']=Borough
df['Neighbourhood']=Neighbourhood

In [10]:
df.shape

(289, 3)

##### So, 289 rows scraped from the wiki page. Let's check the head

In [11]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


##### Convert Borough's "Not assigned" to nulls

In [12]:
df["Borough"].replace("Not assigned", np.nan, inplace=True)

##### Now drop the rows where Borough have nulls. Reindex the dataframe

In [13]:
df.dropna(subset=["Borough"], axis=0, inplace=True)

# reset index, because we droped rows where Borough are Not assigned
df.reset_index(drop=True, inplace=True)

In [14]:
# Check the shape
df.shape

(212, 3)

##### So, 212 non null Borough records found. 
##### Now, let's group the Neighborhood rows by Postalcode

In [15]:
#df = df.groupby(['Postalcode', 'Borough']).Neighbourhood.agg([('Neighbourhood', ', '.join)])
df = df.groupby(['Postalcode','Borough'],as_index=False).agg(lambda x : x.sum() if x.dtype=='float64' else ', '.join(x))

In [16]:
df.head(10)

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


##### Replace Neighbourhood values to Borough if Neighbourhood == "Not assigned"

In [17]:
for i in df.index:
    if df.at[i, 'Neighbourhood'] == "Not assigned":
        print("Replaced for ", df.at[i, 'Neighbourhood'],  "with", df.at[i, 'Borough'], "at index ", i )
        df.at[i, 'Neighbourhood'] = df.at[i, 'Borough']
        


Replaced for  Not assigned with Queen's Park at index  85


### Finally, let's check the shape after the groupBy

In [18]:
df.shape

(103, 3)

#### Thank you for reviewing my notebook. Have a wonderful day !!!