## IBM Applied Data Science Capstone - Week 3 ..............................................................(PART 1)
### Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto
#### Guilherme P. de Carvalho

Importing the essential libraries

In [1]:
import requests #for data wrangling from an url
from bs4 import BeautifulSoup #library for html handling
import pandas as pd 

## Data wrangling

Getting the Postal Code data from Wikipedia
Before doing it, I went to the URL and I inspect the source. This analysis helped me identify the html section where the data was.

In [2]:
# url where the data is
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' 

# pooling data from url
wiki_raw = requests.get(wiki_url)

# making the content a bs object
soup = BeautifulSoup(wiki_raw.text,'lxml')

# getting the data which matters for the project, accessing directly the Postal Code table
match = soup.find('table', class_="wikitable sortable")

In [3]:
# prettiify is to make the linear text in a proper html shape, keeping it in a variable for further manipulation
# print(match.prettify())
wiki_tab = pd.read_html(match.prettify())

## Data Cleansing

In [4]:
# slicing the array to look like a proper pandas df
wiki_tab_temp = wiki_tab[0]
wiki_tab_temp.head()

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [5]:
# proper columns being set
wiki_tab_temp.columns = wiki_tab_temp.iloc[0]
wiki_tab_temp.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [6]:
# removing row index=0, which was 'promoted' to be column
wiki_tab_temp = wiki_tab_temp.reindex(wiki_tab_temp.index.drop(0))
wiki_tab_temp.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [7]:
# removing the rows which column 'Borough' equals to 'Not assigned' as per Instructor guidance
wiki_tab_temp = wiki_tab_temp[wiki_tab_temp['Borough']!='Not assigned']
wiki_tab_temp = wiki_tab_temp.reset_index(drop=True)
wiki_tab_temp.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [8]:
# setting a valid value for lines which 'Neighbourhood' column equals to 'Not assigned' as per instructor guidance
for index, row in wiki_tab_temp.iterrows():
    if row['Neighbourhood']=='Not assigned':
        i = 0
        print('Before: ', row['Neighbourhood'])
        row['Neighbourhood']=row['Borough']
        print('After: ', row['Neighbourhood'])
        i = i + 1
print('Total lines fixed: ', i)

# expected result: no lines
wiki_tab_temp[wiki_tab_temp['Neighbourhood']=='Not assigned']

Before:  Not assigned
After:  Queen's Park
Total lines fixed:  1


Unnamed: 0,Postcode,Borough,Neighbourhood


In [9]:
# reindexing
wiki_tab_temp = wiki_tab_temp.reset_index(drop=True)
wiki_tab_temp.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [10]:
# removind duplicities, merging Neighbourhood values for rows with same Postcode
for row_cur, row_nxt in zip(wiki_tab_temp.iterrows(),wiki_tab_temp.iloc[1:].iterrows()):
    if list(wiki_tab_temp.iloc[row_cur[0]])[0] == list(wiki_tab_temp.iloc[row_nxt[0]])[0]:
        current_str = list(wiki_tab_temp.iloc[row_cur[0]])[2]
        next_str = list(wiki_tab_temp.iloc[row_nxt[0]])[2]
        wiki_tab_temp.iloc[row_nxt[0]][2] = ''
        wiki_tab_temp.iloc[row_nxt[0]][2] = current_str+', '+next_str
        wiki_tab_temp.iloc[row_cur[0]][0] = '##DEL' #flaggin rows to be ignored after iteration

wiki_tab_temp = wiki_tab_temp[wiki_tab_temp['Postcode']!='##DEL'] #removing undesired rows
wiki_tab_temp = wiki_tab_temp.reset_index(drop=True)

wiki_tab_temp.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [20]:
# checking df shape
wiki_tab_temp.shape

(103, 3)