<H2> Read Toronto Neighborhood Information from web page </H2>

In [4]:
import requests

# Get HTML data from the web link
website_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

In [10]:
from bs4 import BeautifulSoup

# Create a parse tree for parsed pages that can be used to extract data from HTML, which is useful for web scraping.
soup = BeautifulSoup(website_url,'lxml')

# Extracting table related HTML data
table = soup.find("table",{"class":"wikitable sortable"})

# Extract the table headers in a list
table_headers =list()
for th in table.find_all('th'):
    table_headers.append(th.getText().strip())
print(table_headers)

In [36]:
# Extracting the table rows
table_rows =list()
table_html_rows = table.find_all('tr')
for row in table_html_rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    # Ignoring the header
    if len(cols)!=0:
        table_rows.append([ele for ele in cols if ele]) # Get rid of empty values
    

In [38]:
print(table_rows[0:10])

[['M1A', 'Not assigned', 'Not assigned'], ['M2A', 'Not assigned', 'Not assigned'], ['M3A', 'North York', 'Parkwoods'], ['M4A', 'North York', 'Victoria Village'], ['M5A', 'Downtown Toronto', 'Harbourfront'], ['M5A', 'Downtown Toronto', 'Regent Park'], ['M6A', 'North York', 'Lawrence Heights'], ['M6A', 'North York', 'Lawrence Manor'], ['M7A', "Queen's Park", 'Not assigned'], ['M8A', 'Not assigned', 'Not assigned']]


<H2> Creating a Toronto Data Frame </H2>

<H5> Generating Panda Data Frame</H5>

In [108]:
# Generating the data frame from lists
import pandas as pd
TorontoDf = pd.DataFrame(table_rows)
table_headers[0]="PostalCode"
table_headers[2]="Neighborhood"
TorontoDf.columns= table_headers

TorontoDf.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


<H5> Filtering not assigned 'boroughs' </H5>

In [109]:
print("Total Rows =%d"%TorontoDf.shape[0])

# Removing the rows with no borough
TorontoDf = TorontoDf[TorontoDf.Borough != 'Not assigned']
TorontoDf.reset_index(drop=True, inplace=True)
print("Total Rows =%d"%TorontoDf.shape[0])


Total Rows =288
Total Rows =211


In [110]:
TorontoDf.head()
print(TorontoDf.shape)

(211, 3)


<H5> Merge the Neighbourhood values for similar Postal code and borough </H5>

In [112]:
TorontoDf= TorontoDf.groupby(["PostalCode","Borough"])['Neighborhood'].apply(', '.join).reset_index()
TorontoDf.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


<H5> Assigning Borough value to Not Assigned Neighborhood </H5>

In [116]:
# Finding incides of all rows, where we have a not assigned neighbuorhood
potential_indices= TorontoDf.loc[(TorontoDf.Neighborhood == "Not assigned") & (TorontoDf.Borough != "Not assigned")].index.values.astype(int) 

print(TorontoDf.loc[85])

# Assigning the Borough values to all neighbhourhood
for index in potential_indices:
    print(index)
    TorontoDf.loc[index]["Neighborhood"] = TorontoDf.loc[index]["Borough"]
    
TorontoDf.head(10)

85


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [119]:
#Making sure that neighborhood value is updated
print(TorontoDf.loc[85])


PostalCode               M7A
Borough         Queen's Park
Neighborhood    Queen's Park
Name: 85, dtype: object


In [120]:
# Total elements in the data frame
print(TorontoDf.shape)

(103, 3)
