# Applied Data Science Capstone Week 3
## Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

This is to the first step in the assignment : "Segmenting and Clustering Neighborhoods in Toronto"

In [1]:
from bs4 import BeautifulSoup # import BeautifulSoup
import pandas as pd # import Pandas
import numpy as np # import Numpy

Firstly retrieve the file from the Wikipedia website.

In [2]:
# using the curl command instead of wget in MacOS
!curl https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M -o html_doc.html
print('Data downloaded!')

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 54726  100 54726    0     0   201k      0 --:--:-- --:--:-- --:--:--  201k
Data downloaded!


Then open the file and load the text from the table in the middle of the html file.

In [3]:
# open the text file and load it into a Beautiful Soup Object
with open("html_doc.html") as fp:
    soup = BeautifulSoup(fp)
    
# create an array to hold the string retrieved from the html file.
text = []
for string in soup.stripped_strings:
    text.append(string)

# from the stripped strings we can see the the information (Postalcode, Borough, Neighborhood) are available from string 41 to 581  
text_extract = text[41:581]

Now we create the Data Frame with columns Postalcode, Borough and Neighborhood.

In [4]:
# set up the column names for the Data Frame
column_names = ['PostalCode', 'Borough', 'Neighborhood']

df = pd.DataFrame(columns=column_names)
df

Unnamed: 0,PostalCode,Borough,Neighborhood


And then we insert the data from text_extract into the Data Frame.  

In [5]:
for txt in range(0, len(text_extract), 3):
    postcode = text_extract[txt]
    borough = text_extract[txt+1]
    neigh = text_extract[txt+2]
    df = df.append({'PostalCode' : postcode, 'Borough' : borough, 'Neighborhood' : neigh}, ignore_index = True)

df.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


Then we only select the rows with assigned Borough.

In [6]:
# select only rows without 'Not assigned' in the Borough column
df = df[~df['Borough'].isin(['Not assigned'])]

# reset the index
df.reset_index(drop=True, inplace=True)

df.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [7]:
print("The shape of the dataframe is : {} ".format(df.shape))
print("The number of rows is : {} ".format(df.shape[0]))


The shape of the dataframe is : (103, 3) 
The number of rows is : 103 


Now we will retrieve the coordinates from the CSV file downloaded, and add the coordinates to the original Data Frame.

In [8]:
# load the csv file in to a Data Frame
geo_codes = pd.read_csv('Geospatial_Coordinates.csv')
geo_codes.rename(columns={'Postal Code' : 'PostalCode'}, inplace=True)

geo_codes

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [9]:
# join the original Data Frame with the Data Frame with coordinates
merged_df = df.set_index('PostalCode').join(geo_codes.set_index('PostalCode'), on='PostalCode')

# reset the index
merged_df.reset_index(inplace=True)
merged_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
