## Applied Data Science Capstone - Week 3 Notebook

First the necessary libraries and methods are imported.

In [15]:
# Import necessary libraries
import json 
import pandas as pd
import requests
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from sklearn.cluster import KMeans
import folium # map rendering library

The Toronto neighborhood data is then scraped from the Wikipedia page, parsed with BeautifulSoup, and cleaned with string operations.

In [2]:
# Scrape Wikipedia webpage with Toronto data
tor_data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# Parse data using BeautifulSoup
soup = BeautifulSoup(tor_data, 'html.parser')

# Iterate through table and extract data into dictionaries
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned': # Cells with no borough data are ignored.
        pass
    else:
        cell['PostalCode'] = row.p.text[:3] # The postal code is three digits long and will be the first characters in the string.
        cell['Borough'] = (row.span.text).split('(')[0] # The borough will be all characters before the first parenthesis
        # Neighborhood will be the remaining characters within the parentheses.
        # Need to split on commas to capture instances where various neighborhoods share a postal code.
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

Read results into a Pandas dataframe. Then clean up some of the borough names.

In [3]:
df=pd.DataFrame(table_contents)
# Need to replace long and clumsy borough names with shorter borough names
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

Finally, we will print out the first five rows of the dataframe and use the .shape method to its rows.

In [4]:
print(df.head())
df.shape

  PostalCode           Borough                      Neighborhood
0        M3A        North York                         Parkwoods
1        M4A        North York                  Victoria Village
2        M5A  Downtown Toronto         Regent Park, Harbourfront
3        M6A        North York  Lawrence Manor, Lawrence Heights
4        M7A      Queen's Park     Ontario Provincial Government


(103, 3)

Now to read in the geospatial coordinates associated with each postal code from the csv file provided. The first five rows are printed along with the number of rows to confirm the data was read successfully.

In [5]:
coords = pd.read_csv('Geospatial_Coordinates.csv')
coords.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
print(coords.head())
coords.columns

  PostalCode   Latitude  Longitude
0        M1B  43.806686 -79.194353
1        M1C  43.784535 -79.160497
2        M1E  43.763573 -79.188711
3        M1G  43.770992 -79.216917
4        M1H  43.773136 -79.239476


Index(['PostalCode', 'Latitude', 'Longitude'], dtype='object')

In [17]:
neighborhoods = df.merge(coords[['PostalCode', 'Latitude', 'Longitude']])
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto Business,Enclave of M4L,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


Geopy is used to obtain latitude and longitude for Toronto, Canada in order to center the map to be produced with Folium.

In [16]:
address = 'Toronto, CA'
geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


Next, all of the boroughs are plotted on the map to take a first exploration of the data.

In [18]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Now the dataframe is filtered to only include neighborhoods in the North York borough.

In [44]:
NY_data = neighborhoods[neighborhoods['Borough'].isin(['North York'])]
NY_data

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
7,M3B,North York,Don Mills North,43.745906,-79.352188
10,M6B,North York,Glencairn,43.709577,-79.445073
13,M3C,North York,Don Mills South,43.7259,-79.340923
27,M2H,North York,Hillcrest Village,43.803762,-79.363452
28,M3H,North York,"Bathurst Manor, Wilson Heights, Downsview North",43.754328,-79.442259
33,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
34,M3J,North York,"Northwood Park, York University",43.76798,-79.487262


The North York neighborhoods are then plotted on the map.

In [45]:
# create map of North York using latitude and longitude values
map_ny = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(NY_data['Latitude'], NY_data['Longitude'], NY_data['Borough'], NY_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_ny)  
    
map_ny