**From the inspect element, we see that the data, which we are interested in,  
is stored in the table, wikitable, and we can access it through the table,   
tr, and td tags. Let us begin by importing and installing the necessary libraries.**

In [5]:
# import requests for getting the HTML contents, lxml.html for parsing, and pandas 
!pip install lxml
import requests
import lxml.html as lh
import pandas as pd

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/dd/ba/a0e6866057fc0bbd17192925c1d63a3b85cf522965de9bc02364d08e5b84/lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 6.0MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.0


**Send an HTTP request to the URL of the webpage we wish to get access to,  
and the server responds by by returning the HTML content of the webpage**

In [6]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

#Create a handle to handle the contents of the website
tor = requests.get(URL) 

#Store the contents of the website under doc
tor_M = lh.fromstring(tor.content)

#Parse the data that is stored in the rows of the table, in tr tag 
tor_M_tr = tor_M.xpath('//tr')

In [7]:
#Check the length of the first 12 rows
[len(T) for T in tor_M_tr[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [8]:
# let’s parse the first row as our header.
#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tor_M_tr[0]:
    i+=1
    name=t.text_content()
    name=name.strip() # remove the \n that follows each name
    print ("%d: %s" %(i,name))
    col.append((name,[]))
    
col # look at the tuple called col to see if everything is the way it should be


1: Postal code
2: Borough
3: Neighborhood


[('Postal code', []), ('Borough', []), ('Neighborhood', [])]

**Now we begin to read data from the webpage and store the results row by row.**

In [9]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tor_M_tr)):
    #T is our j'th row
    T=tor_M_tr[j]
    
    #If row is not of size 3, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

**Create the pandas dataframe**

In [10]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)
df = df.replace('\n', '',regex=True) #replace all \n from the entries in the dataframe
df.head() #look at the first five rows


Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [11]:
# Drop the rows that have Not assigned as a Borough entry, and reset the index.
df.drop(df.loc[df['Borough'] == "Not assigned"].index, inplace=True)
df=df.reset_index()
del df['index']

# Look at the last five rows of the table.
df.tail()


Unnamed: 0,Postal code,Borough,Neighborhood
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...
102,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...
103,,Canadian postal codes,


In [12]:
# The last row of df should be deleted
df_tor=df.loc[:102,:] # form a new dataframe which excludes the last row of df.
df_tor.shape

(103, 3)

**Now we will get geospatial data from a csv file, and add the longitude and   
latitude values of the neighborhoods, forming a single dataframe**

In [13]:
# Sort the dataframe df_tor by the Postal code column and set the index
df_sorted=df_tor.sort_values('Postal code')
df_sorted=df_sorted.reset_index()
del df_sorted['index']
df_sorted


Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West
9,M1N,Scarborough,Birch Cliff / Cliffside West


In [19]:
# read the latitude and longitude values into a data frame
geo_tor=pd.read_csv("http://cocl.us/Geospatial_data")
geo_tor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [20]:
# delete the Postal code column from geospatial dataframe
del geo_tor['Postal Code'] 
geo_tor


Unnamed: 0,Latitude,Longitude
0,43.806686,-79.194353
1,43.784535,-79.160497
2,43.763573,-79.188711
3,43.770992,-79.216917
4,43.773136,-79.239476
5,43.744734,-79.239476
6,43.727929,-79.262029
7,43.711112,-79.284577
8,43.716316,-79.239476
9,43.692657,-79.264848


In [21]:
# Create a single data frame from the two dataframes
df_sorted = pd.concat([df_sorted, geo_tor], axis=1)
df_sorted

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park,43.727929,-79.262029
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge,43.711112,-79.284577
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West,43.716316,-79.239476
9,M1N,Scarborough,Birch Cliff / Cliffside West,43.692657,-79.264848


In [22]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.3

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

Libraries imported.


**Use geopy library to get the latitude and longitude values of New York City.**  
In order to define an instance of the geocoder, we need to define a user_agent.   
We will name our agent tor_explorer.


In [23]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


**Create a map of Toronto and superimpose the neighborhoods on top of it.**

In [24]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_sorted['Latitude'], df_sorted['Longitude'], df_sorted['Borough'], df_sorted['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

**Use the interactive map to zoom and get the name of a neighborood by   
clicking on a blue circle mark.**  

We simplify the map and segment and cluster only the neighborhoods in North York,  
so let us slice the original dataframe and create a new dataframe of the North York data.


In [26]:
north_york_data = df_sorted[df_sorted['Borough'] == 'North York'].reset_index(drop=True)
north_york_data.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M2H,North York,Hillcrest Village,43.803762,-79.363452
1,M2J,North York,Fairview / Henry Farm / Oriole,43.778517,-79.346556
2,M2K,North York,Bayview Village,43.786947,-79.385975
3,M2L,North York,York Mills / Silver Hills,43.75749,-79.374714
4,M2M,North York,Willowdale / Newtonbrook,43.789053,-79.408493


In [27]:
# Get the geographical coordinates of North York.
address = 'North York, Toronto'
geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of North York are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of North York are 43.7543263, -79.44911696639593.


In [28]:
# create map of North York using latitude and longitude values
map_north_york = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(north_york_data['Latitude'], north_york_data['Longitude'], north_york_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_north_york)  
    
map_north_york