Build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M',header=0)
dfToronto=df[0]
dfToronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Ignore the cells without a Borough:

In [2]:
dfToronto = dfToronto[dfToronto.Borough!= 'Not assigned'].reset_index(drop=True)

Grouping different neighborhoods and separate them with comma

In [3]:
dfToronto_grouped=dfToronto.groupby(['Postal Code','Borough'], as_index=False).agg(lambda x: ','.join(x))

Finally, assign empty neighborhood:

In [4]:
dfToronto_grouped.loc[dfToronto_grouped.Neighbourhood== "Not assigned", 'Neighbourhood'] = dfToronto_grouped.loc[dfToronto_grouped.Neighbourhood== "Not assigned", 'Borough']
dfToronto_grouped.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [5]:
dfToronto_grouped.shape

(103, 3)

Get dataframe from csv file that has the geographical coordinates of each postal code:

In [6]:
coordinates=pd.read_csv('https://cocl.us/Geospatial_data')
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Append each latitude and longitude to the PostcalCode. Use a simple inner join with the csv file.

In [7]:
neighborhood = pd.merge(dfToronto_grouped, coordinates, on='Postal Code', how='inner')
neighborhood.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Cluster and plot the neighbourhoods containing Toronto in their Borough

In [8]:
neighborhood = neighborhood[neighborhood.Borough.str.contains('Toronto',regex=False)]
neighborhood.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


Visualise using Folium

In [9]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium
map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

Solving environment: ...working... done

# All requested packages already installed.





  current version: 4.4.10
  latest version: 4.9.2

Please update conda by running

    $ conda update -n base conda




In [10]:
for lat,lng,borough,neighbourhood in zip(neighborhood['Latitude'],neighborhood['Longitude'],neighborhood['Borough'],neighborhood['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

The map might not be visible on Github. Check out the README for the map.
Use KMeans clustering for the clsutering of the neighbourhoods.

In [11]:
from sklearn.cluster import KMeans

In [12]:
k=5
toronto_clustering = neighborhood.drop(['Postal Code','Borough','Neighbourhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
neighborhood.insert(0, 'Cluster Labels', kmeans.labels_)

In [15]:
neighborhood.head()

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighbourhood,Latitude,Longitude
37,0,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,0,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,0,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,0,M4M,East Toronto,Studio District,43.659526,-79.340923
44,1,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [20]:
import matplotlib.cm as cm
import matplotlib.colors as colors

The below map might not be visible on Github. Check out the README for the map.

In [19]:
# create map
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(neighborhood['Latitude'], neighborhood['Longitude'], neighborhood['Neighbourhood'], neighborhood['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters