In [1]:
!pip install pip install beautifulsoup4
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests



In [2]:
!pip install lxml



Import URL by request.get()

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
website_url = requests.get(url).text

In [4]:
# BeautifulASoup is used to get the data in beautiful form.
soup = BeautifulSoup(website_url,'html5')

In [5]:
# Function in Soup
table = soup.find_all('table')

In [6]:
# Function in pandas
df = pd.read_html(str(table))[0]

In [7]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


# Data Cleaning and Preprocessing

Ignore cell with not assigned value

In [8]:
df1 = df[df.Borough != 'Not assigned']
df1.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Group by postal code which can have many neighbourhoods separated with a comma 

In [9]:
df2 = df1.groupby(['Postal Code','Borough'])["Neighborhood"].apply(", ".join).reset_index()
df2.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


If a cell has a borough but a "Not assigned" neighborhood, then the neighborhood will be the same as the borough

In [10]:
df2['Neighborhood'] = np.where(df2['Neighborhood'] == 'Not assigned',df2['Borough'], df2['Neighborhood'])
df2.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Number of rows of df2

In [11]:
df2.shape

(103, 3)

In [12]:
lat_lon = pd.read_csv('https://cocl.us/Geospatial_data')
lat_lon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
df3 = pd.merge(df2,lat_lon,on='Postal Code')
df3.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [35]:
df4 = df3[df3['Borough'].str.contains('Toronto',regex=False)] # regex = False หมายความว่าขอแค่่มีคำนี้พอ พิมพ์เล็กพิมพ์ใหญ่ได้หมด
df4

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


In [39]:
!pip install folium
import folium



In [56]:
map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=12)

for lat,lng,borough,neighbourhood in zip(df4['Latitude'],df4['Longitude'],df4['Borough'],df4['Neighborhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='black',
    fill=True,
    fill_color='#C70039',
    fill_opacity=1,
    parse_html=False).add_to(map_toronto)
map_toronto

# Clustering by K-Means

In [None]:
from sklearn.cluster import KMeans

k=4
toronto_clustering = df4.drop(['Postal Code','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
df4.insert(0, 'Clustering Labels', kmeans.labels_)

In [66]:
kmeans.labels_

array([0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       3, 3, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 0], dtype=int32)

In [71]:
df4

Unnamed: 0,Clustering Labels,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
37,0,0,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,0,0,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,0,0,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,0,0,M4M,East Toronto,Studio District,43.659526,-79.340923
44,3,1,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,3,1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,3,1,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
47,3,1,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,3,1,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,3,1,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


In [73]:
df5 = df4.drop(['Cluster Labels'], 1)

In [74]:
df5

Unnamed: 0,Clustering Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
37,0,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,0,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,0,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,0,M4M,East Toronto,Studio District,43.659526,-79.340923
44,3,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,3,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,3,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
47,3,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,3,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,3,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


In [83]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df5['Latitude'], df5['Longitude'], df5['Neighborhood'], df5['Clustering Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters