In [143]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
!pip install geopy
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

from bs4 import BeautifulSoup

print('Libraries imported.')

Libraries imported.


In [144]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
url

'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [145]:
s = requests.Session()
response = s.get(url, timeout=10)
response

<Response [200]>

In [146]:
# parse response content to html
soup = BeautifulSoup(response.content, 'html.parser')

In [147]:
#title of wikipedia page
soup.title.string

'List of postal codes of Canada: M - Wikipedia'

In [148]:
#get right table to scrap
right_table = soup.find('table',{"class":'wikitable sortable'})

In [149]:
# Number of columns in the table
for row in right_table.findAll("tr"):
    cells = row.findAll('td')

len(cells)

3

In [150]:
# number of rows in the table including header
rows = right_table.findAll("tr")
len(rows)

181

In [151]:
# header attributes of the table
header = [th.text.rstrip() for th in rows[0].find_all('th')]
print(header)
print('------------')
print(len(header))

['Postal Code', 'Borough', 'Neighbourhood']
------------
3


In [152]:
lst_data = []
for row in rows[1:]:
            data = [d.text.rstrip() for d in row.find_all('td')]
            lst_data.append(data)

In [153]:
# select also works as find_all
lst_data1 = []
for row in rows[1:]:
            data = [d.text.rstrip() for d in row.select('td')]
            lst_data1.append(data)

In [154]:
# sample records
lst_data1[0:3]

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods']]

In [155]:
#Scrap the data and append to respective lists

c1=[]
c2=[]
c3=[]
for row in right_table.findAll("tr"):
    cells = row.findAll('td')
    if len(cells)==3: #Only extract table body not heading
        c1.append(cells[0].find(text=True))
        c2.append(cells[1].find(text=True)) 
        c3.append(cells[2].find(text=True))



In [156]:
# create a dictionary
#d = dict([(x,0) for x in header])
d = dict([('PostalCode', 0), ('Borough', 0), ('Neighbourhood', 0)])
d

{'PostalCode': 0, 'Borough': 0, 'Neighbourhood': 0}

In [157]:
# append dictionary with corresponding data list.
d['PostalCode'] = c1
d['Borough']= c2
d['Neighbourhood']=c3

In [158]:
# convert dict to DataFrame
df = pd.DataFrame(d)

# Top 5 records
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


In [159]:
df = df.drop(df[df.Borough == 'Not assigned\n'].index)
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
5,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights\n"
6,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government\n"


In [160]:
df = df.replace('\n', '', regex=True)
#df.rename(columns={'Postal Code': 'PostalCode', 'Borough': 'Boroug', 'Neighbourhood': 'Neighbourhoo'})
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [161]:
df.tail()
df['PostalCode'][2]

'M3A'

In [162]:
#Combine neighborhoods that exist in one postal code area
df.groupby('PostalCode')['Neighbourhood'].apply(' '.join).reset_index(drop=True)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [163]:
df['PostalCode'][1]

'M4A'

In [164]:
#Replace "Not assigned" neighbourhoods for the respective Borough value
i=0
for index, row in df.iterrows():
    if df.Neighbourhood[index] == 'Not assigned':
        i += 1
        df.Neighbourhood[index] = df.Borough[index]
        
if i==0:
    print('No rows with "Not assigned" neighbourhoods')
else:
    print(i,' rows with "Not assgined" neighbourhoods replaced')
        

No rows with "Not assigned" neighbourhoods


In [165]:
df.shape

(103, 3)

In [167]:
'''
#!pip install geocoder
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

pc = df.loc[df['PostalCode'] == 'M5G']
postal_code = pc.Neighbourhood.to_string() + ', ' + pc.Borough.to_string()
# loop until you get the coordinates
while(lat_lng_coords is None):
    print(postal_code)
    #print('{}, Toronto, Ontario'.format(postal_code))
    g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
    lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

print('Latitude: ',latitude, '\n Longitude: ', longitude)
'''

"\n#!pip install geocoder\nimport geocoder # import geocoder\n\n# initialize your variable to None\nlat_lng_coords = None\n\npc = df.loc[df['PostalCode'] == 'M5G']\npostal_code = pc.Neighbourhood.to_string() + ', ' + pc.Borough.to_string()\n# loop until you get the coordinates\nwhile(lat_lng_coords is None):\n    print(postal_code)\n    #print('{}, Toronto, Ontario'.format(postal_code))\n    g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))\n    lat_lng_coords = g.latlng\n\nlatitude = lat_lng_coords[0]\nlongitude = lat_lng_coords[1]\n\nprint('Latitude: ',latitude, '\n Longitude: ', longitude)\n"

In [None]:
# The code was removed by Watson Studio for sharing.

In [None]:
df_cord.shape

In [None]:
df_cord.columns = ['PostalCode', 'Latitude', 'Longitude']
#df_cord.rename({'Postal Code': 'PostalCode'})
df_cord.head()

In [None]:
df_merge = pd.merge(df, df_cord, on='PostalCode')
df_merge.head()

In [None]:
# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 


! pip install folium==0.5.0
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

In [None]:
latitude = df_merge['Latitude'][0]
longitude = df_merge['Longitude'][0]

# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_merge['Latitude'], df_merge['Longitude'], df_merge['Borough'], df_merge['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

In [141]:
# set number of clusters
kclusters = 6

grouped_clustering = df_merge.drop(['Neighbourhood', 'PostalCode', 'Borough'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 2, 3, 2, 5, 4, 1, 1, 2], dtype=int32)

In [142]:
# add clustering labels
df_merge.insert(0, 'Cluster Labels', kmeans.labels_)

ValueError: cannot insert Cluster Labels, already exists

In [140]:
#Visualizing resulting clusters
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merge['Latitude'], df_merge['Longitude'], df_merge['Neighbourhood'], df_merge['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters