In [1]:
import numpy as np 
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json 
from geopy.geocoders import Nominatim
import requests 
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup
import xml
!pip install folium
import folium



In [2]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(url,'lxml')

In [3]:
table_post = soup.find('table')
fields = table_post.find_all('td')

postcode = []
borough = []
neighbourhood = []

for i in range(0, len(fields), 3):
    postcode.append(fields[i].text.strip())
    borough.append(fields[i+1].text.strip())
    neighbourhood.append(fields[i+2].text.strip())
        
df = pd.DataFrame(data=[postcode, borough, neighbourhood]).transpose()
df.columns = ['Postcode', 'Borough', 'Neighbourhood']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
df['Borough'].replace('Not assigned', np.nan, inplace=True)
df.dropna(subset=['Borough'], inplace=True)

df.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [5]:
df = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df.columns = ['Postcode', 'Borough', 'Neighbourhood']
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [6]:
df.shape

(103, 3)

 ### Defining the new Dataframe Format

In [11]:
df["Latitude"] = ""
df["Longitude"] = ""
df.shape

(103, 5)

In [13]:
# Cleaning the neighborhood with multiple values ( Selecting the first one)
df["Neighbourhood"] = df["Neighbourhood"].str.split(",", n = 1, expand = True) 
df["Neighbourhood"] = df["Neighbourhood"].str.split("-", n = 1, expand = True) 
df["Neighbourhood"].head(5)

0       Malvern
1    Rouge Hill
2     Guildwood
3        Woburn
4     Cedarbrae
Name: Neighbourhood, dtype: object

In [14]:
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,Malvern,,
1,M1C,Scarborough,Rouge Hill,,
2,M1E,Scarborough,Guildwood,,
3,M1G,Scarborough,Woburn,,
4,M1H,Scarborough,Cedarbrae,,
5,M1J,Scarborough,Scarborough Village,,
6,M1K,Scarborough,Kennedy Park,,
7,M1L,Scarborough,Golden Mile,,
8,M1M,Scarborough,Cliffside,,
9,M1N,Scarborough,Birch Cliff,,


### To get coordinates and populate the df

In [15]:
df1 = df.loc[0:25]
df2 = df.loc[26:50]
df3 = df.loc[51:75]
df4 = df.loc[76:102]

In [17]:
# Need to drop those Neighborhood that the geocode does not find
to_drop_unknown = []
geolocator = Nominatim(user_agent="ny_explorer")
for index, row in df1.iterrows():
    address = row['Neighbourhood'] + ', Toronto'
    try:
        location = geolocator.geocode(address)
        latitude = location.latitude
        longitude = location.longitude
        print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))
        df1.loc[index, 'Latitude'] = latitude
        df1.loc[index, 'Longitude'] = longitude
    except AttributeError:
        print('Cannot do: {}, will drop index: {}'.format(address, index))
        to_drop_unknown.append(index)

The geograpical coordinate of Malvern, Toronto are 43.8091955, -79.2217008.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The geograpical coordinate of Rouge Hill, Toronto are 43.7802711, -79.1304992.
The geograpical coordinate of Guildwood, Toronto are 43.7552251, -79.1982293.
The geograpical coordinate of Woburn, Toronto are 43.7598243, -79.2252908.
The geograpical coordinate of Cedarbrae, Toronto are 43.75646655, -79.22669244258802.
The geograpical coordinate of Scarborough Village, Toronto are 43.7437422, -79.2116324.
The geograpical coordinate of Kennedy Park, Toronto are 43.724878, -79.2539688.
The geograpical coordinate of Golden Mile, Toronto are 43.7278414, -79.2876217.
The geograpical coordinate of Cliffside, Toronto are 43.7111699, -79.2481769.
The geograpical coordinate of Birch Cliff, Toronto are 43.6918051, -79.2644935.
The geograpical coordinate of Dorset Park, Toronto are 43.7528467, -79.282067.
The geograpical coordinate of Wexford, Toronto are 43.7453767, -79.2947155.
The geograpical coordinate of Agincourt, Toronto are 43.7853531, -79.2785494.
The geograpical coordinate of Clarks Corner

In [18]:
geolocator = Nominatim(user_agent="ny_explorer2")
for index, row in df2.iterrows():
    address = row['Neighbourhood'] + ', Toronto'
    try:
        location = geolocator.geocode(address)
        latitude = location.latitude
        longitude = location.longitude
        print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))
        df2.loc[index, 'Latitude'] = latitude
        df2.loc[index, 'Longitude'] = longitude
    except AttributeError:
        print('Cannot do: {}, will drop index: {}'.format(address, index))
        to_drop_unknown.append(index)

The geograpical coordinate of Don Mills, Toronto are 43.775347, -79.3459439.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The geograpical coordinate of Don Mills, Toronto are 43.775347, -79.3459439.
The geograpical coordinate of Bathurst Manor, Toronto are 43.6655189, -79.4119373.
The geograpical coordinate of Northwood Park, Toronto are 43.7541351, -79.50448.
The geograpical coordinate of Downsview, Toronto are 43.7492988, -79.462248.
The geograpical coordinate of Downsview, Toronto are 43.7492988, -79.462248.
The geograpical coordinate of Downsview, Toronto are 43.7492988, -79.462248.
The geograpical coordinate of Downsview, Toronto are 43.7492988, -79.462248.
The geograpical coordinate of Victoria Village, Toronto are 43.732658, -79.3111892.
The geograpical coordinate of Parkview Hill, Toronto are 43.7062977, -79.3219073.
The geograpical coordinate of Woodbine Heights, Toronto are 43.69992, -79.319279.
The geograpical coordinate of The Beaches, Toronto are 43.6710244, -79.296712.
The geograpical coordinate of Leaside, Toronto are 43.7047983, -79.3680904.
The geograpical coordinate of Thorncliffe Park, 

In [19]:
geolocator = Nominatim(user_agent="ny_explorer3")
for index, row in df3.iterrows():
    address = row['Neighbourhood'] + ', Toronto'
    try:
        location = geolocator.geocode(address)
        latitude = location.latitude
        longitude = location.longitude
        print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))
        df3.loc[index, 'Latitude'] = latitude
        df3.loc[index, 'Longitude'] = longitude
    except AttributeError:
        print('Cannot do: {}, will drop index: {}'.format(address, index))
        to_drop_unknown.append(index)

The geograpical coordinate of St. James Town, Toronto are 43.6694032, -79.3727041.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The geograpical coordinate of Church and Wellesley, Toronto are 43.6708625, -79.37279241253721.
The geograpical coordinate of Regent Park, Toronto are 43.6607056, -79.3604569.
The geograpical coordinate of Garden District, Toronto are 43.6564995, -79.3771141.
The geograpical coordinate of St. James Town, Toronto are 43.6694032, -79.3727041.
The geograpical coordinate of Berczy Park, Toronto are 43.64798435, -79.37539591138858.
The geograpical coordinate of Central Bay Street, Toronto are 43.6597555, -79.3853931.
The geograpical coordinate of Richmond, Toronto are 43.6485875, -79.3913729.
The geograpical coordinate of Harbourfront East, Toronto are 43.6400801, -79.3801495.
The geograpical coordinate of Toronto Dominion Centre, Toronto are 43.6473768, -79.3813719429129.
The geograpical coordinate of Commerce Court, Toronto are 43.648163800000006, -79.37776594404383.
The geograpical coordinate of Bedford Park, Toronto are 43.7373876, -79.4109253.
The geograpical coordinate of Roselawn, To

In [20]:
geolocator = Nominatim(user_agent="ny_explorer4")
for index, row in df4.iterrows():
    address = row['Neighbourhood'] + ', Toronto'
    try:
        location = geolocator.geocode(address)
        latitude = location.latitude
        longitude = location.longitude
        print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))
        df4.loc[index, 'Latitude'] = latitude
        df4.loc[index, 'Longitude'] = longitude
    except AttributeError:
        print('Cannot do: {}, will drop index: {}'.format(address, index))
        to_drop_unknown.append(index)

The geograpical coordinate of Dufferin, Toronto are 43.6602019, -79.4357191.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The geograpical coordinate of Little Portugal, Toronto are 43.64741325, -79.43111632546047.
The geograpical coordinate of Brockton, Toronto are 43.6509173, -79.4400216.
The geograpical coordinate of North Park, Toronto are 43.7186899, -79.4775337.
Cannot do: Del Ray, Toronto, will drop index: 80
The geograpical coordinate of Runnymede, Toronto are 43.6517026, -79.4759978.
The geograpical coordinate of High Park, Toronto are 43.6538668, -79.4668644.
The geograpical coordinate of Parkdale, Toronto are 43.6404954, -79.4368965.
The geograpical coordinate of Runnymede, Toronto are 43.6517026, -79.4759978.
The geograpical coordinate of Queen's Park, Toronto are 43.659659, -79.3903399.
Cannot do: Canada Post Gateway Processing Centre, Toronto, will drop index: 86
Cannot do: Business reply mail Processing Centre, Toronto, will drop index: 87
The geograpical coordinate of New Toronto, Toronto are 43.6007625, -79.505264.
The geograpical coordinate of Alderwood, Toronto are 43.6017173, -79.545232

In [21]:
print(df1.shape)
print(df2.shape)
print(df3.shape)
print(df4.shape)

(26, 5)
(25, 5)
(25, 5)
(27, 5)


In [22]:
df = df1.append(df2, ignore_index = True)
df = df.append(df3, ignore_index = True)
df = df.append(df4, ignore_index = True)

In [23]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,Malvern,43.8092,-79.2217
1,M1C,Scarborough,Rouge Hill,43.7803,-79.1305
2,M1E,Scarborough,Guildwood,43.7552,-79.1982
3,M1G,Scarborough,Woburn,43.7598,-79.2253
4,M1H,Scarborough,Cedarbrae,43.7565,-79.2267


### Cleaning the dataframe

In [24]:
clean_df = df.drop(to_drop_unknown)
clean_df['Latitude'].replace('', np.nan, inplace=True)
clean_df.dropna(subset=['Latitude'], inplace=True)
clean_df.shape

(99, 5)

In [25]:
clean_df.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,Malvern,43.809196,-79.2217
1,M1C,Scarborough,Rouge Hill,43.780271,-79.1305
2,M1E,Scarborough,Guildwood,43.755225,-79.1982
3,M1G,Scarborough,Woburn,43.759824,-79.2253
4,M1H,Scarborough,Cedarbrae,43.756467,-79.2267
5,M1J,Scarborough,Scarborough Village,43.743742,-79.2116
6,M1K,Scarborough,Kennedy Park,43.724878,-79.254
7,M1L,Scarborough,Golden Mile,43.727841,-79.2876
8,M1M,Scarborough,Cliffside,43.71117,-79.2482
9,M1N,Scarborough,Birch Cliff,43.691805,-79.2645


### Mapping the area of Neighbourhood of Toronto

In [26]:
try:
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))
    df.loc[index, 'Latitude'] = latitude
    df.loc[index, 'Longitude'] = longitude
except AttributeError:
    print('Cannot do: {}, will drop index: {}'.format(address, index))

The geograpical coordinate of Northwest, Toronto are 43.6465466, -79.4195263.


In [27]:
my_map = folium.Map(location=[latitude, longitude], zoom_start= 11)
my_map

### Showcasing map if not displayed in Github

In [28]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://raw.githubusercontent.com/RohitLearner/IBM-Applied-D.S-Capstone-Project/master/Week%203/Results/Toronto_map.png")

### Plotting the final neighborhood of Toronto Map

In [30]:
n = 91  # Unable to show Marker in Map if Marker > 91
clean_df1 = clean_df.tail(n)
clean_df1.shape

(91, 5)

In [32]:
import folium
my_map = folium.Map(location=[latitude, longitude], zoom_start=11)
# add markers to map
for lat, lng, label in zip(clean_df1['Latitude'], clean_df1['Longitude'], clean_df1['Neighbourhood']):
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(my_map)  
my_map

### Showcasing map if not displayed in Github

In [33]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://raw.githubusercontent.com/RohitLearner/IBM-Applied-D.S-Capstone-Project/master/Week%203/Results/Toronto_Marked_Map.png")