In [1]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans        #for clustering 
import folium                             #for real maps
import json                               #handle json files
from geopy.geocoders import Nominatim     #convert address to lat and long
from pandas.io.json import json_normalize #covert json file to pandas dataframe
import requests                           #library to handle requests

<h4> PART 1 ASSIGNMENT - WIKIPEDIA DATAFRAME

Note the Wikipedia link and read using pandas read_html function

In [2]:
Link = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
df = pd.read_html(Link)
df

[    Postcode           Borough          Neighbourhood
 0        M1A      Not assigned           Not assigned
 1        M2A      Not assigned           Not assigned
 2        M3A        North York              Parkwoods
 3        M4A        North York       Victoria Village
 4        M5A  Downtown Toronto           Harbourfront
 ..       ...               ...                    ...
 282      M8Z         Etobicoke              Mimico NW
 283      M8Z         Etobicoke     The Queensway West
 284      M8Z         Etobicoke  Royal York South West
 285      M8Z         Etobicoke         South of Bloor
 286      M9Z      Not assigned           Not assigned
 
 [287 rows x 3 columns],
                                                   0   \
 0                                                NaN   
 1  NL NS PE NB QC ON MB SK AB BC NU/NT YT A B C E...   
 2                                                 NL   
 3                                                  A   
 
                          

Store the relevant data of 3 columns and 287 rows in new dataframe

In [4]:
df_new = df[0]
df_new.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [5]:
df_new.shape

(287, 3)

In [6]:
df_new['Borough'].value_counts()

Not assigned        77
Etobicoke           45
North York          38
Scarborough         37
Downtown Toronto    37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Mississauga          1
Name: Borough, dtype: int64

Step 1 - ignore Borough which are "Not Assigned"

In [10]:
df_clean = df_new[df_new['Borough'] !='Not assigned']

In [11]:
df_clean

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
...,...,...,...
281,M8Z,Etobicoke,Kingsway Park South West
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West


Step 2 - Find the neighbourhood which have Not assigned to copy the borough name to neighbourhood
(Ans: There are none)

In [12]:
df_clean.query('Neighbourhood=="Not assigned"')

Unnamed: 0,Postcode,Borough,Neighbourhood


Step 3 - Find the Postal Code which are repeated to merge the rows. Use Groupby function to aggregate the Neighborhood

In [13]:
count = df_clean['Postcode'].value_counts()
count

M9V    8
M8Y    8
M5V    7
M8Z    5
M9B    5
      ..
M6E    1
M4H    1
M3L    1
M1S    1
M9L    1
Name: Postcode, Length: 103, dtype: int64

In [33]:
df_clean['Borough'].value_counts()

Etobicoke           45
North York          38
Scarborough         37
Downtown Toronto    37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Mississauga          1
Name: Borough, dtype: int64

In [14]:
#Aggregate the Neighborhood column using GroupBy function 
df_toro = df_clean.groupby(['Postcode','Borough']).agg( ', '.join)

In [15]:
df_toro = df_toro.reset_index()

In [16]:
df_toro.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [17]:
df_toro.shape

(103, 3)

<h3> PART 2 ASSIGNMENT - TORONTO DATAFRAME

Step 4 - Get the Lat and Long for each PostCode using GEOPY

Step 5 - Previous step didn't worked as planned - hence using the CSV file

In [22]:
toro_csv = pd.read_csv('Geospatial_Coordinates.csv')
toro_csv = toro_csv.rename(columns ={'Postal Code':'Postcode'})
print(toro_csv.head(), toro_csv.shape)

  Postcode   Latitude  Longitude
0      M1B  43.806686 -79.194353
1      M1C  43.784535 -79.160497
2      M1E  43.763573 -79.188711
3      M1G  43.770992 -79.216917
4      M1H  43.773136 -79.239476 (103, 3)


In [23]:
#Combine the 2 data sets
df_toronto = pd.merge(df_toro,toro_csv,how='outer', on = 'Postcode')

In [28]:
df_toronto.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [26]:
df_toronto.shape

(103, 5)

<h3> PART 3 ASSIGNMENT - UNDERSTAND TORONTO

Use Geopy library to get lat and long of Toronto

In [29]:
address = 'Toronto, Ontario, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.653963, -79.387207.


Create a map of TORONTO with neighborhoods superimposed on top

In [31]:
# create map of TORONTO using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Now Lets create a map with Borough = Downtown Toronto

In [34]:
df_downtoronto = df_toronto[df_toronto['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
df_downtoronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
4,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


In [41]:
# create map of DOWNTOWN TORONTO using latitude and longitude values
map_downtoronto = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_downtoronto['Latitude'], df_downtoronto['Longitude'], df_downtoronto['Borough'], df_downtoronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=10,
        popup=label,
        color='red',
        fill=True,
        fill_color='#cc3146',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtoronto)  
    
map_downtoronto