# Applied Capstone Project - Week 4

# Capstone Project - The Battle of Neighborhoods

#### 1. Installing and Importing Python Libraries and Dependencies

In [27]:
import pandas as pd
import requests
import numpy as np
import geocoder
import folium
import requests 
import matplotlib.cm as cm
import matplotlib.colors as colors
import json
import xml
import matplotlib.pyplot as plt
# matplotlib inline
import warnings
warnings.filterwarnings("ignore")

from pandas.io.json import json_normalize 
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim 
from bs4 import BeautifulSoup

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

print("All Required Libraries Imported!")

All Required Libraries Imported!


#### 2. Data Extraction and Cleaning

Using BeautifulSoup Scraping List of Postal Codes of Given Wikipedia Page. Link: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [28]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(data, 'lxml')

Create lists to store the data. Use split ,strip and replace functions for getting Borough and Neighborhood information.

In [29]:
table_contents = []
table = soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

Transform the data into pandas dataframe

In [30]:
df = pd.DataFrame(table_contents)
df['Borough'] = df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

Remove rows with 'Not assigned' in Borough column

In [31]:
df = df[df.Borough != 'Not assigned'].reset_index(drop = True)

Group multiple neighborhood with the same postal code

In [32]:
df_group = df.groupby(['PostalCode', 'Borough'], as_index=False).agg(lambda x: ", ".join(x))
df_group['Neighborhood'] = df_group['Neighborhood'].str.replace('/', ',')
df_group

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


Filter only to North York as per project objective

In [33]:
df = df[df['Borough']=='North York'].reset_index(drop = True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M6A,North York,"Lawrence Manor, Lawrence Heights"
3,M3B,North York,Don Mills North
4,M6B,North York,Glencairn
5,M3C,North York,Don Mills South
6,M2H,North York,Hillcrest Village
7,M3H,North York,"Bathurst Manor, Wilson Heights, Downsview North"
8,M2J,North York,"Fairview, Henry Farm, Oriole"
9,M3J,North York,"Northwood Park, York University"


In [34]:
def get_latilong(postal_code):
    lati_long_coords = None
    while(lati_long_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lati_long_coords = g.latlng
    return lati_long_coords
    
get_latilong('M4G')

[43.709020000000066, -79.36348999999996]

In [35]:
# Retrieving Postal Code Co-ordinates
postal_codes = df['PostalCode']    
coords = [ get_latilong(postal_code) for postal_code in postal_codes.tolist() ]

In [36]:
# Adding Columns Latitude & Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
df['Latitude'] = df_coords['Latitude']
df['Longitude'] = df_coords['Longitude']

In [37]:
df[df.PostalCode == 'M5G']

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude


In [42]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
3,M3B,North York,Don Mills North,43.74923,-79.36186
4,M6B,North York,Glencairn,43.70687,-79.44812
5,M3C,North York,Don Mills South,43.72168,-79.34352
6,M2H,North York,Hillcrest Village,43.80225,-79.35558
7,M3H,North York,"Bathurst Manor, Wilson Heights, Downsview North",43.75788,-79.44847
8,M2J,North York,"Fairview, Henry Farm, Oriole",43.78097,-79.34781
9,M3J,North York,"Northwood Park, York University",43.76476,-79.48798


In [41]:
address = 'North York, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of North York, Ontario, are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of North York, Ontario, are 43.7543263, -79.44911696639593.


In [40]:
# create map of Toronto using latitude and longitude values
map_northyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(map_northyork)  
    
map_northyork

NameError: name 'neighborhoods' is not defined