# Capstone Neighborhood Clustering Lab

## Part 1: Acquire and Clean Data

In [1]:
#install necessary libraries and set options
import pandas as pd
import requests
from bs4 import BeautifulSoup
!pip install geocoder
import geocoder
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 8.4 MB/s  eta 0:00:01
Collecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [2]:
#scrape data from Wikipedia page and store in dataframe
req = requests.get("https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1012118802") 
soup = BeautifulSoup(req.content,'lxml') 
table = soup.find_all('table')[0]  
df = pd.read_html(str(table)) 
neighborhood=pd.DataFrame(df[0]) 

In [3]:
#check the shape and first 10 rows of the dataframe prior to cleaning
print(neighborhood.shape)
neighborhood.head(10)

(180, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [4]:
#confirm how many rows contain "Not assigned" in the Borough column. These will be removed.
neighborhood[neighborhood['Borough'] == "Not assigned"].shape

(77, 3)

In [5]:
#create a new dataframe where rows with "Not assigned" in the Borough column have been removed.
neighborhood_filtered = neighborhood[neighborhood['Borough'] != "Not assigned"]
neighborhood_filtered.reset_index(drop = True, inplace = True)

In [6]:
#check the shape and first 10 rows of the new cleaned dataframe
print(neighborhood_filtered.shape)
neighborhood_filtered.head(10)

(103, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [7]:
#check to see if there are any rows with "Not assigned" in the Neighbourhood column
neighborhood_filtered[neighborhood_filtered['Neighbourhood'] == "Not assigned"].shape

(0, 3)

In [8]:
#check to see if any postal codes are listed more than once
neighborhood_filtered.describe()

Unnamed: 0,Postal Code,Borough,Neighbourhood
count,103,103,103
unique,103,11,99
top,M6C,North York,Downsview
freq,1,24,4


In [9]:
neighborhood_filtered.shape

(103, 3)

## Part 2: Get lat and long data

In [43]:
#!pip install geocoder
#import geocoder

latitude=[]
longitude=[]
for code in neighborhood_filtered['Postal Code']:
    g = geocoder.arcgis('{}, Toronto, Ontario'.format(code))
    #print(code, g.latlng)
    while (g.latlng is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(code))
        #print(code, g.latlng)
    latlng = g.latlng
    latitude.append(latlng[0])
    longitude.append(latlng[1])

#create dataframes for the latitude and longitude lists
latitude_df = pd.DataFrame(latitude, columns = {'latitude'})
longitude_df = pd.DataFrame(longitude, columns = {'longitude'})

#merge the latitude and longitude dataframes with the neighborhood dataframe
lat_long_df = latitude_df.merge(longitude_df, how = 'left', left_index = True, right_index = True)
neighborhood_merged = neighborhood_filtered.merge(lat_long_df, how = 'left', left_index = True, right_index = True)

In [44]:
neighborhood_merged

Unnamed: 0,Postal Code,Borough,Neighbourhood,latitude,longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.66263,-79.52831
6,M1B,Scarborough,"Malvern, Rouge",43.81139,-79.19662
7,M3B,North York,Don Mills,43.74923,-79.36186
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.70718,-79.31192
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804


## Part 3: Explore and Cluster Neighborhoods in Toronto