# Week 3 Neighborhoods in Toronto

## Part 1 - Reading the data

## Import libraries

In [1]:
import numpy as np
import pandas as pd
pd.set_option ('display.max_columns', None)
pd.set_option ('display.max_rows', None)
import matplotlib.pyplot as plt
import json
from pandas.io.json import json_normalize # to flatten json file and use pandas to read it

! pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude

from sklearn.cluster import KMeans

! pip install BeautifulSoup4 requests

print("import done")

Collecting geopy
[?25l  Downloading https://files.pythonhosted.org/packages/07/e1/9c72de674d5c2b8fcb0738a5ceeb5424941fefa080bfe4e240d0bacb5a38/geopy-2.0.0-py3-none-any.whl (111kB)
[K     |████████████████████████████████| 112kB 9.3MB/s eta 0:00:01
[?25hCollecting geographiclib<2,>=1.49 (from geopy)
  Downloading https://files.pythonhosted.org/packages/8b/62/26ec95a98ba64299163199e95ad1b0e34ad3f4e176e221c40245f211e425/geographiclib-1.50-py3-none-any.whl
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-2.0.0
Collecting BeautifulSoup4
[?25l  Downloading https://files.pythonhosted.org/packages/d1/41/e6495bd7d3781cee623ce23ea6ac73282a373088fcd0ddc809a047b18eae/beautifulsoup4-4.9.3-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 7.4MB/s eta 0:00:01
Collecting soupsieve>1.2; python_version >= "3.0" (from BeautifulSoup4)
  Downloading https://files.pythonhosted.org/packages/6f/8f/457f4a5390eeae1cc3aeab89deb7724c96

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600'
}

## To request the data from Wiki

In [4]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source= requests.get(url).text
soup=BeautifulSoup(source)

In [5]:
table_data=soup.find('div', class_='mw-parser-output')
table=table_data.table.tbody

In [6]:
columns=['Postal Code', 'Borough', 'Neighbourhood']
data=dict({key:[]*len(columns) for key in columns})

for row in table.find_all('tr'):
    for i, column in zip(row.find_all('td'), columns):
        i=i.text
        i=i.replace('\n', '')
        data[column].append(i)
        
df=pd.DataFrame.from_dict(data=data)[columns]
print(df.shape)
df.tail(10)

(180, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
170,M9Y,Not assigned,Not assigned
171,M1Z,Not assigned,Not assigned
172,M2Z,Not assigned,Not assigned
173,M3Z,Not assigned,Not assigned
174,M4Z,Not assigned,Not assigned
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."
179,M9Z,Not assigned,Not assigned


In [7]:
# drop the un-assigned Borough
df=df[df['Borough'] !='Not assigned'].reset_index(drop=True)
df.shape

(103, 3)

In [8]:
# number of cases that Neighbourhood is Not Assigned
df[df['Neighbourhood']=='Not assigned'].shape

(0, 3)

There is no row with Neighbourhood = Not Assigned

In [9]:
print('So the final data frame has the shape of:', df.shape)

So the final data frame has the shape of: (103, 3)


## Part 2 - Adding longitudes and latitudes

In [10]:
data_url='http://cocl.us/Geospatial_data'
geodata=pd.read_csv(data_url)

In [11]:
geodata.shape

(103, 3)

In [12]:
df=pd.merge(df, geodata, how='inner', on='Postal Code')
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Part 3 - Clustering