# Applied Data Science Week 3 Project

## 1. Scrape the wikipedia page to get postal codes of Canada into a data frame
> *https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M*

In [2]:
!pip install beautifulsoup4
# !pip install wget

Collecting beautifulsoup4
[?25l  Downloading https://files.pythonhosted.org/packages/d1/41/e6495bd7d3781cee623ce23ea6ac73282a373088fcd0ddc809a047b18eae/beautifulsoup4-4.9.3-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 7.4MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2; python_version >= "3.0" (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/02/fb/1c65691a9aeb7bd6ac2aa505b84cb8b49ac29c976411c6ab3659425e045f/soupsieve-2.1-py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.9.3 soupsieve-2.1


In [3]:
from bs4 import BeautifulSoup
import pandas as pd
# import wget
import requests

In [4]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(url)

In [5]:
parsed_html = BeautifulSoup(page.content, 'html.parser')
table=parsed_html.body.find('table', attrs={'class':'wikitable'})

# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

# print(table.tbody.children)
for trow in table.tbody.children:
    td = trow.extract().find_all("td")
    if (len(td) > 1):
        neighborhoods = neighborhoods.append({'PostalCode': td[0].string.strip("\n"),
                                            'Borough': td[1].string.strip("\n"),
                                          'Neighborhood': td[2].string.strip("\n")}, ignore_index=True)

neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Now clean up the dataframe

In [6]:
neighborhoods = neighborhoods[neighborhoods.Borough != 'Not assigned']

print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

neighborhoods.head()

The dataframe has 10 boroughs and 103 neighborhoods.


Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Verify there are no "Not assigned" cells in Neighborhood

In [7]:
df = neighborhoods
df[df.Neighborhood == "Not assigned"].head()

Unnamed: 0,PostalCode,Borough,Neighborhood


In [128]:
# df = neighborhoods.groupby(["PostalCode"]).count()
# df = df.reset_index()
# df.head()

# # df.describe()
# # df[df.PostalCode == "M1A"]

In [8]:

print("The shape of the dataframe is {}".format(df.shape))

The shape of the dataframe is (103, 3)


## 2. Get the lat, long for neighborhoods using geocoder data

In [15]:
response = requests.get("https://cocl.us/Geospatial_data")
open('geodata.csv', 'w').write(response.text)

2891

In [19]:
geodata = pd.read_csv("geodata.csv")
geodata.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
geodata.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge both data frames

In [23]:
df = pd.merge(neighborhoods, geodata, on="PostalCode")
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## 3. Now cluster the neighborhoods in Toronto and show on map