# Complete notebook for segmenting and clustering neighborhoods in Toronto.

## 1. Initial libraries to be imported:

In [2]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis

import requests # library to handle requests

! conda install -c anaconda beautifulsoup4 --yes  #install the python package to parse the html page (wikipedia)
from bs4 import BeautifulSoup

print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - beautifulsoup4


The following packages will be UPDATED:

    openssl: 1.1.1d-h516909a_0 conda-forge --> 1.1.1-h7b6447c_0 anaconda

The following packages will be DOWNGRADED:

    certifi: 2019.11.28-py36_0 conda-forge --> 2019.9.11-py36_0 anaconda

Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Libraries imported.


## 2. Scrapping the wikipedia webpage:

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url)

soup = BeautifulSoup(r.content, 'html5lib')

table = soup.find('div', attrs = {'id':'container'})

print('Page scrapped.')

Page scrapped.


### 2.1. Extracting table contents from wikipedia page:

In [4]:
postalCodes = []
boroughs = []
neighborhoods = []
colNum = 1
passVal = False

for row in soup.find_all('td'):
    for cell in row:
        if cell.string and cell.string[0].isalpha() and len(cell.string)>2:
            passVal = False
            if colNum == 1:
                if passVal == False and cell.string[1].isdigit():
                    postalCodes.append(cell.string)
                    colNum = 2
                else:
                    continue
            elif colNum == 2:
                if cell.string == 'Not assigned':
                    passVal = True
                    del postalCodes[-1]
                    colNum = 1
                    continue
                else:
                    boroughs.append(cell.string)
                    colNum = 3
            elif colNum == 3:
                if cell.string == 'Not assigned\n':
                    neighborhoods.append(boroughs[-1])
                else:
                    neighborhoods.append(cell.string)
                colNum = 1
                
print('Data Collected.')

Data Collected.


### 2.2. Define column names and create empty dataframe:

In [5]:
col_names = ['PostalCode','Borough','Neighborhood']

tordf = pd.DataFrame(columns = col_names)

tordf

Unnamed: 0,PostalCode,Borough,Neighborhood


### 2.3. Add extracted data to the dataframe columns:

In [6]:
for data in range(len(neighborhoods)):
    code = postalCodes[data]
    borough = boroughs[data]
    neighborhood_name = neighborhoods[data]
    tordf = tordf.append({'PostalCode':code, 'Borough':borough, 'Neighborhood':neighborhood_name},
                                ignore_index = True)

In [7]:
tordf.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Queen's Park,Queen's Park
6,M9A,Queen's Park,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North\n


#### Note that there is no 'Regent Park' neighborhood for M5A postal code in the wikipedia page table.

### 2.4. Now, group neighborhoods (comma separated) that belong to the same postal code:

In [8]:
tordf = tordf.groupby(['PostalCode','Borough'], as_index=False, sort=False).agg(lambda x:', '.join(x))

In [9]:
tordf.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


#### Note the number of rows is 103 and columns is 3 of consolidated dataframe:

In [10]:
tordf.shape

(103, 3)

## 3. Now, moving on to the second part of the assignment...Adding latitude and logitude columns to the dataframe.

### 3.1. First, fetch the lat,long values from the Geospatial_data csv file:

In [11]:
dfll = pd.read_csv('https://cocl.us/Geospatial_data')
dfll.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### 3.2. Rename the Postal Code column (remove space) to later merge the dataframes:

In [12]:
dfll.rename(columns={'Postal Code':'PostalCode'}, inplace=True)

In [13]:
dfll.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### 3.3. Merge the neighborhood dataframe (tordf) with latlong dataframe (dfll) on the 'PostalCode' column, to get lat long values only for required postal codes:

In [14]:
tordfll = tordf.merge(dfll, on=['PostalCode'])

In [17]:
tordfll.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
