# Segmenting and Clustering Neighborhoods in New York City

### Install Beautifulsoup Library

In [1]:
!conda install -c anaconda beautifulsoup4 

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py36_0         156 KB  anaconda
    beautifulsoup4-4.8.2       |           py36_0         161 KB  anaconda
    openssl-1.1.1              |       h7b6447c_0         5.0 MB  anaconda
    ca-certificates-2019.11.27 |                0         132 KB  anaconda
    ------------------------------------------------------------
                                           Total:         5.5 MB

The following packages will be UPDATED:

    beautifulsoup4:  4.7.1-py36_1      --> 4.8.2-py36_0      anaconda
    ca-certificates: 2019.11.27-0      --> 2019.11.27-0      anaconda
    certifi:         2019.11.28-py36_0 --> 2019.11.28-py36_0 anaconda
    openssl:         1.1.1d-

### Import BeautifulSoup and Requests

In [1]:
from bs4 import BeautifulSoup
import requests
import lxml.html as lh

### Parse the html page

In [2]:
html_path = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(html_path)
doc = lh.fromstring(page.content)
html_page = page.text

tr_elements = doc.xpath('//tr')

### Parse Table Header

In [3]:
col = []
i = 0
for t in tr_elements[0]:
    i += 1
    name = t.text_content().rstrip('\r\n') #strip the trailing new line
    print(name)
    print('%d: "%s"'%(i, name))
    col.append((name,[]))

Postcode
1: "Postcode"
Borough
2: "Borough"
Neighborhood
3: "Neighborhood"


### Create Pandas DataFrame

In [4]:
for j in range(1,len(tr_elements)):
    
    T = tr_elements[j] #jth row
    
    if len(T) != 3: #check if it is the lenght of our header
        break
    
    i = 0 # index of the column
    
    for t in T.iterchildren(): #iterate to each element of the row
        data = t.text_content().rstrip('\r\n') #strip the trailing newline
        
        if i > 0: #check if row is empty
            try:
                data = int(data)
            except:
                pass
            
        col[i][1].append(data) #append data to the empty list of the i'th column
        i += 1
        

In [5]:
import pandas as pd
import numpy as np

dic = {title:column for (title, column) in col}
df =  pd.DataFrame(dic)
print(df.head())
print("\nDataFrame shape is: {}".format(df.shape))

  Postcode           Borough      Neighborhood
0      M1A      Not assigned      Not assigned
1      M2A      Not assigned      Not assigned
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront

DataFrame shape is: (287, 3)


#### Copy value of Borough to Neighborhood if Borough is != "Not assigned"

In [6]:
df.Neighborhood = np.where(df.Neighborhood == 'Not assigned', df.Borough, df.Neighborhood) 
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [7]:
df.drop(df[df['Neighborhood'] == 'Not assigned'].index)
df.reset_index(drop = True, inplace = True) #reset index
df.head(15)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Queen's Park


In [8]:
print("New DataFrame shape is: {}".format(df.shape))

New DataFrame shape is: (287, 3)


In [20]:
df.sort_values(by="Postcode")
df_new = df[df.Borough != 'Not assigned']
#df.reset_index(drop = True, inplace = True)
df_new.reset_index(drop = True,inplace = True)
df_new.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [13]:
!wget -O geolocation.csv "https://cocl.us/Geospatial_data"

--2020-01-13 15:36:37--  https://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 169.48.113.194, 158.85.108.83, 158.85.108.86
Connecting to cocl.us (cocl.us)|169.48.113.194|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-01-13 15:36:40--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.26.197, 107.152.27.197
Connecting to ibm.box.com (ibm.box.com)|107.152.26.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-01-13 15:36:40--  https://ibm.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Reusing existing connection to ibm.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.ent.box.com/public/static/9afzr83pps4pwf2s

In [14]:
df_geoloc = pd.read_csv('geolocation.csv')
df_geoloc = df_geoloc.rename(columns={"Postal Code":"Postcode"})
df_geoloc = df_geoloc.sort_values(by=['Postcode'])
df_geoloc.head(10)

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [21]:
df_merge = pd.merge(df_new, df_geoloc, on='Postcode')
df_merge.groupby(df_merge.Neighborhood)
df_merge.head(15)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
4,M6A,North York,Lawrence Manor,43.718518,-79.464763
5,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
6,M9A,Queen's Park,Queen's Park,43.667856,-79.532242
7,M1B,Scarborough,Rouge,43.806686,-79.194353
8,M1B,Scarborough,Malvern,43.806686,-79.194353
9,M3B,North York,Don Mills North,43.745906,-79.352188
