In [None]:
# default_exp scraper

# Scraper

> This module is used to scrape https://www.point2homes.com for all land available on Vancouver Island.

In [None]:
#hide
from nbdev.showdoc import *
from lxml.etree import HTML
from selenium import webdriver
from selenium.webdriver.common.by import By
import xmltodict as xd

In [None]:
def read_land_column(lands):
    land_list = []
    for land in lands:
        data = xd.parse(etree.tostring(land))['div']['div']
        address = data[0]['@data-address']
        try:
            price = data[1]['@data-price']
            lot_size = data[2]['ul']['li'][0]['strong']
            property_type = data[2]['ul']['li'][1]['#text']
            link = data[4]['a']['@href']
            land_list.append({
                'address': address,
                'price':price,
                'lot_size':lot_size,
                'property_type':property_type,
                'link':link
            })
        except:
            print(f'Failed on {address}')
    return land_list

In [None]:
url = "https://www.point2homes.com/CA/Land-For-Sale-Maps/BC/Vancouver-Island.html"

browser = webdriver.Firefox()
browser.get(url)


pages = []

In [None]:
from time import sleep

In [None]:
while(1):
    html = browser.page_source
    tree = HTML(html)
    lands = tree.xpath("//div[@class='item_information']")
    
    pages = pages + read_land_column(lands)
    
    print(len(pages))

    e = browser.find_element(By.XPATH, "//a[starts-with(@class,'pager-next')]")
    if 'disabled' in e.get_attribute('class'):
        break
    else:
        print('Next Page')
        e.click()
        sleep(10)

Failed on 1182 Silversmith Place 
99
Next Page
Failed on 1182 Silversmith Place 
Failed on 1178 Silversmith Place 
197
Next Page
Failed on 43 KESTREL DRIVE 
Failed on LT 4 LANTZVILLE ROAD 
295
Next Page
Failed on 890 CRUIKSHANK RIDGE 
Failed on 988 KHENIPSEN ROAD 
393
Next Page
Failed on 715 GLACIER VIEW CIRC 
Failed on LT 3 LANTZVILLE ROAD 
Failed on 776 PARK AVE 
Failed on 711 KELSEY WAY 
489
Next Page
Failed on 12 5TH STREET 
Failed on 1915 CUMBERLAND ROAD 
Failed on 4090 FALK ROAD 
586
Next Page
Failed on 5130 ARGYLE STREET 
Failed on 3083 COBBLE HILL ROAD 
684
Next Page
Failed on 7010 SPROUT ROAD 
Failed on LT 14 & 15 Joan Ave 
782
Next Page
Failed on Lot 4 ALLADIN AVE 
Failed on 1219-1221 Carlisle Ave 
880
Next Page
Failed on 2096 PENINSULA ROAD 
979
Next Page
Failed on 1932 Mt. Newton Cross Rd 
1078
Next Page
Failed on 5814 LINLEY VALLEY DR 
1086


In [None]:
df = pd.DataFrame(pages).drop_duplicates('address')

In [None]:
import re

In [None]:
def extract_numbers(s):
    return float(''.join(re.findall("[-+]?\d*\.\d+|\d+", s)))

In [None]:
extract_numbers('$799,000 CAD')

799000.0

In [None]:
df['price_float'] = df['price'].apply(extract_numbers)

In [None]:
df['lot_size_float'] = df['lot_size'].apply(extract_numbers)

In [None]:
df['Dollar per Acre'] = df['price_float'] / df['lot_size_float']

In [None]:
df.head()

Unnamed: 0,address,price,lot_size,property_type,link,price_float,lot_size_float,Dollar per Acre
0,LOT 1 Oak Leaf Drive,"$799,000 CAD",0.842 ac,Land,/CA/Vacant-Land-For-Sale/BC/Nanoose-Bay/LOT-1-...,799000.0,0.842,948931.11639
1,2970 Glen Eagles Road LT 30,"$47,900 CAD",0.048 ac,Land,/CA/Vacant-Land-For-Sale/BC/Vancouver-Island/C...,47900.0,0.048,997916.666667
2,2970 Glen Eagles Road LT 29,"$47,900 CAD",0.05 ac,Land,/CA/Vacant-Land-For-Sale/BC/Vancouver-Island/C...,47900.0,0.05,958000.0
3,7000 The Point Road LT 7,"$899,000 CAD",143 ac,Land,/CA/Vacant-Land-For-Sale/BC/Vancouver-Island/D...,899000.0,143.0,6286.713287
4,LOT C Oak Leaf Drive,"$659,000 CAD",8.5 ac,Land,/CA/Vacant-Land-For-Sale/BC/Nanoose-Bay/LOT-C-...,659000.0,8.5,77529.411765


In [None]:
df = df.sort_values('Dollar per Acre')

In [None]:
df.head()

Unnamed: 0,address,price,lot_size,property_type,link,price_float,lot_size_float,Dollar per Acre
1131,12-15 West Coast Road,"$1,200,000 CAD",422.54 ac,Land,/CA/Vacant-Land-For-Sale/BC/Vancouver-Island/C...,1200000.0,422.54,2839.967814
726,Lot 85 West Coast Rd,"$739,900 CAD",160 ac,Land,/CA/Vacant-Land-For-Sale/BC/Sooke/Lot-85-West-...,739900.0,160.0,4624.375
1068,LT 9 & 10 HECATE COVE,"$140,000 CAD",28.05 ac,Land,/CA/Vacant-Land-For-Sale/BC/Vancouver-Island/M...,140000.0,28.05,4991.087344
1129,LT 9 & 10 Hecate Cove,"$140,000 CAD",28.05 ac,Land,/CA/Vacant-Land-For-Sale/BC/Vancouver-Island/M...,140000.0,28.05,4991.087344
59,BLK 1242 Goldstream Heights (off),"$295,000 CAD",52 ac,Land,/CA/Vacant-Land-For-Sale/BC/Vancouver-Island/C...,295000.0,52.0,5673.076923


In [None]:
df.head(40)['link'].values

array(['/CA/Vacant-Land-For-Sale/BC/Vancouver-Island/Capital/12-15-West-Coast-Road/68422928.html',
       '/CA/Vacant-Land-For-Sale/BC/Sooke/Lot-85-West-Coast-Rd/85853262.html',
       '/CA/Vacant-Land-For-Sale/BC/Vancouver-Island/Mount-Waddington/LT-9-10-HECATE-COVE/69743973.html',
       '/CA/Vacant-Land-For-Sale/BC/Vancouver-Island/Mount-Waddington/LT-9-10-Hecate-Cove/69747658.html',
       '/CA/Vacant-Land-For-Sale/BC/Vancouver-Island/Cowichan-Valley/BLK-1242-Goldstream-Heights-off/50881059.html',
       '/CA/Vacant-Land-For-Sale/BC/Vancouver-Island/Powell-River-District/GLINE-RD/88276963.html',
       '/CA/Vacant-Land-For-Sale/BC/Vancouver-Island/Denman-Island/7000-The-Point-Road/79764534.html',
       '/CA/Vacant-Land-For-Sale/BC/Vancouver-Island/Denman-Island/LT-7-7000-THE-POINT-ROAD/79742463.html',
       '/CA/Vacant-Land-For-Sale/BC/Vancouver-Island/Alberni-Clayoquot/FORTUNE-CHANNEL/77176101.html',
       '/CA/Vacant-Land-For-Sale/BC/Campbell-River/LT-5-CEDAR-CREEK-ROAD/864724

In [None]:
df['link'] = 'https://www.point2homes.com' + df['link']

In [None]:
df['Dollar per Acre'] = df['Dollar per Acre'].apply(round)

In [None]:
df.to_csv('VancouverIsland.csv', index=False)

In [None]:
len(pages)extract_numbers

121

In [None]:
df = pd.DataFrame(land_list)

In [None]:
df.sort_values('price_float').to_csv('VancouverIsland-Cheapest.csv',index=False)

In [None]:
df.iloc[92].values

array(['LT 38 RUXTON ISLAND ', '$75,000 CAD', '2.15 ac', 'Land',
       '/CA/Vacant-Land-For-Sale/BC/Vancouver-Island/Cowichan-Valley/LT-38-RUXTON-ISLAND/88932209.html'],
      dtype=object)

In [None]:
next_page = browser.find_element(By.XPATH, "//input[starts-with(@name,'username')]")
password = browser.find_element(By.XPATH, "//input[starts-with(@name,'password')]")

username.send_keys("Carole@impactisi.com")
password.send_keys("password123")
browser.find_element(By.XPATH, "//input[starts-with(@name,'Login')]").click()

In [None]:
pager-next

In [None]:
# price, ac, price/ac, datetime, id, area

In [None]:
1038 / 100

10.38

In [None]:
item = {
    'address':
    'price':
    'characteristics':
    'labels':
    'detail_link'
}

SyntaxError: invalid syntax (<ipython-input-111-5589d66c2c9c>, line 3)

In [None]:
import pandas as pd

In [None]:
from lxml import etree

In [None]:
etree.tostring(tree)

b'<div class="item_information">\n        <div class="item_address" data-url="/CA/Vacant-Land-For-Sale/BC/Nanoose-Bay/LOT-1-Oak-Leaf-Drive/82258309.html" data-tracking="" data-address="LOT 1 Oak Leaf Drive ">\n            <div class="address-container">\n                <span>LOT 1 Oak Leaf Drive </span>\n            </div>\n\n        </div>\n<div class="price " data-price="$799,000 CAD">\n    <span class="green">\n            <span> $799,000 CAD</span>\n    </span>\n\n</div>\n        <div class="characteristics-cnt">\n\n\n<ul>\n                    <li data-label="Lot Size" class="ic-lotsize">\n            <strong>0.842 ac</strong> <span class="gray">Lot Size</span>\n        </li>\n            <li class="property-type ic-proptype">\n            Land\n        </li>\n</ul>\n\n\n        </div>\n        <div class="labels">\n            <div class="featured-icon" title="Featured"/>\n\n        </div>\n        <div class="lnk_details">\n            <a rel="" href="/CA/Vacant-Land-For-Sale/BC

In [None]:
etree.tostringlist(tree)

[b'<div class="item_information">\n        <div class="item_address" data-url="/CA/Vacant-Land-For-Sale/BC/Nanoose-Bay/LOT-1-Oak-Leaf-Drive/82258309.html" data-tracking="" data-address="LOT 1 Oak Leaf Drive ">\n            <div class="address-container">\n                <span>LOT 1 Oak Leaf Drive </span>\n            </div>\n\n        </div>\n<div class="price " data-price="$799,000 CAD">\n    <span class="green">\n            <span> $799,000 CAD</span>\n    </span>\n\n</div>\n        <div class="characteristics-cnt">\n\n\n<ul>\n                    <li data-label="Lot Size" class="ic-lotsize">\n            <strong>0.842 ac</strong> <span class="gray">Lot Size</span>\n        </li>\n            <li class="property-type ic-proptype">\n            Land\n        </li>\n</ul>\n\n\n        </div>\n        <div class="labels">\n            <div class="featured-icon" title="Featured"/>\n\n        </div>\n        <div class="lnk_details">\n            <a rel="" href="/CA/Vacant-Land-For-Sale/B

In [None]:
pd.read_html(tree)

  return tuple(x.decode(encoding, errors) if x else '' for x in args)


TypeError: Cannot read object of type '_Element'

In [None]:
tree = lands[0]

In [None]:
price = tree.xpath("//div[@class='price']")


In [None]:
price

[<Element div at 0x7fb8ac045a88>]

In [None]:
e = price[0]

In [None]:
e.text

'$47,900 CAD'

In [None]:
address = tree.xpath("//div[@class='item_address']")


In [None]:
address[0].attrib['data-address']

'LOT 1 Oak Leaf Drive '

In [None]:
data-address

In [None]:
descriptions = tree.xpath("//div[@class='info member-info']")
links = tree.xpath("//a[@class='learn-more']")

In [None]:
lands

[<Element div at 0x7fb8ac0d0408>,
 <Element div at 0x7fb8ac0d0d08>,
 <Element div at 0x7fb8ac0f57c8>,
 <Element div at 0x7fb8ac045448>,
 <Element div at 0x7fb8ac045488>,
 <Element div at 0x7fb8ac045508>,
 <Element div at 0x7fb896df03c8>,
 <Element div at 0x7fb896df0408>,
 <Element div at 0x7fb896df0448>,
 <Element div at 0x7fb8ac0454c8>,
 <Element div at 0x7fb896df0488>,
 <Element div at 0x7fb896df04c8>,
 <Element div at 0x7fb896df0508>,
 <Element div at 0x7fb896df0548>,
 <Element div at 0x7fb896df0588>,
 <Element div at 0x7fb896df05c8>,
 <Element div at 0x7fb896df0608>,
 <Element div at 0x7fb896df0648>,
 <Element div at 0x7fb896df0688>,
 <Element div at 0x7fb896df06c8>,
 <Element div at 0x7fb896df0708>,
 <Element div at 0x7fb896df0748>,
 <Element div at 0x7fb896df0788>,
 <Element div at 0x7fb896df07c8>,
 <Element div at 0x7fb896df0808>,
 <Element div at 0x7fb896df0848>,
 <Element div at 0x7fb896df0888>,
 <Element div at 0x7fb896df08c8>,
 <Element div at 0x7fb896df0908>,
 <Element div 