In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException, InvalidSessionIdException, SessionNotCreatedException, NoSuchFrameException
from selenium.webdriver.remote import webelement
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import numpy as np
import csv
import pandas as pd
import os
import time
import pyperclip as pc
import math
from xml.dom.minidom import Attr
from urllib3.exceptions import MaxRetryError

from webdriver_manager.chrome import ChromeDriverManager
# from webdriver_manager.firefox import FirefoxDriverManager
from webdriver_manager.firefox import GeckoDriverManager

In [2]:
data_source = "./bto_limits.csv"
url = "https://services2.hdb.gov.sg/webapp/BB29ETHN/BB29STREET"
pd.set_option('display.max_colwidth', None)

# Initialize Scraper

In [56]:
class Scraper:
    def __init__(self, url, data_source):
        # options = FirefoxOptions()
        # options.headless = True
        options = ChromeOptions()
        options.add_argument("--headless")
        # self.webdriver: webdriver.Safari = webdriver.Safari()
        # self.webdriver: webdriver.Firefox = webdriver.Chrome(GeckoDriverManager().install(), options=options)
        self.webdriver: webdriver.Chrome = webdriver.Chrome(ChromeDriverManager().install(), options=options)
        self.webdriver.set_window_size(2000, 1000)
        self.webdriver.get(url)
        self.webdriver.implicitly_wait(2)
        self.properties = pd.read_csv(data_source, \
                                                engine='python', \
                                                encoding="ISO-8859-1",
                                                header=0)
        self.enquire_as_map = {'seller': "enqBySeller", 'buyer': "enqByBuyer"}
        self.ethnic_map = {'chinese': '//*[@id="ethGroupChinese"]',
                           'malay': '//*[@id="ethGroupMalay"]',
                           'indian': '//*[@id="ethGroupInd"]'}
        self.ethnic_map2 = {'chinese': '//*[@id="ethGroupLrgChinese"]',
                           'malay': '//*[@id="ethGroupLrgMalay"]',
                           'indian': '//*[@id="ethGroupLrgInd"]'}

    
    def query(self, enquire_as: str, postal_code: int, ethnic_group: str) -> str:
        WebDriverWait(self.webdriver, 5).until(EC.element_to_be_clickable((By.ID, 'btnProceed')))
        print(enquire_as, postal_code, ethnic_group)
        ActionChains(self.webdriver).move_to_element( \
                        self.webdriver.find_element_by_id( \
                        self.enquire_as_map[enquire_as])).perform()
        self.webdriver.find_element_by_id(self.enquire_as_map[enquire_as]).click()
        self.webdriver.find_element_by_id('postalCde').send_keys(Keys.COMMAND + "a")
        self.webdriver.find_element_by_id('postalCde').send_keys(Keys.DELETE)
        pc.copy(postal_code)
        time.sleep(0.5)
        ActionChains(self.webdriver) \
            .key_down(Keys.COMMAND) \
            .key_down('v') \
            .key_up('v') \
            .key_up(Keys.COMMAND) \
            .perform()
        # print(postal_code)
        self.webdriver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # ActionChains(self.webdriver).move_to_element( \
        #                 self.webdriver.find_element_by_xpath( \
        #                 self.ethnic_map[ethnic_group])).perform()
        WebDriverWait(self.webdriver, 5).until(EC.element_to_be_clickable((By.ID, 'ethGroupLrgChinese')))
        self.webdriver.find_element_by_xpath(self.ethnic_map2[ethnic_group]).click()
        # print(ethnic_group)
        # WebDriverWait(self.webdriver, 5).until(EC.presence_of_element_located((By.ID, '//*[@id="citizenSing"]')))
        ActionChains(self.webdriver).move_to_element( \
                        self.webdriver.find_element_by_xpath( \
                        '//*[@id="citizenSing"]')).perform()
        self.webdriver.find_element_by_xpath('//*[@id="citizenSing"]').click()
        ActionChains(self.webdriver).move_to_element( \
                        self.webdriver.find_element_by_id( \
                        'btnProceed')).perform()
        self.webdriver.find_element_by_id("btnProceed").click()
        time.sleep(1)
        print(self.webdriver.current_url)
        try:
            WebDriverWait(self.webdriver, 2).until(EC.presence_of_element_located((By.XPATH, '//*[@id="rcdNotFndForm"]/div[2]/div/div/h4')))
            # self.webdriver.find_element_by_xpath('//*[@id="rcdNotFndForm"]/div[2]/div/div/h4')
            print("block doesn't exist")
            self.webdriver.back()
            WebDriverWait(self.webdriver, 5).until(EC.element_to_be_clickable((By.ID, "btnProceed")))
            return 'no records found'
        except TimeoutException:
            print("has record")
        if "buyer" in enquire_as:
            try:
                WebDriverWait(self.webdriver, 5).until(EC.presence_of_element_located((By.XPATH, '//*[@id="print"]/div[3]/div/table/tbody/tr/td[2]')))
                ActionChains(self.webdriver).move_to_element( \
                        self.webdriver.find_element_by_xpath( \
                        '//*[@id="print"]/div[3]/div/table/tbody/tr/td[2]')).perform()
            except NoSuchFrameException:
                print("no such frame lor")
            text = self.webdriver.find_element_by_xpath('//*[@id="print"]/div[3]/div/table/tbody/tr/td[2]').text
        else:
            try:
                WebDriverWait(self.webdriver, 5).until(EC.presence_of_element_located((By.XPATH, '//*[@id="print"]/div[3]/div/div/div/div/p')))
                ActionChains(self.webdriver).move_to_element( \
                        self.webdriver.find_element_by_xpath( \
                        '//*[@id="print"]/div[3]/div/div/div/div/p')).perform()
            except NoSuchFrameException:
                print("no such frame lor")
            text = self.webdriver.find_element_by_xpath('//*[@id="print"]/div[3]/div/div/div/div/p').text
        print(text)
        self.webdriver.back()
        WebDriverWait(self.webdriver, 5).until(EC.element_to_be_clickable((By.ID, "btnProceed")))
        return text

    def query_HDB(self, index:int, postal_code: int, isNone = False) -> None:
        if isNone:
            for i in self.properties.columns[1:]:
                self.properties.at[index, i] = "Hi"
        else:
            for i in self.properties.columns[1:]:
                print(index)
                ethnic_group, enquire_as = i.split("_")
                res = self.query(enquire_as=enquire_as, postal_code=postal_code, ethnic_group=ethnic_group)
                self.properties.at[index, i] = res

    def query_all(self) -> None:
        for index, row in self.properties.iterrows():
            if row.isnull().any():
                print(index)
            # print(index)
            # print(math.isnan(row.indian_seller))
            # print(row.isnull().any())
            # print(row.indian_seller.isnull())
            if not row.isnull().any():
                continue
            else:
                self.query_HDB(index, row.POSTAL, not row.POSTAL)

In [57]:
# scraper = Scraper(url, data_source)
# scraper.query_all()
# scraper.properties.to_csv("bto_limits.csv", index=False, encoding="ISO-8859-1")

700
700
buyer 752126 chinese
https://services2.hdb.gov.sg/webapp/BB29ETHN/BB29STREET
has record


TimeoutException: Message: 


In [21]:
# try:
#     scraper.webdriver.quit()
# except NameError:
#     print("crawler not initialised")
# except MaxRetryError:
#     print("crawler quitted / MaxRetryError")
# except InvalidSessionIdException:
#     print("invalid ID")
# except SessionNotCreatedException:
#     print("need to close session")
# except AttributeError:
#     print("manually close")

# scraper = Scraper(url, data_source)
# try:
#     scraper.query_all()
# finally:
#     scraper.properties.to_csv("bto_limits.csv", index=False, encoding="ISO-8859-1")

In [36]:
for i in range(100):
    print(i)
    scraper = Scraper(url, data_source)
    try:
        scraper.query_all()
    # except TimeoutException:
    #     print("timeout :(")
    except NoSuchElementException:
        print("no such element :(")
    except NoSuchFrameException:
        print("no such frame :(")
    finally:
        scraper.properties.to_csv("bto_limits.csv", index=False, encoding="ISO-8859-1")
        scraper.webdriver.quit()

0
697
697
buyer 751126 chinese
no such element :(
1


[WDM] - Downloading: 16.2kB [00:00, 6.05MB/s]                   


KeyboardInterrupt: 