In [1]:
import os
if not os.path.exists("/Users/home/projects/selenium/docs/2023-05-18__ACT-LEGISLATION_HTML"):
    os.makedirs(
        "/Users/home/projects/selenium/docs/2023-05-18__ACT-LEGISLATION_HTML")

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from typing import List, Union, Any, Optional

In [3]:
import logging

logging.basicConfig(
    filename='../logs/scrape_austlii.log',
    filemode='w',
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.INFO
)

In [4]:
import os
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_experimental_option('prefs', {
    # Change default directory for downloads
    "download.default_directory": "/Users/home/projects/selenium/docs/TO_BE_CLEANED/",
    "download.prompt_for_download": False,  # To auto download the file
    "download.directory_upgrade": True,
    # It will not show PDF directly in chrome
    "plugins.always_open_pdf_externally": True
})
chrome_options.add_argument("--headless")

if not os.path.exists("../docs/TO_BE_CLEANED/"):
    os.makedirs("../docs/TO_BE_CLEANED/")

In [6]:
from typing import Any
from selenium.common.exceptions import WebDriverException


class AustliiScraper:
    def __init__(self, url: str, driver: webdriver.Chrome = None, path: str = None):
        self.urls = [url]
        self.path = "../docs/2023-05-18__ACT-LEGISLATION_HTML/"
        self.isIndexed = False

        try:
            if driver is None:
                self.driver = webdriver.Chrome(options=chrome_options)
            else:
                self.driver = driver
        except WebDriverException as e:
            print("Error initializing webdriver: ", e)
            return

        while True:
            if len(self.urls) == 0:
                break
            else:
                print("Getting url: " + self.urls[0])
                self.get_url_content()

        self.shutdown()

    def get_url_content(self) -> str:
        try:
            self.driver.get(self.urls[0])
            source = self.driver.page_source
        except WebDriverException as e:
            print("Error getting URL: ", e)
            return

        try:
            # get html of document
            soup = BeautifulSoup(source, "html.parser")
            article = soup.find("article", {"class": "the-document"})
            title = article.find("h1").text
            # Replace any characters in title that are not suitable for filenames
            title = "".join(c for c in title if c.isalnum()
                            or c in [' ', '.']).rstrip()
        except (AttributeError, TypeError) as e:
            print("Error parsing HTML: ", e)
            self.urls.pop(0)
            return

        if self.isIndexed:
            self.save_url_content(article, title)
        else:
            self.path = self.path + title + "/"
            self.make_dir(self.path)
            self.save_url_content(article, title)
            self.isIndexed = True

            print("Getting urls from index page")
            try:
                documentEl = self.driver.find_elements(
                    By.CLASS_NAME, "the-document")[0]
                els = documentEl.find_elements(By.TAG_NAME, "a")
            except IndexError as e:
                print("Error finding elements in the page: ", e)
                self.urls.pop(0)
                return

            for el in els:
                url = el.get_attribute("href")
                if url not in self.urls and url is not None:
                    self.urls.append(url)
        self.urls.pop(0)

    def make_dir(self, path: str) -> None:
        try:
            if not os.path.exists(path):
                os.makedirs(path)
        except OSError as e:
            print("Error creating directory: ", e)

    def save_url_content(self, article, title):
        if self.path is not None:
            try:
                if not os.path.exists(self.path):
                    os.makedirs(self.path)
                if not self.path.endswith('/'):
                    self.path += '/'
            except OSError as e:
                print("Error handling directory path: ", e)
                return

            try:
                with open(self.path + title + ".html", "w") as file:
                    file.write(str(article))
            except (OSError, IOError) as e:
                print("Error writing to file: ", e)

    def shutdown(self):
        try:
            self.driver.quit()
        except WebDriverException as e:
            print("Error shutting down webdriver: ", e)

In [58]:
url = "https://www.austlii.edu.au/cgi-bin/viewdb/au/legis/act/consol_act/ca190082/"
AustliiScraper(url)

Getting url: https://www.austlii.edu.au/cgi-bin/viewdb/au/legis/act/consol_act/ca190082/
Getting urls from index page


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=113.0.5672.126)
Stacktrace:
0   chromedriver                        0x00000001029338ac chromedriver + 4257964
1   chromedriver                        0x000000010292bf40 chromedriver + 4226880
2   chromedriver                        0x00000001025689d4 chromedriver + 281044
3   chromedriver                        0x0000000102542db0 chromedriver + 126384
4   chromedriver                        0x00000001025c7938 chromedriver + 670008
5   chromedriver                        0x00000001025d9fe8 chromedriver + 745448
6   chromedriver                        0x000000010259798c chromedriver + 473484
7   chromedriver                        0x000000010259898c chromedriver + 477580
8   chromedriver                        0x00000001028f2900 chromedriver + 3991808
9   chromedriver                        0x00000001028f6354 chromedriver + 4006740
10  chromedriver                        0x00000001028f6940 chromedriver + 4008256
11  chromedriver                        0x00000001028fc33c chromedriver + 4031292
12  chromedriver                        0x00000001028f6f34 chromedriver + 4009780
13  chromedriver                        0x00000001028cf490 chromedriver + 3847312
14  chromedriver                        0x00000001029149f4 chromedriver + 4131316
15  chromedriver                        0x0000000102914b4c chromedriver + 4131660
16  chromedriver                        0x0000000102925230 chromedriver + 4198960
17  libsystem_pthread.dylib             0x000000018ae73fa8 _pthread_start + 148
18  libsystem_pthread.dylib             0x000000018ae6eda0 thread_start + 8


In [11]:
url = "https://www.austlii.edu.au/cgi-bin/viewtoc/au/legis/act/consol_act/toc-A.html"
driver = webdriver.Chrome(options=chrome_options)
driver.get(url)

In [12]:
letters = driver.find_element(By.XPATH, '//*[@id="panel-letter"]')

In [13]:
anchors = letters.find_elements(By.TAG_NAME, "a")

In [14]:
ATOZURLS = []
for anchor in anchors:
    ATOZURLS.append(anchor.get_attribute("href"))

In [15]:
ATOZURLS.pop(0)

'https://www.austlii.edu.au/cgi-bin/viewtoc/au/legis/act/consol_act/toc-A.html#'

In [19]:
STOWURLS = ATOZURLS[16:]

In [20]:
from typing import List, Optional, Tuple


def get_links(XPATH: str, url: Optional[str] = None, driver: Optional[webdriver.Chrome] = None, quit: bool = True) -> List[Tuple[str, str]]:
    if not driver:
        driver = webdriver.Chrome(options=chrome_options)
    if url:
        driver.get(url)
    els = WebDriverWait(driver, timeout=3).until(
        EC.presence_of_all_elements_located(
            (By.XPATH, XPATH))
    )
    links = [el.get_attribute('href') for el in els]
    if quit:
        driver.quit()
    return links

In [21]:
URLS = []
for url in STOWURLS:
    URLS.append(get_links('//*[@id="page-main"]/div/div/ul/li[*]/a', url))
URLS

[['https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sogca1987308/',
  'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/soga1954128/',
  'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/somva1977224/',
  'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sapta1934396/',
  'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sala1912183/',
  'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sda1906155/',
  'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sia2003255/',
  'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/spa2018231/',
  'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/swa1992129/',
  'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sarlaa2023444/',
  'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sagicpa2020518/',
  'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/co

In [22]:
len(URLS)

5

In [23]:
from concurrent.futures import ThreadPoolExecutor
from selenium import webdriver


def process_url(url):
    driver = webdriver.Chrome(options=chrome_options)
    scraper = AustliiScraper(url, driver)
    return url


# urls is your list of lists of URLs
all_urls = [url for sublist in URLS for url in sublist]  # flatten the list

all_urls

['https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sogca1987308/',
 'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/soga1954128/',
 'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/somva1977224/',
 'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sapta1934396/',
 'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sala1912183/',
 'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sda1906155/',
 'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sia2003255/',
 'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/spa2018231/',
 'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/swa1992129/',
 'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sarlaa2023444/',
 'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sagicpa2020518/',
 'https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/spp

In [24]:
len(all_urls)

62

In [25]:
# Now you can use ThreadPoolExecutor to process these URLs concurrently.
with ThreadPoolExecutor(max_workers=10) as executor:
    results = list(executor.map(process_url, all_urls))

Getting url: https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sogca1987308/
Getting url: https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/soga1954128/Getting url: https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sda1906155/
Getting url: https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/somva1977224/
Getting url: https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sia2003255/

Getting url: https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/swa1992129/
Getting url: https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sala1912183/
Getting url: https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/spa2018231/
Getting url: https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sapta1934396/
Getting url: https://www.austlii.edu.au/cgi-bin/viewdoc/au/legis/act/consol_act/sarlaa2023444/
Getting urls from index page
Getting url: https://www.austlii.edu.au/