In [1]:
import os
if not os.path.exists("../docs/"):
    os.makedirs("../docs/")

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from typing import List, Union, Any, Optional
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup

In [3]:
import logging

logging.basicConfig(
    filename='../logs/scrape_austlii.log',
    filemode='w',
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.INFO
)

In [4]:
import os
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_experimental_option('prefs', {
    # Change default directory for downloads
    "download.default_directory": "/Users/home/projects/selenium/docs/TO_BE_CLEANED/",
    "download.prompt_for_download": False,  # To auto download the file
    "download.directory_upgrade": True,
    # It will not show PDF directly in chrome
    "plugins.always_open_pdf_externally": True
})

if not os.path.exists("../docs/TO_BE_CLEANED/"):
    os.makedirs("../docs/TO_BE_CLEANED/")

## Scrape 1 austlii url


In [54]:
def get_url_content(path: Optional[str] = None, url: Optional[str] = None, driver: Optional[webdriver.Chrome] = None) -> str:
    if driver is None and url is None:
        raise ValueError("Either url or driver must be provided")
    if driver is None:
        driver = webdriver.Chrome(options=chrome_options)
    if url is not None:
        driver.get(url)
    if path is not None:
        if not os.path.exists(path):
            os.makedirs(path)
        if not path.endswith('/'):
            path += '/'

    article = driver.find_elements(By.TAG_NAME, 'article')
    article = article[0].text

    # Extract the title from the first line of the article text
    title = article.split('\n')[0]

    # Replace any characters in title that are not suitable for filenames
    title = "".join(c for c in title if c.isalnum()
                    or c in [' ', '.']).rstrip()

    if path is not None:
        with open(path + title + ".txt", "w") as f:
            f.write(article)
    return article

### Unit test


In [55]:
url = "http://www.austlii.edu.au/cgi-bin/viewdoc/au/cases/act/ACTCA//2023/15.html"
article = get_url_content(url=url)
print(article)

## Recursively scrape all of austlii


### Funcs


In [9]:
def write_to_file(article: str, path: str) -> None:
    if not os.path.exists(path):
        os.makedirs(path)
    if not path.endswith('/'):
        path += '/'

    lines = article.split('\n')

    # Find the index of the first line containing alphanumeric characters
    first_alnum_line_index = next((i for i, line in enumerate(
        lines) if any(c.isalnum() for c in line)), None)

    if first_alnum_line_index is not None:
        # Extract the title from the first line of the article text
        title = lines[first_alnum_line_index]
        # Replace any characters in title that are not suitable for filenames
        title = "".join(c for c in title if c.isalnum()
                        or c in [' ', '.']).rstrip()

        content = '\n'.join(lines[first_alnum_line_index:])

        with open(f'{path}{title}.txt', 'w') as f:
            f.write(content)

In [10]:
def scrape_austlii(url: str, path: str) -> List[str]:
    if not os.path.exists(path):
        os.makedirs(path)
    if not path.endswith('/'):
        path += '/'

    retv = []

    try:
        driver = webdriver.Chrome(options=chrome_options)
        logging.info('Launching Chrome browser.')
    except Exception as e:
        logging.error(f'Failed to launch Chrome browser: {e}')

    try:
        driver.get(url)
        logging.info('Navigating to the URL.')
    except Exception as e:
        logging.error(f'Failed to navigate to the URL: {e}')

    try:
        # Find all <a> tags linking to a document on the current page
        els = WebDriverWait(driver, timeout=3).until(
            EC.presence_of_all_elements_located(
                (By.XPATH, '//div[@class="card"]/ul/li/a'))
        )
        # Extract the href attribute from each <a> tag
        links = [el.get_attribute('href') for el in els]
        logging.info('Extracting the href attribute from each <a> tag.')
        logging.info(f'Found {len(links)} links.')

        # Iterate through each link and download the document
        for link in links:
            # Navigate to the document page
            driver.get(link)
            logging.info(f'Navigated to the link: {link}.')

            try:
                # Look for an <a> tag that begins with “Plain text”
                download1 = WebDriverWait(driver, timeout=3).until(
                    lambda d: d.find_element(By.XPATH, '//a[starts-with(text(), "Plain text")]'))
                download1.click()
                logging.info('Clicked on "Plain text" download link.')
            except Exception:
                try:
                    logging.info(
                        'No "RTF format" download link found. Calling get_url_content() function.')
                    text = get_url_content(driver=driver)
                    write_to_file(article=text, path=path)
                    retv.append(text)
                    logging.info('Writing the document to file.')
                    continue
                except Exception as e:
                    logging.error(
                        f'Failed to get the document content: {e}')
                    continue
            logging.info('Waiting for the document to be initialised.')
            source = driver.page_source
            text = BeautifulSoup(source, 'html.parser').text
            logging.info('Getting the page source.')
            retv.append(text)
            write_to_file(article=text, path=path)
            logging.info('Writing the document to file.')
    except Exception as e:
        logging.error(
            f'Failed during the navigation and click operations: {e}')
    return retv

In [6]:
from typing import List, Optional, Tuple


def get_links(XPATH: str, url: Optional[str] = None, driver: Optional[webdriver.Chrome] = None, quit: bool = True) -> List[Tuple[str, str]]:
    if not driver:
        driver = webdriver.Chrome(options=chrome_options)
    if url:
        driver.get(url)
    els = WebDriverWait(driver, timeout=3).until(
        EC.presence_of_all_elements_located(
            (By.XPATH, XPATH))
    )
    links = [el.get_attribute('href') for el in els]
    texts = [el.text for el in els]
    if quit:
        driver.quit()
    return list(zip(texts, links))

In [8]:
from datetime import date
from concurrent.futures import ThreadPoolExecutor


def concurrent_scrape(cth_links, base_path):
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = []
        for doc_type, urls in cth_links.items():
            # Convert the doc_type to uppercase and replace any spaces with underscores
            doc_type_formatted = doc_type.upper().replace(' ', '_')
            for url in urls:
                # Create a folder path with the current date and document type
                today_date = date.today().isoformat()
                path_specific = f'{base_path}/{today_date}__{doc_type_formatted}'
                futures.append(executor.submit(
                    scrape_austlii, url, path_specific))

        results = []
        for future in futures:
            results.append(future.result())

        return results

### Scraping


In [11]:
driver = webdriver.Chrome(options=chrome_options)
starting_url = 'http://www8.austlii.edu.au/au/cth/'
cth_links = {}

link_mappings = {
    'cth_cases': '//*[@id="cth_cases"]//a',
    'cth_legis': '//*[@id="cth_legis"]//a',
    'cth_other': '//*[@id="cth_other"]//a',
    'cth_tax_rulings': '//*[@id="cth_tax_rulings"]//a'
}

for key, xpath in link_mappings.items():
    cth_links[key] = get_links(url=starting_url, XPATH=xpath, driver=driver)

for k, v in cth_links.items():
    print(k, len(v))

MaxRetryError: HTTPConnectionPool(host='localhost', port=57315): Max retries exceeded with url: /session/bf4da2bc3aba8f80fc4233933703dca8/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x10a42cdd0>: Failed to establish a new connection: [Errno 61] Connection refused'))

### ACT


In [14]:
driver = webdriver.Chrome(options=chrome_options)
starting_url = 'http://www8.austlii.edu.au/au/act/'
cth_links = {}

link_mappings = {
    'act_cases': '//*[@id="act_cases"]//a',
    'act_legis': '//*[@id="act_legis"]//a',
    'act_other': '//*[@id="act_other"]//a',
}

for key, xpath in link_mappings.items():
    cth_links[key] = get_links(url=starting_url, XPATH=xpath, driver=driver)

for k, v in cth_links.items():
    print(k, len(v))

MaxRetryError: HTTPConnectionPool(host='localhost', port=58718): Max retries exceeded with url: /session/adea86b85255a80bb0af897271f76280/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x108727490>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [12]:
act_cases = get_links(
    url=starting_url, XPATH='//*[@id="act_cases"]//a', driver=driver)
act_cases

TimeoutException: Message: 


## Data cleaning


In [94]:
import os
import subprocess

input_directory = "../docs/TO_BE_CLEANED/"
output_directory = "../docs/TO_BE_CLEANED_TXT/"

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Iterate over files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith(".rtf"):
        input_path = os.path.join(input_directory, filename)
        output_path = os.path.join(
            output_directory, os.path.splitext(filename)[0] + ".txt")

        # Use subprocess to run the "textutil" command-line tool to convert the file
        subprocess.run(["textutil", "-convert", "txt",
                       input_path, "-output", output_path])

        print(f"Converted {filename} to {output_path}")

Converted 1866.rtf to ../docs/TO_BE_CLEANED_TXT/1866.txt
Converted 458.rtf to ../docs/TO_BE_CLEANED_TXT/458.txt
Converted 1 (33).rtf to ../docs/TO_BE_CLEANED_TXT/1 (33).txt
Converted 470.rtf to ../docs/TO_BE_CLEANED_TXT/470.txt
Converted 2 (6).rtf to ../docs/TO_BE_CLEANED_TXT/2 (6).txt
Converted 4 (11).rtf to ../docs/TO_BE_CLEANED_TXT/4 (11).txt
Converted 5 (2).rtf to ../docs/TO_BE_CLEANED_TXT/5 (2).txt
Converted 74 (2).rtf to ../docs/TO_BE_CLEANED_TXT/74 (2).txt
Converted 64 (2).rtf to ../docs/TO_BE_CLEANED_TXT/64 (2).txt
Converted 15 (4).rtf to ../docs/TO_BE_CLEANED_TXT/15 (4).txt
Converted 15.rtf to ../docs/TO_BE_CLEANED_TXT/15.txt
Converted 38 (3).rtf to ../docs/TO_BE_CLEANED_TXT/38 (3).txt
Converted 28 (3).rtf to ../docs/TO_BE_CLEANED_TXT/28 (3).txt
Converted 29.rtf to ../docs/TO_BE_CLEANED_TXT/29.txt
Converted 85 (1).rtf to ../docs/TO_BE_CLEANED_TXT/85 (1).txt
Converted 6 (4).rtf to ../docs/TO_BE_CLEANED_TXT/6 (4).txt
Converted 16 (2).rtf to ../docs/TO_BE_CLEANED_TXT/16 (2).txt
C