# Scrape Federal Register of Legislation


In [62]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import os

In [2]:
if not os.path.exists("./docs"):
    os.makedirs("./docs")

In [107]:
urls = [
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/A/0/0/principal",  # 0
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/B/0/0/principal",  # 1
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/C/0/0/principal",  # 2
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/D/0/0/principal",  # 3
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/E/0/0/principal",  # 4
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/F/0/0/principal",  # 5
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/G/0/0/principal",  # 6
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/H/0/0/principal",  # 7
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/I/0/0/principal",  # 8
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/J/0/0/principal",  # 9
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/K/0/0/principal",  # 10
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/L/0/0/principal",  # 11
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/M/0/0/principal",  # 12
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/N/0/0/principal",  # 13
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/O/0/0/principal",  # 14
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/P/0/0/principal",  # 15
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/Q/0/0/principal",  # 16
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/R/0/0/principal",  # 17
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/S/0/0/principal",  # 18
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/T/0/0/principal",  # 19
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/U/0/0/principal",  # 20
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/V/0/0/principal",  # 21
    "https://www.legislation.gov.au/Browse/Results/ByTitle/Acts/InForce/W/0/0/principal",  # 22
]

In [21]:
# Set up the webdriver. This assumes you're using Chrome; adjust as necessary.
driver = webdriver.Chrome("chromedriver")

In [22]:
# Open the webpage
driver.get(urls[0])

# Wait until the page has fully loaded
wait = WebDriverWait(driver, 10)

In [25]:
def download(driver):
    # Get all download buttons
    download_buttons = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located(
            (By.XPATH, "//input[@value='Download']"))
    )

    # Iterate over each button
    for i in range(len(download_buttons)):
        # Need to find the buttons again because the DOM might have changed
        download_buttons = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located(
                (By.XPATH, "//input[@value='Download']")
            )
        )

        # Click the button
        download_buttons[i].click()

        # Find the "Text" link and click it
        text_link = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable(
                (By.XPATH, "//a[contains(text(), 'Text')]"))
        )
        text_link.click()

        # Get the page source and parse it with Beautiful Soup
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Find the div with class "right" and print its inner HTML
        right_div = soup.find("div", {"class": "right"})
        title = driver.title
        url = driver.current_url

        # Write the HTML to a file
        with open(f"./docs/{title}", "w") as f:
            f.write(f"{title}\n{url}\n{right_div}")

        # Go back to the original page
        driver.back()
        driver.back()

        # Wait for a bit to make sure the page is fully loaded before the next iteration
        time.sleep(1)

In [27]:
download(driver)

In [26]:
def extract_numbers(driver):
    # Get the page source and parse it with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Find the div with the class 'rgWrap rgInfoPart'
    div = soup.find("div", class_="rgWrap rgInfoPart")

    # If the div isn't found, return an empty list
    if div is None:
        return []

    # Find all the strong tags in the div
    strongs = div.find_all("strong")

    # Extract the numbers from the strong tags
    numbers = []
    for strong in strongs:
        # Try to convert the string to an integer
        try:
            number = int(strong.text)
        except ValueError:
            # If it can't be converted to an integer, skip it
            continue

        # Append the number to the list
        numbers.append(number)
    return numbers

In [24]:
# Get the number of pages
num_pages = extract_numbers(driver)
num_pages

[161, 4]

In [77]:
from selenium.common.exceptions import NoSuchElementException


def click_next_page(driver):
    try:
        # Find the input element with title 'Next Page' and class 'rgPageNext'
        next_button = driver.find_elements(
            By.XPATH,
            '//*[@id="ctl00_MainContent_gridBrowse_ctl00"]/thead/tr[1]/td/table/tbody/tr/td/div[3]/input[1]',
        )

        # Scroll to the element
        driver.execute_script("arguments[0].scrollIntoView();", next_button[0])

        # Click on the element
        driver.execute_script("arguments[0].click();", next_button[0])
        return True

    except NoSuchElementException:
        print("Next Page button not found.")
        return False

In [88]:
def download_letter(driver, url):
    # Open the webpage
    driver.get(url)

    # Wait until the page has fully loaded
    wait = WebDriverWait(driver, 10)

    # Get the number of pages
    num_pages = extract_numbers(driver)

    # Iterate over each page
    for i in range(num_pages[1]):
        # Download the HTML
        download(driver)

        # Click the next page button
        click_next_page(driver)

In [105]:
driver = webdriver.Chrome("chromedriver")

In [106]:
urls2 = urls[19:]
for url in urls2:
    download_letter(driver, url)

TimeoutException: Message: 


In [102]:
driver.get(urls[19])
click_next_page(driver)
download(driver)

True

# Process downloaded content


In [121]:
import re


class Document:
    def __init__(self, path):
        with open(path, "r") as f:
            lines = f.readlines()
            self.title = lines[0].strip()
            self.url = lines[1].strip()
            self.metadata = {"title": self.title, "url": self.url}
            html = "".join(lines[2:])
        self.text = self._get_text(html)

    @staticmethod
    def _get_text(html):
        soup = BeautifulSoup(html, "html.parser")
        text = soup.get_text()
        text = re.sub(r"\n+", "\n", text)
        text = re.sub(r"\xa0", "", text)
        return text

In [123]:
from glob import glob

files = glob("./docs/*")
len(files)

1208

In [125]:
docs = [Document(f) for f in files]

In [140]:
import pandas as pd

df = pd.DataFrame([doc.text for doc in docs])
df.head()

Unnamed: 0,0
0,"\n Cocos (Keeling) Islands Act 1955 No.34, 1..."
1,"\n CSL Sale Act 1993 No.88, 1993 Compilation..."
2,\n Immigration (Guardianship of Children) Ac...
3,"\n Air Navigation Act 1920 No.50, 1920 Compi..."
4,"\n Telstra Corporation Act 1991 No.79, 1991 ..."


In [141]:
df2 = pd.DataFrame([d.metadata for d in docs])
df2.head()

Unnamed: 0,title,url
0,Cocos (Keeling) Islands Act 1955,https://www.legislation.gov.au/Details/C2021C0...
1,CSL Sale Act 1993,https://www.legislation.gov.au/Details/C2018C0...
2,Immigration (Guardianship of Children) Act 1946,https://www.legislation.gov.au/Details/C2016C0...
3,Air Navigation Act 1920,https://www.legislation.gov.au/Details/C2016C0...
4,Telstra Corporation Act 1991,https://www.legislation.gov.au/Details/C2022C0...


In [145]:
df_combined = pd.concat([df, df2], axis=1)
df_combined.head()

Unnamed: 0,0,title,url
0,"\n Cocos (Keeling) Islands Act 1955 No.34, 1...",Cocos (Keeling) Islands Act 1955,https://www.legislation.gov.au/Details/C2021C0...
1,"\n CSL Sale Act 1993 No.88, 1993 Compilation...",CSL Sale Act 1993,https://www.legislation.gov.au/Details/C2018C0...
2,\n Immigration (Guardianship of Children) Ac...,Immigration (Guardianship of Children) Act 1946,https://www.legislation.gov.au/Details/C2016C0...
3,"\n Air Navigation Act 1920 No.50, 1920 Compi...",Air Navigation Act 1920,https://www.legislation.gov.au/Details/C2016C0...
4,"\n Telstra Corporation Act 1991 No.79, 1991 ...",Telstra Corporation Act 1991,https://www.legislation.gov.au/Details/C2022C0...


In [148]:
df_combined = df_combined.rename(columns={0: "text"})

In [149]:
df_combined.head()

Unnamed: 0,text,title,url
0,"\n Cocos (Keeling) Islands Act 1955 No.34, 1...",Cocos (Keeling) Islands Act 1955,https://www.legislation.gov.au/Details/C2021C0...
1,"\n CSL Sale Act 1993 No.88, 1993 Compilation...",CSL Sale Act 1993,https://www.legislation.gov.au/Details/C2018C0...
2,\n Immigration (Guardianship of Children) Ac...,Immigration (Guardianship of Children) Act 1946,https://www.legislation.gov.au/Details/C2016C0...
3,"\n Air Navigation Act 1920 No.50, 1920 Compi...",Air Navigation Act 1920,https://www.legislation.gov.au/Details/C2016C0...
4,"\n Telstra Corporation Act 1991 No.79, 1991 ...",Telstra Corporation Act 1991,https://www.legislation.gov.au/Details/C2022C0...


In [150]:
df_combined.to_csv("legislation.csv", index=False)

In [1]:
import pandas as pd


def load_legislation():
    df = pd.read_csv("legislation.csv")
    return df

In [2]:
df = load_legislation()
df.head()

Unnamed: 0,text,title,url
0,"\n Cocos (Keeling) Islands Act 1955 No.34, 1...",Cocos (Keeling) Islands Act 1955,https://www.legislation.gov.au/Details/C2021C0...
1,"\n CSL Sale Act 1993 No.88, 1993 Compilation...",CSL Sale Act 1993,https://www.legislation.gov.au/Details/C2018C0...
2,\n Immigration (Guardianship of Children) Ac...,Immigration (Guardianship of Children) Act 1946,https://www.legislation.gov.au/Details/C2016C0...
3,"\n Air Navigation Act 1920 No.50, 1920 Compi...",Air Navigation Act 1920,https://www.legislation.gov.au/Details/C2016C0...
4,"\n Telstra Corporation Act 1991 No.79, 1991 ...",Telstra Corporation Act 1991,https://www.legislation.gov.au/Details/C2022C0...


In [10]:
# Remove the initial ‘\n ‘ at the start of each value in the ‘text’ column
df["text"] = df["text"].str.lstrip("\n ")
df.head()

Unnamed: 0,text,title,url,char_count
0,"Cocos (Keeling) Islands Act 1955 No.34, 1955 C...",Cocos (Keeling) Islands Act 1955,https://www.legislation.gov.au/Details/C2021C0...,85149
1,"CSL Sale Act 1993 No.88, 1993 Compilation No.7...",CSL Sale Act 1993,https://www.legislation.gov.au/Details/C2018C0...,44602
2,Immigration (Guardianship of Children) Act 194...,Immigration (Guardianship of Children) Act 1946,https://www.legislation.gov.au/Details/C2016C0...,27553
3,"Air Navigation Act 1920 No.50, 1920 Compilatio...",Air Navigation Act 1920,https://www.legislation.gov.au/Details/C2016C0...,208660
4,"Telstra Corporation Act 1991 No.79, 1991 Compi...",Telstra Corporation Act 1991,https://www.legislation.gov.au/Details/C2022C0...,167989


In [4]:
df.to_csv("legislation.csv", index=False)

In [5]:
# Calculate the average number of characters for each value in the ‘text’ column
df["char_count"] = df["text"].apply(lambda x: len(x))
average_char_count = df["char_count"].mean()
print("Average character count: ", average_char_count)

Average character count:  100893.05049668874


In [8]:
average_token_count = average_char_count / 4 * 3
average_token_count

75669.78787251655

In [9]:
total_char_count = df["char_count"].sum() / 4 * 3

total_char_count

91409103.75

In [11]:
# Define the function to split text into chunks with overlap
def split_text(text, limit, overlap):
    return [text[i: i + limit] for i in range(0, len(text), limit - overlap)]

In [12]:
# Create a copy of original dataframe
df_new = df.copy()

# Apply this function to the text column
df_new['text_chunks'] = df_new['text'].apply(lambda x: split_text(x, 96, 20))

# Explode the new column to create a new row for each chunk
df_new = df_new.explode('text_chunks')

# Replace 'text' column with the chunks
df_new['text'] = df_new['text_chunks']

# Drop the unnecessary 'text_chunks' column
df_new = df_new.drop(columns=['text_chunks'])

# Add new 'char_count' column for each chunk
df_new['char_count'] = df_new['text'].apply(len)

In [13]:
df_new.head()

Unnamed: 0,text,title,url,char_count
0,"Cocos (Keeling) Islands Act 1955 No.34, 1955 C...",Cocos (Keeling) Islands Act 1955,https://www.legislation.gov.au/Details/C2021C0...,96
0,ate: 18 December 2020 Includes amendments up t...,Cocos (Keeling) Islands Act 1955,https://www.legislation.gov.au/Details/C2021C0...,96
0,d: 9 February 2021 About this compilation This...,Cocos (Keeling) Islands Act 1955,https://www.legislation.gov.au/Details/C2021C0...,96
0,tion of the Cocos (Keeling) Islands Act 1955 t...,Cocos (Keeling) Islands Act 1955,https://www.legislation.gov.au/Details/C2021C0...,96
0,as amended and in force on 18 December 2020 (t...,Cocos (Keeling) Islands Act 1955,https://www.legislation.gov.au/Details/C2021C0...,96


In [14]:
df_new.shape

(1604255, 4)

In [15]:
df_new.to_csv("legislation_96char_chunks.csv", index=False)

In [None]:
from langchain.schema import Document


def create_document(row):
    page_content = row['text']
    metadata = {
        'source': row['url'],
        'title': row['title'],
    }
    return Document(page_content=page_content, metadata=metadata)


documents = df_new.apply(create_document, axis=1).tolist()
len(documents)

In [1]:
from bs4 import BeautifulSoup
import pandas as pd

df = pd.read_csv("legislation.csv")

In [2]:
df.head()

Unnamed: 0,text,title,url
0,"Cocos (Keeling) Islands Act 1955 No.34, 1955 C...",Cocos (Keeling) Islands Act 1955,https://www.legislation.gov.au/Details/C2021C0...
1,"CSL Sale Act 1993 No.88, 1993 Compilation No.7...",CSL Sale Act 1993,https://www.legislation.gov.au/Details/C2018C0...
2,Immigration (Guardianship of Children) Act 194...,Immigration (Guardianship of Children) Act 1946,https://www.legislation.gov.au/Details/C2016C0...
3,"Air Navigation Act 1920 No.50, 1920 Compilatio...",Air Navigation Act 1920,https://www.legislation.gov.au/Details/C2016C0...
4,"Telstra Corporation Act 1991 No.79, 1991 Compi...",Telstra Corporation Act 1991,https://www.legislation.gov.au/Details/C2022C0...


In [46]:
with open ('output.txt') as f:
    text = f.read()

In [47]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

texts = RecursiveCharacterTextSplitter().split_text(text)
len(texts)

24

In [48]:
from langchain.docstore.document import Document

docs = [Document(page_content=t) for t in texts]
len(docs)

24

In [49]:
from langchain.chains.summarize import load_summarize_chain
from langchain.llms import OpenAI


llm = OpenAI(temperature=0)
chain = load_summarize_chain(llm, chain_type="map_reduce")
chain.run(docs)

' This case examines the complexities of the concept of "break and entry" as established in historical circumstances, and how it applies to contemporary society. The appellant was charged with breaking and entering a dwelling house and committing a serious indictable offence. The Court of Criminal Appeal found that an entry into a dwelling-house effected pursuant to a contractual right will involve a "break" if made without the consent of the "actual occupant". The Crown argued that the appellant\'s conduct did not constitute a trespass, as he had a right of exclusive possession from a residential tenancy agreement. The Court of Criminal Appeal expressed concern that the lesser offences to which the appellant entered pleas of guilty did not take into account the fact that the conduct took place in the complainant\'s home, which is the rationale of the offence in s.112. The appeal was allowed and the appellant was found not guilty.'

In [50]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = FAISS.from_documents(docs, embedding=embeddings)

In [51]:
vectorstore.similarity_search(query='What was the appellant charged with?', top_k=5)

[Document(page_content='4.  The trial was conducted in the District Court of New South Wales by\n    Williams\xa0SC\xa0DCJ without a jury. Williams\xa0SC\xa0DCJ found that the Crown\n    had not established an essential precondition to liability for the\n    offence under s\xa0112 and directed a verdict of not guilty.\n    Williams\xa0SC\xa0DCJ made no finding (as his Honour was not required to\n    do so) as to the appellant\\\'s intent or purpose at the time of entry\n    into the apartment. His Honour accepted the appellant\\\'s argument\n    that, because he was a tenant of the apartment at the time of the\n    alleged offence, he had a right to enter and could not be guilty of\n    breaking into his own premises. His Honour found that there was \\"no\n    doubt that the tenancy agreement granted to the complainant and the\n    accused a right of occupation at the relevant time\\". As appears\n    below, the Crown sought to contest the scope of the appellant\\\'s\n    right of occu

In [56]:
from langchain import OpenAI
from langchain.chains import RetrievalQA

chain = RetrievalQA.from_chain_type(OpenAI(temperature=0), chain_type="stuff", retriever=vectorstore.as_retriever())

In [58]:
chain({"query": "What was the appellant charged with?"})

{'query': 'What was the appellant charged with?',
 'result': ' The appellant was charged with an offence under s 112(2) of the Crimes Act.'}

In [63]:
driver = webdriver.Chrome("chromedriver")
# Open the webpage
driver.get("https://www.austlii.edu.au/cgi-bin/viewdoc/au/cases/cth/HCA/1989/62.html")

# Wait until the page has fully loaded
wait = WebDriverWait(driver, 10)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [65]:
html_content = driver.page_source

In [67]:
driver.quit()

In [None]:

# Create a BeautifulSoup object and parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Find the <article> element with class="the-document"
article_element = soup.find('article', class_='the-document')

# Extract the inner HTML of the <article> element using the prettify() method
inner_html = article_element.prettify()

print(inner_html)

In [73]:
texts = RecursiveCharacterTextSplitter().split_text(inner_html)
vectorstore = FAISS.from_texts(texts, embedding=embeddings)
vectorstore.similarity_search(query='What was the outcome of the case?', top_k=5)

[Document(page_content='<article class="the-document">\n <h1>\n  Chan Yee Kin v Minister for Immigration &amp; Ethnic Affairs [1989] HCA 62; (1989) 169 CLR 379 (9 December 1989)\n </h1>\n <!--make_database: source=/home/philip/austlii/db/HCA/data/1989_62.html-->\n <!--sino date 9 December 1989-->\n <center>\n  <h2>\n   <b>\n    HIGH COURT OF AUSTRALIA\n   </b>\n  </h2>\n  <p>\n   CHAN v. MINISTER FOR IMMIGRATION AND ETHNIC AFFAIRS\n   <a class="autolink_findcases_inserted" href="/cgi-bin/viewdoc/au/cases/cth/FCA/1990/364.html" title="View Case">\n    [1989] HCA 62\n   </a>\n   ; (1989) 169 CLR 379\n   <br/>\n   <br/>\n   F.C. 89/034\n  </p>\n  <p>\n   Immigration - Administrative Law (Cth)\n  </p>\n  <p>\n   High Court of Australia\n   <br/>\n   <br/>\n   Mason C.J.(1), Dawson(2), Toohey(3), Gaudron(4) and McHugh(5) JJ.\n  </p>\n  <p>\n  </p>\n </center>\n <p>\n  <b>\n   CATCHWORDS\n  </b>\n </p>\n Immigration - Refusal by Minister\'s delegate of refugee status - Act\nreferring to stat

In [80]:
from langchain.chat_models import ChatOpenAI
docs = [Document(page_content=t) for t in texts]
chain = load_summarize_chain(ChatOpenAI(), chain_type="map_reduce")
chain.run(docs)

KeyboardInterrupt: 

In [81]:
df

Unnamed: 0,text,title,url
0,"Cocos (Keeling) Islands Act 1955 No.34, 1955 C...",Cocos (Keeling) Islands Act 1955,https://www.legislation.gov.au/Details/C2021C0...
1,"CSL Sale Act 1993 No.88, 1993 Compilation No.7...",CSL Sale Act 1993,https://www.legislation.gov.au/Details/C2018C0...
2,Immigration (Guardianship of Children) Act 194...,Immigration (Guardianship of Children) Act 1946,https://www.legislation.gov.au/Details/C2016C0...
3,"Air Navigation Act 1920 No.50, 1920 Compilatio...",Air Navigation Act 1920,https://www.legislation.gov.au/Details/C2016C0...
4,"Telstra Corporation Act 1991 No.79, 1991 Compi...",Telstra Corporation Act 1991,https://www.legislation.gov.au/Details/C2022C0...
...,...,...,...
1203,Australian Defence Force Cover Act 2015 No.118...,Australian Defence Force Cover Act 2015,https://www.legislation.gov.au/Details/C2022C0...
1204,"Domicile Act 1982 No.1, 1982 Compilation No.4 ...",Domicile Act 1982,https://www.legislation.gov.au/Details/C2019C0...
1205,Loan Act 1973 No. 19 of 1973 AN ACT To...,Loan Act 1973,https://www.legislation.gov.au/Details/C2004A0...
1206,"Age Discrimination Act 2004 No.68, 2004 Compil...",Age Discrimination Act 2004,https://www.legislation.gov.au/Details/C2022C0...


In [82]:
total_characters = df['text'].str.len().sum()
total_characters

121878805

In [84]:
total_tokens = total_characters / 4 * 3

In [86]:
total_cost = total_tokens * 0.0004 / 1000
total_cost

36.5636415