In [2]:
from glob import glob

act_legislations = glob('docs/2023-05-18__ACT-LEGISLATION/*.txt')

len(act_legislations)

336

In [3]:
act_legislations[:5]

['docs/2023-05-18__ACT-LEGISLATION/ENCLOSED LANDS PROTECTION ACT 1943.txt',
 'docs/2023-05-18__ACT-LEGISLATION/HUMAN RIGHTS COMMISSION ACT 2005.txt',
 'docs/2023-05-18__ACT-LEGISLATION/DUTIES ACT 1999.txt',
 'docs/2023-05-18__ACT-LEGISLATION/MERCANTILE LAW ACT 1962.txt',
 'docs/2023-05-18__ACT-LEGISLATION/WORKPLACE LEGISLATION AMENDMENT ACT 2022.txt']

In [17]:
ACT_LEGISLATION = []
for legislation in act_legislations:
    with open(legislation, 'r') as f:
        ACT_LEGISLATION.append(f.read())

len(ACT_LEGISLATION)

336

In [18]:
import tiktoken
tokens = 0
enc = tiktoken.get_encoding("cl100k_base")
for legislation in ACT_LEGISLATION:
    tokens += len(enc.encode(legislation))
tokens

10622614

In [21]:
ACT_LEGISLATION[0]

'ENCLOSED LANDS PROTECTION ACT 1943 \n\n- TABLE OF PROVISIONS\n\n1. Name of Act   \n2. Dictionary  \n3. Notes  \n4. Penalty for unlawful entry on enclosed lands   \n5. Penalty for leaving gate open   \n6. Requirement to give name etc   \n7. Owner may destroy goats   \nDICTIONARY\nENDNOTES\n\nENCLOSED LANDS PROTECTION ACT 1943\n- LONG TITLE\nAn Act relating to protection of enclosed lands from intrusion and\ntrespass\n       \n       \n \n\nENCLOSED LANDS PROTECTION ACT 1943\n- SECT 1\nName of Act \nThis Act is the Enclosed Lands Protection Act 1943.\n\nENCLOSED LANDS PROTECTION ACT 1943\n- SECT 2\nDictionary\nThe dictionary at the end of this Act is part of this Act.\nNote 1     The dictionary at the end of this Act defines certain terms\nused in this Act.\nNote 2     A definition in the dictionary applies to the entire Act\nunless the definition, or another provision of the Act, provides otherwise or\nthe contrary intention otherwise appears (see Legislation Act\n, s 155 and s 156 (1)

In [49]:
import uuid


def process_documents(documents):
    result = []
    metadata = []

    for doc in documents:
        lines = doc.split('\n')
        title = lines[0].strip()  # strip leading/trailing whitespace
        if not title:  # skip if title is empty
            continue

        # remove newline characters before the first occurrence of title
        first_title_index = doc.find(title)
        if first_title_index > 0:
            doc = doc[first_title_index:]

        second_title_index = doc.find(title, doc.find(title)+1)
        if second_title_index == -1:  # skip if title doesn't occur twice
            continue

        doc = doc[second_title_index:]
        chunks = doc.split(title)

        for i, chunk in enumerate(chunks):
            lines = chunk.split('\n')
            section = next(
                (line for line in lines if line.startswith('- SECT')), None)
            if section:
                section = section.replace('- ', '')  # remove preceding '- '
                if len(chunk) > 2000:
                    # split chunk into smaller chunks
                    sub_chunks = [chunk[i:i+2000]
                                  for i in range(0, len(chunk), 2000)]
                    for j, sub_chunk in enumerate(sub_chunks):
                        sub_chunk = title + '\n' + sub_chunk  # add title as first line
                        result.append(sub_chunk)
                        uuid_str = str(uuid.uuid4())
                        metadata.append(
                            {'id': uuid_str, 'title': title, 'section': f'{section}_{j}'})
                else:
                    chunk = title + '\n' + chunk  # add title as first line
                    result.append(chunk)
                    uuid_str = str(uuid.uuid4())
                    metadata.append(
                        {'id': uuid_str, 'title': title, 'section': section})

    return result, metadata

In [50]:
texts, metadata = process_documents(ACT_LEGISLATION)

In [None]:
documents = [doc for doc in texts]
ids = [meta['id'] for meta in metadata]

# reformat metadatas to match the format expected by collection.add()
metadatas = [{'chapter': meta['title'], 'verse': meta['section']}
             for meta in metadata]

collection.add(documents=documents, metadatas=metadatas, ids=ids)

In [2]:
import csv
rows = []
with open('/Users/home/projects/selenium/act_legislation.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        rows.append(row)

len(rows)

1

In [3]:
import requests
from InstructorEmbedding import INSTRUCTOR
from typing import List
from bs4 import BeautifulSoup
INSTRUCTION = 'This is a sample instruction'
model = INSTRUCTOR('/Users/home/gh/instructor-embed-api/instructor-large')


def split_text(input_string, overlap=200):
    result = []
    if len(input_string) > 2000:
        mid_point = len(input_string) // 2
        # Ensure overlap while splitting
        first_part = input_string[:mid_point + overlap]
        second_part = input_string[mid_point:]
        result += split_text(first_part)
        result += split_text(second_part)
    else:
        result.append(input_string)
    return result


def download_html(url: str):
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    return soup.text


def embed(payload: List[List[str]]):
    embeddings = model.encode(payload)
    return embeddings


def construct_payload(text: str, instruction: str = INSTRUCTION):
    retv = []
    if len(text) > 2000:
        split_texts = split_text(text)
        for text in split_texts:
            retv.append([instruction, text])
    else:
        retv.append([instruction, text])
    return retv

load INSTRUCTOR_Transformer
max_seq_length  512


In [4]:
section_text = download_html(
    'https://storage.googleapis.com/law-docs/ACT-LEGISLATION_HTML/CEMETERIES%20AND%20CREMATORIA%20ACT%202020/s30.html')
payload = construct_payload(section_text)
embeddings = []
try:
    print("embedding section")
    embeddings = embed(payload)
except Exception as e:
    print("failed to embed section: ",  e)

embedding section


In [8]:
len(section_text[0][0])

2516

In [9]:
len(payload)

1

In [10]:
payload[0]

['This is a sample instruction',
 ['<html><head></head><body><article class="the-document" id="2b033adf-ac5d-4cd1-8746-78b7ac6b8394">\n<h2>\nCEMETERIES AND CREMATORIA ACT 2020 - SECT 70\nImmediate suspension of licenceâ\x80\x94danger to public health\n</h2>\n\n<b>Immediate suspension of licenceâ\x80\x94danger to public health</b>\n<p>&nbsp; &nbsp; (1) &nbsp; &nbsp; The regulator may suspend a licence to operate\na facility immediately if the regulator believes on reasonable grounds\nthatâ\x80\x94 </p> <p>&nbsp; &nbsp; &nbsp; &nbsp; (a) &nbsp; &nbsp; the licensee has\nengaged in disciplinary conduct; and </p> <p>&nbsp; &nbsp; &nbsp; &nbsp; (b) &nbsp;\n&nbsp; there is a danger to public health as a result of the conduct. </p> <p>&nbsp;\n&nbsp; (2) &nbsp; &nbsp; The regulator must tell the licensee, in writing,\nthat the regulator is suspending the licence starting immediately and the\nreasons for the suspension. </p> <p><i>Note &nbsp; &nbsp; </i>For what must be included in a\nstatement 

In [7]:
section_text = [download_html(section_url)]

['encode',
 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa

In [2]:
from InstructorEmbedding import INSTRUCTOR
model = INSTRUCTOR('/Users/home/gh/instructor-embed-api/instructor-large')
vectors = model.encode(payload)

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


NameError: name 'payload' is not defined

# HI


In [6]:
import json
with open('/Users/home/projects/selenium-py/ACT-LEGISLATION_HTML copy/legislations.json', 'r') as f:
    data = json.load(f)

print(len(data))

338


In [8]:
import os


path = "/Users/home/projects/selenium-py/ACT-LEGISLATION_HTML copy/ABORIGINAL AND TORRES STRAIT ISLANDER CHILDREN AND YOUNG PEOPLE COMMISSIONER ACT 2022/"
legislation_name = os.path.basename(os.path.dirname(path))

with open(f'{path}index.html', 'r') as f:
    index_html = f.read()

print("legislation name: ", legislation_name)
index_html

legislation name:  ABORIGINAL AND TORRES STRAIT ISLANDER CHILDREN AND YOUNG PEOPLE COMMISSIONER ACT 2022


'<html><head></head><body><article class="the-document" id="e6ca4536-5125-4f74-9435-3651786407bc">\n<h2>\nABORIGINAL AND TORRES STRAIT ISLANDER CHILDREN AND YOUNG PEOPLE COMMISSIONER ACT 2022\n</h2>\n<h2>Table of Provisions</h2>\n<ul>\n<a name="longtitle"></a> <li><a href="longtitle.html">Long Title</a> </li>\n</ul>\n<a name="p1"></a> \n<ul>\n<a name="s1"></a> <li><a class="leg-number" href="s1.html">1</a> Name of Act &nbsp;</li>\n<a name="s3"></a> <li><a class="leg-number" href="s3.html">3</a> Dictionary &nbsp;</li>\n<a name="s4"></a> <li><a class="leg-number" href="s4.html">4</a> Notes &nbsp;</li>\n<a name="s5"></a> <li><a class="leg-number" href="s5.html">5</a> Offences against Act—application of Criminal Code etc &nbsp;</li>\n</ul>\n<a name="p2"></a> \n<ul>\n<a name="s6"></a> <li><a class="leg-number" href="s6.html">6</a> Objects of Act &nbsp;</li>\n<a name="s7"></a> <li><a class="leg-number" href="s7.html">7</a> Aboriginal and Torres Strait Islander cultural principles &nbsp;</li>

In [37]:
from bs4 import BeautifulSoup
import re
root = "/Users/home/projects/selenium-py/ACT-LEGISLATION_HTML copy"


def read_and_parse_html(path):
    with open(path, 'r', encoding='utf-8') as f:
        contents = f.read()
    soup = BeautifulSoup(contents, 'html.parser')
    return soup


for legislation in data:
    path = f'{root}/{legislation["name"]}/index.html'
    html = read_and_parse_html(path)
    for section in legislation['sections']:
        if section['order_value'] == 1:
            continue
        elif section['order_value'] == 2:
            html = re.sub(r'<a href="longtitle.html">',
                          f"<a href='#{section['id']}'>",
                          str(html))
        elif section['order_value'] == 5:
            html = re.sub(r'(<a href=")notes.html(.*">)',
                          f"\\1#{section['id']}\\2",
                          str(html))

        elif section['order_value'] == 3:
            html = re.sub(f'<a class="leg-number" href="s{section["section_order"]}.html">',
                          f"<a class='leg-number' href='#{section['id']}'>",
                          str(html))
        elif section['order_value'] == 4:
            html = re.sub(f'<a href="sch{section["section_order"]}.html">',
                          f"<a href='#{section['id']}'>",
                          str(html))
    with open(path, 'w', encoding='utf-8') as f:
        f.write(str(html))

In [45]:
import glob

index_files = glob.glob(f'{root}/*/index.html', recursive=True)
for path in index_files:
    with open(path, 'r', encoding='utf-8') as f:
        contents = f.read()
    soup = BeautifulSoup(contents, 'html.parser')
    for a in soup.find_all('a', href=True):
        if a['href'].startswith('#'):
            continue
        else:
            with open('logs/wrong-hrefs.txt', 'a') as f:
                f.write(f'\n{path}, {a["href"]}')

In [35]:
vectors = model.encode([['Represent the query for retrieval', 'ACT']])
len(vectors)

1

In [38]:
import requests
import json

url = 'https://law-docs-c5783da.svc.us-east4-gcp.pinecone.io/query'

headers = {
    'Content-Type': 'application/json',
    'Api-Key': '3172ab22-d119-46f2-acb2-9ab0122441a5',
}

data = {
    "vector": vectors[0].tolist(),
    "topK": 5,
    "includeMetadata": True,
    "includeValues": True,
    "namespace": "act-legislation"
}

response = requests.post(url, headers=headers, data=json.dumps(data))
resp_json = response.json()

In [40]:
for match in resp_json['matches']:
    print(match['metadata'])

{'jurisdiction': 'ACT', 'legislation_combined_url': 'https://storage.googleapis.com/law-docs/ACT-LEGISLATION_HTML/EDUCATION%20AND%20CARE%20SERVICES%20NATIONAL%20LAW%20ACT%20ACT%202011/combined.html', 'legislation_id': '105b205f-2921-49e1-9afe-ad3c867cb5a7', 'legislation_index_url': 'https://storage.googleapis.com/law-docs/ACT-LEGISLATION_HTML/EDUCATION%20AND%20CARE%20SERVICES%20NATIONAL%20LAW%20ACT%20ACT%202011/index.html', 'legislation_name': 'EDUCATION AND CARE SERVICES NATIONAL LAW ACT ACT 2011', 'legislation_year': '2011', 'section_id': '32f1233d-f166-40d3-a01e-ab2cbe722fc7', 'section_name': 'Name of Act', 'section_url': 'https://storage.googleapis.com/law-docs/ACT-LEGISLATION_HTML/EDUCATION%20AND%20CARE%20SERVICES%20NATIONAL%20LAW%20ACT%20ACT%202011/s1.html', 'text': '\n\nEDUCATION AND CARE SERVICES NATIONAL LAW (ACT) ACT 2011 - SECT 1\nName of Act\n\nName of Act\nThis Act is the Education and Care Services National Law (ACT) Act  2011 . \n\n\n'}
{'jurisdiction': 'ACT', 'legislati

In [3]:
import requests
from bs4 import BeautifulSoup


def download_html(url: str):
    response = requests.get(url)
    # response.encoding = 'utf-8'  # explicitly set encoding to utf-8
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    return soup.text

In [4]:
download_html(
    'https://storage.googleapis.com/law-docs/ACT-LEGISLATION_HTML/CEMETERIES%20AND%20CREMATORIA%20ACT%202020/s91.html')

"\n\nCEMETERIES AND CREMATORIA ACT 2020 - SECT 91\nOffenceâ\x80\x94fail to comply with direction to give name and address\n\nOffenceâ\x80\x94fail to comply with direction to give name and address\n\xa0 \xa0 (1) \xa0 \xa0 A person commits an offence ifâ\x80\x94  \xa0\n\xa0 \xa0 \xa0 (a) \xa0 \xa0 an authorised person directs the person\nto give their full name and address under section 90; and  \xa0 \xa0\n\xa0 \xa0 (b) \xa0 \xa0 the authorised person produces the authorised\nperson's identity card for inspection by the person; and  \xa0 \xa0 \xa0\n\xa0 (c) \xa0 \xa0 the authorised person warns the person that failure\nto comply with the direction is an offence; and  \xa0 \xa0 \xa0 \xa0\n(d) \xa0 \xa0 the person did not give the authorised person their full\nname and address.  Maximum penalty: 10 penalty units.  Note \xa0 \xa0 It\nis an offence to make a false or misleading statement or give false or\nmisleading information (see Criminal Code  , pt 3.4).  \xa0 \xa0 (2)\n\xa0 \xa0 An offe