## Utils


In [4]:
import os
import shutil


def copy_directory(src, dst):
    try:
        shutil.copytree(src, dst)
    except OSError as e:
        print('Directory not copied. Error: %s' % e)

In [5]:
def copy_file(src, dst):
    try:
        shutil.copy(src, dst)
    except OSError as e:
        print('File not copied. Error: %s' % e)

## Remove anchors


In [8]:
import os
from bs4 import BeautifulSoup


def remove_links(root_dir):
    for legislation_folder in os.listdir(root_dir):
        folder_path = os.path.join(root_dir, legislation_folder)
        if os.path.isdir(folder_path):
            for html_file in os.listdir(folder_path):
                if html_file != 'index.html':
                    html_file_path = os.path.join(folder_path, html_file)
                    with open(html_file_path, 'r', errors='ignore') as f:  # ignore decode errors
                        html = f.read()
                        soup = BeautifulSoup(html, 'html.parser')

                        for a in soup.find_all('a'):
                            # Replace the 'a' tag with its text content
                            a.replace_with(a.text)

                    # Re-encode the file in UTF-8 while writing
                    with open(html_file_path, 'w', encoding='utf-8') as f:
                        f.write(str(soup))

In [30]:
dst = '../COPY_ACT-LEGISLATION_HTML/'
remove_links(dst)

KeyboardInterrupt: 

## Change section_url format


In [10]:
copy_file('../legislations.json', '../COPY_legislations.json')

In [11]:
import json


def remove_date_from_url(json_file_path):
    with open(json_file_path, 'r') as f:
        data = json.load(f)

    # Iterate over each legislation and section
    for legislation in data:
        for section in legislation['sections']:
            # Replace the date in the URL
            section['section_url'] = section['section_url'].replace(
                '2023-05-27__', '')

    # Write the updated data back to the file
    with open(json_file_path, 'w') as f:
        json.dump(data, f, indent=2)

In [12]:
remove_date_from_url('../COPY_legislations.json')

## Remove comments


In [1]:
import os
from bs4 import BeautifulSoup, Comment


def remove_comments(root_dir):
    for legislation_folder in os.listdir(root_dir):
        folder_path = os.path.join(root_dir, legislation_folder)
        if os.path.isdir(folder_path):
            for html_file in os.listdir(folder_path):
                html_file_path = os.path.join(folder_path, html_file)
                with open(html_file_path, 'r+', errors='ignore') as f:
                    html = f.read()
                    soup = BeautifulSoup(html, 'html.parser')

                    # find all comments and remove those matching "<!--sino noindex-->"
                    comments = soup.findAll(
                        text=lambda text: isinstance(text, Comment))
                    for comment in comments:
                        if 'sino noindex' in comment:
                            comment.extract()

                    # write the modified html back to the file
                    f.seek(0)
                    f.write(str(soup))
                    f.truncate()

In [31]:
# copy_directory('../ACT-LEGISLATION_HTML/', '../COPY_ACT-LEGISLATION_HTML/')
remove_comments(dst)

  comments = soup.findAll(text=lambda text: isinstance(text, Comment))


## Map section_order to numbers


In [1]:
import json
import re


def edit_json(filename):
    # Load JSON
    with open(filename, 'r') as f:
        data = json.load(f)

    # Order mapping
    order_mapping = {'index': 1, 'long_name': 2,
                     'section': 3, 'schedule': 4, 'notes': 5}

    # Edit JSON
    for legislation in data:
        for section in legislation['sections']:
            # Determine category
            if section['section_name'].lower() == 'index':
                category = 'index'
            elif section['section_name'].lower() == 'long title':
                category = 'long_name'
            elif section['section_name'].lower().startswith('schedule'):
                category = 'schedule'
            elif section['section_name'].lower() == 'notes':
                category = 'notes'
            else:
                category = 'section'

            # Assign order value
            section['order_value'] = order_mapping[category]

            # Split section_order into prefix and suffix parts
            match = re.match(r"([0-9]+)([a-z]*)",
                             section['section_order'], re.I)
            if match:
                items = match.groups()
            else:
                items = (section['section_order'],)
            section['section_order_prefix'] = items[0]
            section['section_order_suffix'] = items[1] if len(
                items) > 1 else ''

    # Save JSON
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

In [6]:
copy_file('/Users/home/gh/selenium/prisma/act_legislation.json',
          '../COPY_act_legislation.json')
edit_json('../COPY_act_legislation.json')

## Remove unnecessary line breaks


In [11]:
import os


def remove_string_from_files(root_directory, file_name, string_to_remove):
    for directory_path, directory_names, file_names in os.walk(root_directory):
        if file_name in file_names:
            file_path = os.path.join(directory_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = file.read()
            data = data.replace(string_to_remove, "")
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(data)

# use function

In [33]:
a = '<p><br/><br/><font size="2"></font><font size="1"></font><br/><br/><font size="1">\n</font><br/><br/><font size="1"> </font><br/><br/><font size="1">\n</font><br/><br/></p>'
b = '<p><br/><br/><font size="2"></font><font size="1"></font><br/><br/><font size="1">\n</font><br/><br/><font size="1"> </font><br/><br/><font size="1"> </font></p><p align="right"></p>'
c = '<p><br/><br/><font size="2"></font><font size="1"></font><br/><br/><font size="1"></font><br/><br/><font size="1"> </font><br/><br/><font size="1"></font><br/><br/><br/><br/></p>'
remove_string_from_files(dst, 'LONG TITLE.html', a)
remove_string_from_files(dst, 'LONG TITLE.html', b)
remove_string_from_files(dst, 'LONG TITLE.html', c)

## Combine html files


In [34]:
import os
import re


def concatenate_html_files(dir_path):
    # define the regex pattern for s#.html and sch-#.html
    s_pattern = re.compile(r'^s(\d+)([a-zA-Z]*)\.html$')
    sch_pattern = re.compile(r'^sch-(\d+)([a-zA-Z]*)\.html$')

    for root, dirs, files in os.walk(dir_path):
        # sort all the html files
        html_files = sorted(f for f in files if f.endswith('.html'))

        ordered_files = []
        if 'LONG TITLE.html' in html_files:
            ordered_files.append('LONG TITLE.html')
            html_files.remove('LONG TITLE.html')

        s_files = sorted((f for f in html_files if s_pattern.match(f)),
                         key=lambda x: (int(s_pattern.match(x).group(1)), s_pattern.match(x).group(2)))
        ordered_files.extend(s_files)

        sch_files = sorted((f for f in html_files if sch_pattern.match(f)),
                           key=lambda x: (int(sch_pattern.match(x).group(1)), sch_pattern.match(x).group(2)))
        ordered_files.extend(sch_files)

        if 'NOTES.html' in html_files:
            ordered_files.append('NOTES.html')

        # combine all html files into one
        with open(os.path.join(root, 'combined.html'), 'w') as outfile:
            for fname in ordered_files:
                with open(os.path.join(root, fname)) as infile:
                    outfile.write(infile.read())

In [35]:
dir_path = '../COPY_ACT-LEGISLATION_HTML/BAIL ACT 1992'
concatenate_html_files(dir_path)

In [36]:
dir_path = '../COPY_ACT-LEGISLATION_HTML'
concatenate_html_files(dir_path)

## Remove redundant h3 and rename h1 to h2


In [37]:
import os
from bs4 import BeautifulSoup


def modify_html_tags(directory):
    # Walk the directory
    for dirpath, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            if filename.endswith('.html'):
                filepath = os.path.join(dirpath, filename)

                # Open and read the file
                with open(filepath, 'r') as file:
                    html_content = file.read()

                # Parse the HTML with BeautifulSoup
                soup = BeautifulSoup(html_content, 'html.parser')

                # Find all h3 tags and remove them
                for h3_tag in soup.find_all('h3'):
                    h3_tag.decompose()

                # Find all h1 tags and change them to h2
                for h1_tag in soup.find_all('h1'):
                    h1_tag.name = 'h2'

                # Write the modified HTML back to the file
                with open(filepath, 'w') as file:
                    file.write(str(soup))

In [39]:
copy_directory('../ACT-LEGISLATION_HTML/', '../COPY_ACT-LEGISLATION_HTML/')

In [40]:
dir_path = '../COPY_ACT-LEGISLATION_HTML/BAIL ACT 1992'
modify_html_tags(dir_path)

In [41]:
modify_html_tags('../COPY_ACT-LEGISLATION_HTML')