## Utilities


In [12]:
import shutil
from collections import defaultdict
import os

In [13]:
def copy_directory(source, destination):
    try:
        shutil.copytree(source, destination)
        print(f"Directory copied from {source} to {destination}")
    except shutil.Error as e:
        print(f"Directory copy failed: {e}")
    except OSError as e:
        print(f"Directory copy failed: {e}")


def delete_file(file_path):
    try:
        os.remove(file_path)
        print(f"The file '{file_path}' has been successfully deleted.")
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
    except Exception as e:
        print(f"An error occurred while deleting the file '{file_path}': {e}")


def delete_folder(folder_path):
    try:
        shutil.rmtree(folder_path)
        print(f"The folder '{folder_path}' has been successfully deleted.")
    except FileNotFoundError:
        print(f"Folder '{folder_path}' not found.")
    except Exception as e:
        print(
            f"An error occurred while deleting the folder '{folder_path}': {e}")

### Copy source dir to target dir


In [23]:
main_directory = "../COPY_ACT-LEGISLATION_HTML"
source_directory = "../docs/2023-05-18__ACT-LEGISLATION_HTML"
delete_folder(main_directory)
copy_directory(source_directory, main_directory)

Folder '../COPY_ACT-LEGISLATION_HTML' not found.
Directory copied from ../docs/2023-05-18__ACT-LEGISLATION_HTML to ../COPY_ACT-LEGISLATION_HTML


### Logger


In [15]:
import logging

In [16]:
# setup the logger
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# set the log file
log_file = f"{main_directory}/legislation.log"
handler = logging.FileHandler(log_file)
logger.addHandler(handler)

## Parser


In [6]:
import re
import os
import json
import urllib.parse

### Parser funcs


In [17]:
def should_include_postfix(file_name, doc_type):
    if doc_type == 'SECTION':
        # Returns the number of capital letters following the section number
        post_num_pattern = r'SECT\s\d+([A-Z]+)'
        post_num_match = re.search(post_num_pattern, file_name)
        return len(post_num_match.group(1)) if post_num_match else 0
    elif doc_type == 'SCHEDULE':
        # Returns the number of capital letters following the schedule number
        post_num_pattern = r'SCHEDULE\s(\d+)([A-Z]+)'
        post_num_match = re.search(post_num_pattern, file_name)
        return len(post_num_match.group(2)) if post_num_match else 0


def extract_section_details(file_name, directory_name):
    base_url = "https://storage.googleapis.com/law-docs/ACT-LEGISLATION_HTML"

    # match section pattern
    sect_pattern = r'SECT\s(\d+)([A-Z]*)(.*)([\d\.]*)\.html$'
    sect_match = re.search(sect_pattern, file_name)

    if sect_match:
        sect_num = 's' + sect_match.group(1)
        postfix = sect_match.group(2)
        num_of_capital_letters = should_include_postfix(file_name, 'SECTION')
        if postfix and num_of_capital_letters > 1:
            sect_num += postfix[:num_of_capital_letters - 1].lower()
            sect_name = postfix[-1]+sect_match.group(3)
        else:
            sect_name = sect_match.group(2) + sect_match.group(3)
        sect_url = f"{base_url}/{urllib.parse.quote(directory_name)}/{urllib.parse.quote(sect_num)}.html"
        return sect_num, sect_name, sect_url, None

    # match schedule pattern
    schedule_pattern = r'SCHEDULE (\d+)([A-Z]*)\.html$'
    schedule_match = re.search(schedule_pattern, file_name)

    if schedule_match:
        sched_num = 'sch-' + schedule_match.group(1)
        postfix = schedule_match.group(2)
        num_of_capital_letters = should_include_postfix(file_name, 'SCHEDULE')
        if postfix and num_of_capital_letters > 1:
            sched_num += postfix[:num_of_capital_letters - 1].lower()
            sched_name = 'SCHEDULE ' + schedule_match.group(1) + postfix[-1]
        else:
            sched_num += postfix.lower()
            sched_name = 'SCHEDULE ' + schedule_match.group(1) + postfix
        sched_url = f"{base_url}/{urllib.parse.quote(directory_name)}/{urllib.parse.quote(sched_num)}.html"
        return sched_num, sched_name, sched_url, None
    else:
        schedule_pattern = r'SCHEDULE\.html$'
        schedule_match = re.search(schedule_pattern, file_name)
        if schedule_match:
            sched_num = 'sch-0'
            sched_name = 'SCHEDULE'
            sched_url = f"{base_url}/{urllib.parse.quote(directory_name)}/{urllib.parse.quote(sched_num)}.html"
            return sched_num, sched_name, sched_url, None
        else:
            schedule_pattern = r'SCHEDULE\s(.*)\.html$'
            schedule_match = re.search(schedule_pattern, file_name)
            if schedule_match:
                sched_num = 'sch-' + schedule_match.group(1)
                sched_name = 'SCHEDULE ' + schedule_match.group(1)
                sched_url = f"{base_url}/{urllib.parse.quote(directory_name)}/{urllib.parse.quote(sched_num)}.html"
                return sched_num, sched_name, sched_url, None

    # match longtitle pattern
    longtitle_pattern = r'LONG TITLE\.html$'
    longtitle_match = re.search(longtitle_pattern, file_name)

    if longtitle_match:
        sect_url = f"{base_url}/{urllib.parse.quote(directory_name)}/LONG%20TITLE.html"
        # return "LONG TITLE" instead of '0'
        return "LONG TITLE", 'LONG TITLE', sect_url, None

    # match notes pattern
    notes_pattern = r'NOTES\.html$'
    notes_match = re.search(notes_pattern, file_name)

    if notes_match:
        sect_url = f"{base_url}/{urllib.parse.quote(directory_name)}/NOTES.html"
        return "NOTES", 'NOTES', sect_url, None

    if not any([sect_match, schedule_match, longtitle_match, notes_match]):
        logger.error(f'Unmatched file: {file_name}')
        return None, None, None, file_name


def get_index_url(directory_name):
    base_url = "https://storage.googleapis.com/law-docs/2023-05-27__ACT-LEGISLATION_HTML"

    index_url = f"{base_url}/{urllib.parse.quote(directory_name)}/index.html"
    return index_url


def rename_files_in_directory(directory):
    legislation_name = os.path.basename(directory)
    year_match = re.search(r'\d{4}$', legislation_name)
    if year_match:
        year = year_match.group(0)
    else:
        logger.error(f"Could not get year for {legislation_name}")
        year = None
    sections = []
    unmatched_files = []
    for file in os.listdir(directory):
        f = os.path.splitext(file)[0]
        if f == legislation_name:
            os.rename(os.path.join(directory, file),
                      os.path.join(directory, "index.html"))
            sections.append({
                "section_name": "index",
                "section_order": "index",
                "section_url": get_index_url(legislation_name)
            })
        else:
            sect_num, sect_name, sect_url, unmatched_file = extract_section_details(
                file, legislation_name)
            if sect_num is not None and sect_name is not None:
                new_file_name = f"{sect_num}.html"

                os.rename(os.path.join(directory, file),
                          os.path.join(directory, new_file_name))

                if sect_num.startswith('sch'):  # split the schedule num
                    _, section_order = sect_num.split('-')
                elif sect_num.startswith('s'):  # split the section num
                    _, section_order = sect_num[0], sect_num[1:]
                else:  # for "LONG TITLE" and "NOTES"
                    section_order = sect_num

                sections.append({
                    "section_name": sect_name,
                    "section_order": section_order,
                    "section_url": sect_url
                })
            elif unmatched_file:
                unmatched_files.append(unmatched_file)
            else:
                logger.error(f"Could not extract section details for {file}")
    return {"legislation": legislation_name, "year": year, "sections": sections, "unmatched_files": unmatched_files}


def create_legislation_json(directory):
    legislation_details = rename_files_in_directory(directory)
    return legislation_details

### Process all directories


In [19]:
def process_all_directories(main_directory):
    all_legislations = []
    missing_sections = {}
    original_file_counts = {}

    for directory in os.listdir(main_directory):
        directory_path = os.path.join(main_directory, directory)
        if os.path.isdir(directory_path):
            original_file_counts[directory] = len(os.listdir(directory_path))

    # Process directories and count sections in JSON
    for directory in os.listdir(main_directory):
        directory_path = os.path.join(main_directory, directory)
        if os.path.isdir(directory_path):
            legislation_dict = create_legislation_json(directory_path)
            section_count = len(legislation_dict["sections"])
            unmatched_files = legislation_dict["unmatched_files"]

            if original_file_counts[directory] != section_count:
                missing_sections[directory] = unmatched_files

            all_legislations.append(legislation_dict)

    with open('../COPY_ACT-LEGISLATION_HTML/legislations.json', 'w') as f:
        json.dump(all_legislations, f, indent=2)

    return missing_sections

## Validation


In [20]:
GREEN = '\033[92m'
RED = '\033[91m'
RESET = '\033[0m'

In [21]:
def check_unique_urls(path):
    # Open the json file
    with open(path) as json_file:
        json_data = json.load(json_file)

    url_set = set()

    # List to store duplicate sections
    duplicate_sections = []

    # Iterate through each legislation
    for legislation in json_data:

        # Iterate through each section in the legislation
        for section in legislation['sections']:

            # Get the URL of the section
            url = section.get('section_url', None)

            if url:
                # If URL is already in the set, it's not unique, add it to duplicate_sections
                if url in url_set:
                    print(section)
                    duplicate_sections.append(section)
                else:
                    # If URL is not in the set, add it to the set
                    url_set.add(url)

    # Write duplicate sections to a JSON file
    with open('../2023-05-27__ACT-LEGISLATION_HTML/duplicate_sections.json', 'w') as outfile:
        json.dump(duplicate_sections, outfile, indent=4)


def group_sections_by_url(file_path):
    # Load the data from the JSON file
    with open(file_path, 'r') as file:
        data = json.load(file)

    # Use a defaultdict to easily group data by section_url
    grouped_data = defaultdict(list)

    # Iterate over each section in data
    for section in data:
        # Append the section to the list of sections for the corresponding section_url
        grouped_data[section["section_url"]].append(section)

    # Convert back to a regular dict for serialization
    grouped_data = dict(grouped_data)

    # Write the grouped data back to the JSON file
    with open(file_path, 'w') as file:
        json.dump(grouped_data, file, indent=4)


def count_sections(data):
    count = 0
    for legislation in data:
        for section in legislation['sections']:
            count += 1
    return count


def count_files(directory):
    count = 0
    for root, _, files in os.walk(directory):
        count += len(files)
    return count


def count_folders(directory):
    count = 0
    for _, dirnames, _ in os.walk(directory):
        count += len(dirnames)
    return count


def check_sum(target_path, source_path):
    with open(os.path.join(target_path, "legislations.json")) as json_file:
        json_data = json.load(json_file)

    num_legislation_source = count_folders(source_path)
    num_legislation_target = count_folders(target_path)
    num_legislation_json = len(json_data)
    if num_legislation_json == num_legislation_source == num_legislation_target:
        print(GREEN+"Legislation count check passed!" + RESET)
        print(f"    Number of folders source: {num_legislation_source}")
        print(f"    Number of folders target: {num_legislation_target}")
        print(f"    Number of legislations json: {num_legislation_json}")
    else:
        print(RED+"Legislation count check failed!" + RESET)
        print(f"    Number of folders source: {num_legislation_source}")
        print(f"    Number of folders target: {num_legislation_target}")
        print(f"    Number of legislations json: {num_legislation_json}")

    num_files_source = count_files(source_path)
    num_files_target = count_files(target_path)
    num_sections_json = count_sections(json_data)
    if num_sections_json == num_files_source == num_files_target:
        print(GREEN+"Section count check passed!" + RESET)
        print(f"    Number of files source: {num_files_source}")
        print(f"    Number of files target: {num_files_target}")
        print(f"    Number of sections json: {num_sections_json}")
    else:
        print(RED+"Section count check failed!" + RESET)
        print(f"    Number of files source: {num_files_source}")
        print(f"    Number of files target: {num_files_target}")
        print(f"    Number of sections json: {num_sections_json}")

        print(
            RED+f"MISSING {num_files_source - num_files_target} SECTIONS IN TARGET" + RESET)
        print(
            RED+f"MISSING {num_files_source - num_sections_json} SECTIONS IN JSON" + RESET)

In [24]:

missing = process_all_directories(main_directory)
check_unique_urls(main_directory+"/legislations.json")
check_sum(main_directory, source_directory)

print(RED+f"MISSING: {missing}" + RESET)

2023-06-01 20:21:02,758 - ERROR - Could not get year for MAGNA CARTA 1297 25 EDW 1 C 29
2023-06-01 20:21:02,895 - ERROR - Could not get year for HEAVY VEHICLE NATIONAL LAW ACT
2023-06-01 20:21:03,784 - ERROR - Could not get year for ELECTRONIC CONVEYANCING NATIONAL LAW ACT
2023-06-01 20:21:04,001 - ERROR - Could not get year for APPROPRIATION OFFICE OF THE LEGISLATIVE ASSEMBLY ACT 20222023 NO 2
2023-06-01 20:21:04,342 - ERROR - Could not get year for APPROPRIATION ACT 20222023 NO 2
2023-06-01 20:21:04,348 - ERROR - Could not get year for HEALTH PRACTITIONER REGULATION NATIONAL LAW ACT


{'section_name': 'N3yearly investigation by actuary', 'section_order': '79', 'section_url': 'https://storage.googleapis.com/law-docs/ACT-LEGISLATION_HTML/LONG%20SERVICE%20LEAVE%20PORTABLE%20SCHEMES%20ACT%202009/s79.html'}
{'section_name': 'S 3500 and Legislation Act s 47', 'section_order': '126a', 'section_url': 'https://storage.googleapis.com/law-docs/ACT-LEGISLATION_HTML/CONSTRUCTION%20OCCUPATIONS%20LICENSING%20ACT%202004/s126a.html'}


FileNotFoundError: [Errno 2] No such file or directory: '../2023-05-27__ACT-LEGISLATION_HTML/duplicate_sections.json'

In [385]:
schedule_pattern = r'SCHEDULE (\d+)([A-Z]*)\.html$'
schedule_match = re.search(schedule_pattern, "SCHEDULE 1A.html")
schedule_match.group(1)

'1'

In [386]:
post_num_pattern = r'SCHEDULE\s(\d+)([A-Z]+)'
post_num_match = re.search(post_num_pattern, "SCHEDULE 1A.html")
len(post_num_match.group(2)) if post_num_match else 0

1

In [387]:
def TEST_extract_section_details(file_name, directory_name):
    base_url = "https://storage.googleapis.com/law-docs/2023-05-27__ACT-LEGISLATION_HTML"
    schedule_pattern = r'SCHEDULE (\d+)([A-Z]*)\.html$'
    schedule_match = re.search(schedule_pattern, file_name)
    if schedule_match:
        sched_num = 'sch-' + schedule_match.group(1)
        postfix = schedule_match.group(2)
        num_of_capital_letters = should_include_postfix(file_name)
        if postfix and num_of_capital_letters > 1:
            sched_num += postfix[:num_of_capital_letters - 1].lower()
            sched_name = 'SCHEDULE ' + schedule_match.group(1) + postfix[-1]
        else:
            sched_num += postfix.lower()
            sched_name = 'SCHEDULE ' + schedule_match.group(1) + postfix
        sched_url = f"{base_url}/{urllib.parse.quote(directory_name)}/{urllib.parse.quote(sched_num)}.html"
        return sched_num, sched_name, sched_url, None
    else:
        schedule_pattern = r'SCHEDULE\.html$'
        schedule_match = re.search(schedule_pattern, file_name)
        if schedule_match:
            sched_num = 'sch-0'
            sched_name = 'SCHEDULE'
            sched_url = f"{base_url}/{urllib.parse.quote(directory_name)}/{urllib.parse.quote(sched_num)}.html"
            return sched_num, sched_name, sched_url, None
        else:
            schedule_pattern = r'SCHEDULE\s(.*)\.html$'
            schedule_match = re.search(schedule_pattern, file_name)
            if schedule_match:
                sched_num = 'sch-' + schedule_match.group(1)
                sched_name = 'SCHEDULE ' + schedule_match.group(1)
                sched_url = f"{base_url}/{urllib.parse.quote(directory_name)}/{urllib.parse.quote(sched_num)}.html"
                return sched_num, sched_name, sched_url, None

In [390]:
check_unique_urls("../2023-05-27__ACT-LEGISLATION_HTML/legislations.json")

In [1]:
import os

def get_folders(directory):
    # List to store the names of folders
    folder_list = []

    # Iterate over all the items in the directory
    for item in os.listdir(directory):
        # Check if the item is a directory
        if os.path.isdir(os.path.join(directory, item)):
            # Check if the last 4 characters of the folder name are not numeric
            if not item[-4:].isnumeric():
                # If not, add the folder name to the list
                folder_list.append(item)

    return folder_list


In [3]:
get_folders('/Users/home/projects/selenium-py/docs/2023-05-18__ACT-LEGISLATION_HTML')

[]

In [2]:
import os
from bs4 import BeautifulSoup
import codecs

def find_article_tags(directory, log_file):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file == 'combined.html':
                filepath = os.path.join(root, file)
                with codecs.open(filepath, 'r', 'utf-8') as f:
                    soup = BeautifulSoup(f, 'html.parser')
                    for article in soup.find_all('article'):
                        if 'id' not in article.attrs:
                            h2 = article.find('h2')
                            if h2:
                                with open(log_file, 'a') as log:
                                    log.write(f'File Path: {filepath}, h2 content: {h2.text}\n')

# Example usage


In [4]:
find_article_tags('/Users/home/projects/selenium-py/ACT-LEGISLATION_HTML copy', 'log.txt')