In [1]:
# Load dotenv
from dotenv import load_dotenv
import os

load_dotenv()

LEGISLATION_URL_PREFIX = os.getenv("LEGISLATION_URL_PREFIX")
LEGISLATION_URI_LIST_FILE = os.getenv("LEGISLATION_URI_LIST_FILE")
JSON_OUTPUT_DIR = os.getenv("JSON_OUTPUT_DIR", "json_out")
DEPTH_LIMIT = int(os.getenv("DEPTH_LIMIT", 2))

In [2]:
from bs4 import BeautifulSoup


class XmlSchemaAccumulator:
    def __init__(self):
        self.schema_tree = {}

    def add_soup(self, soup):
        for root_tag in soup.find_all(recursive=False):
            self._traverse(root_tag, self.schema_tree)

        return self.schema_tree

    def _traverse(self, tag, current_level):
        if tag.name is None:
            return

        if tag.name not in current_level:
            current_level[tag.name] = {"attributes": set(), "children": {}}

        current_level[tag.name]["attributes"].update(tag.attrs.keys())

        for child in tag.find_all(recursive=False):
            self._traverse(child, current_level[tag.name]["children"])

    def print_current_schema(self, node=None, depth=0):
        if node is None:
            node = self.schema_tree
            print("\n--- Current Accumulated XML Schema ---")

        for tag_name, tag_data in node.items():
            indent = "  " * depth
            attrs = ", ".join(sorted(tag_data["attributes"]))
            attr_str = f" (Attributes: {attrs})" if attrs else ""

            print(f"{indent}└── <{tag_name}>{attr_str}")

            self.print_current_schema(tag_data["children"], depth + 1)

In [3]:
import os
import requests
import json
from collections import deque
from tqdm.auto import tqdm


class LegislationCrawler:
    def __init__(self, max_depth=2):
        self.max_depth = max_depth
        self.visited_urls = set()
        self.queue = deque()
        self.schema_accumulator = XmlSchemaAccumulator()

        os.makedirs(".cache", exist_ok=True)
        os.makedirs(JSON_OUTPUT_DIR, exist_ok=True)

    def get_safe_filename(self, url):
        clean_url = url.split("://")[-1].replace(LEGISLATION_URL_PREFIX + "/", "")
        return clean_url.replace("/", "_")

    def normalize_url(self, uri):
        if not uri:
            return None
        clean_uri = uri.replace("/id/", "/")
        clean_uri = clean_uri.rstrip("/")
        if not clean_uri.endswith("data.xml"):
            return f"{clean_uri}/data.xml"
        return clean_uri

    def clean_url(self, url):
        return url.replace("/id/", "/").rstrip("/")

    def extract_identifier(self, soup):
        identifier = {}
        title = soup.find("dc:title")
        identifier["title"] = title.text.strip() if title else None
        description = soup.find("dc:description")
        identifier["description"] = description.text.strip() if description else None
        publisher = soup.find("dc:publisher")
        identifier["publisher"] = publisher.text.strip() if publisher else None
        modified = soup.find("dc:modified")
        identifier["modified"] = modified.text.strip() if modified else None
        identifier_tag = soup.find("dc:identifier")
        identifier["uri"] = identifier_tag.text.strip() if identifier_tag else None
        valid = soup.find("dct:valid")
        identifier["valid_date"] = valid.text.strip() if valid else None
        return identifier

    def extract_super(self, soup):
        super = {}
        supersedes = soup.find("ukm:Supersedes")
        if supersedes:
            super["supersedes"] = self.clean_url(supersedes.get("URI"))
        superseded_by = soup.find("ukm:SupersededBy")
        if superseded_by:
            super["superseded_by"] = self.clean_url(superseded_by.get("URI"))

        return super

    def extract_metadata(self, soup):
        metadata = {}
        metadata_block = soup.find(
            ["ukm:PrimaryMetadata", "ukm:SecondaryMetadata", "ukm:EUMetadata"]
        )
        if not metadata_block:
            return metadata

        year = metadata_block.find("ukm:Year")
        metadata["year"] = year.get("Value") if year else None
        number = metadata_block.find("ukm:Number")
        metadata["number"] = number.get("Value") if number else None
        enactment = metadata_block.find("ukm:EnactmentDate")
        metadata["enactment_date"] = enactment.get("Date") if enactment else None
        status = metadata_block.find("ukm:DocumentStatus")
        metadata["status"] = status.get("Value") if status else None
        isbn = metadata_block.find("ukm:ISBN")
        metadata["isbn"] = isbn.get("Value") if isbn else None
        category = metadata_block.find("ukm:DocumentCategory")
        metadata["category"] = category.get("Value") if category else None
        coming_into_force = metadata_block.find("ukm:ComingIntoForce")
        if coming_into_force:
            date_tag = coming_into_force.find("ukm:DateTime")
            metadata["coming_into_force"] = (
                date_tag.get("Date").strip() if date_tag else None
            )

        unapplied_effects_list = []
        for effect in metadata_block.find_all("ukm:UnappliedEffect"):
            effect_data = {
                "effect_id": effect.get("EffectId"),
                "type": effect.get("Type"),
                "affected_provisions": effect.get("AffectedProvisions"),
                "affecting_provisions": effect.get("AffectingProvisions"),
                "requires_applied": effect.get("RequiresApplied") == "true",
                "notes": effect.get("Notes"),
                "modified_date": effect.get("Modified"),
                "affecting_title": None,
                "in_force_date": None,
                "in_force_qualification": None,
            }
            affecting_title = effect.find("ukm:AffectingTitle")
            if affecting_title:
                effect_data["affecting_title"] = affecting_title.text.strip()
            in_force = effect.find("ukm:InForce")
            if in_force:
                if in_force.get("Date"):
                    effect_data["in_force_date"] = in_force.get("Date")
                elif in_force.get("Prospective") == "true":
                    effect_data["in_force_date"] = "Prospective"
                effect_data["in_force_qualification"] = in_force.get(
                    "Qualification"
                ) or in_force.get("OtherQualification")
            unapplied_effects_list.append(effect_data)

        metadata["unapplied_effects"] = unapplied_effects_list
        return metadata

    def _get_xml_soup(self, xml_url, safe_name, current_depth):
        cache_filepath = os.path.join(".cache", safe_name)

        if os.path.exists(cache_filepath):
            self.last_fetch_status = f"Cached (Depth {current_depth})"
            with open(cache_filepath, "rb") as f:
                xml_content = f.read()
        else:
            self.last_fetch_status = f"Fetched (Depth {current_depth})"
            try:
                response = requests.get(xml_url)
                response.raise_for_status()
                xml_content = response.content
                temp_soup = BeautifulSoup(xml_content, "xml")
                xml_content = temp_soup.prettify(encoding="utf-8")
                with open(cache_filepath, "wb") as f:
                    f.write(xml_content)
            except requests.exceptions.RequestException as e:
                # tqdm.write(f"Failed to fetch {xml_url}: {e}")
                return None
        return BeautifulSoup(xml_content, "xml")

    def _extract_commentaries(self, soup, current_depth, found_citations):
        commentaries_map = {}
        for comm in soup.find_all("Commentary"):
            comm_id = comm.get("id")
            if not comm_id:
                continue

            full_text = comm.get_text(separator=" ", strip=True)
            citations = []

            for cit in comm.find_all("Citation"):
                cit_uri = cit.get("URI")
                citations.append(
                    {
                        "id": cit.get("id"),
                        "uri": cit_uri,
                        "title": cit.get("Title"),
                        "class": cit.get("Class"),
                        "year": cit.get("Year"),
                        "number": cit.get("Number"),
                        "text": cit.text.strip(),
                    }
                )
                if cit_uri and current_depth < self.max_depth:
                    found_citations.add(cit_uri)

            citation_subrefs = []
            for subref in comm.find_all("CitationSubRef"):
                citation_subrefs.append(
                    {
                        "id": subref.get("id"),
                        "uri": subref.get("URI"),
                        "citation_ref": subref.get("CitationRef"),
                        "section_ref": subref.get("SectionRef"),
                        "text": subref.text.strip(),
                    }
                )

            commentaries_map[comm_id] = {
                "type": comm.get("Type"),
                "text": full_text,
                "citations": citations,
                "citation_subrefs": citation_subrefs,
            }
        return commentaries_map

    def _resolve_commentaries(self, element, commentaries_map, exclude_parent=None):
        resolved = []
        for cref in element.find_all("CommentaryRef"):
            if exclude_parent and cref.find_parent(exclude_parent) is not None:
                continue
            ref_id = cref.get("Ref")
            if ref_id in commentaries_map:
                resolved.append({"ref_id": ref_id, **commentaries_map[ref_id]})
        return resolved

    def _get_joined_text(self, element):
        return " ".join(
            [t.get_text(separator=" ", strip=True) for t in element.find_all("Text")]
        )

    def _extract_body_parts(self, soup, commentaries_map):
        document_tree = {}
        body_p1s = [p1 for p1 in soup.find_all("P1") if not p1.find_parent("Schedules")]

        for idx, section in enumerate(body_p1s, start=1):
            part = section.find_parent("Part")
            chapter = section.find_parent("Chapter")
            p1group = section.find_parent("P1group")

            part_num = (
                part.find("Number").text.strip()
                if part and part.find("Number")
                else "No Part"
            )
            part_uri = part.get("DocumentURI") if part else None
            part_restrict_start_date = part.get("RestrictStartDate") if part else None
            part_restrict_end_date = part.get("RestrictEndDate") if part else None
            part_status = part.get("Status") if part else None
            part_title = (
                part.find("Title").text.strip() if part and part.find("Title") else None
            )
            chapter_uri = chapter.get("DocumentURI") if chapter else None
            chapter_restrict_start_date = (
                chapter.get("RestrictStartDate") if chapter else None
            )
            chapter_restrict_end_date = (
                chapter.get("RestrictEndDate") if chapter else None
            )
            chapter_status = chapter.get("Status") if chapter else None
            chapter_num = (
                chapter.find("Number").text.strip()
                if chapter and chapter.find("Number")
                else "No Chapter"
            )
            chapter_title = (
                chapter.find("Title").text.strip()
                if chapter and chapter.find("Title")
                else None
            )

            chap_dict_key = chapter_uri or chapter_num

            if part_num not in document_tree:
                document_tree[part_num] = {
                    "title": part_title,
                    "part_uri": part_uri,
                    "restrict_start_date": part_restrict_start_date,
                    "restrict_end_date": part_restrict_end_date,
                    "status": part_status,
                    "chapters": {},
                }

            if chap_dict_key not in document_tree[part_num]["chapters"]:
                document_tree[part_num]["chapters"][chap_dict_key] = {
                    "chapter_uri": chapter_uri,
                    "chapter_number": (
                        chapter_num if chapter_num != "No Chapter" else None
                    ),
                    "restrict_start_date": chapter_restrict_start_date,
                    "restrict_end_date": chapter_restrict_end_date,
                    "status": chapter_status,
                    "title": chapter_title,
                    "sections": [],
                }

            section_num = (
                section.find("Pnumber").text.strip()
                if section.find("Pnumber")
                else None
            )
            section_title = (
                p1group.find("Title").text.strip()
                if p1group and p1group.find("Title")
                else None
            )
            section_restrict_start_date = (
                p1group.get("RestrictStartDate") if p1group else None
            )
            section_restrict_end_date = (
                p1group.get("RestrictEndDate") if p1group else None
            )
            section_restrict_extent = p1group.get("RestrictExtent") if p1group else None
            section_status = p1group.get("Status") if p1group else None

            section_data = {
                "order": idx,
                "section_number": section_num,
                "title": section_title,
                "uri": section.get("DocumentURI") or section.get("id"),
                "restrict_start_date": section_restrict_start_date,
                "restrict_end_date": section_restrict_end_date,
                "restrict_extent": section_restrict_extent,
                "status": section_status,
                "commentaries": self._resolve_commentaries(
                    section, commentaries_map, exclude_parent="P2"
                ),
                "paragraphs": [],
            }

            paragraphs = section.find_all(["P2", "P3", "P4"])
            if not paragraphs:
                section_data["text"] = self._get_joined_text(section)
            else:
                for idx, para in enumerate(paragraphs, start=1):
                    para_num = (
                        para.find("Pnumber").text.strip()
                        if para.find("Pnumber")
                        else None
                    )
                    para_restrict_start_date = para.get("RestrictStartDate")
                    para_restrict_end_date = para.get("RestrictEndDate")
                    para_restrict_extent = para.get("RestrictExtent")
                    para_status = para.get("Status")
                    section_data["paragraphs"].append(
                        {
                            "order": idx,
                            "paragraph_number": para_num,
                            "text": self._get_joined_text(para),
                            "uri": para.get("DocumentURI") or para.get("id"),
                            "restrict_start_date": para_restrict_start_date,
                            "restrict_end_date": para_restrict_end_date,
                            "restrict_extent": para_restrict_extent,
                            "status": para_status,
                            "commentaries": self._resolve_commentaries(
                                para, commentaries_map
                            ),
                        }
                    )

            document_tree[part_num]["chapters"][chap_dict_key]["sections"].append(
                section_data
            )

        parts_list = []
        for part_idx, (p_num, p_data) in enumerate(document_tree.items(), start=1):
            part_obj = {
                "order": part_idx,
                "part_number": p_num if p_num != "No Part" else None,
                "uri": p_data["part_uri"],
                "restrict_start_date": p_data["restrict_start_date"],
                "restrict_end_date": p_data["restrict_end_date"],
                "status": p_data["status"],
                "title": p_data["title"],
                "chapters": [],
            }
            for chap_idx, (chap_key, c_data) in enumerate(
                p_data["chapters"].items(), start=1
            ):
                part_obj["chapters"].append(
                    {
                        "order": chap_idx,
                        "chapter_number": c_data["chapter_number"],
                        "uri": c_data["chapter_uri"],
                        "restrict_start_date": c_data["restrict_start_date"],
                        "restrict_end_date": c_data["restrict_end_date"],
                        "status": c_data["status"],
                        "title": c_data["title"],
                        "sections": c_data["sections"],
                    }
                )
            parts_list.append(part_obj)

        return parts_list

    def _extract_schedules(self, soup, commentaries_map):
        schedules_list = []
        schedules_root = soup.find("Schedules")

        if not schedules_root:
            return schedules_list

        for sched_idx, schedule in enumerate(
            schedules_root.find_all("Schedule"), start=1
        ):
            sched_num = (
                schedule.find("Number").text.strip()
                if schedule.find("Number")
                else None
            )
            sched_title_node = schedule.find("Title")
            sched_title = (
                sched_title_node.get_text(strip=True) if sched_title_node else None
            )
            sched_ref = (
                schedule.find("Reference").text.strip()
                if schedule.find("Reference")
                else None
            )

            sched_obj = {
                "order": sched_idx,
                "schedule_number": sched_num,
                "title": sched_title,
                "reference": sched_ref,
                "uri": schedule.get("DocumentURI") or schedule.get("id"),
                "paragraphs": [],
            }

            for p1_idx, p1 in enumerate(schedule.find_all("P1"), start=1):
                p1_num = p1.find("Pnumber").text.strip() if p1.find("Pnumber") else None
                pblock = p1.find_parent("Pblock")
                pblock_title = (
                    pblock.find("Title").get_text(strip=True)
                    if pblock and pblock.find("Title")
                    else None
                )

                p1_data = {
                    "order": p1_idx,
                    "paragraph_number": p1_num,
                    "crossheading": pblock_title,
                    "uri": p1.get("DocumentURI") or p1.get("id"),
                    "commentaries": self._resolve_commentaries(
                        p1, commentaries_map, exclude_parent="P2"
                    ),
                    "subparagraphs": [],
                }

                p2s = p1.find_all("P2")
                if not p2s:
                    p1_data["text"] = self._get_joined_text(p1)
                else:
                    for p2_idx, p2 in enumerate(p2s, start=1):
                        p2_num = (
                            p2.find("Pnumber").text.strip()
                            if p2.find("Pnumber")
                            else None
                        )
                        p1_data["subparagraphs"].append(
                            {
                                "order": p2_idx,
                                "subparagraph_number": p2_num,
                                "text": self._get_joined_text(p2),
                                "uri": p2.get("DocumentURI") or p2.get("id"),
                                "commentaries": self._resolve_commentaries(
                                    p2, commentaries_map
                                ),
                            }
                        )

                sched_obj["paragraphs"].append(p1_data)
            schedules_list.append(sched_obj)
        return schedules_list

    def _extract_explanatory_notes(self, soup, current_depth, found_citations):
        notes_root = soup.find("ExplanatoryNotes")
        if not notes_root:
            return None

        notes_uri = notes_root.get("DocumentURI") or notes_root.get("IdURI")

        paragraphs = []
        for p in notes_root.find_all("P"):
            p_text = self._get_joined_text(p)

            citations = []
            for cit in p.find_all("Citation"):
                cit_uri = cit.get("URI")
                citations.append(
                    {
                        "id": cit.get("id"),
                        "uri": cit_uri,
                        "title": cit.get("Title"),
                        "class": cit.get("Class"),
                        "year": cit.get("Year"),
                        "number": cit.get("Number"),
                        "text": cit.text.strip(),
                    }
                )
                if cit_uri and current_depth < self.max_depth:
                    found_citations.add(cit_uri)

            paragraphs.append({"text": p_text, "citations": citations})

        return {"uri": notes_uri, "paragraphs": paragraphs}

    def fetch_and_parse(self, target_url, current_depth):
        xml_url = self.normalize_url(target_url)
        if not xml_url:
            return None

        base_identifying_url = xml_url.replace("/data.xml", "")
        if base_identifying_url in self.visited_urls:
            return None

        self.visited_urls.add(base_identifying_url)

        safe_name = self.get_safe_filename(xml_url)
        soup = self._get_xml_soup(xml_url, safe_name, current_depth)
        if not soup:
            return None

        self.schema_accumulator.add_soup(soup)

        identifier = self.extract_identifier(soup)
        metadata = self.extract_metadata(soup)
        super = self.extract_super(soup)

        doc_year = metadata.get("year") or "unknown_year"
        year_dir = os.path.join(JSON_OUTPUT_DIR, str(doc_year))
        os.makedirs(year_dir, exist_ok=True)
        json_filepath = os.path.join(year_dir, safe_name.replace(".xml", ".json"))

        found_citations = set()
        commentaries_map = self._extract_commentaries(
            soup, current_depth, found_citations
        )

        parts_list = self._extract_body_parts(soup, commentaries_map)
        schedules_list = self._extract_schedules(soup, commentaries_map)
        explanatory_notes = self._extract_explanatory_notes(
            soup, current_depth, found_citations
        )

        final_json_structure = {
            "legislation_url": base_identifying_url,
            "identifier": identifier,
            "super": super,
            "metadata": metadata,
            "parts": parts_list,
            "schedules": schedules_list,
            "explanatory_notes": explanatory_notes,
        }

        with open(json_filepath, "w", encoding="utf-8") as f:
            f.write(json.dumps(final_json_structure, indent=4))

        for cit_url in found_citations:
            normalized = self.normalize_url(cit_url)
            if (
                normalized
                and normalized.replace("/data.xml", "") not in self.visited_urls
            ):
                self.queue.append((cit_url, current_depth + 1))

        if super.get("supersedes"):
            self.queue.append(
                (super.get("supersedes"), current_depth)
            )  # Leave at current depth as we want superseeds to always be retrieved.
        if super.get("superseded_by"):
            self.queue.append((super.get("superseded_by"), current_depth))

    def crawl(self, start_url):
        self.queue.append((start_url, 0))

        with tqdm(desc="Crawling Legislation", unit="docs") as pbar:
            while self.queue:
                pbar.total = pbar.n + len(self.queue)
                pbar.refresh()

                current_url, depth = self.queue.popleft()
                self.fetch_and_parse(current_url, depth)

                status = getattr(self, "last_fetch_status", "")
                pbar.set_postfix({"queue": len(self.queue), "action": status})

                pbar.update(1)

In [4]:
crawler = LegislationCrawler(max_depth=DEPTH_LIMIT)

# Read LEGISLATION_URI_LIST_FILE and process each URI
with open(LEGISLATION_URI_LIST_FILE, "r") as f:
    legislation_uris = [line.strip() for line in f if line.strip()]
for uri in tqdm(legislation_uris, desc="Processing Legislation URIs"):
    full_url = f"{LEGISLATION_URL_PREFIX}{uri}"
    crawler.crawl(full_url)

Processing Legislation URIs:   0%|          | 0/25 [00:00<?, ?it/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

Crawling Legislation: 0docs [00:00, ?docs/s]

In [5]:
crawler.schema_accumulator.print_current_schema()


--- Current Accumulated XML Schema ---
└── <Legislation> (Attributes: ConfersPower, DocumentURI, IdURI, NumberOfProvisions, RestrictEndDate, RestrictExtent, RestrictStartDate, SchemaVersion, Status, xml:lang, xmlns, xmlns:xsi, xsi:schemaLocation)
  └── <Metadata> (Attributes: xmlns:atom, xmlns:dc, xmlns:dct, xmlns:ukm)
    └── <identifier>
    └── <title> (Attributes: xml:lang)
    └── <description>
    └── <type>
    └── <format>
    └── <language>
    └── <publisher>
    └── <modified>
    └── <contributor>
    └── <valid>
    └── <link> (Attributes: href, hreflang, rel, title, type)
    └── <PrimaryMetadata>
      └── <DocumentClassification>
        └── <DocumentCategory> (Attributes: Value)
        └── <DocumentMainType> (Attributes: Value)
        └── <DocumentStatus> (Attributes: Value)
      └── <Year> (Attributes: Value)
      └── <Number> (Attributes: Value)
      └── <EnactmentDate> (Attributes: Date)
      └── <ISBN> (Attributes: Value)
      └── <UnappliedEffects>
       