NER and RE

In [None]:
import networkx as nx
import csv
import os
import html
import pandas as pd
import re
from collections import defaultdict
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import html

def extract_source_act_name(root):
    

    
    title_element = root.find(".//title")
    if title_element is not None and title_element.text:
        return html.unescape(title_element.text.strip())

    
    citation_patterns = [
        "This Act may be cited as",
        "This Act is the",
        "This Act shall be known as",
        "This Act is called",
        "The Short Title of this Act is"
    ]

    
    for para in root.iter("para"):
        for text_element in para.iter("text"):
            sentence_parts = []
            for node in text_element.iter():
                if node.text:
                    sentence_parts.append(html.unescape(node.text.strip()))
            full_text = " ".join(sentence_parts).strip()

            for pattern in citation_patterns:
                if pattern in full_text:
                    act_title = full_text.split(pattern, 1)[-1].strip()
                    return act_title.rstrip(".")

    return None  


def load_official_act_patterns(csv_file):
    
    try:
        df = pd.read_csv(csv_file, encoding="utf-8")
        unique_names = sorted(df["Extracted Text"].dropna().unique(), key=len, reverse=True)
        return [(name, re.compile(rf"\b{re.escape(name)}\b", re.IGNORECASE)) for name in unique_names]
    except Exception:
        return []



def extract_all_act_names(text, compiled_patterns):
    
    extracted_acts = set()

    for name, pattern in compiled_patterns:
        if pattern.search(text):
            is_substring = any(longer_act for longer_act in extracted_acts if name in longer_act)
            if not is_substring:
                extracted_acts.add(name)

    return list(extracted_acts)








def normalize_act_name(name):
    return name.lower().replace("’", "'").replace("  ", " ").strip()

def find_principal_act_name(root, all_elements):
    
    namespaces = {'atidlm': 'http://www.arbortext.com/namespace/atidlm'}

    
    for elem in all_elements:
        if elem.tag.endswith("para"):
            text = "".join(elem.itertext()).strip().lower()
            if "is called the principal act" in text:
                citation_elem = elem.find(".//{*}citation")
                if citation_elem is not None:
                    link = citation_elem.find(".//atidlm:link", namespaces)
                    if link is not None:
                        name = link.attrib.get("{http://www.arbortext.com/namespace/atidlm}name")
                        if name:
                            return name

    
    for elem in all_elements:
        if elem.tag.endswith("para"):
            text = "".join(elem.itertext()).strip().lower()
            if "this act may be cited as" in text and "principal act" in text:
                citation_elem = elem.find(".//{*}citation")
                if citation_elem is not None:
                    link = citation_elem.find(".//atidlm:link", namespaces)
                    if link is not None:
                        name = link.attrib.get("{http://www.arbortext.com/namespace/atidlm}name")
                        if name:
                            return name

    
    for long_title in root.findall(".//long-title"):
        para = long_title.find(".//para")
        if para is not None:
            text = "".join(para.itertext()).strip().lower()
            if "an act to amend the" in text:
                citation_elem = para.find(".//{*}citation")
                if citation_elem is not None:
                    link = citation_elem.find(".//atidlm:link", namespaces)
                    if link is not None:
                        name = link.attrib.get("{http://www.arbortext.com/namespace/atidlm}name")
                        if name:
                            return name

    
    for elem in all_elements:
        if elem.tag.endswith("para"):
            text = "".join(elem.itertext()).strip().lower()
            if "this act amends the" in text and "the principal act" in text:
                citation_elem = elem.find(".//{*}citation")
                if citation_elem is not None:
                    link = citation_elem.find(".//atidlm:link", namespaces)
                    if link is not None:
                        name = link.attrib.get("{http://www.arbortext.com/namespace/atidlm}name")
                        if name:
                            return name
                    leg_title = citation_elem.find(".//leg-title")
                    if leg_title is not None and leg_title.text:
                        return leg_title.text.strip()

    
    for elem in all_elements:
        if elem.tag.endswith("text"):
            text_content = "".join(elem.itertext()).strip().lower()
            if text_content.startswith("this act amends the"):
                citation_elem = elem.find(".//{*}citation")
                if citation_elem is not None:
                    link = citation_elem.find(".//atidlm:link", namespaces)
                    if link is not None:
                        name = link.attrib.get("{http://www.arbortext.com/namespace/atidlm}name")
                        if name:
                            return name

    
    for elem in all_elements:
        if elem.tag.endswith("para"):
            text = "".join(elem.itertext()).strip().lower()
            if "is called" in text and "the principal act" in text:
                citation_elem = elem.find(".//{*}citation")
                if citation_elem is not None:
                    link = citation_elem.find(".//atidlm:link", namespaces)
                    if link is not None:
                        name = link.attrib.get("{http://www.arbortext.com/namespace/atidlm}name")
                        if name:
                            return name
                            
    for elem in root.findall(".//subprov"):
        para = elem.find(".//para")
        if para is not None:
            para_text = "".join(para.itertext()).strip().lower()
            if "this" in para_text and "amends the" in para_text and "principal act" in para_text:
                leg_title = para.find(".//leg-title")
                if leg_title is not None and leg_title.text:
                    return leg_title.text.strip()
    
    for para in root.findall(".//para"):
        para_text = "".join(para.itertext()).strip().lower()
        if "this act amends the" in para_text:
            citation_elem = para.find(".//citation")
            if citation_elem is not None:
                leg_title = citation_elem.find("leg-title")
                if leg_title is not None and leg_title.text:
                    return leg_title.text.strip()
                else:
                    return "".join(citation_elem.itertext()).strip()


    return None  




substitution_keywords = {"substituted", "replaced with", "to read as", "replacement for"}

def determine_relation_type_from_para(text, default="AMD_S"):
    text = text.lower()
    return "CIT" if any(kw in text for kw in substitution_keywords) else default


def get_amends_affect_citations(root):
    
    excluded_amendment_elements = set()
    amendment_keywords = ("amendment to", "amendments to")

    
    for amends_affect in root.findall(".//amends-affect"):
        for citation in amends_affect.findall(".//citation"):
            excluded_amendment_elements.add(citation)

    
    for schedule_amendment in root.findall(".//schedule-amendments"):
        for para in schedule_amendment.findall(".//para"):
            para_text = "".join(para.itertext() or "").lower()
            if any(kw in para_text for kw in amendment_keywords):
                excluded_amendment_elements.add(para)

    return excluded_amendment_elements


def get_history_note_elements(root):

    history_note_elements = set()

    for history_note in root.findall(".//history-note"):
        for elem in history_note.iter():
            history_note_elements.add(elem)

    return history_note_elements


    




    
def extract_amended_acts_1(all_elements, source_act_name, compiled_act_patterns, relations):
    
    last_heading_text = None  

    for element in all_elements:
        tag_name = element.tag.lower()

        if tag_name == "heading":
            last_heading_text = element.text.strip().lower() if element.text else None

        elif tag_name == "schedule.amendments" and last_heading_text:
            if "enactments amended" in last_heading_text:
                for group2 in element.findall(".//schedule.amendments.group2"):
                    heading_element = group2.find("./heading")
                    if heading_element is not None and heading_element.text:
                        extracted_acts = extract_all_act_names(heading_element.text, compiled_act_patterns)
                        for act in extracted_acts:
                            if act != source_act_name:
                                relations[(source_act_name, act)]["relation_types"].add("AMD_S")



def extract_consequential_amendments_schedule_group2(all_elements, source_act_name, compiled_act_patterns, relations):
    
    
    
    
    last_heading_text = None  

    for element in all_elements:
        tag_name = element.tag.lower()

        if tag_name == "heading":
            last_heading_text = element.text.strip().lower() if element.text else None

        elif tag_name == "schedule.amendments" and last_heading_text:
            if "consequential amendments" in last_heading_text:
                for group2 in element.findall(".//schedule.amendments.group2"):
                    heading_element = group2.find("heading")
                    if heading_element is not None and heading_element.text:
                        extracted_acts = extract_all_act_names(heading_element.text.strip(), compiled_act_patterns)
                        for act in extracted_acts:
                            if act != source_act_name:
                                relations[(source_act_name, act)]["relation_types"].add("AMD_S")


def extract_repealed_acts_1(all_elements, source_act_name, compiled_act_patterns, relations):
    
    last_heading_text = None  

    for element in all_elements:
        tag_name = element.tag.lower()

        if tag_name == "heading":
            last_heading_text = element.text.strip().lower() if element.text else None

        elif tag_name == "schedule.amendments" and last_heading_text:
            if "enactments repealed" in last_heading_text:
                for group2 in element.findall(".//schedule.amendments.group2"):
                    heading_element = group2.find("./heading")
                    if heading_element is not None and heading_element.text:
                        extracted_acts = extract_all_act_names(heading_element.text, compiled_act_patterns)
                        for act in extracted_acts:
                            if act != source_act_name:
                                relations[(source_act_name, act)]["relation_types"].add("R_S")


   
def extract_amended_acts(all_elements, source_act_name, compiled_act_patterns, relations):
    
    last_heading_text = None  

    for element in all_elements:
        tag_name = element.tag.lower()

        if tag_name == "heading":
            last_heading_text = element.text.strip().lower() if element.text else None

        elif tag_name == "schedule.amendments" and last_heading_text:
            if "enactments amended" in last_heading_text:
                for group2 in element.findall(".//schedule.amendments.group2"):
                    heading_element = group2.find("./heading")
                    if heading_element is not None and heading_element.text:
                        extracted_acts = extract_all_act_names(heading_element.text, compiled_act_patterns)
                        for act in extracted_acts:
                            if act != source_act_name:
                                relations[(source_act_name, act)]["relation_types"].add("AMD_S")

def extract_repealed_acts(all_elements, source_act_name, compiled_act_patterns, relations):
    
    last_heading_text = None  

    for element in all_elements:
        tag_name = element.tag.lower()

        if tag_name == "heading":
            last_heading_text = element.text.strip().lower() if element.text else None

        elif tag_name == "schedule.amendments" and last_heading_text:
            if "enactments repealed" in last_heading_text:
                for group2 in element.findall(".//schedule.amendments.group2"):
                    heading_element = group2.find("./heading")
                    if heading_element is not None and heading_element.text:
                        extracted_acts = extract_all_act_names(heading_element.text, compiled_act_patterns)
                        for act in extracted_acts:
                            if act != source_act_name:
                                relations[(source_act_name, act)]["relation_types"].add("R_S")
def extract_amended_acts_from_headings(all_elements, source_act_name, compiled_act_patterns, relations):
    
    
    
    for element in all_elements:
        tag_name = element.tag.lower()
        
        if tag_name == "heading" and element.text:
            heading_text = element.text.strip()
            if "amended" in heading_text.lower():
                extracted_acts = extract_all_act_names(heading_text, compiled_act_patterns)
                for act in extracted_acts:
                    if act != source_act_name:
                        relations[(source_act_name, act)]["relation_types"].add("AMD_S")


def extract_consequential_amendments1(all_elements, source_act_name, compiled_act_patterns, relations):
    
    
    
    last_heading_text = None  

    for element in all_elements:
        tag_name = element.tag.lower()

        
        if tag_name == "heading":
            last_heading_text = element.text.strip().lower() if element.text else None

        
        elif tag_name == "schedule.amendments" and last_heading_text:
            if "consequential amendments to other enactments" in last_heading_text:
                for group2 in element.findall(".//schedule.amendments.group2"):
                    heading_element = group2.find("./heading")
                    if heading_element is not None and heading_element.text:
                        extracted_acts = extract_all_act_names(heading_element.text, compiled_act_patterns)

                        
                        for act in extracted_acts:
                            if act != source_act_name:
                                relations[(source_act_name, act)]["relation_types"].add("AMD_S")





def extract_legtable_amendments(all_elements, source_act_name, compiled_act_patterns, relations):
    
    
    
    
    for element in all_elements:
        if element.tag == "amend":
            
            heading_element = element.find("heading")
            amended_acts = []
            if heading_element is not None and heading_element.text:
                amended_acts = extract_all_act_names(heading_element.text.strip(), compiled_act_patterns)

            for act in amended_acts:
                if act != source_act_name:
                    relations[(source_act_name, act)]["relation_types"].add("AMD_S")




def extract_other_enactments_amended(all_elements, source_act_name, compiled_act_patterns, relations):
    
    last_heading_text = None  

    for element in all_elements:
        tag_name = element.tag.lower()

        
        if tag_name == "heading":
            last_heading_text = element.text.strip().lower() if element.text else None

        
        elif tag_name == "schedule.amendments" and last_heading_text:
            if "other enactments amended" in last_heading_text:
                for group2 in element.findall(".//schedule.amendments.group2"):
                    heading_element = group2.find("./heading")
                    if heading_element is not None and heading_element.text:
                        amended_act_text = heading_element.text.strip()
                        extracted_acts = extract_all_act_names(amended_act_text, compiled_act_patterns)
                        for act in extracted_acts:
                            if act != source_act_name:
                                relations[(source_act_name, act)]["relation_types"].add("AMD_S")


def extract_citations(filtered_elements, source_act_name, compiled_act_patterns, relations):

    allowed_tags = {"extref", "leg-title", "intref"}  

    for element in  filtered_elements:
        tag_name = element.tag.lower()

        if tag_name == "citation":
            citation_text_parts = []

            
            leg_title_element = element.find(".//leg-title")
            if leg_title_element is not None and leg_title_element.text:
                citation_text_parts.append(leg_title_element.text.strip())
            else:
                for node in element.iter():
                    if node.tag in allowed_tags and node.text:
                        citation_text_parts.append(node.text.strip())

            full_citation_text = " ".join(citation_text_parts).strip()
            full_context_text = "".join(element.itertext() or "").lower()

            
            if "amendment to" in full_context_text or "amendments to" in full_context_text:
                continue

            extracted_acts = extract_all_act_names(full_citation_text, compiled_act_patterns)

            for act in extracted_acts:
                if act != source_act_name:
                    relations[(source_act_name, act)]["relation_types"].add("CIT")





                    

def extract_para_citations(filtered_elements, source_act_name, compiled_act_patterns, relations):
    

    allowed_tags = {"extref", "leg-title"}

    for element in  filtered_elements:
        if element.tag == "para":
            para_text_context = "".join(element.itertext() or "").lower()

            
            if "amendment to" in para_text_context or "amendments to" in para_text_context:
                continue

            for citation_element in element.findall(".//citation"):
                citation_text_parts = []

                for node in citation_element.iter():
                    if node.tag in allowed_tags and node.text:
                        citation_text_parts.append(node.text.strip())
                    if node.tail and node.tag in allowed_tags:
                        citation_text_parts.append(node.tail.strip())

                full_text = " ".join(citation_text_parts).strip()
                extracted_acts = extract_all_act_names(full_text, compiled_act_patterns)

                for act in extracted_acts:
                    if act != source_act_name:
                        relations[(source_act_name, act)]["relation_types"].add("CIT")








def extract_cf_and_def_term_citations1(filtered_elements, source_act_name, compiled_act_patterns, relations):
    
    allowed_tags = {"extref", "leg-title", "intref"}

    for element in  filtered_elements:
        if element.tag in {"cf", "def-term"}:
            if hasattr(element, "iterancestors") and any(p.tag == "history-note" for p in element.iterancestors()):
                continue

            element_text_context = "".join(element.itertext() or "").lower()
            if "amendment to" in element_text_context or "amendments to" in element_text_context:
                continue

            extracted_acts = set()

            if element.text:
                direct_text = element.text.strip()
                extracted_acts.update(extract_all_act_names(direct_text, compiled_act_patterns))

            text_element = element.find("./text")
            if text_element is not None and text_element.text:
                text_content = text_element.text.strip()
                extracted_acts.update(extract_all_act_names(text_content, compiled_act_patterns))

            citation_element = element.find(".//citation")
            if citation_element is not None:
                citation_text_parts = []
                for node in citation_element.iter():
                    if node.tag in allowed_tags and node.text:
                        citation_text_parts.append(node.text.strip())
                    if node.tail and node.tag in allowed_tags:
                        citation_text_parts.append(node.tail.strip())

                full_citation_text = " ".join(citation_text_parts).strip()
                extracted_acts.update(extract_all_act_names(full_citation_text, compiled_act_patterns))

            for act in extracted_acts:
                if act != source_act_name:
                    relations[(source_act_name, act)]["relation_types"].add("CIT")





def extract_cf_and_def_term_citations(filtered_elements, source_act_name, compiled_act_patterns, relations):
    
    allowed_tags = {"extref", "leg-title", "intref"}
    skip_keywords = {
        "amendment to", "amendments to", "inserted", "repealed", "amended",
        "substituted", "replaced", "remove", "omit"
    }

    for element in  filtered_elements:
        if element.tag in {"cf", "def-term"}:
            if hasattr(element, "iterancestors") and any(p.tag == "history-note" for p in element.iterancestors()):
                continue

            
            element_text_context = "".join(element.itertext() or "").lower()
            if any(keyword in element_text_context for keyword in skip_keywords):
                continue

            extracted_acts = set()

            if element.text:
                direct_text = element.text.strip()
                extracted_acts.update(extract_all_act_names(direct_text, compiled_act_patterns))

            text_element = element.find("./text")
            if text_element is not None and text_element.text:
                text_content = text_element.text.strip()
                extracted_acts.update(extract_all_act_names(text_content, compiled_act_patterns))

            citation_element = element.find(".//citation")
            if citation_element is not None:
                citation_text_parts = []
                for node in citation_element.iter():
                    if node.tag in allowed_tags and node.text:
                        citation_text_parts.append(node.text.strip())
                    if node.tail and node.tag in allowed_tags:
                        citation_text_parts.append(node.tail.strip())

                full_citation_text = " ".join(citation_text_parts).strip()
                extracted_acts.update(extract_all_act_names(full_citation_text, compiled_act_patterns))

            for act in extracted_acts:
                if act != source_act_name:
                    relations[(source_act_name, act)]["relation_types"].add("CIT")




def extract_citations_from_paras_1(filtered_elements, source_act_name, compiled_act_patterns, relations):

    allowed_tags = {"extref"}

    for element in  filtered_elements:
        if element.tag == "para":
            para_context_text = "".join(element.itertext() or "").lower()

            
            if "amendment to" in para_context_text or "amendments to" in para_context_text:
                continue

            for citation in element.findall(".//citation"):
                citation_text_parts = []

                for node in citation.iter():
                    if node.tag in allowed_tags and node.text:
                        citation_text_parts.append(node.text.strip())
                    if node.tail and node.tag in allowed_tags:
                        citation_text_parts.append(node.tail.strip())

                full_citation_text = " ".join(citation_text_parts).strip()
                extracted_acts = extract_all_act_names(full_citation_text, compiled_act_patterns)

                for act in extracted_acts:
                    if act != source_act_name:
                        relations[(source_act_name, act)]["relation_types"].add("CIT")




def extract_citations_from_headings_with_section(filtered_elements, source_act_name, compiled_act_patterns, relations):
    
    exclusion_keywords = {"amend", "amendment", "amendments", "repeal"}

    for element in  filtered_elements:
        if element.tag == "heading" and element.text:
            heading_text = element.text.strip().lower()

            if "under section" in heading_text and not any(excl in heading_text for excl in exclusion_keywords):
                extracted_acts = extract_all_act_names(heading_text, compiled_act_patterns)
                for act in extracted_acts:
                    if act != source_act_name:
                        relations[(source_act_name, act)]["relation_types"].add("CIT")






def extract_citations_from_leg_title1(filtered_elements, source_act_name, compiled_act_patterns, relations):
    
    for element in  filtered_elements:
        if element.tag == "leg-title" and element.text:
            parent_chain = list(element.iterancestors()) if hasattr(element, "iterancestors") else []

            if any(p.tag == "citation" for p in parent_chain):
                citation_root = next((p for p in parent_chain if p.tag == "citation"), None)

                
                context_text = "".join(citation_root.itertext() or "").lower() if citation_root is not None else ""
                if "amendment to" in context_text or "amendments to" in context_text:
                    continue

                act_text = element.text.strip()
                extracted_acts = extract_all_act_names(act_text, compiled_act_patterns)

                for act in extracted_acts:
                    if act != source_act_name:
                        relations[(source_act_name, act)]["relation_types"].add("CIT")






def extract_citations_from_insertwords(filtered_elements, source_act_name, compiled_act_patterns, relations):

    for element in  filtered_elements:
        if element.tag == "insertwords":
            insertwords_context = "".join(element.itertext() or "").lower()

            
            if "amendment to" in insertwords_context or "amendments to" in insertwords_context:
                continue

            for citation in element.findall(".//citation"):
                leg_title = citation.find("leg-title")
                if leg_title is not None and leg_title.text:
                    act_text = leg_title.text.strip()
                    extracted_acts = extract_all_act_names(act_text, compiled_act_patterns)

                    for act in extracted_acts:
                        if act != source_act_name:
                            relations[(source_act_name, act)]["relation_types"].add("CIT")




def extract_citations_from_legtitle_citations(filtered_elements, source_act_name, compiled_act_patterns, relations):
    
    for element in  filtered_elements:
        if element.tag == "citation":
            
            citation_context = "".join(element.itertext() or "").lower()
            if "amendment to" in citation_context or "amendments to" in citation_context:
                continue

            leg_title = element.find("leg-title")
            if leg_title is not None and leg_title.text:
                act_text = leg_title.text.strip()
                extracted_acts = extract_all_act_names(act_text, compiled_act_patterns)

                for act in extracted_acts:
                    if act != source_act_name:
                        relations[(source_act_name, act)]["relation_types"].add("CIT")



def extract_citations_from_extref_and_text(filtered_elements, source_act_name, compiled_act_patterns, relations):
    
    allowed_tags = {"extref"}

    for element in  filtered_elements:
        if element.tag == "citation":
            
            context_text = "".join(element.itertext() or "").lower()
            if "amendment to" in context_text or "amendments to" in context_text:
                continue

            citation_text_parts = []

            for node in element.iter():
                if node.tag in allowed_tags and node.text:
                    citation_text_parts.append(node.text.strip())
                if node.tail and node.tag in allowed_tags:
                    citation_text_parts.append(node.tail.strip())

            full_text = " ".join(citation_text_parts).strip()
            extracted_acts = extract_all_act_names(full_text, compiled_act_patterns)

            for act in extracted_acts:
                if act != source_act_name:
                    relations[(source_act_name, act)]["relation_types"].add("CIT")


def extract_multiple_citations_in_para(filtered_elements, source_act_name, compiled_act_patterns, relations):
    
    allowed_tags = {"extref", "leg-title"}

    for element in  filtered_elements:
        if element.tag == "citation":
            
            context_text = "".join(element.itertext() or "").lower()
            if "amendment to" in context_text or "amendments to" in context_text:
                continue

            citation_text_parts = []

            for node in element.iter():
                if node.tag in allowed_tags and node.text:
                    citation_text_parts.append(node.text.strip())
                if node.tail and node.tag in allowed_tags:
                    citation_text_parts.append(node.tail.strip())

            full_text = " ".join(citation_text_parts).strip()
            extracted_acts = extract_all_act_names(full_text, compiled_act_patterns)

            for act in extracted_acts:
                if act != source_act_name:
                    relations[(source_act_name, act)]["relation_types"].add("CIT")




def extract_citations_with_intref_and_legtitle(filtered_elements, source_act_name, compiled_act_patterns, relations):
    
    allowed_tags = {"leg-title", "extref", "intref"}

    for element in  filtered_elements:
        if element.tag == "citation":
            
            context_text = "".join(element.itertext() or "").lower()
            if "amendment to" in context_text or "amendments to" in context_text:
                continue

            citation_text_parts = []

            for node in element.iter():
                if node.tag in allowed_tags and node.text:
                    citation_text_parts.append(node.text.strip())
                if node.tail and node.tag in allowed_tags:
                    citation_text_parts.append(node.tail.strip())

            full_text = " ".join(citation_text_parts).strip()
            extracted_acts = extract_all_act_names(full_text, compiled_act_patterns)

            for act in extracted_acts:
                if act != source_act_name:
                    relations[(source_act_name, act)]["relation_types"].add("CIT")




def extract_citations_with_insertwords_and_multiple_extrefs(filtered_elements, source_act_name, compiled_act_patterns, relations):
    
    allowed_tags = {"extref", "leg-title"}

    for element in  filtered_elements:
        if element.tag == "citation":
            
            context_text = "".join(element.itertext() or "").lower()
            if "amendment to" in context_text or "amendments to" in context_text:
                continue

            citation_text_parts = []

            for node in element.iter():
                if node.tag in allowed_tags and node.text:
                    citation_text_parts.append(node.text.strip())
                if node.tail and node.tag in allowed_tags:
                    citation_text_parts.append(node.tail.strip())

            full_text = " ".join(citation_text_parts).strip()
            extracted_acts = extract_all_act_names(full_text, compiled_act_patterns)

            for act in extracted_acts:
                if act != source_act_name:
                    relations[(source_act_name, act)]["relation_types"].add("CIT")







def extract_citations_from_def_para(filtered_elements, source_act_name, compiled_act_patterns, relations):
    
    allowed_tags = {"leg-title", "extref", "intref"}

    for element in  filtered_elements:
        if element.tag == "citation":
            
            if hasattr(element, "iterancestors") and any(p.tag == "history-note" for p in element.iterancestors()):
                continue

            
            parent_chain = list(element.iterancestors()) if hasattr(element, "iterancestors") else []
            if not any(p.tag == "def-para" for p in parent_chain):
                continue

            
            context_text = "".join(element.itertext() or "").lower()
            if "amendment to" in context_text or "amendments to" in context_text:
                continue

            
            citation_text_parts = []
            for node in element.iter():
                if node.tag in allowed_tags and node.text:
                    citation_text_parts.append(node.text.strip())
                if node.tail and node.tag in allowed_tags:
                    citation_text_parts.append(node.tail.strip())

            full_text = " ".join(citation_text_parts).strip()
            extracted_acts = extract_all_act_names(full_text, compiled_act_patterns)

            for act in extracted_acts:
                if act != source_act_name:
                    relations[(source_act_name, act)]["relation_types"].add("CIT")





def extract_citations_from_leg_title(filtered_elements, source_act_name, compiled_act_patterns, relations):
    
    for element in  filtered_elements:
        if element.tag == "leg-title" and element.text:
            parent_chain = list(element.iterancestors()) if hasattr(element, "iterancestors") else []
            if any(p.tag == "citation" for p in parent_chain):
                citation_parent = next((p for p in parent_chain if p.tag == "citation"), None)

        
                context_text = "".join(citation_parent.itertext() or "").lower() if citation_parent is not None else ""
                if "amendment to" in context_text or "amendments to" in context_text:
                    continue

                act_text = element.text.strip()
                extracted_acts = extract_all_act_names(act_text, compiled_act_patterns)

                for act in extracted_acts:
                    if act != source_act_name:
                        relations[(source_act_name, act)]["relation_types"].add("CIT")





def extract_citations_from_extref_with_trailing_act_name(filtered_elements, source_act_name, compiled_act_patterns, relations):
    
    allowed_tags = {"extref", "intref", "leg-title"}

    for element in  filtered_elements:
        if element.tag == "citation":
            
            context_text = "".join(element.itertext() or "").lower()
            if "amendment to" in context_text or "amendments to" in context_text:
                continue

            citation_text_parts = []

            
            for node in element.iter():
                if node.tag in allowed_tags and node.text:
                    citation_text_parts.append(node.text.strip())
                if node.tail and node.tag in allowed_tags:
                    citation_text_parts.append(node.tail.strip())

            
            full_text = " ".join(citation_text_parts).strip()
            extracted_acts = extract_all_act_names(full_text, compiled_act_patterns)

            for act in extracted_acts:
                if act != source_act_name:
                    relations[(source_act_name, act)]["relation_types"].add("CIT")






def extract_citations_with_extref_and_intref(filtered_elements, source_act_name, compiled_act_patterns, relations):
    
    allowed_tags = {"extref", "intref", "leg-title"}

    for citation in  filtered_elements:
        if citation.tag == "citation":
        
            context_text = "".join(citation.itertext() or "").lower()
            if "amendment to" in context_text or "amendments to" in context_text:
                continue

            citation_text_parts = []

            
            for node in citation.iter():
                if node.tag in allowed_tags and node.text:
                    citation_text_parts.append(node.text.strip())
                if node.tail and node.tag in allowed_tags:
                    citation_text_parts.append(node.tail.strip())

            full_text = " ".join(citation_text_parts).strip()
            extracted_acts = extract_all_act_names(full_text, compiled_act_patterns)

            for act in extracted_acts:
                if act != source_act_name:
                    relations[(source_act_name, act)]["relation_types"].add("CIT")






def extract_dual_citations_leg_title_and_extref(filtered_elements, source_act_name, compiled_act_patterns, relations):
    
    allowed_tags = {"leg-title", "extref", "intref"}

    for citation in  filtered_elements:
        if citation.tag == "citation":
            
            context_text = "".join(citation.itertext() or "").lower()
            if "amendment to" in context_text or "amendments to" in context_text:
                continue

            citation_text_parts = []

            
            for node in citation.iter():
                if node.tag in allowed_tags and node.text:
                    citation_text_parts.append(node.text.strip())
                if node.tail and node.tag in allowed_tags:
                    citation_text_parts.append(node.tail.strip())

            full_text = " ".join(citation_text_parts).strip()
            extracted_acts = extract_all_act_names(full_text, compiled_act_patterns)

            for act in extracted_acts:
                if act != source_act_name:
                    relations[(source_act_name, act)]["relation_types"].add("CIT")




def extract_plaintext_act_references(filtered_elements, source_act_name, compiled_act_patterns, relations):
    
    for element in  filtered_elements:
        if element.tag == "para":
            para_context_text = "".join(element.itertext() or "").lower()

            
            if "amendment to" in para_context_text or "amendments to" in para_context_text:
                continue

            for text_elem in element.findall(".//text"):
                if text_elem.text:
                    plain_text = text_elem.text.strip()
                    extracted_acts = extract_all_act_names(plain_text, compiled_act_patterns)

                    for act in extracted_acts:
                        if act != source_act_name:
                            relations[(source_act_name, act)]["relation_types"].add("CIT")











def extract_amendments_from_crossheadings(all_elements, source_act_name, compiled_act_patterns, relations):
    
    for element in all_elements:
        if element.tag.lower() == "crosshead" and element.text:
            if "amendments" in element.text.lower():
                heading_text = element.text.strip()
                extracted_acts = extract_all_act_names(heading_text, compiled_act_patterns)
                
                for act in extracted_acts:
                    if act != source_act_name:
                        relations[(source_act_name, act)]["relation_types"].add("AMD_S")


def extract_schedule_repeals_after_amendments_crosshead(all_elements, source_act_name, compiled_act_patterns, relations):
    
    found_amendment_crosshead = False

    for element in all_elements:
        tag = element.tag.lower()

        
        if tag == "crosshead" and element.text and "amendments" in element.text.lower():
            found_amendment_crosshead = True

        
        elif found_amendment_crosshead and tag == "heading" and element.text and "repeal" in element.text.lower():
            heading_text = element.text.strip()
            extracted_acts = extract_all_act_names(heading_text, compiled_act_patterns)

            for act in extracted_acts:
                if act != source_act_name:
                    relations[(source_act_name, act)]["relation_types"].add("PR_S")


def extract_consequential_amendments(all_elements, source_act_name, compiled_act_patterns, relations):
    
    for element in all_elements:
        if element.tag.lower() == "heading" and element.text and "consequential amendments" in element.text.lower():
            heading_text = element.text.strip()
            extracted_acts = extract_all_act_names(heading_text, compiled_act_patterns)

            for act in extracted_acts:
                if act != source_act_name:
                    relations[(source_act_name, act)]["relation_types"].add("AMD_S")


            


def extract_schedule_repeals_and_citations(all_elements, source_act_name, compiled_act_patterns, relations):
    
    for element in all_elements:
        if element.tag == "schedule.amendments.group1":
            group1_heading = element.find("./heading")

            if group1_heading is not None and group1_heading.text:
                heading_text = group1_heading.text.strip()
                if "repeals relating to" in heading_text.lower():
                    
                    
                    for group2 in element.findall("./schedule.amendments.group2"):
                        group2_heading = group2.find("./heading")
                        if group2_heading is not None and group2_heading.text:
                            group2_text = group2_heading.text.strip()
                            repealed_acts = extract_all_act_names(group2_text, compiled_act_patterns)

                            for act in repealed_acts:
                                if act != source_act_name:
                                    relations[(source_act_name, act)]["relation_types"].add("R_S")



def extract_amendments(all_elements, source_act_name, compiled_act_patterns, relations): 
    
    for element in all_elements:
        if element.tag.lower() == "history-note":
            amending_leg_text = "".join(element.itertext()).strip()
            extracted_acts = extract_all_act_names(amending_leg_text, compiled_act_patterns)

            amending_operation_element = element.find(".//amending-operation")
            operation_text = (
                amending_operation_element.text.strip().lower()
                if amending_operation_element is not None and amending_operation_element.text
                else ""
            )

            for act in extracted_acts:
                if act != source_act_name:
                    if "repealed" in operation_text:
                        relations[(source_act_name, act)]["relation_types"].add("PR")
                    else:
                        relations[(source_act_name, act)]["relation_types"].add("AMD")



def extract_amendment_headings(all_elements, source_act_name, compiled_act_patterns, relations):
    
    for element in all_elements:
        if element.tag.lower() == "heading" and element.text and "amendments" in element.text.lower():
            heading_text = element.text.strip()
            extracted_acts = extract_all_act_names(heading_text, compiled_act_patterns)

            for act in extracted_acts:
                if act != source_act_name:
                    relations[(source_act_name, act)]["relation_types"].add("AMD_S")




def extract_consequential_amends_affect_amendments(all_elements, source_act_name, compiled_act_patterns, relations):
    
    last_heading_text = None

    for element in all_elements:
        tag = element.tag.lower()

        if tag in {"heading", "crosshead"}:
            last_heading_text = element.text.strip().lower() if element.text else None

        elif tag == "amends-affect" and last_heading_text and "consequential amendments" in last_heading_text:
            for citation in element.findall(".//citation"):
                for leg_title in citation.findall("leg-title"):
                    if leg_title is not None and leg_title.text:
                        act_name = leg_title.text.strip()
                        extracted_acts = extract_all_act_names(act_name, compiled_act_patterns)
                        for act in extracted_acts:
                            if act != source_act_name:
                                relations[(source_act_name, act)]["relation_types"].add("AMD_S")




def extract_schedule_amendments_with_amend_block(all_elements, source_act_name, compiled_act_patterns, relations):
    
    for element in all_elements:
        if element.tag.lower() == "schedule.amendments.group2":
            heading_element = element.find("heading")
            if heading_element is not None and heading_element.text:
                if element.find(".//amend") is not None:
                    act_heading_text = heading_element.text.strip()
                    extracted_acts = extract_all_act_names(act_heading_text, compiled_act_patterns)
                    for act in extracted_acts:
                        if act != source_act_name:
                            relations[(source_act_name, act)]["relation_types"].add("AMD_S")





def extract_consequential_repeals_from_schedule_misc(all_elements, source_act_name, compiled_act_patterns, relations):
    
    last_heading_text = None

    for element in all_elements:
        tag = element.tag.lower()

        if tag == "heading":
            last_heading_text = element.text.strip().lower() if element.text else None

        elif tag == "schedule.misc" and last_heading_text and "consequential repeals" in last_heading_text:
            for head5 in element.findall(".//head5"):
                heading_elem = head5.find("heading")
                if heading_elem is not None and heading_elem.text:
                    act_heading_text = heading_elem.text.strip()
                    extracted_acts = extract_all_act_names(act_heading_text, compiled_act_patterns)

                    for act in extracted_acts:
                        if act != source_act_name:
                            relations[(source_act_name, act)]["relation_types"].add("R_S")







def extract_enactments_amended_from_schedule_misc(all_elements, source_act_name, compiled_act_patterns, relations):
    
    last_heading_text = None

    for element in all_elements:
        tag = element.tag.lower()

        if tag == "heading":
            last_heading_text = element.text.strip().lower() if element.text else None

        elif tag == "schedule.misc" and last_heading_text and "enactments amended" in last_heading_text:
            for head5 in element.findall(".//head5"):
                heading_elem = head5.find("heading")
                if heading_elem is not None and heading_elem.text:
                    act_heading_text = heading_elem.text.strip()
                    extracted_acts = extract_all_act_names(act_heading_text, compiled_act_patterns)
                    
                    for act in extracted_acts:
                        if act != source_act_name:
                            relations[(source_act_name, act)]["relation_types"].add("AMD_S")







def extract_amds_amendment_with_substitution_plaintext(root, filtered_elements_for_history_sensitive, source_act_name, compiled_act_patterns, relations):
    
    for elem in filtered_elements_for_history_sensitive:
        if not elem.tag or not elem.tag.endswith("para"):
            continue

        para_text = "".join(elem.itertext() or "").strip().lower()

        if "is hereby amended by repealing" in para_text:
            relation_type = "PR_S"
        elif "is hereby amended by substituting" in para_text or "is hereby substituted" in para_text:
            relation_type = "AMD_S"
        else:
            continue

        for citation in elem.findall(".//citation"):
            citation_text = "".join(citation.itertext() or "").strip().lower()
            principal_name = find_principal_act_name(root, filtered_elements_for_history_sensitive)

            if principal_name and principal_name != source_act_name:
                relations[(source_act_name, principal_name)]["relation_types"].add(relation_type)

            for name, pattern in compiled_act_patterns:
                if pattern.search(citation_text) and name != source_act_name:
                    relations[(source_act_name, name)]["relation_types"].add(relation_type)




def extract_repeal_relations_consolidated_safe(root, filtered_elements_for_history_sensitive, source_act_name, compiled_act_patterns, relations):
    trigger_words = {"section", "part", "schedule"}
    repeal_phrases = {
        "is hereby consequentially repealed",
        "are hereby consequentially repealed",
        "the following enactments are hereby consequentially repealed"
    }

    for elem in filtered_elements_for_history_sensitive:
        if not elem.tag or not elem.tag.endswith("para"):
            continue

        para_text = "".join(elem.itertext() or "").lower()

        if not any(phrase in para_text for phrase in repeal_phrases):
            continue

        target_citation_blocks = []

        if "the following enactments are hereby consequentially repealed" in para_text:
            for label_para in elem.findall(".//label-para"):
                for inner_para in label_para.findall(".//para"):
                    target_citation_blocks.extend(inner_para.findall(".//citation"))
        else:
            target_citation_blocks.extend(elem.findall(".//citation"))

        for citation in target_citation_blocks:
            citation_text = " ".join(citation.itertext() or "").strip().lower()
            if not citation_text:
                continue

            relation_type = determine_relation_type_from_para(
                para_text,
                default="PR_S" if any(w in citation_text for w in trigger_words) else "R_S"
            )

            principal_name = find_principal_act_name(root, filtered_elements_for_history_sensitive)
            

            if principal_name and principal_name != source_act_name:
                relations[(source_act_name, principal_name)]["relation_types"].add(relation_type)
                

            for name, pattern in compiled_act_patterns:
                if pattern.search(citation_text) and name != source_act_name:
                    relations[(source_act_name, name)]["relation_types"].add(relation_type)
                    



def extract_amendment_with_relates_to_plain(root, filtered_elements_for_history_sensitive, source_act_name, compiled_act_patterns, relations):
    for elem in filtered_elements_for_history_sensitive:
        if elem.tag and elem.tag.endswith("subprov"):
            para = elem.find(".//para")
            if para is not None:
                para_text = "".join(text or "" for text in para.itertext()).lower()

                if any(phrase in para_text for phrase in ["is hereby consequentially amended", "substituted"]):
                    relation_type = determine_relation_type_from_para(para_text, default="AMD_S")

                    citations = para.findall(".//citation")
                    for citation in citations:
                        citation_text = "".join(citation.itertext() or "").strip().lower()
                        principal_name = find_principal_act_name(root, filtered_elements_for_history_sensitive)
                        

                        if principal_name and principal_name != source_act_name:
                            relations[(source_act_name, principal_name)]["relation_types"].add(relation_type)
                            

                        for name, pattern in compiled_act_patterns:
                            if pattern.search(citation_text) and name != source_act_name:
                                relations[(source_act_name, name)]["relation_types"].add(relation_type)
                                




def extract_amendment_special_from_para_plain(root, filtered_elements_for_history_sensitive, source_act_name, compiled_act_patterns, relations):

    for elem in filtered_elements_for_history_sensitive:
        if elem.tag and elem.tag.endswith("para"):
            para_text = "".join(text or "" for text in elem.itertext()).lower()

            if "hereby consequentially amended" in para_text:
                citations = elem.findall(".//citation")

                for citation in citations:
                    citation_text = "".join(citation.itertext() or "").strip().lower()
                    principal_name = find_principal_act_name(root, filtered_elements_for_history_sensitive)

                    

                    if principal_name and principal_name != source_act_name:
                        relations[(source_act_name, principal_name)]["relation_types"].add("AMD_S")
                        

                    for name, pattern in compiled_act_patterns:
                        if pattern.search(citation_text) and name != source_act_name:
                            relations[(source_act_name, name)]["relation_types"].add("AMD_S")
                            




def extract_amend_or_repeal_amendments_from_para_safe(root, filtered_elements_for_history_sensitive, source_act_name, compiled_act_patterns, relations):
    for element in filtered_elements_for_history_sensitive:
        if element.tag and element.tag.endswith("para"):
            para_text = "".join(text or "" for text in element.itertext()).lower()

            if "is amended by" in para_text:
                
                if "the principal act is amended by repealing" in para_text and "substituting the following part" in para_text:
                    principal_name = find_principal_act_name(root, list(root.iter()))  
                    if principal_name and principal_name != source_act_name:
                        relations[(source_act_name, principal_name)]["relation_types"].add("PR_S")
                    continue  

                
                relation_type = determine_relation_type_from_para(
                    para_text,
                    default="PR_S" if "repeal" in para_text else "AMD_S"
                )

                for citation in element.findall(".//citation"):
                    citation_text = "".join(citation.itertext() or "").strip().lower()
                    principal_name = find_principal_act_name(root, list(root.iter()))  

                    

                    if principal_name and principal_name != source_act_name:
                        relations[(source_act_name, principal_name)]["relation_types"].add(relation_type)
                        

                    for name, pattern in compiled_act_patterns:
                        if pattern.search(citation_text) and name != source_act_name:
                            relations[(source_act_name, name)]["relation_types"].add(relation_type)
                            




def extract_amds_from_amendment_heading_and_para_safe(root, filtered_elements_for_history_sensitive, source_act_name, compiled_act_patterns, relations):
    for element in filtered_elements_for_history_sensitive:
        tag = element.tag.lower()

        if tag.endswith("heading") and element.text and "amendment to" in element.text.lower():
            for citation in element.findall(".//citation"):
                citation_text = "".join(citation.itertext() or "").strip().lower()
                principal_name = find_principal_act_name(root, filtered_elements_for_history_sensitive)

                

                if principal_name and principal_name != source_act_name:
                    relations[(source_act_name, principal_name)]["relation_types"].add("AMD_S")
                    

                for name, pattern in compiled_act_patterns:
                    if pattern.search(citation_text) and name != source_act_name:
                        relations[(source_act_name, name)]["relation_types"].add("AMD_S")
                        

        elif tag.endswith("para"):
            para_text = "".join(element.itertext() or "").strip().lower()

            if "is amended by inserting" in para_text or "is amended by adding" in para_text:
                relation_type = determine_relation_type_from_para(para_text, default="AMD_S")

                for citation in element.findall(".//citation"):
                    citation_text = "".join(citation.itertext() or "").strip().lower()
                    principal_name = find_principal_act_name(root, filtered_elements_for_history_sensitive)

                    

                    if principal_name and principal_name != source_act_name:
                        relations[(source_act_name, principal_name)]["relation_types"].add(relation_type)
                       

                    for name, pattern in compiled_act_patterns:
                        if pattern.search(citation_text) and name != source_act_name:
                            relations[(source_act_name, name)]["relation_types"].add(relation_type)
                           


def extract_intro_amendment_relation(root, filtered_elements_for_history_sensitive, source_act_name, compiled_act_patterns, relations):
    for element in filtered_elements_for_history_sensitive:
        if element.tag.lower().endswith("para"):
            para_text = "".join(element.itertext() or "").strip().lower()

            if "this act amends the" in para_text:
                relation_type = determine_relation_type_from_para(para_text, default="AMD_S")

                for citation in element.findall(".//citation"):
                    citation_text = "".join(citation.itertext() or "").strip().lower()
                    principal_name = find_principal_act_name(root, filtered_elements_for_history_sensitive)

                    

                    if principal_name and principal_name != source_act_name:
                        relations[(source_act_name, principal_name)]["relation_types"].add(relation_type)
                       

                    for name, pattern in compiled_act_patterns:
                        if pattern.search(citation_text) and name != source_act_name:
                            relations[(source_act_name, name)]["relation_types"].add(relation_type)
                           









def extract_acts_and_relations(xml_file, official_act_names):
    
    relations = defaultdict(lambda: {"relation_types": set(), "dates": set()})
    act_names = set()

    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        all_elements = list(root.iter())  
    except ET.ParseError as e:
        print(f"Error parsing XML file: {e}")
        return set(), relations, None
    
    


    
    source_act_name = extract_source_act_name(root)


    
    if not source_act_name:
        source_act_name = os.path.splitext(os.path.basename(xml_file))[0]

    act_names.add(source_act_name)

   


    
    
    skip_elements = get_amends_affect_citations(root)
    filtered_elements = [el for el in root.iter() if el not in skip_elements]
    history_skip_elements = get_history_note_elements(root)
    filtered_elements_for_history_sensitive = [el for el in root.iter() if el not in history_skip_elements]



    principal_name = find_principal_act_name(root, filtered_elements_for_history_sensitive)





    if principal_name:
         print(f"📘 Principal act detected: {principal_name}")  

         extract_consequential_repeals_from_schedule_misc(all_elements, source_act_name, official_act_names, relations)
         extract_repealed_acts(all_elements, source_act_name, official_act_names, relations)
         extract_schedule_repeals_after_amendments_crosshead(all_elements, source_act_name, official_act_names, relations)
         extract_repealed_acts_1(all_elements, source_act_name, official_act_names, relations)
         extract_schedule_repeals_and_citations(all_elements, source_act_name, official_act_names, relations)
         extract_amended_acts_1(all_elements, source_act_name, official_act_names, relations)
         extract_other_enactments_amended(all_elements, source_act_name, official_act_names, relations)
         extract_consequential_amendments(all_elements, source_act_name, official_act_names, relations)
         extract_amended_acts(all_elements, source_act_name, official_act_names, relations)
         extract_enactments_amended_from_schedule_misc(all_elements, source_act_name, official_act_names, relations)
         extract_amendment_headings(all_elements, source_act_name, official_act_names, relations)
         extract_amendments_from_crossheadings(all_elements, source_act_name, official_act_names, relations)
         extract_consequential_amends_affect_amendments(all_elements, source_act_name, official_act_names, relations)
         extract_consequential_amendments_schedule_group2(all_elements, source_act_name, official_act_names, relations)
         extract_legtable_amendments(all_elements, source_act_name, official_act_names, relations)
         extract_schedule_amendments_with_amend_block(all_elements, source_act_name, official_act_names, relations)
         extract_amended_acts_from_headings(all_elements, source_act_name, official_act_names, relations)
         extract_amendments(all_elements, source_act_name, official_act_names, relations)
         extract_consequential_amendments1(all_elements, source_act_name, official_act_names, relations)



         extract_citations_from_legtitle_citations(filtered_elements, source_act_name, official_act_names, relations)
         extract_para_citations(filtered_elements, source_act_name, official_act_names, relations)
         extract_citations_from_insertwords(filtered_elements, source_act_name, official_act_names, relations)
         extract_cf_and_def_term_citations(filtered_elements, source_act_name, official_act_names, relations)
         extract_citations_from_paras_1(filtered_elements, source_act_name, official_act_names, relations)
         extract_citations_from_leg_title(filtered_elements, source_act_name, official_act_names, relations)
         extract_citations_from_headings_with_section(filtered_elements, source_act_name, official_act_names, relations)
         extract_citations_from_extref_and_text(filtered_elements, source_act_name, official_act_names, relations)
         extract_multiple_citations_in_para(filtered_elements, source_act_name, official_act_names, relations)
         extract_citations_with_insertwords_and_multiple_extrefs(filtered_elements, source_act_name, official_act_names, relations)
         extract_cf_and_def_term_citations1(filtered_elements, source_act_name, official_act_names, relations)
    

         extract_citations_with_intref_and_legtitle(filtered_elements, source_act_name, official_act_names, relations)
         extract_citations_from_def_para(filtered_elements, source_act_name, official_act_names, relations)
         extract_citations_from_extref_with_trailing_act_name(filtered_elements, source_act_name, official_act_names, relations)
         extract_citations_from_leg_title1(filtered_elements, source_act_name, official_act_names, relations)
         extract_citations_with_extref_and_intref(filtered_elements, source_act_name, official_act_names, relations)
         extract_dual_citations_leg_title_and_extref(filtered_elements, source_act_name, official_act_names, relations)
         extract_plaintext_act_references(filtered_elements, source_act_name, official_act_names, relations)
         extract_citations(filtered_elements, source_act_name, official_act_names, relations)

         
         extract_amds_amendment_with_substitution_plaintext(root, filtered_elements_for_history_sensitive, source_act_name, official_act_names, relations)
         extract_repeal_relations_consolidated_safe(root, filtered_elements_for_history_sensitive, source_act_name, official_act_names, relations)
         extract_amendment_with_relates_to_plain(root, filtered_elements_for_history_sensitive, source_act_name, official_act_names, relations)
         extract_amend_or_repeal_amendments_from_para_safe(root,filtered_elements_for_history_sensitive, source_act_name, official_act_names, relations)
         extract_amendment_special_from_para_plain(root, filtered_elements_for_history_sensitive, source_act_name, official_act_names, relations)
         extract_amds_from_amendment_heading_and_para_safe(root, filtered_elements_for_history_sensitive, source_act_name, official_act_names, relations)
         extract_intro_amendment_relation(root, filtered_elements_for_history_sensitive, source_act_name, official_act_names, relations)

  
    else:
        extract_consequential_repeals_from_schedule_misc(all_elements, source_act_name, official_act_names, relations)
        extract_repealed_acts(all_elements, source_act_name, official_act_names, relations)
        extract_schedule_repeals_after_amendments_crosshead(all_elements, source_act_name, official_act_names, relations)
        extract_repealed_acts_1(all_elements, source_act_name, official_act_names, relations)
        extract_schedule_repeals_and_citations(all_elements, source_act_name, official_act_names, relations)
        extract_amended_acts_1(all_elements, source_act_name, official_act_names, relations)
        extract_other_enactments_amended(all_elements, source_act_name, official_act_names, relations)
        extract_consequential_amendments(all_elements, source_act_name, official_act_names, relations)
        extract_amended_acts(all_elements, source_act_name, official_act_names, relations)
        extract_enactments_amended_from_schedule_misc(all_elements, source_act_name, official_act_names, relations)
        extract_amendment_headings(all_elements, source_act_name, official_act_names, relations)
        extract_amendments_from_crossheadings(all_elements, source_act_name, official_act_names, relations)
        extract_consequential_amends_affect_amendments(all_elements, source_act_name, official_act_names, relations)
        extract_consequential_amendments_schedule_group2(all_elements, source_act_name, official_act_names, relations)
        extract_legtable_amendments(all_elements, source_act_name, official_act_names, relations)
        extract_schedule_amendments_with_amend_block(all_elements, source_act_name, official_act_names, relations)
        extract_amended_acts_from_headings(all_elements, source_act_name, official_act_names, relations)
        extract_amendments(all_elements, source_act_name, official_act_names, relations)
        extract_consequential_amendments1(all_elements, source_act_name, official_act_names, relations)



        extract_citations_from_legtitle_citations(filtered_elements, source_act_name, official_act_names, relations)
        extract_para_citations(filtered_elements, source_act_name, official_act_names, relations)
        extract_citations_from_insertwords(filtered_elements, source_act_name, official_act_names, relations)
        extract_cf_and_def_term_citations(filtered_elements, source_act_name, official_act_names, relations)
        extract_citations_from_paras_1(filtered_elements, source_act_name, official_act_names, relations)
        extract_citations_from_leg_title(filtered_elements, source_act_name, official_act_names, relations)
        extract_citations_from_headings_with_section(filtered_elements, source_act_name, official_act_names, relations)
        extract_citations_from_extref_and_text(filtered_elements, source_act_name, official_act_names, relations)
        extract_multiple_citations_in_para(filtered_elements, source_act_name, official_act_names, relations)
        extract_citations_with_insertwords_and_multiple_extrefs(filtered_elements, source_act_name, official_act_names, relations)
        extract_cf_and_def_term_citations1(filtered_elements, source_act_name, official_act_names, relations)
    

        extract_citations_with_intref_and_legtitle(filtered_elements, source_act_name, official_act_names, relations)
        extract_citations_from_def_para(filtered_elements, source_act_name, official_act_names, relations)
        extract_citations_from_extref_with_trailing_act_name(filtered_elements, source_act_name, official_act_names, relations)
        extract_citations_from_leg_title1(filtered_elements, source_act_name, official_act_names, relations)
        extract_citations_with_extref_and_intref(filtered_elements, source_act_name, official_act_names, relations)
        extract_dual_citations_leg_title_and_extref(filtered_elements, source_act_name, official_act_names, relations)
        extract_plaintext_act_references(filtered_elements, source_act_name, official_act_names, relations)
        extract_citations(filtered_elements, source_act_name, official_act_names, relations)

    




    for key, data in relations.items():
        if "R_S" in data["relation_types"] and "PR_S" in data["relation_types"]:
            data["relation_types"].discard("PR_S")

    
    
    return act_names, relations



import os
import xml.etree.ElementTree as ET
from collections import defaultdict
import traceback

def process_folder(folder_path, compiled_act_patterns):
    all_relations = defaultdict(lambda: {"relation_types": set()})
    all_acts = set()


    for file_name in os.listdir(folder_path):
        if not file_name.endswith(".xml"):
            continue

        print(f"🔍 Processing: {file_name}")  

        xml_file_path = os.path.join(folder_path, file_name)

        try:
            act_names, relations = extract_acts_and_relations(xml_file_path, compiled_act_patterns)
            all_acts.update(act_names)

            
            for key, value in relations.items():
                all_relations[key]["relation_types"].update(value["relation_types"])
                

        except Exception:
            pass  

    return all_acts, all_relations




    

import csv
import networkx as nx

def export_network_to_csv_only(act_names, relations, output_csv):
    
    with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        
        
        writer.writerow(["Source", "Target", "RelationType"])
        for (source, target), data in relations.items():
            relation_types = sorted(data["relation_types"])
            label = ", ".join(relation_types)
            writer.writerow([source, target, label])

        
        writer.writerow([])  
        writer.writerow(["List of Acts"])
        for act in sorted(act_names):
            writer.writerow([act])




if __name__ == "__main__":
    
    folder_path = "C:/path/to/your/folder/C_Network_Acts" # Replace this with the actual path to your folder
    core_act = "Mental Health (Compulsory Assessment and Treatment) Act 1992"
    official_acts_csv = "outputlist_All_2.csv"

    compiled_act_patterns = load_official_act_patterns(official_acts_csv)
    act_names, relations= process_folder(folder_path, compiled_act_patterns)

    export_network_to_csv_only(
        act_names,
        relations,
        "act_network_version.csv"
    )

    

Katz Prestige Centrality

In [None]:
import pandas as pd
import networkx as nx


input_csv = "act_network.csv"
edges_df = pd.read_csv(input_csv)


edges_df_cleaned = edges_df.dropna(subset=["Source", "Target"])


edges_df_cleaned.to_csv("cleaned_edge_list.csv", index=False)


G = nx.from_pandas_edgelist(edges_df_cleaned, source='Source', target='Target', create_using=nx.DiGraph())


alpha = 0.005
katz_centrality = nx.katz_centrality_numpy(G, alpha=alpha)


katz_df = pd.DataFrame(katz_centrality.items(), columns=["Act", "Katz_Prestige"])


katz_df = katz_df.dropna(subset=["Act"])


katz_df = katz_df.sort_values(by="Katz_Prestige", ascending=False).reset_index(drop=True)


output_csv = "katz_prestige.csv"
katz_df.to_csv(output_csv, index=False)


print("Katz Prestige Centrality (cleaned) saved to:", output_csv)
print("Top 15 Acts by Katz Prestige:")
print(katz_df.head(15).to_string(index=False))


Pearson Correlation

In [None]:
import pandas as pd
import networkx as nx
import random
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr


input_csv = "act_network.csv"  
output_csv = "katz_robustness_results.csv"
delete_fractions = [0.01, 0.05, 0.10, 0.20]
num_repeats = 100


edges_df = pd.read_csv(input_csv)
G_original = nx.from_pandas_edgelist(edges_df, source='Source', target='Target', create_using=nx.DiGraph())


alpha = 0.005  
katz_original = nx.katz_centrality_numpy(G_original, alpha=alpha)
katz_orig_rank = pd.Series(katz_original).rank(method="min")


results = []
for frac in delete_fractions:
    for run in range(num_repeats):
        G_temp = G_original.copy()
        num_edges = int(frac * G_temp.number_of_edges())
        removed_edges = random.sample(list(G_temp.edges()), num_edges)
        G_temp.remove_edges_from(removed_edges)

        try:
            katz_temp = nx.katz_centrality_numpy(G_temp, alpha=alpha)
            katz_temp_rank = pd.Series(katz_temp).reindex(katz_orig_rank.index).rank(method="min")
            corr, _ = pearsonr(katz_orig_rank, katz_temp_rank)
        except:
            corr = np.nan  

        results.append({
            "Fraction_Removed": frac,
            "Run": run,
            "Pearson_Correlation": corr
        })


results_df = pd.DataFrame(results)
results_df.to_csv(output_csv, index=False)
print("Results saved to", output_csv)


plt.figure(figsize=(8, 5))
results_df.boxplot(column="Pearson_Correlation", by="Fraction_Removed")
plt.title("Robustness of Katz Prestige Centrality")
plt.suptitle("")
plt.xlabel("Edge Deletion Fraction")
plt.ylabel("Pearson Correlation")
plt.grid(True)
plt.tight_layout()
plt.savefig("katz_robustness_plot.png")
plt.show()


Louvain Community Detection

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.patches as mpatches
from collections import defaultdict
import community as community_louvain  


csv_input = "act_network.csv"
core_act = "Mental Health (Compulsory Assessment and Treatment) Act 1992"


df = pd.read_csv(csv_input)
G = nx.from_pandas_edgelist(df, "Source", "Target", edge_attr=True, create_using=nx.Graph())



partition = community_louvain.best_partition(G, resolution=1.0)


modularity = community_louvain.modularity(partition, G)
print(f"Modularity Score: {modularity:.4f}")


core_comm = partition.get(core_act, "Not Found")
print(f"Core Act '{core_act}' is in Community: {core_comm}")


communities = defaultdict(list)
for node, cid in partition.items():
    communities[cid].append(node)

community_df = pd.DataFrame([(cid, act) for cid, acts in communities.items() for act in acts],
                            columns=["Community", "ActName"])
community_df.to_csv("community_ids_with_act_names.csv", index=False)


plt.figure(figsize=(12, 9))
pos = nx.spring_layout(G, seed=42)
base_cmap = plt.get_cmap("tab20")
colors = lambda i: base_cmap(i % base_cmap.N)
node_colors = [colors(partition[node]) for node in G.nodes()]
nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=20, alpha=0.8)
nx.draw_networkx_edges(G, pos, edge_color="gray", width=0.5, alpha=0.4)
plt.title("Louvain Community Detection for NZ Mental Health Act Network", fontsize=14)


plt.savefig("louvain_community_network.png", dpi=300, bbox_inches='tight')
plt.show()


fig, ax = plt.subplots(figsize=(4, len(communities) * 0.3))  
fig.subplots_adjust(left=0.2)

patches = [mpatches.Patch(color=colors(i), label=f"Community {i}") for i in sorted(communities.keys())]
legend = ax.legend(handles=patches, loc='center left', fontsize=10, frameon=False)

ax.axis('off')  
plt.title("Community Color Legend", fontsize=12)
plt.savefig("community_legend.png", dpi=300, bbox_inches='tight')
plt.show()



G_proj = nx.Graph()
for cid, nodes in communities.items():
    G_proj.add_node(cid, size=len(nodes))

for u, v, data in G.edges(data=True):
    cu = partition[u]
    cv = partition[v]
    if cu != cv:
        if G_proj.has_edge(cu, cv):
            G_proj[cu][cv]['weight'] += 1
        else:
            G_proj.add_edge(cu, cv, weight=1)


plt.figure(figsize=(5.5, 6))
proj_pos = nx.spring_layout(G_proj, seed=42, k=1.2)
proj_colors = [colors(n) for n in G_proj.nodes()]
proj_sizes = [G_proj.nodes[n]['size'] * 10 for n in G_proj.nodes()]
edge_widths = [G_proj[u][v]['weight'] / 10 for u, v in G_proj.edges()]
nx.draw(G_proj, proj_pos, with_labels=True, node_color=proj_colors,
        node_size=proj_sizes, width=edge_widths, edge_color='black', alpha=0.8)
plt.title("Community Projection Graph of NZ Mental Health Act Network", fontsize=13)
plt.savefig("community_projection_graph.png", dpi=300, bbox_inches='tight')
plt.show()


Agglomerative Clustering

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.cm as cm
import matplotlib.colors as mcolors
from sklearn.cluster import AgglomerativeClustering
import community as community_louvain 


community_df = pd.read_csv("community_ids_with_act_names.csv")
edges_df = pd.read_csv("act_network.csv")
G = nx.from_pandas_edgelist(edges_df, "Source", "Target")



core_act = "Mental Health (Compulsory Assessment and Treatment) Act 1992"

partition = {row["ActName"]: row["Community"] for _, row in community_df.iterrows()}
modularity = community_louvain.modularity(partition, G)
print(f"Modularity Score: {modularity:.4f}")



community_nodes = {}
for _, row in community_df.iterrows():
    community_nodes.setdefault(row["Community"], set()).add(row["ActName"])

G_proj = nx.Graph()
for c1, nodes1 in community_nodes.items():
    for c2, nodes2 in community_nodes.items():
        if c1 < c2 and any(G.has_edge(n1, n2) for n1 in nodes1 for n2 in nodes2):
            G_proj.add_edge(c1, c2)


adj_matrix = nx.to_numpy_array(G_proj, nodelist=sorted(G_proj.nodes()))
distance_matrix = 1 - adj_matrix
community_ids = sorted(G_proj.nodes())

clustering = AgglomerativeClustering(n_clusters=8, metric='precomputed', linkage='average')
macro_labels = clustering.fit_predict(distance_matrix)
macro_comm_map = {comm: macro for comm, macro in zip(community_ids, macro_labels)}
community_df["Macro_Community"] = community_df["Community"].map(macro_comm_map)



core_macro_comm = community_df.loc[community_df["ActName"] == core_act, "Macro_Community"].values
if len(core_macro_comm) > 0:
    print(f"Core Act '{core_act}' is in Macro Community: {core_macro_comm[0]}")
else:
    print(f"Core Act '{core_act}' not found in the dataset.")


community_df.to_csv("macro_communities.csv", index=False)
macro_groups = community_df.groupby("Macro_Community")["ActName"].apply(list)
pd.DataFrame([
    {"Macro_Community": m, "Acts": "; ".join(str(a) for a in acts if pd.notna(a))}
    for m, acts in macro_groups.items()
]).to_csv("macro_community_groups.csv", index=False)


# === Color Mapping
macro_ids = sorted(set(macro_labels))
cmap = cm.tab10
macro_color_dict = {m: cmap(i % 10) for i, m in enumerate(macro_ids)}
act_macro_map = dict(zip(community_df["ActName"], community_df["Macro_Community"]))

# === Plot: Full Network by Macro-Community ===
plt.figure(figsize=(9, 7))
pos = nx.spring_layout(G, seed=42)
node_colors = [macro_color_dict.get(act_macro_map.get(n, -1), "grey") for n in G.nodes()]
nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=8)
nx.draw_networkx_edges(G, pos, alpha=0.04)
plt.title("Macro-Community Detection for NZ Mental Health Act Network", fontsize=11)
plt.axis("off")
plt.tight_layout()
plt.savefig("Macro-Community.png", dpi=300)
plt.show()



fig, ax = plt.subplots(figsize=(4, len(macro_ids) * 0.3))  
fig.subplots_adjust(left=0.2)

patches = [
    mpatches.Patch(color=macro_color_dict[i], label=f"Macro Community {i}")
    for i in sorted(macro_ids)
]

legend = ax.legend(handles=patches, loc='center left', fontsize=10, frameon=False)

ax.axis('off')  
plt.title("Macro-Community Color Legend", fontsize=12)
plt.savefig("macro_community_legend.png", dpi=300, bbox_inches='tight')
plt.show()


macro_nodes = community_df.groupby("Macro_Community")["ActName"].apply(set).to_dict()
G_proj_macro = nx.Graph()

for i, mi in enumerate(macro_ids):
    for mj in macro_ids[i + 1:]:
        edge_count = sum(
            1 for a1 in macro_nodes[mi]
              for a2 in macro_nodes[mj]
              if G.has_edge(a1, a2)
        )
        if edge_count > 0:
            G_proj_macro.add_edge(mi, mj, weight=edge_count)



macro_sizes = [len(macro_nodes[m]) * 2 for m in G_proj_macro.nodes()]
macro_colors = [macro_color_dict[m] for m in G_proj_macro.nodes()]
edge_widths = [G_proj_macro[u][v]['weight'] /22 for u, v in G_proj_macro.edges()]

plt.figure(figsize=(5.5, 6))
pos = nx.spring_layout(G_proj_macro, seed=42, scale=5, k=1.2)

nx.draw(
    G_proj_macro,
    pos,
    with_labels=True,
    node_size=macro_sizes,
    node_color=macro_colors,
    width=edge_widths,
    edge_color="black",
    font_size=8
)

plt.title("Community Projection Graph of NZ Mental Health Act Network", fontsize=10)
plt.axis("off")
plt.margins(0.15)
plt.tight_layout()
plt.savefig("macro_community_projection.png", dpi=300, bbox_inches="tight")
plt.show()






Content Similarity

In [None]:
import pandas as pd
from collections import defaultdict


lda_df = pd.read_csv("lda_keyword_top20_per_macro_community.csv")


committee_keywords = {
    "Social Services and Community": {
        "social", "housing", "income", "women", "children", "youth", "seniors",
        "pacific", "ethnic", "arts", "culture", "heritage", "sport", "recreation", "voluntary"
    },
    "Economic Development, Science and Innovation": {
        "business", "tourism", "minerals", "commerce", "consumer", "research", "science",
        "innovation", "intellectual", "broadcasting", "communications", "technology", "trade",
        "company", "patent", "cooperative", "store", "institute", "policy"
    },
    "Finance and Expenditure": {
        "fiscal", "tax", "revenue", "bank", "superannuation", "insurance", "audit", "finance",
        "fund", "financial", "accountant", "reporting", "estate", "coverage"
    },
    "Governance and Administration": {
        "legislation", "prime", "statistics", "internal", "affairs", "civil", "local",
        "regulations", "ombudsman", "parliamentary", "government", "commissioner", "council", "ministry"
    },
    "Education and Workforce": {
        "education", "training", "employment", "immigration", "workplace", "industrial", "safety",
        "worker", "employ", "parental", "leave", "protection", "relation","wage", "contractor", "lien", "earnings"
    },
    "Justice": {
        "justice", "court", "crime", "police", "corrections", "legal", "human", "rights",
        "criminal", "proceeding", "offense", "tribunal", "person"
    },
    "Primary Production": {
        "agriculture", "biosecurity", "fisheries", "forestry", "land", "farming"
    },
    "Transport and Infrastructure": {
        "transport", "road", "rail", "infrastructure", "energy", "building", "construction",
        "vehicle", "management", "conveyance", "direction"
    },
    "Foreign Affairs, Defence and Trade": {
        "customs", "defence", "foreign", "trade", "arms", "veterans", "excise",
        "duty", "tariff", "countervailing"
    },
    "Environment": {
        "conservation", "environment", "climate"
    },
    "Health": {
        "health", "hospital", "medicine", "disability", "mental", "wellness",
        "infirmary", "disablement"
    },
    "Māori Affairs": {
        "māori", "settlement", "land", "maori", "reserve", "river"
    }
}


community_keywords = defaultdict(set)

for _, row in lda_df.iterrows():
    community = row["Macro_Community"]
    keywords = [str(row[f"Keyword_{i}"]).lower().strip() for i in range(1, 21) if pd.notna(row[f"Keyword_{i}"])]
    community_keywords[community].update(keywords)


output_rows = []

for comm_id, lda_words in community_keywords.items():
    for committee, keywords in committee_keywords.items():
        norm_keywords = {kw.lower().strip() for kw in keywords}
        overlap = norm_keywords & lda_words
        score = len(overlap) / len(norm_keywords) if norm_keywords else 0
        output_rows.append({
            "Macro Community ID": comm_id,
            "Parliamentary Committee": committee,
            "Similarity Score": round(score, 3),
            "Overlap Terms": ", ".join(sorted(overlap))
        })


df_output = pd.DataFrame(output_rows)
df_output.sort_values(by=["Macro Community ID", "Similarity Score"], ascending=[True, False], inplace=True)
df_output.to_csv("macro_committee_similarity_LDA.csv", index=False)
print(df_output)
