In [None]:
from langchain_openai import AzureChatOpenAI
AZURE_ENDPOINT = ""
AZURE_API_KEY = ""
AZURE_API_VERSION = ""
AZURE_DEPLOYMENT_NAME =""
AZURE_MODEL_NAME = ""

llm = AzureChatOpenAI(
    api_version=AZURE_API_VERSION,
    azure_endpoint=AZURE_ENDPOINT,
    azure_deployment=AZURE_DEPLOYMENT_NAME,
    model=AZURE_MODEL_NAME,
    api_key=AZURE_API_KEY,
    streaming=True,
    temperature=0.0,
)
llm.invoke("What is the capital of France?")

AIMessage(content='The capital of France is Paris.', additional_kwargs={}, response_metadata={'finish_reason': 'stop', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_7a53abb7a2'}, id='run--6c324ef3-d158-42f0-b5ca-79fedabeaf39-0')

In [None]:
import pdfplumber
import re
import os

DATA_FOLDER = "./data"
OUTPUT_FOLDER = "./output/text_data"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

def clean_text(text):
    """Clean the extracted text by removing noise and normalizing whitespace."""
    lines = text.split("\n")
    cleaned_lines = [
        line.strip()
        for line in lines
        if line.strip() and not re.match(r"^strana \d+$", line.strip())
    ]
    cleaned_text = "\n".join(cleaned_lines)
    cleaned_text = re.sub(r"\n{2,}", "\n\n", cleaned_text)
    return cleaned_text.strip()


def pdf_to_text(file_path, output_path):
    """Extract text from a PDF file, clean it, and save to a text file."""
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                text += page_text + "\n"

        cleaned_text = clean_text(text)
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(cleaned_text)

        print(f"Text successfully saved to {output_path}")
        return cleaned_text
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return None

pdf_files = [
    os.path.join(DATA_FOLDER, fname)
    for fname in os.listdir(DATA_FOLDER)
    if fname.endswith(".pdf")
]

for pdf_file in pdf_files:
    output_path = os.path.join(OUTPUT_FOLDER, os.path.basename(pdf_file).replace(".pdf", ".txt"))
    pdf_to_text(pdf_file, output_path)

Text successfully saved to ./output/Sb_2000_128_2024-01-01_IZ.txt
Text successfully saved to ./output/Sb_2006_183_2023-12-31_IZ.txt
Text successfully saved to ./output/Sb_2016_250_2024-01-01_IZ.txt
Text successfully saved to ./output/Sb_2009_40_2024-04-01_IZ.txt
Text successfully saved to ./output/Sb_2012_90_2024-07-19_IZ.txt
Text successfully saved to ./output/Sb_2008_125_2024-07-19_IZ.txt
Text successfully saved to ./output/Sb_2021_283_2024-07-01_IZ.txt
Text successfully saved to ./output/Sb_1991_455_2024-01-01_IZ.txt
Text successfully saved to ./output/Sb_2000_361_2024-07-01_IZ.txt
Text successfully saved to ./output/Sb_2012_89_2024-04-01_IZ.txt
Text successfully saved to ./output/Sb_2006_262_2024-07-01_IZ.txt
Text successfully saved to ./output/Sb_1992_586_2024-07-19_IZ.txt
Text successfully saved to ./output/Sb_2012_89_2024-04-01_IZ (1).txt


In [18]:
from pydantic import BaseModel, Field
from typing import Annotated
import tqdm
import re
import json


part_pattern = r"^ČÁST\s+.*"
chapter_pattern = r"^HLAVA\s+.*"
section_pattern = r"^DÍL\s+.*"
subsection_pattern = r"^ODDÍL\s+.*"
paragraph_pattern = r"^§\s+\d+.*"
subsubsection_pattern = r"^\((\d+)\)\s*(.*)"
appendix_pattern = r"^Příloha\s+č\.\s*\d+\s*k zákonu\s*č\.\s*\d+/\d+\s*Sb\."
from typing import Literal
CONTENT_TYPES = {
    "law": "law",
    "part": "part",
    "chapter": "chapter",
    "section": "section",
    "subsection": "subsection",
    "paragraph": "paragraph",
    "appendix": "appendix",
    "item": "item",
}


class NodeReference(BaseModel):
    type: Literal["law", "part", "chapter", "section", "subsection", "paragraph", "appendix", "item"]
    title: str


class Metadata(BaseModel):
    title: str
    effective_date: Annotated[str, Field(description="Effective date of the law")]
    references: Annotated[
        list[NodeReference], Field(description="References to other laws or sections")
    ]
    agencies: Annotated[
        list[str], Field(description="Agencies or organizations mentioned")
    ]


def extract_metadata(content, node_type, title):
    """
    Extract metadata from content for Neo4j graph loading.
    """

    prompt = (
        "You are a metadata extraction assistant. Your task is to extract metadata from a legal document's content. "
        "Please extract the following fields if they are present in the text. If a field is missing, return 'None' for that field.\n\n"
        "Required metadata fields:\n"
        "- title: The full title of the legal document.\n"
        "- effective_date: The date on which the document becomes legally effective.\n"
        "- references: Any other laws, regulations, or documents explicitly mentioned in the text.\n"
        "- agencies: The names of government agencies, departments, or regulatory bodies that issued or are involved in the document.\n\n"
        "The content is: {content}."
    )
    response = llm.with_structured_output(Metadata).invoke(prompt.format(content=content))
    return response.model_dump_json()


# Determine the level of a heading (1: part/appendix, 2: chapter, 3: section, 4: subsection, 5: paragraph)
def get_level(line):
    if re.match(part_pattern, line):
        return 1  # Part
    elif re.match(appendix_pattern, line):
        return 1  # Appendix
    elif re.match(chapter_pattern, line):
        return 2  # Chapter
    elif re.match(section_pattern, line):
        return 3  # Section
    elif re.match(subsection_pattern, line):
        return 4  # Subsection (ODDÍL)
    elif re.match(paragraph_pattern, line):
        return 5  # Paragraph
    return -1  # Not a heading


def convert_law_to_json(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f:
        lines = [line.rstrip() for line in f.readlines() if line.strip()]

    # Find the index of the first "ČÁST" or "Příloha"
    first_part_index = next(
        (
            i
            for i, line in enumerate(lines)
            if re.match(part_pattern, line) or re.match(appendix_pattern, line)
        ),
        len(lines),
    )
    law_content = "\n".join(lines[:first_part_index])

    # Create the top-level "law" node
    law_node = {
        "type": "law",
        "title": "ZÁKON",
        "content": law_content,
        "metadata": extract_metadata(law_content, "law", "ZÁKON"),
        "children": [],
    }

    # Track current nodes at each level: 0=law, 1=part/appendix, 2=chapter, 3=section, 4=subsection, 5=paragraph
    current_nodes = [law_node, None, None, None, None, None]
    last_node = law_node
    current_content = ""
    in_subsection = False
    current_subsection = None
    in_appendix = False

    # Process lines starting from the first "ČÁST" or "Příloha"
    for line in tqdm.tqdm(lines[first_part_index:], desc="Processing lines"):
        level = get_level(line)
        if level >= 1:  # Heading detected
            # Finalize any ongoing subsection
            if in_subsection and current_subsection:
                current_subsection["content"] = current_content.strip()
                current_subsection["metadata"] = extract_metadata(
                    current_subsection["content"],
                    "subsection",
                    current_subsection["title"],
                )
                current_content = ""
                in_subsection = False
                current_subsection = None

            # Handle appendix items
            if in_appendix and current_content and last_node["type"] == "appendix":
                # Split current_content into items (assuming one per line)
                items = current_content.strip().split("\n")
                for item in items:
                    if item and not re.match(
                        subsubsection_pattern, item
                    ):  # Exclude subsubsection-like lines
                        item_node = {
                            "type": "item",
                            "title": item,
                            "content": "",
                            "metadata": extract_metadata("", "item", item),
                            "children": [],
                        }
                        last_node["children"].append(item_node)
                current_content = ""

            # Attach accumulated content to the last node
            if current_content and last_node and last_node["type"] != "appendix":
                last_node["content"] = current_content.strip()
                last_node["metadata"] = extract_metadata(
                    current_content, last_node["type"], last_node["title"]
                )
                current_content = ""

            # Create new node
            node_type = (
                "appendix"
                if re.match(appendix_pattern, line)
                else ["part", "chapter", "section", "subsection", "paragraph"][
                    level - 1
                ]
            )
            new_node = {
                "type": node_type,
                "title": line,
                "content": "",
                "metadata": {},
                "children": [],
            }

            # Find parent: highest level < current level with a node
            parent_level = level - 1
            while parent_level >= 0 and current_nodes[parent_level] is None:
                parent_level -= 1
            parent = current_nodes[parent_level] if parent_level >= 0 else law_node

            # Add new node to parent's children
            parent["children"].append(new_node)
            current_nodes[level] = new_node
            for m in range(level + 1, 6):  # Reset lower levels
                current_nodes[m] = None

            last_node = new_node
            in_appendix = node_type == "appendix"
        else:
            # Check for subsection (subsubsection) start
            match = re.match(subsubsection_pattern, line)
            if match and current_nodes[5]:  # Inside a paragraph
                if in_subsection and current_subsection:
                    # Finalize previous subsection
                    current_subsection["content"] = current_content.strip()
                    current_subsection["metadata"] = extract_metadata(
                        current_subsection["content"],
                        "subsection",
                        current_subsection["title"],
                    )
                    current_content = ""

                # Start new subsubsection
                number, first_line_content = match.group(1), match.group(2) or ""
                current_subsection = {
                    "type": "subsection",
                    "title": f"({number})",
                    "content": first_line_content,
                    "metadata": extract_metadata(
                        first_line_content, "subsection", f"({number})"
                    ),
                    "children": [],
                }
                current_nodes[5]["children"].append(current_subsection)
                in_subsection = True
                current_content = (
                    "" if not first_line_content else first_line_content + "\n"
                )
            else:
                # Accumulate content for current subsection, appendix, or node
                if in_subsection:
                    current_content += line + "\n"
                elif in_appendix:
                    current_content += line + "\n"
                else:
                    current_content += line + "\n"

    # Finalize any remaining content
    if in_subsection and current_subsection:
        current_subsection["content"] = current_content.strip()
        current_subsection["metadata"] = extract_metadata(
            current_subsection["content"], "subsection", current_subsection["title"]
        )
    elif in_appendix and current_content and last_node["type"] == "appendix":
        # Process remaining appendix items
        items = current_content.strip().split("\n")
        for item in items:
            if item and not re.match(subsubsection_pattern, item):
                item_node = {
                    "type": "item",
                    "title": item,
                    "content": "",
                    "metadata": extract_metadata("", "item", item),
                    "children": [],
                }
                last_node["children"].append(item_node)
    elif current_content and last_node:
        last_node["content"] = current_content.strip()
        last_node["metadata"] = extract_metadata(
            current_content, last_node["type"], last_node["title"]
        )

    # Save to JSON
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(law_node, f, ensure_ascii=False, indent=2)
    print(f"JSON saved to '{output_file}'")


INPUT_FOLDER = "./output/text_data"
OUTPUT_FOLDER = "./output/json_data"

pdf_files = [
    os.path.join(INPUT_FOLDER, fname)
    for fname in os.listdir(INPUT_FOLDER)
    if fname.endswith(".txt")
]

# for pdf_file in pdf_files:
#     output_path = os.path.join(
#         OUTPUT_FOLDER, os.path.basename(pdf_file).replace(".txt", ".json")
#     )
#     convert_law_to_json(pdf_file, output_path)

convert_law_to_json(
    "./output/text_data/Sb_1991_455_2024-01-01_IZ.txt",
    "./output/json_data/Sb_1991_455_2024-01-01_IZ.json",
)

Processing lines: 100%|██████████| 3050/3050 [26:31<00:00,  1.92it/s]


JSON saved to './output/json_data/Sb_1991_455_2024-01-01_IZ.json'
