From d57c367eec0d8edee70d40f2687e91cfe9b2e24e Mon Sep 17 00:00:00 2001 From: daavoo Date: Wed, 22 Jan 2025 11:31:55 +0100 Subject: [PATCH] enh(preprocessing): Add split_markdown_by_headings. --- docs/step-by-step-guide.md | 4 +- pyproject.toml | 1 - src/structured_qa/preprocessing.py | 74 ++++++++++++++++++++++++------ tests/unit/test_preprocessing.py | 46 ++++++++++++++++++- 4 files changed, 107 insertions(+), 18 deletions(-) diff --git a/docs/step-by-step-guide.md b/docs/step-by-step-guide.md index 27f2c82..6f7d06f 100644 --- a/docs/step-by-step-guide.md +++ b/docs/step-by-step-guide.md @@ -34,9 +34,7 @@ The document is first converted to markdown and then split into sections based o **Section Splitting** - - Uses [langchain-text-splitters](https://pypi.org/project/langchain-text-splitters/) - - - Splits on `("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")` + - Uses [split_markdown_by_headings](api.md/#structured_qa.preprocessing.split_markdown_by_headings) - Each section is saved to a separate file. diff --git a/pyproject.toml b/pyproject.toml index d5f5bb9..75d203c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,6 @@ dependencies = [ "huggingface-hub", "llama-cpp-python", "loguru", - "langchain-text-splitters", "pydantic", "pymupdf4llm", "pyyaml", diff --git a/src/structured_qa/preprocessing.py b/src/structured_qa/preprocessing.py index c673f25..f85d701 100644 --- a/src/structured_qa/preprocessing.py +++ b/src/structured_qa/preprocessing.py @@ -1,18 +1,70 @@ +import re +from collections import defaultdict from pathlib import Path import pymupdf4llm -from langchain_text_splitters import MarkdownHeaderTextSplitter from loguru import logger +def split_markdown_by_headings( + markdown_text, heading_patterns: list[str] | None = None +) -> dict[str, str]: + """Splits a markdown document into sections based on specified heading patterns. + + Args: + markdown_text (str): The markdown document as a single string. + heading_patterns (str, optional): A list of regex patterns representing heading markers + in the markdown document. + Defaults to None. + If None, the default patterns are used: + + ```python + [ + r"^#\s+(.+)$", + r"^##\s+(.+)$", + r"^###\s+(.+)$", + r"^\*\*[\d\.]+\.\*\*\s*\*\*(.+)\*\*$", + r"^\*\*[\d\.]+\.\*\*\s+(.+)$" + ] + ``` + + Returns: + dict[str, str]: A dictionary where the keys are the section names and the values are the section contents. + """ + if heading_patterns is None: + heading_patterns = [ + r"^#\s+(.+)$", + r"^##\s+(.+)$", + r"^###\s+(.+)$", + r"^####\s+(.+)$", + r"^\*\*[\d\.]+\.\*\*\s*\*\*(.+)\*\*$", + ] + + sections = defaultdict(str) + + heading_text = "INTRO" + for line in markdown_text.splitlines(): + line = line.strip() + if not line: + continue + for pattern in heading_patterns: + match = re.match(pattern, line) + if match: + heading_text = match.group(1)[:100] + break + sections[heading_text] += f"{line}\n" + + return sections + + @logger.catch(reraise=True) def document_to_sections_dir(input_file: str, output_dir: str) -> list[str]: """ Convert a document to a directory of sections. Uses [pymupdf4llm](https://pypi.org/project/pymupdf4llm/) to convert input_file to markdown. - Then uses [langchain_text_splitters](https://pypi.org/project/langchain-text-splitters/) to split the markdown into sections based on the headers. + Then uses [`split_markdown_by_headings`][structured_qa.preprocessing.split_markdown_by_headings] to split the markdown into sections based on the headers. Args: input_file: Path to the input document. @@ -32,27 +84,23 @@ def document_to_sections_dir(input_file: str, output_dir: str) -> list[str]: logger.info(f"Converting {input_file}") md_text = pymupdf4llm.to_markdown(input_file) + Path("debug.md").write_text(md_text) logger.success("Converted") logger.info("Extracting sections") - splitter = MarkdownHeaderTextSplitter( - headers_to_split_on=[("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")] + sections = split_markdown_by_headings( + md_text, ) - sections = splitter.split_text(md_text) logger.success(f"Found {len(sections)} sections") logger.info(f"Writing sections to {output_dir}") output_dir = Path(output_dir) output_dir.mkdir(exist_ok=True, parents=True) - section_names = [] - for section in sections: - if not section.metadata: - continue - section_name = list(section.metadata.values())[-1].lower() - section_names.append(section_name) + + for section_name, section_content in sections.items(): (output_dir / f"{section_name.replace('/', '_')}.txt").write_text( - section.page_content + section_content ) logger.success("Done") - return section_names + return sections.keys() diff --git a/tests/unit/test_preprocessing.py b/tests/unit/test_preprocessing.py index 627bdd1..8b3fd05 100644 --- a/tests/unit/test_preprocessing.py +++ b/tests/unit/test_preprocessing.py @@ -1,3 +1,6 @@ +import pytest + +from structured_qa.preprocessing import split_markdown_by_headings from structured_qa.preprocessing import document_to_sections_dir @@ -6,4 +9,45 @@ def test_document_to_sections_dir(tmp_path, example_data): document_to_sections_dir(example_data / "1706.03762v7.pdf", output_dir) sections = list(output_dir.iterdir()) assert all(section.is_file() and section.suffix == ".txt" for section in sections) - assert len(sections) == 10 + assert len(sections) == 12 + + +DEFAULT_HEADINGS = """ +# Introduction + +This is the introduction. + +## Related Work + +This is the related work. + +### Method + +This is the method. +""" + +NUMERIC_HEADINGS = """ +**1.** **Introduction** + +This is the introduction. + +**2.** **Related Work** + +This is the related work. + +**2.1** **Method** + +This is the method. +""" + + +@pytest.mark.parametrize( + ("markdown_text", "n_sections"), + ( + (DEFAULT_HEADINGS, 3), + (NUMERIC_HEADINGS, 2), + ), +) +def test_split_markdown_by_headings(markdown_text, n_sections): + sections = split_markdown_by_headings(markdown_text) + assert len(sections) == n_sections