Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions docs/step-by-step-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,7 @@ The document is first converted to markdown and then split into sections based o

**Section Splitting**

- Uses [langchain-text-splitters](https://pypi.org/project/langchain-text-splitters/)

- Splits on `("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")`
- Uses [split_markdown_by_headings](api.md/#structured_qa.preprocessing.split_markdown_by_headings)

- Each section is saved to a separate file.

Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ dependencies = [
"huggingface-hub",
"llama-cpp-python",
"loguru",
"langchain-text-splitters",
"pydantic",
"pymupdf4llm",
"pyyaml",
Expand Down
74 changes: 61 additions & 13 deletions src/structured_qa/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,70 @@
import re
from collections import defaultdict
from pathlib import Path

import pymupdf4llm
from langchain_text_splitters import MarkdownHeaderTextSplitter

from loguru import logger


def split_markdown_by_headings(
markdown_text, heading_patterns: list[str] | None = None
) -> dict[str, str]:
"""Splits a markdown document into sections based on specified heading patterns.

Args:
markdown_text (str): The markdown document as a single string.
heading_patterns (str, optional): A list of regex patterns representing heading markers
in the markdown document.
Defaults to None.
If None, the default patterns are used:

```python
[
r"^#\s+(.+)$",
r"^##\s+(.+)$",
r"^###\s+(.+)$",
r"^\*\*[\d\.]+\.\*\*\s*\*\*(.+)\*\*$",
r"^\*\*[\d\.]+\.\*\*\s+(.+)$"
]
```

Returns:
dict[str, str]: A dictionary where the keys are the section names and the values are the section contents.
"""
if heading_patterns is None:
heading_patterns = [
r"^#\s+(.+)$",
r"^##\s+(.+)$",
r"^###\s+(.+)$",
r"^####\s+(.+)$",
r"^\*\*[\d\.]+\.\*\*\s*\*\*(.+)\*\*$",
]

sections = defaultdict(str)

heading_text = "INTRO"
for line in markdown_text.splitlines():
line = line.strip()
if not line:
continue
for pattern in heading_patterns:
match = re.match(pattern, line)
if match:
heading_text = match.group(1)[:100]
break
sections[heading_text] += f"{line}\n"

return sections


@logger.catch(reraise=True)
def document_to_sections_dir(input_file: str, output_dir: str) -> list[str]:
"""
Convert a document to a directory of sections.

Uses [pymupdf4llm](https://pypi.org/project/pymupdf4llm/) to convert input_file to markdown.
Then uses [langchain_text_splitters](https://pypi.org/project/langchain-text-splitters/) to split the markdown into sections based on the headers.
Then uses [`split_markdown_by_headings`][structured_qa.preprocessing.split_markdown_by_headings] to split the markdown into sections based on the headers.

Args:
input_file: Path to the input document.
Expand All @@ -32,27 +84,23 @@ def document_to_sections_dir(input_file: str, output_dir: str) -> list[str]:

logger.info(f"Converting {input_file}")
md_text = pymupdf4llm.to_markdown(input_file)
Path("debug.md").write_text(md_text)
logger.success("Converted")

logger.info("Extracting sections")
splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=[("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
sections = split_markdown_by_headings(
md_text,
)
sections = splitter.split_text(md_text)
logger.success(f"Found {len(sections)} sections")

logger.info(f"Writing sections to {output_dir}")
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True, parents=True)
section_names = []
for section in sections:
if not section.metadata:
continue
section_name = list(section.metadata.values())[-1].lower()
section_names.append(section_name)

for section_name, section_content in sections.items():
(output_dir / f"{section_name.replace('/', '_')}.txt").write_text(
section.page_content
section_content
)
logger.success("Done")

return section_names
return sections.keys()
46 changes: 45 additions & 1 deletion tests/unit/test_preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import pytest

from structured_qa.preprocessing import split_markdown_by_headings
from structured_qa.preprocessing import document_to_sections_dir


Expand All @@ -6,4 +9,45 @@ def test_document_to_sections_dir(tmp_path, example_data):
document_to_sections_dir(example_data / "1706.03762v7.pdf", output_dir)
sections = list(output_dir.iterdir())
assert all(section.is_file() and section.suffix == ".txt" for section in sections)
assert len(sections) == 10
assert len(sections) == 12


DEFAULT_HEADINGS = """
# Introduction

This is the introduction.

## Related Work

This is the related work.

### Method

This is the method.
"""

NUMERIC_HEADINGS = """
**1.** **Introduction**

This is the introduction.

**2.** **Related Work**

This is the related work.

**2.1** **Method**

This is the method.
"""


@pytest.mark.parametrize(
("markdown_text", "n_sections"),
(
(DEFAULT_HEADINGS, 3),
(NUMERIC_HEADINGS, 2),
),
)
def test_split_markdown_by_headings(markdown_text, n_sections):
sections = split_markdown_by_headings(markdown_text)
assert len(sections) == n_sections
Loading