diff --git a/.gitignore b/.gitignore index 78e07f63..f84925da 100644 --- a/.gitignore +++ b/.gitignore @@ -454,3 +454,4 @@ $RECYCLE.BIN/ !pyrightconfig.json /coverage/ +/coverage.json diff --git a/examples/README.md b/examples/README.md index 6d168504..76571230 100644 --- a/examples/README.md +++ b/examples/README.md @@ -18,3 +18,10 @@ that via `.env` if desired), upload the checked-in sample assets under `examples/resources/`, and exercise the async client end-to-end. Use `uvx nox -s examples` when you want to execute every example across the supported interpreter matrix. + +## Available Examples + +- `examples/delete/delete_example.py` – demonstrate file deletion (sync + async + variants). +- `examples/extract_text/extract_pdf_text_example.py` – run `extract_pdf_text` + with word coordinates/style enabled and render the output as a Rich table. diff --git a/examples/extract_text/extract_pdf_text_example.py b/examples/extract_text/extract_pdf_text_example.py new file mode 100644 index 00000000..15e55dee --- /dev/null +++ b/examples/extract_text/extract_pdf_text_example.py @@ -0,0 +1,111 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = ["pdfrest", "python-dotenv", "rich"] +# /// +"""Render extracted words with coordinates and style metadata. + +This sample demonstrates how to: + +1. Upload the bundled ``examples/resources/report.pdf`` resource. +2. Request JSON output from :func:`PdfRestClient.extract_pdf_text` while turning on + word-level coordinates and styling data. +3. Display the returned metadata as a Rich table. + +Run with ``uv run --project ../.. python extract_pdf_text_example.py`` after +setting ``PDFREST_API_KEY`` (``python-dotenv`` will also load `.env` if present). +""" + +from __future__ import annotations + +from pathlib import Path + +from dotenv import load_dotenv +from rich.console import Console +from rich.table import Table + +from pdfrest import PdfRestClient +from pdfrest.models import ( + ExtractedTextDocument, + ExtractedTextPoint, + ExtractedTextWord, + ExtractedTextWordCoordinates, +) + +RESOURCE = Path(__file__).resolve().parents[1] / "resources" / "report.pdf" + + +def _format_point(point: ExtractedTextPoint | None) -> str: + if point is None: + return "—" + return f"({point.x:.2f}, {point.y:.2f})" + + +def _format_color(word: ExtractedTextWord) -> str: + style = word.style + if style is None or style.color is None: + return "—" + color = style.color + values = ", ".join(str(value) for value in color.values) + return f"{color.space}: {values}" + + +def _format_font(word: ExtractedTextWord) -> str: + style = word.style + if style is None: + return "—" + font = style.font + return f"{font.name} ({font.size:.1f} pt)" + + +def _build_word_table(document: ExtractedTextDocument) -> Table: + table = Table(title="Extracted Words with Coordinates and Style") + table.add_column("Word", style="bold") + table.add_column("Page", justify="right") + table.add_column("Top Left") + table.add_column("Top Right") + table.add_column("Bottom Left") + table.add_column("Bottom Right") + table.add_column("Color") + table.add_column("Font") + + for word in document.words or []: + coords: ExtractedTextWordCoordinates | None = word.coordinates + table.add_row( + word.text, + str(word.page), + _format_point(coords.top_left if coords else None), + _format_point(coords.top_right if coords else None), + _format_point(coords.bottom_left if coords else None), + _format_point(coords.bottom_right if coords else None), + _format_color(word), + _format_font(word), + ) + return table + + +def list_words_with_coordinates() -> None: + load_dotenv() + console = Console() + + with PdfRestClient() as client: + uploaded = client.files.create_from_paths([RESOURCE])[0] + document = client.extract_pdf_text( + uploaded, + full_text="by_page", + preserve_line_breaks=True, + word_style=True, + word_coordinates=True, + ) + + words = document.words or [] + console.print(f"Extracted {len(words)} words from [bold]{uploaded.name}[/bold].") + if not words: + console.print("[yellow]This document did not include word metadata.[/yellow]") + return + + table = _build_word_table(document) + console.print(table) + + +if __name__ == "__main__": # pragma: no cover - manual example + list_words_with_coordinates() diff --git a/pyproject.toml b/pyproject.toml index c7c92469..b09c8c05 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dev = [ "basedpyright>=1.34.0", "python-dotenv>=1.0.1", "diff-cover>=10.2.0", + "rich>=14.1.0", ] [tool.pytest.ini_options] diff --git a/src/pdfrest/client.py b/src/pdfrest/client.py index 0035227c..ecac10d4 100644 --- a/src/pdfrest/client.py +++ b/src/pdfrest/client.py @@ -60,6 +60,7 @@ translate_httpx_error, ) from .models import ( + ExtractedTextDocument, PdfRestDeletionResponse, PdfRestErrorResponse, PdfRestFile, @@ -2398,6 +2399,48 @@ def extract_images( timeout=timeout, ) + def extract_pdf_text( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + pages: PdfPageSelection | None = None, + full_text: Literal["off", "by_page", "document"] = "document", + preserve_line_breaks: bool = False, + word_style: bool = False, + word_coordinates: bool = False, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> ExtractedTextDocument: + """Extract text content from a PDF and return parsed JSON results.""" + + payload: dict[str, Any] = { + "files": file, + "full_text": full_text, + "preserve_line_breaks": preserve_line_breaks, + "word_style": word_style, + "word_coordinates": word_coordinates, + "output_type": "json", + } + if pages is not None: + payload["pages"] = pages + + validated_payload = ExtractTextPayload.model_validate(payload) + request = self.prepare_request( + "POST", + "/extracted-text", + json_body=validated_payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + ), + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + raw_payload = self._send_request(request) + return ExtractedTextDocument.model_validate(raw_payload) + def extract_pdf_text_to_file( self, file: PdfRestFile | Sequence[PdfRestFile], @@ -3393,6 +3436,48 @@ async def extract_images( timeout=timeout, ) + async def extract_pdf_text( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + pages: PdfPageSelection | None = None, + full_text: Literal["off", "by_page", "document"] = "document", + preserve_line_breaks: bool = False, + word_style: bool = False, + word_coordinates: bool = False, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> ExtractedTextDocument: + """Extract text content from a PDF and return parsed JSON results.""" + + payload: dict[str, Any] = { + "files": file, + "full_text": full_text, + "preserve_line_breaks": preserve_line_breaks, + "word_style": word_style, + "word_coordinates": word_coordinates, + "output_type": "json", + } + if pages is not None: + payload["pages"] = pages + + validated_payload = ExtractTextPayload.model_validate(payload) + request = self.prepare_request( + "POST", + "/extracted-text", + json_body=validated_payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + ), + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + raw_payload = await self._send_request(request) + return ExtractedTextDocument.model_validate(raw_payload) + async def extract_pdf_text_to_file( self, file: PdfRestFile | Sequence[PdfRestFile], diff --git a/src/pdfrest/models/__init__.py b/src/pdfrest/models/__init__.py index ef10e565..2ee3d5a2 100644 --- a/src/pdfrest/models/__init__.py +++ b/src/pdfrest/models/__init__.py @@ -1,4 +1,15 @@ from .public import ( + ExtractedTextDocument, + ExtractedTextFullText, + ExtractedTextFullTextPage, + ExtractedTextFullTextPages, + ExtractedTextPoint, + ExtractedTextWord, + ExtractedTextWordColor, + ExtractedTextWordCoordinates, + ExtractedTextWordFont, + ExtractedTextWordStyle, + ExtractTextResponse, PdfRestDeletionResponse, PdfRestErrorResponse, PdfRestFile, @@ -12,6 +23,17 @@ ) __all__ = [ + "ExtractTextResponse", + "ExtractedTextDocument", + "ExtractedTextFullText", + "ExtractedTextFullTextPage", + "ExtractedTextFullTextPages", + "ExtractedTextPoint", + "ExtractedTextWord", + "ExtractedTextWordColor", + "ExtractedTextWordCoordinates", + "ExtractedTextWordFont", + "ExtractedTextWordStyle", "PdfRestDeletionResponse", "PdfRestErrorResponse", "PdfRestFile", diff --git a/src/pdfrest/models/public.py b/src/pdfrest/models/public.py index e4dc8a3a..99ef5257 100644 --- a/src/pdfrest/models/public.py +++ b/src/pdfrest/models/public.py @@ -14,12 +14,24 @@ ConfigDict, Field, HttpUrl, + RootModel, ) from pydantic.json_schema import JsonSchemaValue from pydantic_core import CoreSchema from typing_extensions import override __all__ = ( + "ExtractTextResponse", + "ExtractedTextDocument", + "ExtractedTextFullText", + "ExtractedTextFullTextPage", + "ExtractedTextFullTextPages", + "ExtractedTextPoint", + "ExtractedTextWord", + "ExtractedTextWordColor", + "ExtractedTextWordCoordinates", + "ExtractedTextWordFont", + "ExtractedTextWordStyle", "PdfRestDeletionResponse", "PdfRestErrorResponse", "PdfRestFile", @@ -402,6 +414,307 @@ class TranslatePdfTextFileResponse(PdfRestFileBasedResponse): ] = None +class ExtractTextResponse(BaseModel): + """Response returned by the extracted-text tool.""" + + model_config = ConfigDict(extra="allow") + + full_text: Annotated[ + str | None, + Field( + alias="fullText", + validation_alias=AliasChoices("full_text", "fullText"), + description="Inline extracted text when output_type is json.", + default=None, + ), + ] = None + input_id: Annotated[ + PdfRestFileID, + Field( + validation_alias=AliasChoices("input_id", "inputId"), + description="The id of the input file.", + ), + ] + warning: Annotated[ + str | None, + Field(description="A warning that was generated during text extraction."), + ] = None + + +class ExtractedTextPoint(BaseModel): + """A point in PDF coordinate space expressed in points.""" + + model_config = ConfigDict(extra="allow") + + x: Annotated[ + float, + Field(description="Horizontal position in PDF points."), + ] + y: Annotated[ + float, + Field(description="Vertical position in PDF points."), + ] + + +class ExtractedTextWordCoordinates(BaseModel): + """Bounding box describing where a word appears on the page.""" + + model_config = ConfigDict(extra="allow") + + top_left: Annotated[ + ExtractedTextPoint, + Field( + alias="topLeft", + validation_alias=AliasChoices("top_left", "topLeft"), + description="Upper-left corner of the word bounds.", + ), + ] + top_right: Annotated[ + ExtractedTextPoint, + Field( + alias="topRight", + validation_alias=AliasChoices("top_right", "topRight"), + description="Upper-right corner of the word bounds.", + ), + ] + bottom_left: Annotated[ + ExtractedTextPoint, + Field( + alias="bottomLeft", + validation_alias=AliasChoices("bottom_left", "bottomLeft"), + description="Lower-left corner of the word bounds.", + ), + ] + bottom_right: Annotated[ + ExtractedTextPoint, + Field( + alias="bottomRight", + validation_alias=AliasChoices("bottom_right", "bottomRight"), + description="Lower-right corner of the word bounds.", + ), + ] + + +class ExtractedTextWordColor(BaseModel): + """Font color applied to an extracted word.""" + + model_config = ConfigDict(extra="allow") + + space: Annotated[ + str, + Field(description="Color space name reported by pdfRest (e.g., DeviceRGB)."), + ] + values: Annotated[ + list[float], + Field( + description="Numeric components in the reported color space.", + min_length=1, + ), + ] + + +class ExtractedTextWordFont(BaseModel): + """Font metadata applied to an extracted word.""" + + model_config = ConfigDict(extra="allow") + + name: Annotated[ + str, + Field(description="Reported font face name."), + ] + size: Annotated[ + float, + Field(description="Font size in points."), + ] + + +class ExtractedTextWordStyle(BaseModel): + """Style information for an extracted word.""" + + model_config = ConfigDict(extra="allow") + + color: Annotated[ + ExtractedTextWordColor, + Field(description="Color information for the word."), + ] + font: Annotated[ + ExtractedTextWordFont, + Field(description="Font information for the word."), + ] + + +class ExtractedTextWord(BaseModel): + """A single word extracted from a PDF page.""" + + model_config = ConfigDict(extra="allow") + + text: Annotated[ + str, + Field(description="Word content as rendered by the PDF."), + ] + page: Annotated[ + int, + Field(description="1-indexed page number containing the word.", ge=1), + ] + coordinates: Annotated[ + ExtractedTextWordCoordinates | None, + Field( + description="Bounding box for the word when positional data is requested.", + default=None, + ), + ] = None + style: Annotated[ + ExtractedTextWordStyle | None, + Field( + description="Font/color details captured for the word.", + default=None, + ), + ] = None + + +class ExtractedTextFullTextPage(BaseModel): + """Per-page representation of the aggregated text content.""" + + model_config = ConfigDict(extra="allow") + + page: Annotated[ + int, + Field(description="1-indexed page number.", ge=1), + ] + text: Annotated[ + str, + Field(description="Concatenated text for the page."), + ] + + +class ExtractedTextFullTextPages(BaseModel): + """Container for per-page text output.""" + + model_config = ConfigDict(extra="allow") + + pages: Annotated[ + list[ExtractedTextFullTextPage], + Field( + description="Ordered text for each page present in the document.", + min_length=1, + ), + ] + + +class ExtractedTextFullText(RootModel[str | ExtractedTextFullTextPages]): + """ + Represents full-text extraction in either "document" (str) or "page" (object) + modes while providing convenience accessors for both forms. + """ + + root: str | ExtractedTextFullTextPages + + @property + def document_text(self) -> str | None: + """ + Return the document-level string. Falls back to space-joining per-page text + when only the page-structured payload is available. + """ + if isinstance(self.root, str): + return self.root + return " ".join(page.text for page in self.root.pages) + + @property + def pages(self) -> list[ExtractedTextFullTextPage]: + """ + Return page entries when pdfRest emits per-page text. + Raises ValueError when the payload is in document-string mode. + """ + if isinstance(self.root, ExtractedTextFullTextPages): + return self.root.pages + msg = "full text payload was emitted in document mode; page data unavailable" + raise ValueError(msg) + + def iter_pages(self) -> list[ExtractedTextFullTextPage]: + """ + Convenience helper that provides a stable iterable without requiring + callers to guard against the document-only representation. + """ + try: + return self.pages + except ValueError: + return [] + + +class ExtractedTextDocument(BaseModel): + """Structured representation of the JSON output returned by extract_text_to_file.""" + + model_config = ConfigDict(extra="allow") + + input_id: Annotated[ + PdfRestFileID, + Field( + alias="inputId", + validation_alias=AliasChoices("input_id", "inputId"), + description="Identifier of the uploaded PDF.", + ), + ] + words: Annotated[ + list[ExtractedTextWord] | None, + Field( + description="Individual word records when word-level extraction is enabled.", + default=None, + ), + ] = None + full_text: Annotated[ + ExtractedTextFullText | None, + Field( + alias="fullText", + validation_alias=AliasChoices("full_text", "fullText"), + description="Full text output (document string or per-page content).", + default=None, + ), + ] = None + + +class ConvertToMarkdownResponse(BaseModel): + """Response returned by the markdown conversion tool.""" + + model_config = ConfigDict(extra="allow") + + markdown: Annotated[ + str | None, + Field( + description="Inline markdown content when output_type is json.", + default=None, + ), + ] = None + input_id: Annotated[ + PdfRestFileID, + Field( + validation_alias=AliasChoices("input_id", "inputId"), + description="The id of the input file.", + ), + ] + output_url: Annotated[ + HttpUrl | None, + Field( + alias="outputUrl", + validation_alias=AliasChoices("output_url", "outputUrl"), + description="Download URL for file output.", + default=None, + ), + ] = None + output_id: Annotated[ + PdfRestFileID | None, + Field( + alias="outputId", + validation_alias=AliasChoices("output_id", "outputId"), + description="The id of the generated output when output_type is file.", + default=None, + ), + ] = None + warning: Annotated[ + str | None, + Field(description="A warning that was generated during markdown conversion."), + ] = None + + class PdfRestInfoResponse(BaseModel): """A response containing the output from the /info route.""" diff --git a/tests/live/test_live_extract_pdf_text.py b/tests/live/test_live_extract_pdf_text.py new file mode 100644 index 00000000..67b69f1a --- /dev/null +++ b/tests/live/test_live_extract_pdf_text.py @@ -0,0 +1,122 @@ +from __future__ import annotations + +from itertools import product + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import ExtractedTextDocument + +from ..resources import get_test_resource_path + +FULL_TEXT_OPTIONS = ("off", "by_page", "document") +BOOL_OPTION_SETS = list(product([False, True], repeat=3)) + +LIVE_OPTION_SETS = [ + pytest.param( + { + "full_text": full_text, + "preserve_line_breaks": preserve, + "word_style": word_style, + "word_coordinates": word_coordinates, + }, + id=f"{full_text}-plb-{int(preserve)}-ws-{int(word_style)}-wc-{int(word_coordinates)}", + ) + for full_text in FULL_TEXT_OPTIONS + for preserve, word_style, word_coordinates in BOOL_OPTION_SETS +] + + +def _assert_live_full_text( + response: ExtractedTextDocument, + *, + full_text_mode: str, +) -> None: + if full_text_mode == "off": + assert response.full_text is None + elif full_text_mode == "document": + assert response.full_text is not None + assert response.full_text.document_text is not None + else: + assert response.full_text is not None + assert response.full_text.pages is not None + + +@pytest.mark.parametrize("options", LIVE_OPTION_SETS) +def test_live_extract_pdf_text_success( + options: dict[str, bool | str], + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.extract_pdf_text(uploaded, **options) + + assert isinstance(response, ExtractedTextDocument) + assert response.input_id == uploaded.id + _assert_live_full_text(response, full_text_mode=options["full_text"]) + if options["word_style"] or options["word_coordinates"]: + assert response.words is not None + assert response.words + + +@pytest.mark.asyncio +@pytest.mark.parametrize("options", LIVE_OPTION_SETS) +async def test_live_async_extract_pdf_text_success( + options: dict[str, bool | str], + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.extract_pdf_text(uploaded, **options) + + assert isinstance(response, ExtractedTextDocument) + assert response.input_id == uploaded.id + _assert_live_full_text(response, full_text_mode=options["full_text"]) + if options["word_style"] or options["word_coordinates"]: + assert response.words is not None + assert response.words + + +def test_live_extract_pdf_text_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + client.extract_pdf_text( + uploaded, + extra_body={"pages": "last-1"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_extract_pdf_text_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + await client.extract_pdf_text( + uploaded, + extra_body={"pages": "last-1"}, + ) diff --git a/tests/test_extract_pdf_text.py b/tests/test_extract_pdf_text.py new file mode 100644 index 00000000..4286d92c --- /dev/null +++ b/tests/test_extract_pdf_text.py @@ -0,0 +1,432 @@ +from __future__ import annotations + +import json +from collections.abc import Mapping +from itertools import product + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import ExtractedTextDocument, PdfRestFileID +from pdfrest.models._internal import ExtractTextPayload + +from .graphics_test_helpers import ASYNC_API_KEY, VALID_API_KEY, make_pdf_file + + +def _make_extracted_text_document_payload(input_id: str) -> dict[str, object]: + return { + "inputId": input_id, + "words": [ + { + "text": "Hello", + "page": 1, + "coordinates": { + "topLeft": {"x": 1, "y": 2}, + "topRight": {"x": 3, "y": 4}, + "bottomLeft": {"x": 5, "y": 6}, + "bottomRight": {"x": 7, "y": 8}, + }, + "style": { + "color": {"space": "DeviceRGB", "values": [0, 0, 0]}, + "font": {"name": "Calibri", "size": 12}, + }, + } + ], + "fullText": { + "pages": [ + {"page": 1, "text": "Hello world"}, + {"page": 2, "text": "Bye"}, + ] + }, + } + + +FULL_TEXT_OPTIONS = ("off", "by_page", "document") +BOOL_OPTION_SETS = list(product([False, True], repeat=3)) + +EXTRACT_TEXT_OPTION_SETS = [ + pytest.param( + { + "full_text": full_text, + "preserve_line_breaks": preserve, + "word_style": word_style, + "word_coordinates": word_coordinates, + }, + id=f"{full_text}-plb-{int(preserve)}-ws-{int(word_style)}-wc-{int(word_coordinates)}", + ) + for full_text in FULL_TEXT_OPTIONS + for preserve, word_style, word_coordinates in BOOL_OPTION_SETS +] + +PAGES_OPTION_SETS = [ + pytest.param(None, id="without-pages"), + pytest.param(["1-2"], id="with-pages"), +] + + +@pytest.mark.parametrize("options", EXTRACT_TEXT_OPTION_SETS) +def test_extract_pdf_text_success( + options: Mapping[str, bool | str], + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + base_payload: dict[str, object] = { + "files": [input_file], + "pages": ["1-2"], + "output_type": "json", + } + payload_input = base_payload | dict(options) + payload_dump = ExtractTextPayload.model_validate(payload_input).model_dump( + mode="json", + by_alias=True, + exclude_none=True, + exclude_unset=True, + ) + + expected_response = _make_extracted_text_document_payload(str(input_file.id)) + seen: dict[str, int] = {"post": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response(200, json=expected_response) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.extract_pdf_text( + input_file, + pages=["1-2"], + full_text=options["full_text"], + preserve_line_breaks=options["preserve_line_breaks"], + word_style=options["word_style"], + word_coordinates=options["word_coordinates"], + ) + + assert seen == {"post": 1} + assert isinstance(response, ExtractedTextDocument) + assert response.input_id == input_file.id + assert response.model_dump(by_alias=True, exclude_none=True) == expected_response + + +def test_extract_pdf_text_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + payload_dump = ExtractTextPayload.model_validate( + { + "files": [input_file], + "full_text": "document", + "preserve_line_breaks": False, + "word_style": False, + "word_coordinates": False, + "output_type": "json", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + expected_response = _make_extracted_text_document_payload(str(input_file.id)) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-text": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync-json" + captured_timeout["post"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump | {"debug": True} + return httpx.Response(200, json=expected_response) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.extract_pdf_text( + input_file, + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync-json"}, + extra_body={"debug": True}, + timeout=0.25, + ) + + assert isinstance(response, ExtractedTextDocument) + post_timeout = captured_timeout["post"] + assert post_timeout is not None + if isinstance(post_timeout, dict): + assert all( + component == pytest.approx(0.25) for component in post_timeout.values() + ) + else: + assert post_timeout == pytest.approx(0.25) + assert response.model_dump(by_alias=True, exclude_none=True) == expected_response + + +@pytest.mark.asyncio +async def test_async_extract_pdf_text_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + payload_dump = ExtractTextPayload.model_validate( + { + "files": [input_file], + "full_text": "document", + "preserve_line_breaks": False, + "word_style": False, + "word_coordinates": False, + "output_type": "json", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + expected_response = _make_extracted_text_document_payload(str(input_file.id)) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-text": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "async-json" + captured_timeout["post"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump | {"debug": True} + return httpx.Response(200, json=expected_response) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient( + api_key=ASYNC_API_KEY, + transport=transport, + ) as client: + response = await client.extract_pdf_text( + input_file, + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "async-json"}, + extra_body={"debug": True}, + timeout=0.25, + ) + + assert isinstance(response, ExtractedTextDocument) + post_timeout = captured_timeout["post"] + assert post_timeout is not None + if isinstance(post_timeout, dict): + assert all( + component == pytest.approx(0.25) for component in post_timeout.values() + ) + else: + assert post_timeout == pytest.approx(0.25) + assert response.model_dump(by_alias=True, exclude_none=True) == expected_response + + +@pytest.mark.asyncio +@pytest.mark.parametrize("options", EXTRACT_TEXT_OPTION_SETS) +@pytest.mark.parametrize("pages", PAGES_OPTION_SETS) +async def test_async_extract_pdf_text_success( + options: Mapping[str, bool | str], + pages: list[str] | None, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + base_payload: dict[str, object] = { + "files": [input_file], + "output_type": "json", + } + if pages is not None: + base_payload["pages"] = pages + payload_input = base_payload | dict(options) + payload_dump = ExtractTextPayload.model_validate(payload_input).model_dump( + mode="json", + by_alias=True, + exclude_none=True, + exclude_unset=True, + ) + expected_response = _make_extracted_text_document_payload(str(input_file.id)) + seen: dict[str, int] = {"post": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response(200, json=expected_response) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient( + api_key=ASYNC_API_KEY, + transport=transport, + ) as client: + request_kwargs: dict[str, object] = { + "full_text": options["full_text"], + "preserve_line_breaks": options["preserve_line_breaks"], + "word_style": options["word_style"], + "word_coordinates": options["word_coordinates"], + } + if pages is not None: + request_kwargs["pages"] = pages + + response = await client.extract_pdf_text(input_file, **request_kwargs) + + assert seen == {"post": 1} + assert isinstance(response, ExtractedTextDocument) + assert response.input_id == input_file.id + assert response.model_dump(by_alias=True, exclude_none=True) == expected_response + + +def test_extract_pdf_text_multi_file_guard(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + files = [ + make_pdf_file(PdfRestFileID.generate(1)), + make_pdf_file(PdfRestFileID.generate(2)), + ] + transport = httpx.MockTransport(lambda request: httpx.Response(500)) + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="at most 1 item"), + ): + client.extract_pdf_text(files) + + +@pytest.mark.asyncio +async def test_async_extract_pdf_text_multi_file_guard( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + files = [ + make_pdf_file(PdfRestFileID.generate(1)), + make_pdf_file(PdfRestFileID.generate(2)), + ] + transport = httpx.MockTransport(lambda request: httpx.Response(500)) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises(ValidationError, match="at most 1 item"): + await client.extract_pdf_text(files) + + +def test_extract_pdf_text_invalid_pages(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + transport = httpx.MockTransport(lambda request: httpx.Response(500)) + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, + match="The start page must be less than or equal to the end", + ), + ): + client.extract_pdf_text(input_file, pages=["5-1"]) + + +@pytest.mark.asyncio +async def test_async_extract_pdf_text_invalid_pages( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + transport = httpx.MockTransport(lambda request: httpx.Response(500)) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises( + ValidationError, + match="The start page must be less than or equal to the end", + ): + await client.extract_pdf_text(input_file, pages=["5-1"]) + + +def test_extract_pdf_text_server_error(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-text": + return httpx.Response(400, json={"message": "Invalid option"}) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(PdfRestApiError, match="Invalid option"), + ): + client.extract_pdf_text(input_file, full_text="off") + + +@pytest.mark.asyncio +async def test_async_extract_pdf_text_server_error( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-text": + return httpx.Response(400, json={"message": "Invalid option"}) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises(PdfRestApiError, match="Invalid option"): + await client.extract_pdf_text(input_file, full_text="off") + + +@pytest.mark.parametrize( + ("invalid_kwargs", "match"), + [ + pytest.param({"full_text": "pages"}, "full_text", id="bad-full-text"), + pytest.param( + {"preserve_line_breaks": "maybe"}, + "preserve_line_breaks", + id="bad-preserve-line-breaks", + ), + pytest.param({"word_style": "maybe"}, "word_style", id="bad-word-style"), + pytest.param( + {"word_coordinates": "maybe"}, "word_coordinates", id="bad-word-coordinates" + ), + ], +) +def test_extract_pdf_text_invalid_option_values( + invalid_kwargs: Mapping[str, object], + match: str, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + transport = httpx.MockTransport(lambda request: httpx.Response(500)) + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match=match), + ): + client.extract_pdf_text(input_file, **invalid_kwargs) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("invalid_kwargs", "match"), + [ + pytest.param({"full_text": "pages"}, "full_text", id="bad-full-text"), + pytest.param( + {"preserve_line_breaks": "maybe"}, + "preserve_line_breaks", + id="bad-preserve-line-breaks", + ), + pytest.param({"word_style": "maybe"}, "word_style", id="bad-word-style"), + pytest.param( + {"word_coordinates": "maybe"}, "word_coordinates", id="bad-word-coordinates" + ), + ], +) +async def test_async_extract_pdf_text_invalid_option_values( + invalid_kwargs: Mapping[str, object], + match: str, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + transport = httpx.MockTransport(lambda request: httpx.Response(500)) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises(ValidationError, match=match): + await client.extract_pdf_text(input_file, **invalid_kwargs) diff --git a/tests/test_extracted_text_document.py b/tests/test_extracted_text_document.py new file mode 100644 index 00000000..b9110672 --- /dev/null +++ b/tests/test_extracted_text_document.py @@ -0,0 +1,244 @@ +"""Tests for ExtractedTextDocument validation and serialization.""" + +from __future__ import annotations + +import pytest + +from pdfrest.models import ExtractedTextDocument + + +def test_extract_text_document_round_trip_document_mode() -> None: + data = { + "inputId": "153ec1a0f-07e4-4f42-bc64-05180f72a06c", + "fullText": "The lamb walks My Cow Eats!", + } + + document = ExtractedTextDocument.model_validate(data) + + assert document.input_id == data["inputId"] + assert document.full_text is not None + assert document.full_text.document_text == "The lamb walks My Cow Eats!" + assert document.full_text.iter_pages() == [] + assert document.words is None + + with pytest.raises( + ValueError, + match="full text payload was emitted in document mode; page data unavailable", + ): + _ = document.full_text.pages + + assert document.model_dump(by_alias=True, exclude_none=True) == data + + +def test_extract_text_document_round_trip_page_mode() -> None: + data = { + "inputId": "10559e808-4073-488b-b660-a0b1106dd98e", + "words": [ + { + "text": "The", + "page": 1, + "coordinates": { + "topLeft": {"x": 72, "y": 720.7918090820312}, + "topRight": {"x": 90.12725830078125, "y": 720.7918090820312}, + "bottomLeft": {"x": 72, "y": 704.72412109375}, + "bottomRight": {"x": 90.12725830078125, "y": 704.72412109375}, + }, + "style": { + "color": {"space": "DeviceRGB", "values": [0, 0, 0]}, + "font": {"name": "Calibri", "size": 12}, + }, + } + ], + "fullText": { + "pages": [ + {"page": 1, "text": "The lamb walks"}, + {"page": 2, "text": "My Cow Eats!"}, + ] + }, + } + + document = ExtractedTextDocument.model_validate(data) + + assert document.input_id == data["inputId"] + assert document.full_text is not None + assert document.words is not None + assert len(document.words) == 1 + assert document.full_text.document_text == "The lamb walks My Cow Eats!" + pages = document.full_text.pages + assert len(pages) == 2 + assert pages[0].page == 1 + assert pages[0].text == "The lamb walks" + assert pages[1].page == 2 + assert pages[1].text == "My Cow Eats!" + assert document.full_text.iter_pages() == pages + + word = document.words[0] + assert word.text == "The" + assert word.page == 1 + assert word.style is not None + assert word.style.color.space == "DeviceRGB" + assert word.style.color.values == [0, 0, 0] + assert word.style.font.name == "Calibri" + assert word.style.font.size == 12 + assert word.coordinates is not None + assert word.coordinates.top_left.x == 72 + assert word.coordinates.top_left.y == 720.7918090820312 + assert word.coordinates.top_right.x == 90.12725830078125 + assert word.coordinates.top_right.y == 720.7918090820312 + assert word.coordinates.bottom_left.x == 72 + assert word.coordinates.bottom_left.y == 704.72412109375 + assert word.coordinates.bottom_right.x == 90.12725830078125 + assert word.coordinates.bottom_right.y == 704.72412109375 + + assert document.model_dump(by_alias=True, exclude_none=True) == data + + +def test_extract_text_document_round_trip_without_words_or_full_text() -> None: + data = { + "inputId": "3f59e808-4073-488b-b660-a0b1106dd9aa", + } + + document = ExtractedTextDocument.model_validate(data) + + assert document.input_id == data["inputId"] + assert document.full_text is None + assert document.words is None + assert document.model_dump(by_alias=True, exclude_none=True) == data + + +@pytest.mark.parametrize( + ("word_payload", "has_coordinates", "has_style"), + [ + pytest.param( + {"text": "Simple", "page": 1}, + False, + False, + id="minimal-word", + ), + pytest.param( + { + "text": "CoordsOnly", + "page": 2, + "coordinates": { + "topLeft": {"x": 1, "y": 2}, + "topRight": {"x": 3, "y": 4}, + "bottomLeft": {"x": 5, "y": 6}, + "bottomRight": {"x": 7, "y": 8}, + }, + }, + True, + False, + id="coordinates-only", + ), + pytest.param( + { + "text": "StyleOnly", + "page": 3, + "style": { + "color": {"space": "DeviceRGB", "values": [0.1, 0.2, 0.3]}, + "font": {"name": "Calibri", "size": 10}, + }, + }, + False, + True, + id="style-only", + ), + pytest.param( + { + "text": "Both", + "page": 4, + "coordinates": { + "topLeft": {"x": 10, "y": 11}, + "topRight": {"x": 12, "y": 13}, + "bottomLeft": {"x": 14, "y": 15}, + "bottomRight": {"x": 16, "y": 17}, + }, + "style": { + "color": {"space": "DeviceCMYK", "values": [0, 0, 0, 1]}, + "font": {"name": "Times", "size": 8.5}, + }, + }, + True, + True, + id="coordinates-and-style", + ), + ], +) +def test_extracted_text_words_optional_fields( + word_payload: dict[str, object], has_coordinates: bool, has_style: bool +) -> None: + data = { + "inputId": "6f59e808-4073-488b-b660-a0b1106dd9bb", + "words": [word_payload], + } + + document = ExtractedTextDocument.model_validate(data) + + assert document.input_id == data["inputId"] + assert document.words is not None + word = document.words[0] + assert word.text == word_payload["text"] + assert word.page == word_payload["page"] + + if has_coordinates: + assert word.coordinates is not None + coord_payload = word_payload.get("coordinates") + assert isinstance(coord_payload, dict) + top_left = coord_payload["topLeft"] + top_right = coord_payload["topRight"] + bottom_left = coord_payload["bottomLeft"] + bottom_right = coord_payload["bottomRight"] + assert word.coordinates.top_left.x == top_left["x"] + assert word.coordinates.top_left.y == top_left["y"] + assert word.coordinates.top_right.x == top_right["x"] + assert word.coordinates.top_right.y == top_right["y"] + assert word.coordinates.bottom_left.x == bottom_left["x"] + assert word.coordinates.bottom_left.y == bottom_left["y"] + assert word.coordinates.bottom_right.x == bottom_right["x"] + assert word.coordinates.bottom_right.y == bottom_right["y"] + assert word.coordinates.model_dump(by_alias=True) == coord_payload + else: + assert word.coordinates is None + + if has_style: + assert word.style is not None + style_payload = word_payload.get("style") + assert isinstance(style_payload, dict) + color_payload = style_payload["color"] + font_payload = style_payload["font"] + assert isinstance(color_payload, dict) + assert isinstance(font_payload, dict) + assert word.style.color.space == color_payload["space"] + assert word.style.color.values == color_payload["values"] + assert word.style.font.name == font_payload["name"] + assert word.style.font.size == font_payload["size"] + else: + assert word.style is None + + assert document.model_dump(by_alias=True, exclude_none=True) == data + + +def test_extract_text_document_page_mode_without_words() -> None: + data = { + "inputId": "9f59e808-4073-488b-b660-a0b1106dd9cc", + "fullText": { + "pages": [ + {"page": 1, "text": "One"}, + {"page": 2, "text": "Two"}, + ] + }, + } + + document = ExtractedTextDocument.model_validate(data) + + assert document.input_id == data["inputId"] + assert document.words is None + assert document.full_text is not None + pages = document.full_text.pages + assert len(pages) == 2 + assert pages[0].page == 1 + assert pages[0].text == "One" + assert pages[1].page == 2 + assert pages[1].text == "Two" + assert document.full_text.document_text == "One Two" + assert document.model_dump(by_alias=True, exclude_none=True) == data diff --git a/uv.lock b/uv.lock index d78991ea..402cabeb 100644 --- a/uv.lock +++ b/uv.lock @@ -751,6 +751,7 @@ dev = [ { name = "pytest-rerunfailures" }, { name = "pytest-xdist" }, { name = "python-dotenv" }, + { name = "rich" }, { name = "ruff" }, ] @@ -778,6 +779,7 @@ dev = [ { name = "pytest-rerunfailures", specifier = ">=16.0.1" }, { name = "pytest-xdist", specifier = ">=3.8.0" }, { name = "python-dotenv", specifier = ">=1.0.1" }, + { name = "rich", specifier = ">=14.1.0" }, { name = "ruff", specifier = ">=0.6.9" }, ]