Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -454,3 +454,4 @@ $RECYCLE.BIN/

!pyrightconfig.json
/coverage/
/coverage.json
7 changes: 7 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,10 @@ that via `.env` if desired), upload the checked-in sample assets under
`examples/resources/`, and exercise the async client end-to-end. Use
`uvx nox -s examples` when you want to execute every example across the
supported interpreter matrix.

## Available Examples

- `examples/delete/delete_example.py` – demonstrate file deletion (sync + async
variants).
- `examples/extract_text/extract_pdf_text_example.py` – run `extract_pdf_text`
with word coordinates/style enabled and render the output as a Rich table.
111 changes: 111 additions & 0 deletions examples/extract_text/extract_pdf_text_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# /// script
# requires-python = ">=3.10"
# dependencies = ["pdfrest", "python-dotenv", "rich"]
# ///
"""Render extracted words with coordinates and style metadata.

This sample demonstrates how to:

1. Upload the bundled ``examples/resources/report.pdf`` resource.
2. Request JSON output from :func:`PdfRestClient.extract_pdf_text` while turning on
word-level coordinates and styling data.
3. Display the returned metadata as a Rich table.

Run with ``uv run --project ../.. python extract_pdf_text_example.py`` after
setting ``PDFREST_API_KEY`` (``python-dotenv`` will also load `.env` if present).
"""

from __future__ import annotations

from pathlib import Path

from dotenv import load_dotenv
from rich.console import Console
from rich.table import Table

from pdfrest import PdfRestClient
from pdfrest.models import (
ExtractedTextDocument,
ExtractedTextPoint,
ExtractedTextWord,
ExtractedTextWordCoordinates,
)

RESOURCE = Path(__file__).resolve().parents[1] / "resources" / "report.pdf"


def _format_point(point: ExtractedTextPoint | None) -> str:
if point is None:
return "—"
return f"({point.x:.2f}, {point.y:.2f})"


def _format_color(word: ExtractedTextWord) -> str:
style = word.style
if style is None or style.color is None:
return "—"
color = style.color
values = ", ".join(str(value) for value in color.values)
return f"{color.space}: {values}"


def _format_font(word: ExtractedTextWord) -> str:
style = word.style
if style is None:
return "—"
font = style.font
return f"{font.name} ({font.size:.1f} pt)"


def _build_word_table(document: ExtractedTextDocument) -> Table:
table = Table(title="Extracted Words with Coordinates and Style")
table.add_column("Word", style="bold")
table.add_column("Page", justify="right")
table.add_column("Top Left")
table.add_column("Top Right")
table.add_column("Bottom Left")
table.add_column("Bottom Right")
table.add_column("Color")
table.add_column("Font")

for word in document.words or []:
coords: ExtractedTextWordCoordinates | None = word.coordinates
table.add_row(
word.text,
str(word.page),
_format_point(coords.top_left if coords else None),
_format_point(coords.top_right if coords else None),
_format_point(coords.bottom_left if coords else None),
_format_point(coords.bottom_right if coords else None),
_format_color(word),
_format_font(word),
)
return table


def list_words_with_coordinates() -> None:
load_dotenv()
console = Console()

with PdfRestClient() as client:
uploaded = client.files.create_from_paths([RESOURCE])[0]
document = client.extract_pdf_text(
uploaded,
full_text="by_page",
preserve_line_breaks=True,
word_style=True,
word_coordinates=True,
)

words = document.words or []
console.print(f"Extracted {len(words)} words from [bold]{uploaded.name}[/bold].")
if not words:
console.print("[yellow]This document did not include word metadata.[/yellow]")
return

table = _build_word_table(document)
console.print(table)


if __name__ == "__main__": # pragma: no cover - manual example
list_words_with_coordinates()
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ dev = [
"basedpyright>=1.34.0",
"python-dotenv>=1.0.1",
"diff-cover>=10.2.0",
"rich>=14.1.0",
]

[tool.pytest.ini_options]
Expand Down
85 changes: 85 additions & 0 deletions src/pdfrest/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
translate_httpx_error,
)
from .models import (
ExtractedTextDocument,
PdfRestDeletionResponse,
PdfRestErrorResponse,
PdfRestFile,
Expand Down Expand Up @@ -2398,6 +2399,48 @@ def extract_images(
timeout=timeout,
)

def extract_pdf_text(
self,
file: PdfRestFile | Sequence[PdfRestFile],
*,
pages: PdfPageSelection | None = None,
full_text: Literal["off", "by_page", "document"] = "document",
preserve_line_breaks: bool = False,
word_style: bool = False,
word_coordinates: bool = False,
extra_query: Query | None = None,
extra_headers: AnyMapping | None = None,
extra_body: Body | None = None,
timeout: TimeoutTypes | None = None,
) -> ExtractedTextDocument:
"""Extract text content from a PDF and return parsed JSON results."""

payload: dict[str, Any] = {
"files": file,
"full_text": full_text,
"preserve_line_breaks": preserve_line_breaks,
"word_style": word_style,
"word_coordinates": word_coordinates,
"output_type": "json",
}
if pages is not None:
payload["pages"] = pages

validated_payload = ExtractTextPayload.model_validate(payload)
request = self.prepare_request(
"POST",
"/extracted-text",
json_body=validated_payload.model_dump(
mode="json", by_alias=True, exclude_none=True, exclude_unset=True
),
extra_query=extra_query,
extra_headers=extra_headers,
extra_body=extra_body,
timeout=timeout,
)
raw_payload = self._send_request(request)
return ExtractedTextDocument.model_validate(raw_payload)

def extract_pdf_text_to_file(
self,
file: PdfRestFile | Sequence[PdfRestFile],
Expand Down Expand Up @@ -3393,6 +3436,48 @@ async def extract_images(
timeout=timeout,
)

async def extract_pdf_text(
self,
file: PdfRestFile | Sequence[PdfRestFile],
*,
pages: PdfPageSelection | None = None,
full_text: Literal["off", "by_page", "document"] = "document",
preserve_line_breaks: bool = False,
word_style: bool = False,
word_coordinates: bool = False,
extra_query: Query | None = None,
extra_headers: AnyMapping | None = None,
extra_body: Body | None = None,
timeout: TimeoutTypes | None = None,
) -> ExtractedTextDocument:
"""Extract text content from a PDF and return parsed JSON results."""

payload: dict[str, Any] = {
"files": file,
"full_text": full_text,
"preserve_line_breaks": preserve_line_breaks,
"word_style": word_style,
"word_coordinates": word_coordinates,
"output_type": "json",
}
if pages is not None:
payload["pages"] = pages

validated_payload = ExtractTextPayload.model_validate(payload)
request = self.prepare_request(
"POST",
"/extracted-text",
json_body=validated_payload.model_dump(
mode="json", by_alias=True, exclude_none=True, exclude_unset=True
),
extra_query=extra_query,
extra_headers=extra_headers,
extra_body=extra_body,
timeout=timeout,
)
raw_payload = await self._send_request(request)
return ExtractedTextDocument.model_validate(raw_payload)

async def extract_pdf_text_to_file(
self,
file: PdfRestFile | Sequence[PdfRestFile],
Expand Down
22 changes: 22 additions & 0 deletions src/pdfrest/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,15 @@
from .public import (
ExtractedTextDocument,
ExtractedTextFullText,
ExtractedTextFullTextPage,
ExtractedTextFullTextPages,
ExtractedTextPoint,
ExtractedTextWord,
ExtractedTextWordColor,
ExtractedTextWordCoordinates,
ExtractedTextWordFont,
ExtractedTextWordStyle,
ExtractTextResponse,
PdfRestDeletionResponse,
PdfRestErrorResponse,
PdfRestFile,
Expand All @@ -12,6 +23,17 @@
)

__all__ = [
"ExtractTextResponse",
"ExtractedTextDocument",
"ExtractedTextFullText",
"ExtractedTextFullTextPage",
"ExtractedTextFullTextPages",
"ExtractedTextPoint",
"ExtractedTextWord",
"ExtractedTextWordColor",
"ExtractedTextWordCoordinates",
"ExtractedTextWordFont",
"ExtractedTextWordStyle",
"PdfRestDeletionResponse",
"PdfRestErrorResponse",
"PdfRestFile",
Expand Down
Loading