Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/guide/python-getting-started.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,15 @@ async def upload(upload: UploadFile):
)
```

### URL
Allows sending an URL directly.

**Note**: No local operations can be performed on the input (such as removing pages from a PDF).

```python
input_doc = mindee_client.doc_from_url(url="https://www.example.com/invoice.pdf")
```

## Sending a File
To send a file to the API, we need to specify how to process the document.
This will determine which API endpoint is used and how the API return will be handled internally by the library.
Expand Down
19 changes: 13 additions & 6 deletions mindee/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,19 @@ class CommandConfig(Generic[TypeDoc]):
def _get_input_doc(client, args) -> DocumentClient:
if args.input_type == "file":
with open(args.path, "rb", buffering=30) as file_handle:
return client.doc_from_file(file_handle)
return client.doc_from_file(input_file=file_handle)
elif args.input_type == "base64":
with open(args.path, "rt", encoding="ascii") as base64_handle:
return client.doc_from_b64string(base64_handle.read(), "test.jpg")
return client.doc_from_b64string(
input_string=base64_handle.read(), filename="test.jpg"
)
elif args.input_type == "bytes":
with open(args.path, "rb") as bytes_handle:
return client.doc_from_bytes(bytes_handle.read(), bytes_handle.name)
return client.doc_from_bytes(
input_bytes=bytes_handle.read(), filename=bytes_handle.name
)
elif args.input_type == "url":
return client.doc_from_url(url=args.path)
return client.doc_from_path(args.path)


Expand Down Expand Up @@ -181,13 +187,14 @@ def _parse_args() -> Namespace:
"-i",
"--input-type",
dest="input_type",
choices=["path", "file", "base64", "bytes"],
choices=["path", "file", "base64", "bytes", "url"],
default="path",
help="Specify how to handle the input.\n"
"- path: open a path (default).\n"
"- file: open as a file handle.\n"
"- base64: load the from a base64 encoded text file.\n"
"- bytes: load the contents as raw bytes.",
"- base64: open a base64 encoded text file.\n"
"- bytes: open the contents as raw bytes.\n"
"- url: open an URL.",
)
subp.add_argument(
"-o",
Expand Down
43 changes: 32 additions & 11 deletions mindee/client.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import json
from typing import BinaryIO, Dict, List, NamedTuple, Optional, Type
from typing import BinaryIO, Dict, List, NamedTuple, Optional, Type, Union

from mindee import documents
from mindee.documents.base import Document, TypeDocument
Expand All @@ -10,8 +10,9 @@
Base64Input,
BytesInput,
FileInput,
InputSource,
LocalInputSource,
PathInput,
UrlInputSource,
)
from mindee.logger import logger
from mindee.response import PredictResponse
Expand All @@ -23,13 +24,13 @@ def get_bound_classname(type_var) -> str:


class DocumentClient:
input_doc: InputSource
input_doc: Union[LocalInputSource, UrlInputSource]
doc_configs: DocumentConfigDict
raise_on_error: bool = True

def __init__(
self,
input_doc: InputSource,
input_doc: Union[LocalInputSource, UrlInputSource],
doc_configs: DocumentConfigDict,
raise_on_error: bool,
):
Expand Down Expand Up @@ -108,12 +109,13 @@ def parse(

doc_config = self.doc_configs[config_key]
doc_config.check_api_keys()
if page_options and self.input_doc.is_pdf():
self.input_doc.process_pdf(
page_options.operation,
page_options.on_min_pages,
page_options.page_indexes,
)
if not isinstance(self.input_doc, UrlInputSource):
if page_options and self.input_doc.is_pdf():
self.input_doc.process_pdf(
page_options.operation,
page_options.on_min_pages,
page_options.page_indexes,
)
return self._make_request(
document_class, doc_config, include_words, close_file, cropper
)
Expand Down Expand Up @@ -152,7 +154,8 @@ def _make_request(

def close(self) -> None:
"""Close the file object."""
self.input_doc.file_object.close()
if not isinstance(self.input_doc, UrlInputSource):
self.input_doc.file_object.close()


class ConfigSpec(NamedTuple):
Expand Down Expand Up @@ -397,3 +400,21 @@ def doc_from_bytes(
doc_configs=self._doc_configs,
raise_on_error=self.raise_on_error,
)

def doc_from_url(
self,
url: str,
) -> DocumentClient:
"""
Load a document from an URL.

:param url: Raw byte input
"""
input_doc = UrlInputSource(
url,
)
return DocumentClient(
input_doc=input_doc,
doc_configs=self._doc_configs,
raise_on_error=self.raise_on_error,
)
17 changes: 10 additions & 7 deletions mindee/documents/base.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import datetime
import re
from typing import Any, Dict, List, Optional, TypeVar
from typing import Any, Dict, List, Optional, TypeVar, Union

from mindee.endpoints import Endpoint
from mindee.fields.orientation import OrientationField
from mindee.fields.position import PositionField
from mindee.input.sources import InputSource
from mindee.input.sources import LocalInputSource, UrlInputSource

TypeApiPrediction = Dict[str, Any]

Expand Down Expand Up @@ -46,15 +46,18 @@ class Document:

def __init__(
self,
input_source: InputSource,
input_source: Union[LocalInputSource, UrlInputSource],
document_type: Optional[str],
api_prediction: TypeApiPrediction,
page_n: Optional[int] = None,
):
if input_source:
self.filepath = input_source.filepath
self.filename = input_source.filename
self.file_extension = input_source.file_mimetype
if isinstance(input_source, UrlInputSource):
self.filename = input_source.url
else:
self.filepath = input_source.filepath
self.filename = input_source.filename
self.file_extension = input_source.file_mimetype
self.checklist = {}
self.type = document_type

Expand All @@ -67,7 +70,7 @@ def __init__(
@staticmethod
def request(
endpoints: List[Endpoint],
input_source: InputSource,
input_source: Union[LocalInputSource, UrlInputSource],
include_words: bool = False,
close_file: bool = True,
cropper: bool = False,
Expand Down
9 changes: 6 additions & 3 deletions mindee/documents/financial/financial_v1.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Optional, TypeVar
from typing import List, Optional, TypeVar, Union

from mindee.documents.base import Document, TypeApiPrediction, clean_out_string
from mindee.documents.invoice.invoice_v3 import InvoiceV3
Expand All @@ -11,7 +11,7 @@
from mindee.fields.payment_details import PaymentDetails
from mindee.fields.tax import TaxField
from mindee.fields.text import TextField
from mindee.input.sources import InputSource
from mindee.input.sources import LocalInputSource, UrlInputSource


class FinancialV1(Document):
Expand Down Expand Up @@ -152,7 +152,7 @@ def __str__(self) -> str:
@staticmethod
def request(
endpoints: List[Endpoint],
input_source: InputSource,
input_source: Union[LocalInputSource, UrlInputSource],
include_words: bool = False,
close_file: bool = True,
cropper: bool = False,
Expand All @@ -166,6 +166,9 @@ def request(
:param close_file: Whether to `close()` the file after parsing it.
:param cropper: Including Mindee cropper results.
"""
if isinstance(input_source, UrlInputSource):
raise AssertionError("URL input is not supported for this API endpoint.")

if "pdf" in input_source.file_mimetype:
# invoices is index 0, receipts 1 (this should be cleaned up)
index = 0
Expand Down
36 changes: 23 additions & 13 deletions mindee/endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import requests

from mindee.input.sources import InputSource
from mindee.input.sources import LocalInputSource, UrlInputSource
from mindee.logger import logger
from mindee.versions import __version__, get_platform, python_version

Expand Down Expand Up @@ -97,7 +97,7 @@ def set_api_key_from_env(self) -> None:

def predict_req_post(
self,
input_source: InputSource,
input_source: Union[LocalInputSource, UrlInputSource],
include_words: bool = False,
close_file: bool = True,
cropper: bool = False,
Expand All @@ -111,7 +111,6 @@ def predict_req_post(
:param cropper: Including Mindee cropping results.
:return: requests response
"""
files = {"document": input_source.read_contents(close_file)}
data = {}
if include_words:
data["include_mvision"] = "true"
Expand All @@ -120,20 +119,31 @@ def predict_req_post(
if cropper:
params["cropper"] = "true"

response = requests.post(
f"{self._url_root}/predict",
files=files,
headers=self.base_headers,
data=data,
params=params,
timeout=self._request_timeout,
)
if isinstance(input_source, UrlInputSource):
data["document"] = input_source.url
response = requests.post(
f"{self._url_root}/predict",
headers=self.base_headers,
data=data,
params=params,
timeout=self._request_timeout,
)
else:
files = {"document": input_source.read_contents(close_file)}
response = requests.post(
f"{self._url_root}/predict",
files=files,
headers=self.base_headers,
data=data,
params=params,
timeout=self._request_timeout,
)
return response


class CustomEndpoint(Endpoint):
def training_req_post(
self, input_source: InputSource, close_file: bool = True
self, input_source: LocalInputSource, close_file: bool = True
) -> requests.Response:
"""
Make a request to POST a document for training.
Expand All @@ -155,7 +165,7 @@ def training_req_post(
return response

def training_async_req_post(
self, input_source: InputSource, close_file: bool = True
self, input_source: LocalInputSource, close_file: bool = True
) -> requests.Response:
"""
Make a request to POST a document for training without processing.
Expand Down
Loading