Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
9fc4717
base layout for functions, doesn't work yet
sebastianMindee May 2, 2023
fc2a1a1
added all basic needed support for async (untested)
sebastianMindee May 2, 2023
ef8fb56
fixed queue polling functions
sebastianMindee May 2, 2023
9768787
revamped inheritance and fixed AsyncPredictionResponse generation
sebastianMindee May 3, 2023
4ff8bfc
fixed polling for jobs, document access not working yet
sebastianMindee May 3, 2023
518d732
added some safety checks
sebastianMindee May 3, 2023
bc2b816
Fixes for draft PR
sebastianMindee May 3, 2023
e49c259
added test & fixed InvoiceSplitterV1 class
sebastianMindee May 4, 2023
61a0d1f
InvoiceSplitterV1 class cleanup
sebastianMindee May 4, 2023
8c5579c
Clarified DocString & added documentation support
sebastianMindee May 4, 2023
e969c28
Clarified DocString & added documentation support
sebastianMindee May 4, 2023
a9c21f4
Added unit testint for Async & fixed some obsolete AsyncPrediction no…
sebastianMindee May 4, 2023
2daaf18
renamed async unittest file to avoid confusion
sebastianMindee May 4, 2023
09ac89c
put api tests together
ianardee May 5, 2023
80df96e
rework a bit the structure
ianardee May 5, 2023
7920f6b
add empty files for docs
ianardee May 5, 2023
8aaf753
added doc support for async, refactored AsyncPredict class to fit wit…
sebastianMindee May 5, 2023
0fa7c37
tweaked import for retrocompatibility
sebastianMindee May 5, 2023
9c8c361
retrocompatibility tweak
sebastianMindee May 5, 2023
fbe2e45
further tweaking
sebastianMindee May 5, 2023
c37e8ec
fixed typo
sebastianMindee May 5, 2023
0d7347a
fixed typo... again...
sebastianMindee May 5, 2023
497d647
fixed txt test, again.
sebastianMindee May 5, 2023
f579852
fixed code sample for invoicesplitter
sebastianMindee May 5, 2023
2b46983
fixed typing in invoicesplitter
sebastianMindee May 5, 2023
5e38352
revamped sample code
sebastianMindee May 5, 2023
d9b6ffb
Update docs/extras/code_samples/invoice_splitter_v1_async.txt
sebastianMindee May 5, 2023
8119184
Update docs/extras/code_samples/invoice_splitter_v1_async.txt
sebastianMindee May 5, 2023
fc4c6aa
fixed code sample & added comments
sebastianMindee May 5, 2023
8c92a7a
removed needless import
sebastianMindee May 5, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/client.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,10 @@ PredictResponse
---------------
.. autoclass:: mindee.response.PredictResponse
:members:

AsyncPredictResponse
--------------------
.. autoclass:: mindee.response.AsyncPredictResponse
:members:
.. autoclass:: mindee.response.Job
:members:
55 changes: 55 additions & 0 deletions docs/extras/code_samples/invoice_splitter_v1_async.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from mindee import Client, documents
from time import sleep

# Init a new client
mindee_client = Client(api_key="my-api-key")

# Load a file from disk
input_doc = mindee_client.doc_from_path("/path/to/the/file.ext")

# Put the document class in a local variable to keep the code DRY

doc_class = documents.TypeInvoiceSplitterV1

# Limit the amount of API calls to retrieve your document
MAX_RETRIES = 10

# How many seconds to wait in-between tries
INTERVAL_SECS = 6

# Counter to keep track of how many times we try to retrieve the document
times_tried = 1


queue_result = input_doc.enqueue(doc_class)

# Get the id of the queue (job)
queue_id = queue_result.job.job_id

# Recursive function that tries to retrieve the completed document.
# If the document is not "complete", try again
def get_doc_from_async_queue(queue_id, times_tried=0):

# Have we exceeded our retry count?
if times_tried >= MAX_RETRIES:
raise Exception(f"Maximum retries reached {times_tried}")

# Wait for a few seconds before fetching
sleep(INTERVAL_SECS)

# Fetch and parse the result, using the same type
parsed_result = input_doc.parse_queued(doc_class, queue_id)

# Check whether the result is ready
if parsed_result.job.status == "completed":

# Print a brief summary of the parsed data
print(parsed_result.document.document)
return

# Otherwise, try again...
else:
get_doc_from_async_queue(queue_id, times_tried+1)

# Start the recursion...
get_doc_from_async_queue(queue_id)
10 changes: 10 additions & 0 deletions docs/predictions/standard/documents/invoice_splitter_v1.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Invoice Splitter V1
-------------------

**Sample Code:**

.. literalinclude:: /extras/code_samples/invoice_splitter_v1_async.txt
:language: Python

.. autoclass:: mindee.documents.InvoiceSplitterV1
:members:
2 changes: 1 addition & 1 deletion mindee/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from mindee.client import Client, PageOptions
from mindee.response import PredictResponse
from mindee.response import AsyncPredictResponse, Job, PredictResponse
208 changes: 184 additions & 24 deletions mindee/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
UrlInputSource,
)
from mindee.logger import logger
from mindee.response import PredictResponse
from mindee.response import AsyncPredictResponse, PredictResponse


def get_bound_classname(type_var) -> str:
Expand Down Expand Up @@ -84,41 +84,105 @@ def parse(

logger.debug("Parsing document as '%s'", endpoint_name)

found = []
for k in self.doc_configs.keys():
if k[1] == endpoint_name:
found.append(k)
doc_config = self._check_config(endpoint_name, account_name)
if not isinstance(self.input_doc, UrlInputSource):
if page_options and self.input_doc.is_pdf():
self.input_doc.process_pdf(
page_options.operation,
page_options.on_min_pages,
page_options.page_indexes,
)
return self._make_request(
document_class, doc_config, include_words, close_file, cropper
)

if len(found) == 0:
raise RuntimeError(f"Document type not configured: {endpoint_name}")
def enqueue(
self,
document_class: TypeDocument,
endpoint_name: Optional[str] = None,
account_name: Optional[str] = None,
include_words: bool = False,
close_file: bool = True,
page_options: Optional[PageOptions] = None,
cropper: bool = False,
) -> AsyncPredictResponse[TypeDocument]:
"""
Enqueueing to an async endpoint.

if account_name:
config_key = (account_name, endpoint_name)
elif len(found) == 1:
config_key = found[0]
else:
usernames = [k[0] for k in found]
:param document_class: The document class to use.
The response object will be instantiated based on this parameter.

:param endpoint_name: For custom endpoints, the "API name" field in the "Settings" page of the API Builder.
Do not set for standard (off the shelf) endpoints.

:param account_name: For custom endpoints, your account or organization username on the API Builder.
This is normally not required unless you have a custom endpoint which has the
same name as standard (off the shelf) endpoint.
Do not set for standard (off the shelf) endpoints.

:param include_words: Whether to include the full text for each page.
This performs a full OCR operation on the server and will increase response time.

:param close_file: Whether to ``close()`` the file after parsing it.
Set to ``False`` if you need to access the file after this operation.

:param page_options: If set, remove pages from the document as specified.
This is done before sending the file to the server and is useful to avoid page limitations.

:param cropper: Whether to include cropper results for each page.
This performs a cropping operation on the server and will increase response time.
"""
bound_classname = get_bound_classname(document_class)
if bound_classname != documents.CustomV1.__name__:
endpoint_name = get_bound_classname(document_class)
elif endpoint_name is None:
raise RuntimeError(
(
"Duplicate configuration detected.\n"
f"You specified a document_type '{endpoint_name}' in your custom config.\n"
"To avoid confusion, please add the 'account_name' attribute to "
f"the parse method, one of {usernames}."
)
f"endpoint_name is required when using {bound_classname} class"
)

doc_config = self.doc_configs[config_key]
doc_config.check_api_keys()
logger.debug("Enqueuing document as '%s'", endpoint_name)

doc_config = self._check_config(endpoint_name, account_name)
if not isinstance(self.input_doc, UrlInputSource):
if page_options and self.input_doc.is_pdf():
self.input_doc.process_pdf(
page_options.operation,
page_options.on_min_pages,
page_options.page_indexes,
)
return self._make_request(
document_class, doc_config, include_words, close_file, cropper
)
return self._predict_async(doc_config, include_words, close_file, cropper)

def parse_queued(
self,
document_class: TypeDocument,
queue_id: str,
endpoint_name: Optional[str] = None,
account_name: Optional[str] = None,
) -> AsyncPredictResponse[TypeDocument]:
"""
Parses a queued document.

:param queue_id: queue_id received from the API
:param endpoint_name: For custom endpoints, the "API name" field in the "Settings" page of the API Builder.
Do not set for standard (off the shelf) endpoints.
:param account_name: For custom endpoints, your account or organization username on the API Builder.
This is normally not required unless you have a custom endpoint which has the
same name as standard (off the shelf) endpoint.
Do not set for standard (off the shelf) endpoints.
"""
bound_classname = get_bound_classname(document_class)
if bound_classname != documents.CustomV1.__name__:
endpoint_name = get_bound_classname(document_class)
elif endpoint_name is None:
raise RuntimeError(
f"endpoint_name is required when using {bound_classname} class"
)

logger.debug("Fetching queued document as '%s'", endpoint_name)

doc_config = self._check_config(endpoint_name, account_name)

return self._get_queued_document(doc_config, queue_id)

def _make_request(
self,
Expand All @@ -145,18 +209,108 @@ def _make_request(
raise HTTPException(
f"API {response.status_code} HTTP error: {json.dumps(dict_response)}"
)

return PredictResponse[TypeDocument](
http_response=dict_response,
doc_config=doc_config,
input_source=self.input_doc,
response_ok=response.ok,
)

def _predict_async(
self,
doc_config: DocumentConfig,
include_words: bool = False,
close_file: bool = True,
cropper: bool = False,
) -> AsyncPredictResponse[TypeDocument]:
"""
Sends a document to the queue, and sends back an asynchronous predict response.

:param doc_config: Configuration of the document.
"""
response = doc_config.endpoints[0].predict_async_req_post(
self.input_doc, include_words, close_file, cropper
)

dict_response = response.json()

if not response.ok and self.raise_on_error:
raise HTTPException(
f"API {response.status_code} HTTP error: {json.dumps(dict_response)}"
)

return AsyncPredictResponse[TypeDocument](
http_response=dict_response,
doc_config=doc_config,
input_source=self.input_doc,
response_ok=response.ok,
)

def _get_queued_document(
self,
doc_config: DocumentConfig,
queue_id: str,
) -> AsyncPredictResponse[TypeDocument]:
"""
Fetches a document or a Job from a given queue.

:param queue_id: Queue_id received from the API
:param doc_config: Pre-checked document configuration.
"""
queue_response = doc_config.endpoints[0].document_queue_req_get(
queue_id=queue_id
)

if (
not queue_response.status_code
or queue_response.status_code < 200
or queue_response.status_code > 302
):
raise HTTPException(
f"API {queue_response.status_code} HTTP error: {json.dumps(queue_response)}"
)

return AsyncPredictResponse[TypeDocument](
http_response=queue_response.json(),
doc_config=doc_config,
input_source=self.input_doc,
response_ok=queue_response.ok,
)

def close(self) -> None:
"""Close the file object."""
if not isinstance(self.input_doc, UrlInputSource):
self.input_doc.file_object.close()

def _check_config(self, endpoint_name, account_name) -> DocumentConfig:
found = []
for k in self.doc_configs.keys():
if k[1] == endpoint_name:
found.append(k)

if len(found) == 0:
raise RuntimeError(f"Document type not configured: {endpoint_name}")

if account_name:
config_key = (account_name, endpoint_name)
elif len(found) == 1:
config_key = found[0]
else:
usernames = [k[0] for k in found]
raise RuntimeError(
(
"Duplicate configuration detected.\n"
f"You specified a document_type '{endpoint_name}' in your custom config.\n"
"To avoid confusion, please add the 'account_name' attribute to "
f"the parse method, one of {usernames}."
)
)

doc_config = self.doc_configs[config_key]
doc_config.check_api_keys()
return doc_config


class ConfigSpec(NamedTuple):
doc_class: Type[Document]
Expand Down Expand Up @@ -281,7 +435,13 @@ def _init_default_endpoints(self) -> None:
url_name="license_plates",
version="1",
),
ConfigSpec(
doc_class=documents.InvoiceSplitterV1,
url_name="invoice_splitter",
version="1",
),
]

for config in configs:
config_key = (OTS_OWNER, config.doc_class.__name__)
self._doc_configs[config_key] = self._standard_doc_config(
Expand Down
1 change: 1 addition & 0 deletions mindee/documents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
TypeFinancialV1,
)
from mindee.documents.invoice import InvoiceV3, InvoiceV4, TypeInvoiceV3, TypeInvoiceV4
from mindee.documents.invoice_splitter import InvoiceSplitterV1, TypeInvoiceSplitterV1
from mindee.documents.passport import PassportV1, TypePassportV1
from mindee.documents.proof_of_address import ProofOfAddressV1, TypeProofOfAddressV1
from mindee.documents.receipt import (
Expand Down
2 changes: 1 addition & 1 deletion mindee/documents/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def check_api_keys(self) -> None:
raise RuntimeError(
(
f"Missing API key for '{endpoint.url_name} v{endpoint.version}',"
"check your Client configuration.\n"
" check your Client configuration.\n"
"You can set this using the "
f"'{API_KEY_ENV_NAME}' environment variable."
)
Expand Down
1 change: 1 addition & 0 deletions mindee/documents/invoice_splitter/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .invoice_splitter_v1 import InvoiceSplitterV1, TypeInvoiceSplitterV1
Loading