From 9fc4717beb36d701376cb964991fc5fb8e5ca33f Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Tue, 2 May 2023 15:50:47 +0200 Subject: [PATCH 01/30] base layout for functions, doesn't work yet --- mindee/client.py | 166 +++++++++++++++++++++++++++++++++++++++++++- mindee/endpoints.py | 77 ++++++++++++++++++++ mindee/response.py | 69 +++++++++++++++++- 3 files changed, 309 insertions(+), 3 deletions(-) diff --git a/mindee/client.py b/mindee/client.py index 2e177d4e..8e635792 100644 --- a/mindee/client.py +++ b/mindee/client.py @@ -15,7 +15,7 @@ UrlInputSource, ) from mindee.logger import logger -from mindee.response import PredictResponse +from mindee.response import AsyncPredictResponse, PredictResponse def get_bound_classname(type_var) -> str: @@ -120,6 +120,136 @@ def parse( document_class, doc_config, include_words, close_file, cropper ) + def enqueue( + self, + document_class: TypeDocument, + endpoint_name: Optional[str] = None, + account_name: Optional[str] = None, + include_words: bool = False, + close_file: bool = True, + page_options: Optional[PageOptions] = None, + cropper: bool = False, + ) -> PredictResponse[TypeDocument]: + """ + Enqueueing to an async endpoint. + + :param document_class: The document class to use. + The response object will be instantiated based on this parameter. + + :param endpoint_name: For custom endpoints, the "API name" field in the "Settings" page of the API Builder. + Do not set for standard (off the shelf) endpoints. + + :param account_name: For custom endpoints, your account or organization username on the API Builder. + This is normally not required unless you have a custom endpoint which has the + same name as standard (off the shelf) endpoint. + Do not set for standard (off the shelf) endpoints. + + :param include_words: Whether to include the full text for each page. + This performs a full OCR operation on the server and will increase response time. + + :param close_file: Whether to ``close()`` the file after parsing it. + Set to ``False`` if you need to access the file after this operation. + + :param page_options: If set, remove pages from the document as specified. + This is done before sending the file to the server and is useful to avoid page limitations. + + :param cropper: Whether to include cropper results for each page. + This performs a cropping operation on the server and will increase response time. + """ + bound_classname = get_bound_classname(document_class) + if bound_classname != documents.CustomV1.__name__: + endpoint_name = get_bound_classname(document_class) + elif endpoint_name is None: + raise RuntimeError( + f"endpoint_name is required when using {bound_classname} class" + ) + + logger.debug("Enqueuing document as '%s'", endpoint_name) + + found = [] + for k in self.doc_configs.keys(): + if k[1] == endpoint_name: + found.append(k) + + if len(found) == 0: + raise RuntimeError(f"Document type not configured: {endpoint_name}") + + if account_name: + config_key = (account_name, endpoint_name) + elif len(found) == 1: + config_key = found[0] + else: + usernames = [k[0] for k in found] + raise RuntimeError( + ( + "Duplicate configuration detected.\n" + f"You specified a document_type '{endpoint_name}' in your custom config.\n" + "To avoid confusion, please add the 'account_name' attribute to " + f"the parse method, one of {usernames}." + ) + ) + + doc_config = self.doc_configs[config_key] + doc_config.check_api_keys() + if not isinstance(self.input_doc, UrlInputSource): + if page_options and self.input_doc.is_pdf(): + self.input_doc.process_pdf( + page_options.operation, + page_options.on_min_pages, + page_options.page_indexes, + ) + return self._make_request( + document_class, doc_config, include_words, close_file, cropper + ) + + def parse_queued( + self, + queue_id: str, + endpoint_name: str, + account_name: Optional[str] = None, + include_words: bool = False, + cropper: bool = False, + ) -> AsyncPredictResponse: + """ + Parses a queued document. + + :param queue_id: queue_id received from the API + :param endpoint_name: For custom endpoints, the "API name" field in the "Settings" page of the API Builder. + Do not set for standard (off the shelf) endpoints. + :param account_name: For custom endpoints, your account or organization username on the API Builder. + This is normally not required unless you have a custom endpoint which has the + same name as standard (off the shelf) endpoint. + Do not set for standard (off the shelf) endpoints. + + """ + found = [] + for k in self.doc_configs.keys(): + if k[1] == endpoint_name: + found.append(k) + + if len(found) == 0: + raise RuntimeError(f"Document type not configured: {endpoint_name}") + + if account_name: + config_key = (account_name, endpoint_name) + elif len(found) == 1: + config_key = found[0] + else: + usernames = [k[0] for k in found] + raise RuntimeError( + ( + "Duplicate configuration detected.\n" + f"You specified a document_type '{endpoint_name}' in your custom config.\n" + "To avoid confusion, please add the 'account_name' attribute to " + f"the parse method, one of {usernames}." + ) + ) + + doc_config = self.doc_configs[config_key] + doc_config.check_api_keys() + + return self._get_queued_document(queue_id, doc_config, include_words, cropper) + def _make_request( self, document_class: TypeDocument, @@ -145,6 +275,7 @@ def _make_request( raise HTTPException( f"API {response.status_code} HTTP error: {json.dumps(dict_response)}" ) + return PredictResponse[TypeDocument]( http_response=dict_response, doc_config=doc_config, @@ -152,6 +283,39 @@ def _make_request( response_ok=response.ok, ) + def _get_queued_document( + self, + queue_id: str, + doc_config: DocumentConfig, + include_words: bool = False, + cropper: bool = False, + ) -> AsyncPredictResponse[TypeDocument]: + """ + Fetches a document or a Job from a given queue. + + :param queue_id: Queue_id received from the API + :param doc_config: Pre-checked document configuration. + """ + queue_response = doc_config.endpoints[0].document_queue_req_get( + queue_id=queue_id, include_words=include_words, cropper=cropper + ) + + if ( + not queue_response.status_code + or queue_response.status_code < 200 + or queue_response.status_code > 302 + ): + raise HTTPException( + f"API {queue_response.status_code} HTTP error: {json.dumps(queue_response)}" + ) + + return AsyncPredictResponse[TypeDocument]( + doc_config=doc_config, + http_response=queue_response.json(), + input_source=self.input_doc, + response_ok=queue_response.ok, + ) + def close(self) -> None: """Close the file object.""" if not isinstance(self.input_doc, UrlInputSource): diff --git a/mindee/endpoints.py b/mindee/endpoints.py index 56a0f3a7..86b63c5f 100644 --- a/mindee/endpoints.py +++ b/mindee/endpoints.py @@ -140,6 +140,83 @@ def predict_req_post( ) return response + def predict_async_req_post( + self, + input_source: Union[LocalInputSource, UrlInputSource], + include_words: bool = False, + close_file: bool = True, + cropper: bool = False, + ) -> requests.Response: + """ + Make an asynchronous request to POST a document for prediction. + + :param input_source: Input object + :param include_words: Include raw OCR words in the response + :param close_file: Whether to `close()` the file after parsing it. + :param cropper: Including Mindee cropping results. + :return: requests response + """ + data = {} + if include_words: + data["include_mvision"] = "true" + + params = {} + if cropper: + params["cropper"] = "true" + + if isinstance(input_source, UrlInputSource): + data["document"] = input_source.url + response = requests.post( + f"{self._url_root}/predict_async", + headers=self.base_headers, + data=data, + params=params, + timeout=self._request_timeout, + ) + else: + files = {"document": input_source.read_contents(close_file)} + response = requests.post( + f"{self._url_root}/predict_async", + files=files, + headers=self.base_headers, + data=data, + params=params, + timeout=self._request_timeout, + ) + return response + + def document_queue_req_get( + self, + queue_id: str, + include_words: bool = False, + cropper: bool = False, + ) -> requests.Response: + """ + Sends a request matching a given queue_id. Returns either a Job or a Document. + + :param queue_id: queue_id received from the API + :param include_words: Whether to include the full text for each page. + This performs a full OCR operation on the server and will increase response time. + :param cropper: Whether to include cropper results for each page. + This performs a cropping operation on the server and will increase response time. + + """ + data = {} + if include_words: + data["include_mvision"] = "true" + + params = {} + if cropper: + params["cropper"] = "true" + response = requests.get( + f"{self._url_root}/documents/queue/{queue_id}", + headers=self.base_headers, + data=data, + params=params, + timeout=self._request_timeout, + ) + return response + class CustomEndpoint(Endpoint): def training_req_post( diff --git a/mindee/response.py b/mindee/response.py index facf9831..bc0daf5c 100644 --- a/mindee/response.py +++ b/mindee/response.py @@ -1,4 +1,5 @@ -from typing import Any, Dict, Generic, List, Optional, Union +from datetime import datetime +from typing import Any, Dict, Generic, List, Literal, Optional, Union from mindee.documents.base import TypeDocument from mindee.documents.config import DocumentConfig @@ -6,6 +7,45 @@ from mindee.logger import logger +class Job: + """ + Job wrapper for a request sent to the API. + + Only relevant in the case of async work. + """ + + issued_at: datetime + """Timestamp of the request reception by the API.""" + available_at: Optional[datetime] + """Timestamp of the request after it has been completed.""" + job_id: Optional[str] + """ID of the job.""" + status: Optional[str] + """Status of the request, as seen by the API.""" + milli_secs_taken: int # Check if that one is fine as a simple int + """Time (ms) taken for the request to be processed by the API.""" + + def __init__(self, json_response: dict) -> None: + """ + Wrapper for the HTTP response sent from the API when a document is enqueued. + + :param json_response: JSON response sent by the server + """ + self.issued_at = datetime.strptime( + json_response["issued_at"], "%Y-%m-%dT%H:%M:%S.%fZ" + ) # check date formatting later + if json_response.get("available_at"): + self.available_at = datetime.strptime( + json_response["available_at"], "%Y-%m-%dT%H:%M:%S.%fZ" + ) + self.job_id = json_response.get("id") + self.status = json_response.get("status") + if self.available_at: + self.milli_secs_taken = int( + (self.available_at.microsecond - self.issued_at.microsecond) / 1000 + ) + + class PredictResponse(Generic[TypeDocument]): """ Response of a prediction request. @@ -31,7 +71,7 @@ class PredictResponse(Generic[TypeDocument]): def __init__( self, doc_config: DocumentConfig, - http_response: dict, + http_response: Dict, input_source: Union[LocalInputSource, UrlInputSource], response_ok: bool, ) -> None: @@ -96,3 +136,28 @@ def _load_response( input_source=input_source, page_n=None, ) + + +class AsyncPredictResponse(PredictResponse[TypeDocument]): + """ + Response of a prediction request. + + Certain properties will depend on the document type. + """ + + job: Job + + def __init__( + self, + doc_config: DocumentConfig, + http_response: Dict, + input_source: Union[LocalInputSource, UrlInputSource], + response_ok: bool, + ) -> None: + super().__init__( + doc_config=doc_config, + http_response=http_response, + input_source=input_source, + response_ok=response_ok, + ) + self.job = Job(http_response["job"]) From fc2a1a1050cc982231352e7a228b6775dcf39dfd Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Tue, 2 May 2023 17:18:20 +0200 Subject: [PATCH 02/30] added all basic needed support for async (untested) --- mindee/client.py | 40 +++++++++++++++++++++++++++++++++------- mindee/endpoints.py | 13 +------------ 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/mindee/client.py b/mindee/client.py index 8e635792..badb06bd 100644 --- a/mindee/client.py +++ b/mindee/client.py @@ -198,10 +198,11 @@ def enqueue( page_options.on_min_pages, page_options.page_indexes, ) - return self._make_request( - document_class, doc_config, include_words, close_file, cropper + return self._predict_async( + doc_config, include_words, close_file, cropper ) + def parse_queued( self, queue_id: str, @@ -282,13 +283,39 @@ def _make_request( input_source=self.input_doc, response_ok=response.ok, ) + + def _predict_async( + self, + doc_config: DocumentConfig, + include_words: bool = False, + close_file: bool = True, + cropper: bool = False + ) -> PredictResponse[TypeDocument]: + response = doc_config.endpoints[0].predict_async_req_post( + self.input_doc, + include_words, + close_file, + cropper + ) # TODO: refactor this into the document class + + + dict_response = response.json() + + if not response.ok and self.raise_on_error: + raise HTTPException( + f"API {response.status_code} HTTP error: {json.dumps(dict_response)}" + ) + return PredictResponse[TypeDocument]( + http_response=dict_response, + doc_config=doc_config, + input_source=self.input_doc, + response_ok=response.ok, + ) def _get_queued_document( self, - queue_id: str, doc_config: DocumentConfig, - include_words: bool = False, - cropper: bool = False, + queue_id: str, ) -> AsyncPredictResponse[TypeDocument]: """ Fetches a document or a Job from a given queue. @@ -297,8 +324,7 @@ def _get_queued_document( :param doc_config: Pre-checked document configuration. """ queue_response = doc_config.endpoints[0].document_queue_req_get( - queue_id=queue_id, include_words=include_words, cropper=cropper - ) + queue_id=queue_id) if ( not queue_response.status_code diff --git a/mindee/endpoints.py b/mindee/endpoints.py index 86b63c5f..1c721f1c 100644 --- a/mindee/endpoints.py +++ b/mindee/endpoints.py @@ -187,9 +187,7 @@ def predict_async_req_post( def document_queue_req_get( self, - queue_id: str, - include_words: bool = False, - cropper: bool = False, + queue_id: str ) -> requests.Response: """ Sends a request matching a given queue_id. Returns either a Job or a Document. @@ -201,18 +199,9 @@ def document_queue_req_get( This performs a cropping operation on the server and will increase response time. """ - data = {} - if include_words: - data["include_mvision"] = "true" - - params = {} - if cropper: - params["cropper"] = "true" response = requests.get( f"{self._url_root}/documents/queue/{queue_id}", headers=self.base_headers, - data=data, - params=params, timeout=self._request_timeout, ) return response From ef8fb5656988bd180f5966d4b069fa8270807841 Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Tue, 2 May 2023 17:21:31 +0200 Subject: [PATCH 03/30] fixed queue polling functions --- mindee/client.py | 24 +++++++++--------------- mindee/endpoints.py | 5 +---- 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/mindee/client.py b/mindee/client.py index badb06bd..763fac92 100644 --- a/mindee/client.py +++ b/mindee/client.py @@ -198,10 +198,7 @@ def enqueue( page_options.on_min_pages, page_options.page_indexes, ) - return self._predict_async( - doc_config, include_words, close_file, cropper - ) - + return self._predict_async(doc_config, include_words, close_file, cropper) def parse_queued( self, @@ -249,7 +246,7 @@ def parse_queued( doc_config = self.doc_configs[config_key] doc_config.check_api_keys() - return self._get_queued_document(queue_id, doc_config, include_words, cropper) + return self._get_queued_document(doc_config, queue_id) def _make_request( self, @@ -283,22 +280,18 @@ def _make_request( input_source=self.input_doc, response_ok=response.ok, ) - + def _predict_async( self, doc_config: DocumentConfig, include_words: bool = False, close_file: bool = True, - cropper: bool = False + cropper: bool = False, ) -> PredictResponse[TypeDocument]: response = doc_config.endpoints[0].predict_async_req_post( - self.input_doc, - include_words, - close_file, - cropper - ) # TODO: refactor this into the document class - - + self.input_doc, include_words, close_file, cropper + ) # TODO: refactor this into the document class + dict_response = response.json() if not response.ok and self.raise_on_error: @@ -324,7 +317,8 @@ def _get_queued_document( :param doc_config: Pre-checked document configuration. """ queue_response = doc_config.endpoints[0].document_queue_req_get( - queue_id=queue_id) + queue_id=queue_id + ) if ( not queue_response.status_code diff --git a/mindee/endpoints.py b/mindee/endpoints.py index 1c721f1c..ef23398f 100644 --- a/mindee/endpoints.py +++ b/mindee/endpoints.py @@ -185,10 +185,7 @@ def predict_async_req_post( ) return response - def document_queue_req_get( - self, - queue_id: str - ) -> requests.Response: + def document_queue_req_get(self, queue_id: str) -> requests.Response: """ Sends a request matching a given queue_id. Returns either a Job or a Document. From 976878776ab926581a6d73905612e8c463efccc7 Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Wed, 3 May 2023 10:48:37 +0200 Subject: [PATCH 04/30] revamped inheritance and fixed AsyncPredictionResponse generation --- mindee/client.py | 34 ++++++++++++++++++---------------- mindee/response.py | 26 ++++++++++++++------------ 2 files changed, 32 insertions(+), 28 deletions(-) diff --git a/mindee/client.py b/mindee/client.py index 763fac92..9ebd856d 100644 --- a/mindee/client.py +++ b/mindee/client.py @@ -129,7 +129,7 @@ def enqueue( close_file: bool = True, page_options: Optional[PageOptions] = None, cropper: bool = False, - ) -> PredictResponse[TypeDocument]: + ) -> AsyncPredictResponse[TypeDocument]: """ Enqueueing to an async endpoint. @@ -201,13 +201,8 @@ def enqueue( return self._predict_async(doc_config, include_words, close_file, cropper) def parse_queued( - self, - queue_id: str, - endpoint_name: str, - account_name: Optional[str] = None, - include_words: bool = False, - cropper: bool = False, - ) -> AsyncPredictResponse: + self, queue_id: str, endpoint_name: str, account_name: Optional[str] = None + ) -> AsyncPredictResponse[TypeDocument]: """ Parses a queued document. @@ -218,7 +213,6 @@ def parse_queued( This is normally not required unless you have a custom endpoint which has the same name as standard (off the shelf) endpoint. Do not set for standard (off the shelf) endpoints. - """ found = [] for k in self.doc_configs.keys(): @@ -287,10 +281,15 @@ def _predict_async( include_words: bool = False, close_file: bool = True, cropper: bool = False, - ) -> PredictResponse[TypeDocument]: + ) -> AsyncPredictResponse[TypeDocument]: + """ + Sends a document to the queue, and sends back an asynchronous predict response. + + :param doc_config: Configuration of the document. + """ response = doc_config.endpoints[0].predict_async_req_post( self.input_doc, include_words, close_file, cropper - ) # TODO: refactor this into the document class + ) dict_response = response.json() @@ -298,11 +297,9 @@ def _predict_async( raise HTTPException( f"API {response.status_code} HTTP error: {json.dumps(dict_response)}" ) - return PredictResponse[TypeDocument]( + + return AsyncPredictResponse[TypeDocument]( http_response=dict_response, - doc_config=doc_config, - input_source=self.input_doc, - response_ok=response.ok, ) def _get_queued_document( @@ -329,9 +326,14 @@ def _get_queued_document( f"API {queue_response.status_code} HTTP error: {json.dumps(queue_response)}" ) + if queue_response.status_code != 302: + return AsyncPredictResponse[TypeDocument]( + http_response=queue_response.json() + ) + return AsyncPredictResponse[TypeDocument]( - doc_config=doc_config, http_response=queue_response.json(), + doc_config=doc_config, input_source=self.input_doc, response_ok=queue_response.ok, ) diff --git a/mindee/response.py b/mindee/response.py index bc0daf5c..f21c9504 100644 --- a/mindee/response.py +++ b/mindee/response.py @@ -1,5 +1,5 @@ from datetime import datetime -from typing import Any, Dict, Generic, List, Literal, Optional, Union +from typing import Any, Dict, Generic, List, Optional, Union from mindee.documents.base import TypeDocument from mindee.documents.config import DocumentConfig @@ -140,24 +140,26 @@ def _load_response( class AsyncPredictResponse(PredictResponse[TypeDocument]): """ - Response of a prediction request. + Async Response Wrapper class for a Predict response. - Certain properties will depend on the document type. + Since Optional inheritance is convoluted, this is used as a simple wrapper for PredictResponse. """ job: Job + # Inheritance of Optional isn't a possibility, so this technically just a wrapper def __init__( self, - doc_config: DocumentConfig, http_response: Dict, - input_source: Union[LocalInputSource, UrlInputSource], - response_ok: bool, + doc_config: Optional[DocumentConfig] = None, + input_source: Optional[Union[LocalInputSource, UrlInputSource]] = None, + response_ok: Optional[bool] = None, ) -> None: - super().__init__( - doc_config=doc_config, - http_response=http_response, - input_source=input_source, - response_ok=response_ok, - ) + if doc_config and input_source and response_ok is not None: + super().__init__( + http_response=http_response, + doc_config=doc_config, + input_source=input_source, + response_ok=response_ok, + ) self.job = Job(http_response["job"]) From 4ff8bfcb9e8c5f7f4283e60d98a61080e6314c0a Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Wed, 3 May 2023 11:57:06 +0200 Subject: [PATCH 05/30] fixed polling for jobs, document access not working yet --- mindee/client.py | 19 ++++- mindee/documents/__init__.py | 1 + mindee/documents/invoice_splitter/__init__.py | 1 + .../invoice_splitter/invoice_splitter_v1.py | 70 +++++++++++++++++++ mindee/response.py | 38 +++++++--- 5 files changed, 117 insertions(+), 12 deletions(-) create mode 100644 mindee/documents/invoice_splitter/__init__.py create mode 100644 mindee/documents/invoice_splitter/invoice_splitter_v1.py diff --git a/mindee/client.py b/mindee/client.py index 9ebd856d..4efc4d77 100644 --- a/mindee/client.py +++ b/mindee/client.py @@ -201,7 +201,11 @@ def enqueue( return self._predict_async(doc_config, include_words, close_file, cropper) def parse_queued( - self, queue_id: str, endpoint_name: str, account_name: Optional[str] = None + self, + document_class: TypeDocument, + queue_id: str, + endpoint_name: Optional[str] = None, + account_name: Optional[str] = None ) -> AsyncPredictResponse[TypeDocument]: """ Parses a queued document. @@ -214,6 +218,14 @@ def parse_queued( same name as standard (off the shelf) endpoint. Do not set for standard (off the shelf) endpoints. """ + bound_classname = get_bound_classname(document_class) + if bound_classname != documents.CustomV1.__name__: + endpoint_name = get_bound_classname(document_class) + elif endpoint_name is None: + raise RuntimeError( + f"endpoint_name is required when using {bound_classname} class" + ) + found = [] for k in self.doc_configs.keys(): if k[1] == endpoint_name: @@ -467,6 +479,11 @@ def _init_default_endpoints(self) -> None: url_name="license_plates", version="1", ), + ConfigSpec( + doc_class=documents.InvoiceSplitterV1, + url_name="invoice_splitter", + version="1" + ) ] for config in configs: config_key = (OTS_OWNER, config.doc_class.__name__) diff --git a/mindee/documents/__init__.py b/mindee/documents/__init__.py index dcee7a31..03cfcf6d 100644 --- a/mindee/documents/__init__.py +++ b/mindee/documents/__init__.py @@ -8,6 +8,7 @@ TypeFinancialV1, ) from mindee.documents.invoice import InvoiceV3, InvoiceV4, TypeInvoiceV3, TypeInvoiceV4 +from mindee.documents.invoice_splitter import InvoiceSplitterV1, TypeInvoiceSplitterV1 from mindee.documents.passport import PassportV1, TypePassportV1 from mindee.documents.proof_of_address import ProofOfAddressV1, TypeProofOfAddressV1 from mindee.documents.receipt import ( diff --git a/mindee/documents/invoice_splitter/__init__.py b/mindee/documents/invoice_splitter/__init__.py new file mode 100644 index 00000000..4c5b33fd --- /dev/null +++ b/mindee/documents/invoice_splitter/__init__.py @@ -0,0 +1 @@ +from .invoice_splitter_v1 import InvoiceSplitterV1, TypeInvoiceSplitterV1 \ No newline at end of file diff --git a/mindee/documents/invoice_splitter/invoice_splitter_v1.py b/mindee/documents/invoice_splitter/invoice_splitter_v1.py new file mode 100644 index 00000000..d47a02e6 --- /dev/null +++ b/mindee/documents/invoice_splitter/invoice_splitter_v1.py @@ -0,0 +1,70 @@ +from typing import Any, Dict, List, Optional, TypeVar, Union + +from mindee.documents.base import Document, TypeApiPrediction +from mindee.input.sources import LocalInputSource, UrlInputSource + + +class PageGroup: + page_indexes: List[int] = [] + confidence: int + + def __init__(self, prediction: Dict[str, Any]): + self.page_indexes = prediction["page_indexes"] + self.confidence = prediction["confidence"] + + def __str__(self) -> str: + return f"page indexes: {', '.join([str(page_index) for page_index in self.page_indexes])}" + + +class InvoiceSplitterV1(Document): + invoice_page_groups: List[PageGroup] = [] + + def __init__( + self, + api_prediction: Dict[str, Any], + input_source: Union[LocalInputSource, UrlInputSource], + page_n: int, + ): + super().__init__( + input_source=input_source, + document_type="shipping_container", + api_prediction=api_prediction, + page_n=page_n, + ) + self._build_from_api_prediction(api_prediction["prediction"], page_n=page_n) + + def _build_from_api_prediction( + self, api_prediction: TypeApiPrediction, page_n: Optional[int] = None + ) -> None: + """ + Build the object from the prediction API JSON. + + :param api_prediction: Raw prediction from HTTP response + :param page_n: Page number + """ + if ( + api_prediction["invoice_page_groups"] + and len(api_prediction["invoice_page_groups"]) > 0 + ): + self.invoice_page_groups = [ + PageGroup(prediction) + for prediction in api_prediction["invoice_page_groups"] + ] + + def __str__(self) -> str: + invoice_page_groups = "\n" + if len(self.invoice_page_groups) > 0: + invoice_page_groups += "\n ".join( + [str(ivp) for ivp in self.invoice_page_groups] + ) + + out_str = ( + f"----- Invoice Splitter V1 -----" + f"Filename: {self.filename}" + f"Invoice Page Groups: {invoice_page_groups}" + f"----------------------" + ) + return out_str + + +TypeInvoiceSplitterV1 = TypeVar("TypeInvoiceSplitterV1", bound=InvoiceSplitterV1) diff --git a/mindee/response.py b/mindee/response.py index f21c9504..0df48bf7 100644 --- a/mindee/response.py +++ b/mindee/response.py @@ -1,4 +1,5 @@ from datetime import datetime +import json from typing import Any, Dict, Generic, List, Optional, Union from mindee.documents.base import TypeDocument @@ -16,11 +17,11 @@ class Job: issued_at: datetime """Timestamp of the request reception by the API.""" - available_at: Optional[datetime] + available_at: Optional[datetime] = None """Timestamp of the request after it has been completed.""" - job_id: Optional[str] + job_id: Optional[str] = None """ID of the job.""" - status: Optional[str] + status: Optional[str] = None """Status of the request, as seen by the API.""" milli_secs_taken: int # Check if that one is fine as a simple int """Time (ms) taken for the request to be processed by the API.""" @@ -31,19 +32,18 @@ def __init__(self, json_response: dict) -> None: :param json_response: JSON response sent by the server """ - self.issued_at = datetime.strptime( - json_response["issued_at"], "%Y-%m-%dT%H:%M:%S.%fZ" - ) # check date formatting later + self.issued_at = datetime.fromisoformat(json_response["issued_at"]) if json_response.get("available_at"): - self.available_at = datetime.strptime( - json_response["available_at"], "%Y-%m-%dT%H:%M:%S.%fZ" - ) + self.available_at = datetime.fromisoformat(json_response["available_at"]) self.job_id = json_response.get("id") self.status = json_response.get("status") if self.available_at: self.milli_secs_taken = int( (self.available_at.microsecond - self.issued_at.microsecond) / 1000 ) + + def __str__(self) -> str: + return json.dumps(self.__dict__, indent=4, sort_keys=True, default=str) class PredictResponse(Generic[TypeDocument]): @@ -146,8 +146,18 @@ class AsyncPredictResponse(PredictResponse[TypeDocument]): """ job: Job - # Inheritance of Optional isn't a possibility, so this technically just a wrapper - + document: Optional[TypeDocument] = None + http_response: Dict[str, Any] = {} + """Raw HTTP response JSON""" + document_type: Optional[str] = None + """Document type""" + input_path: Optional[str] = None + """Path of the input file""" + input_filename: Optional[str] = None + """Name of the input file""" + input_mimetype: Optional[str] = None + """MIME type of the input file""" + def __init__( self, http_response: Dict, @@ -162,4 +172,10 @@ def __init__( input_source=input_source, response_ok=response_ok, ) + self.document = super().document + self.http_response = super().http_response + self.document_type = super().document_type + self.input_path = super().input_path + self.input_filename = super().input_filename + self.input_mimetype = super().input_mimetype self.job = Job(http_response["job"]) From 518d73290d97b1576fc6fa40a6ec4372361fdf5f Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Wed, 3 May 2023 16:02:01 +0200 Subject: [PATCH 06/30] added some safety checks --- mindee/client.py | 18 ++++++++--------- mindee/documents/config.py | 4 ++-- mindee/documents/invoice_splitter/__init__.py | 2 +- .../invoice_splitter/invoice_splitter_v1.py | 17 ++++++++-------- mindee/response.py | 20 +++++++++---------- 5 files changed, 28 insertions(+), 33 deletions(-) diff --git a/mindee/client.py b/mindee/client.py index 4efc4d77..8f96388f 100644 --- a/mindee/client.py +++ b/mindee/client.py @@ -205,7 +205,7 @@ def parse_queued( document_class: TypeDocument, queue_id: str, endpoint_name: Optional[str] = None, - account_name: Optional[str] = None + account_name: Optional[str] = None, ) -> AsyncPredictResponse[TypeDocument]: """ Parses a queued document. @@ -225,7 +225,7 @@ def parse_queued( raise RuntimeError( f"endpoint_name is required when using {bound_classname} class" ) - + found = [] for k in self.doc_configs.keys(): if k[1] == endpoint_name: @@ -248,7 +248,6 @@ def parse_queued( f"the parse method, one of {usernames}." ) ) - doc_config = self.doc_configs[config_key] doc_config.check_api_keys() @@ -312,6 +311,9 @@ def _predict_async( return AsyncPredictResponse[TypeDocument]( http_response=dict_response, + doc_config=doc_config, + input_source=self.input_doc, + response_ok=response.ok, ) def _get_queued_document( @@ -338,11 +340,6 @@ def _get_queued_document( f"API {queue_response.status_code} HTTP error: {json.dumps(queue_response)}" ) - if queue_response.status_code != 302: - return AsyncPredictResponse[TypeDocument]( - http_response=queue_response.json() - ) - return AsyncPredictResponse[TypeDocument]( http_response=queue_response.json(), doc_config=doc_config, @@ -482,9 +479,10 @@ def _init_default_endpoints(self) -> None: ConfigSpec( doc_class=documents.InvoiceSplitterV1, url_name="invoice_splitter", - version="1" - ) + version="1", + ), ] + for config in configs: config_key = (OTS_OWNER, config.doc_class.__name__) self._doc_configs[config_key] = self._standard_doc_config( diff --git a/mindee/documents/config.py b/mindee/documents/config.py index 97af1d42..e03f9d0c 100644 --- a/mindee/documents/config.py +++ b/mindee/documents/config.py @@ -27,8 +27,8 @@ def check_api_keys(self) -> None: if not endpoint.api_key: raise RuntimeError( ( - f"Missing API key for '{endpoint.url_name} v{endpoint.version}'," - "check your Client configuration.\n" + f"Missing API key for '{endpoint.url_name} v{endpoint.version.strip()}'," + " check your Client configuration.\n" "You can set this using the " f"'{API_KEY_ENV_NAME}' environment variable." ) diff --git a/mindee/documents/invoice_splitter/__init__.py b/mindee/documents/invoice_splitter/__init__.py index 4c5b33fd..0cc2749c 100644 --- a/mindee/documents/invoice_splitter/__init__.py +++ b/mindee/documents/invoice_splitter/__init__.py @@ -1 +1 @@ -from .invoice_splitter_v1 import InvoiceSplitterV1, TypeInvoiceSplitterV1 \ No newline at end of file +from .invoice_splitter_v1 import InvoiceSplitterV1, TypeInvoiceSplitterV1 diff --git a/mindee/documents/invoice_splitter/invoice_splitter_v1.py b/mindee/documents/invoice_splitter/invoice_splitter_v1.py index d47a02e6..8bd9883b 100644 --- a/mindee/documents/invoice_splitter/invoice_splitter_v1.py +++ b/mindee/documents/invoice_splitter/invoice_splitter_v1.py @@ -1,6 +1,6 @@ from typing import Any, Dict, List, Optional, TypeVar, Union -from mindee.documents.base import Document, TypeApiPrediction +from mindee.documents.base import Document, TypeApiPrediction, clean_out_string from mindee.input.sources import LocalInputSource, UrlInputSource @@ -27,7 +27,7 @@ def __init__( ): super().__init__( input_source=input_source, - document_type="shipping_container", + document_type="invoice_splitter", api_prediction=api_prediction, page_n=page_n, ) @@ -43,7 +43,7 @@ def _build_from_api_prediction( :param page_n: Page number """ if ( - api_prediction["invoice_page_groups"] + "invoice_page_groups" in api_prediction and len(api_prediction["invoice_page_groups"]) > 0 ): self.invoice_page_groups = [ @@ -52,16 +52,15 @@ def _build_from_api_prediction( ] def __str__(self) -> str: - invoice_page_groups = "\n" if len(self.invoice_page_groups) > 0: - invoice_page_groups += "\n ".join( + invoice_page_groups = f"\n { ' ' * 20 }".join( [str(ivp) for ivp in self.invoice_page_groups] ) - out_str = ( - f"----- Invoice Splitter V1 -----" - f"Filename: {self.filename}" - f"Invoice Page Groups: {invoice_page_groups}" + out_str = clean_out_string( + f"----- Invoice Splitter V1 -----\n" + f"Filename: {self.filename}\n" + f"Invoice Page Groups: {invoice_page_groups}\n" f"----------------------" ) return out_str diff --git a/mindee/response.py b/mindee/response.py index 0df48bf7..0d1008b9 100644 --- a/mindee/response.py +++ b/mindee/response.py @@ -1,5 +1,5 @@ -from datetime import datetime import json +from datetime import datetime from typing import Any, Dict, Generic, List, Optional, Union from mindee.documents.base import TypeDocument @@ -41,7 +41,7 @@ def __init__(self, json_response: dict) -> None: self.milli_secs_taken = int( (self.available_at.microsecond - self.issued_at.microsecond) / 1000 ) - + def __str__(self) -> str: return json.dumps(self.__dict__, indent=4, sort_keys=True, default=str) @@ -83,7 +83,6 @@ def __init__( :param http_response: json response from HTTP call """ logger.debug("Handling API response") - self.http_response = http_response self.document_type = doc_config.document_type self.pages = [] @@ -157,7 +156,7 @@ class AsyncPredictResponse(PredictResponse[TypeDocument]): """Name of the input file""" input_mimetype: Optional[str] = None """MIME type of the input file""" - + def __init__( self, http_response: Dict, @@ -165,17 +164,16 @@ def __init__( input_source: Optional[Union[LocalInputSource, UrlInputSource]] = None, response_ok: Optional[bool] = None, ) -> None: - if doc_config and input_source and response_ok is not None: + if ( + doc_config + and input_source + and http_response["job"]["status"] == "completed" + and response_ok is not None + ): super().__init__( http_response=http_response, doc_config=doc_config, input_source=input_source, response_ok=response_ok, ) - self.document = super().document - self.http_response = super().http_response - self.document_type = super().document_type - self.input_path = super().input_path - self.input_filename = super().input_filename - self.input_mimetype = super().input_mimetype self.job = Job(http_response["job"]) From bc2b81678cbba819dd1e4f38f96862ba6f20ca1a Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Wed, 3 May 2023 17:05:22 +0200 Subject: [PATCH 07/30] Fixes for draft PR --- .../invoice_splitter/invoice_splitter_v1.py | 18 +++++++++-- mindee/response.py | 30 +++++++++---------- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/mindee/documents/invoice_splitter/invoice_splitter_v1.py b/mindee/documents/invoice_splitter/invoice_splitter_v1.py index 8bd9883b..72ef78ce 100644 --- a/mindee/documents/invoice_splitter/invoice_splitter_v1.py +++ b/mindee/documents/invoice_splitter/invoice_splitter_v1.py @@ -5,19 +5,33 @@ class PageGroup: + """Page Group class for Invoice splitter.""" + page_indexes: List[int] = [] - confidence: int + """Index of each page""" + confidence: float = 0.0 + """Confidence score""" def __init__(self, prediction: Dict[str, Any]): self.page_indexes = prediction["page_indexes"] - self.confidence = prediction["confidence"] + try: + self.confidence = float(prediction["confidence"]) + except (KeyError, TypeError): + pass def __str__(self) -> str: return f"page indexes: {', '.join([str(page_index) for page_index in self.page_indexes])}" class InvoiceSplitterV1(Document): + """ + Invoice Splitter prediction results. + + Currently uses the API's async endpoints. + """ + invoice_page_groups: List[PageGroup] = [] + """Page groups linked to an invoice.""" def __init__( self, diff --git a/mindee/response.py b/mindee/response.py index 0d1008b9..b7b91f47 100644 --- a/mindee/response.py +++ b/mindee/response.py @@ -10,9 +10,9 @@ class Job: """ - Job wrapper for a request sent to the API. + Job class for asynchronous requests. - Only relevant in the case of async work. + Will hold information on the queue a document has been submitted to. """ issued_at: datetime @@ -71,7 +71,7 @@ class PredictResponse(Generic[TypeDocument]): def __init__( self, doc_config: DocumentConfig, - http_response: Dict, + http_response: Dict[str, Any], input_source: Union[LocalInputSource, UrlInputSource], response_ok: bool, ) -> None: @@ -145,25 +145,25 @@ class AsyncPredictResponse(PredictResponse[TypeDocument]): """ job: Job - document: Optional[TypeDocument] = None - http_response: Dict[str, Any] = {} - """Raw HTTP response JSON""" - document_type: Optional[str] = None - """Document type""" - input_path: Optional[str] = None - """Path of the input file""" - input_filename: Optional[str] = None - """Name of the input file""" - input_mimetype: Optional[str] = None - """MIME type of the input file""" + """Job linked to an Async prediction.""" def __init__( self, - http_response: Dict, + http_response: Dict[str, Any], doc_config: Optional[DocumentConfig] = None, input_source: Optional[Union[LocalInputSource, UrlInputSource]] = None, response_ok: Optional[bool] = None, ) -> None: + """ + Container wrapper for a raw API response. + + Inherits and instantiates a normal PredictResponse if the parsing of + the current queue is both requested and done. + + :param doc_config: DocumentConfig + :param input_source: Input object + :param http_response: json response from HTTP call + """ if ( doc_config and input_source From e49c259da2262ea02fb4165aba268447eca4c49c Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Thu, 4 May 2023 12:09:47 +0200 Subject: [PATCH 08/30] added test & fixed InvoiceSplitterV1 class --- mindee/documents/__init__.py | 2 +- .../invoice_splitter/invoice_splitter_v1.py | 16 +++---- tests/documents/test_invoice_splitter_v1.py | 44 +++++++++++++++++++ 3 files changed, 53 insertions(+), 9 deletions(-) create mode 100644 tests/documents/test_invoice_splitter_v1.py diff --git a/mindee/documents/__init__.py b/mindee/documents/__init__.py index 03cfcf6d..a9312eb9 100644 --- a/mindee/documents/__init__.py +++ b/mindee/documents/__init__.py @@ -8,7 +8,6 @@ TypeFinancialV1, ) from mindee.documents.invoice import InvoiceV3, InvoiceV4, TypeInvoiceV3, TypeInvoiceV4 -from mindee.documents.invoice_splitter import InvoiceSplitterV1, TypeInvoiceSplitterV1 from mindee.documents.passport import PassportV1, TypePassportV1 from mindee.documents.proof_of_address import ProofOfAddressV1, TypeProofOfAddressV1 from mindee.documents.receipt import ( @@ -23,3 +22,4 @@ ShippingContainerV1, TypeShippingContainerV1, ) +from mindee.documents.invoice_splitter import InvoiceSplitterV1, TypeInvoiceSplitterV1 diff --git a/mindee/documents/invoice_splitter/invoice_splitter_v1.py b/mindee/documents/invoice_splitter/invoice_splitter_v1.py index 72ef78ce..fd8fac0e 100644 --- a/mindee/documents/invoice_splitter/invoice_splitter_v1.py +++ b/mindee/documents/invoice_splitter/invoice_splitter_v1.py @@ -36,8 +36,8 @@ class InvoiceSplitterV1(Document): def __init__( self, api_prediction: Dict[str, Any], - input_source: Union[LocalInputSource, UrlInputSource], - page_n: int, + input_source: Optional[Union[LocalInputSource, UrlInputSource]] = None, + page_n: Optional[int] = None, ): super().__init__( input_source=input_source, @@ -45,10 +45,10 @@ def __init__( api_prediction=api_prediction, page_n=page_n, ) - self._build_from_api_prediction(api_prediction["prediction"], page_n=page_n) + self._build_from_api_prediction(api_prediction["prediction"]) def _build_from_api_prediction( - self, api_prediction: TypeApiPrediction, page_n: Optional[int] = None + self, api_prediction: TypeApiPrediction ) -> None: """ Build the object from the prediction API JSON. @@ -67,17 +67,17 @@ def _build_from_api_prediction( def __str__(self) -> str: if len(self.invoice_page_groups) > 0: - invoice_page_groups = f"\n { ' ' * 20 }".join( + invoice_page_groups = "\n " + invoice_page_groups += f"\n{ ' ' * 2 }".join( [str(ivp) for ivp in self.invoice_page_groups] ) - out_str = clean_out_string( + return clean_out_string( f"----- Invoice Splitter V1 -----\n" - f"Filename: {self.filename}\n" + f"Filename: {self.filename or ''}\n" f"Invoice Page Groups: {invoice_page_groups}\n" f"----------------------" ) - return out_str TypeInvoiceSplitterV1 = TypeVar("TypeInvoiceSplitterV1", bound=InvoiceSplitterV1) diff --git a/tests/documents/test_invoice_splitter_v1.py b/tests/documents/test_invoice_splitter_v1.py new file mode 100644 index 00000000..a3d9bd37 --- /dev/null +++ b/tests/documents/test_invoice_splitter_v1.py @@ -0,0 +1,44 @@ +import json + +import pytest + +from mindee.documents import InvoiceSplitterV1 + +INVOICE_SPLITTER_DATA_DIR = "./tests/data/invoice_splitter" + +FILE_PATH_INVOICE_SPLITTER_V1_COMPLETE = ( + f"{ INVOICE_SPLITTER_DATA_DIR }/response_v1/complete.json" +) +FILE_PATH_INVOICE_SPLITTER_V1_EMPTY = ( + f"{ INVOICE_SPLITTER_DATA_DIR }/response_v1/empty.json" +) + + +@pytest.fixture +def invoice_splitter_v1_doc() -> InvoiceSplitterV1: + json_data = json.load(open(FILE_PATH_INVOICE_SPLITTER_V1_COMPLETE)) + return InvoiceSplitterV1(json_data["document"]["inference"], page_n=None) + + +@pytest.fixture +def invoice_splitter_v1_doc_empty() -> InvoiceSplitterV1: + json_data = json.load(open(FILE_PATH_INVOICE_SPLITTER_V1_EMPTY)) + return InvoiceSplitterV1(json_data["document"]["inference"], page_n=None) + + +@pytest.fixture +def invoice_splitter_v1_page_object(): + json_data = json.load(open(FILE_PATH_INVOICE_SPLITTER_V1_COMPLETE)) + return InvoiceSplitterV1(json_data["document"]["inference"]["pages"][0], page_n=0) + +@pytest.fixture +def invoice_splitter_v1_doc_object(): + json_data = json.load(open(FILE_PATH_INVOICE_SPLITTER_V1_COMPLETE)) + return InvoiceSplitterV1(json_data["document"]["inference"], page_n=0) + + +def test_doc_constructor(invoice_splitter_v1_doc): + file_path = f"{ INVOICE_SPLITTER_DATA_DIR }/response_v1/doc_to_string.txt" + reference_str = open(file_path, "r", encoding="utf-8").read().strip() + assert str(invoice_splitter_v1_doc) == reference_str + From 61a0d1ff58a5c001b62b76fdb9034e3a99d58683 Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Thu, 4 May 2023 14:50:07 +0200 Subject: [PATCH 09/30] InvoiceSplitterV1 class cleanup --- mindee/documents/__init__.py | 2 +- mindee/documents/invoice_splitter/invoice_splitter_v1.py | 9 ++++----- tests/documents/test_invoice_splitter_v1.py | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/mindee/documents/__init__.py b/mindee/documents/__init__.py index a9312eb9..03cfcf6d 100644 --- a/mindee/documents/__init__.py +++ b/mindee/documents/__init__.py @@ -8,6 +8,7 @@ TypeFinancialV1, ) from mindee.documents.invoice import InvoiceV3, InvoiceV4, TypeInvoiceV3, TypeInvoiceV4 +from mindee.documents.invoice_splitter import InvoiceSplitterV1, TypeInvoiceSplitterV1 from mindee.documents.passport import PassportV1, TypePassportV1 from mindee.documents.proof_of_address import ProofOfAddressV1, TypeProofOfAddressV1 from mindee.documents.receipt import ( @@ -22,4 +23,3 @@ ShippingContainerV1, TypeShippingContainerV1, ) -from mindee.documents.invoice_splitter import InvoiceSplitterV1, TypeInvoiceSplitterV1 diff --git a/mindee/documents/invoice_splitter/invoice_splitter_v1.py b/mindee/documents/invoice_splitter/invoice_splitter_v1.py index fd8fac0e..79242843 100644 --- a/mindee/documents/invoice_splitter/invoice_splitter_v1.py +++ b/mindee/documents/invoice_splitter/invoice_splitter_v1.py @@ -1,7 +1,6 @@ -from typing import Any, Dict, List, Optional, TypeVar, Union +from typing import Any, Dict, List, Optional, TypeVar from mindee.documents.base import Document, TypeApiPrediction, clean_out_string -from mindee.input.sources import LocalInputSource, UrlInputSource class PageGroup: @@ -35,8 +34,8 @@ class InvoiceSplitterV1(Document): def __init__( self, - api_prediction: Dict[str, Any], - input_source: Optional[Union[LocalInputSource, UrlInputSource]] = None, + api_prediction: TypeApiPrediction, + input_source=None, page_n: Optional[int] = None, ): super().__init__( @@ -48,7 +47,7 @@ def __init__( self._build_from_api_prediction(api_prediction["prediction"]) def _build_from_api_prediction( - self, api_prediction: TypeApiPrediction + self, api_prediction: TypeApiPrediction, page_n: Optional[int] = None ) -> None: """ Build the object from the prediction API JSON. diff --git a/tests/documents/test_invoice_splitter_v1.py b/tests/documents/test_invoice_splitter_v1.py index a3d9bd37..07b932a8 100644 --- a/tests/documents/test_invoice_splitter_v1.py +++ b/tests/documents/test_invoice_splitter_v1.py @@ -31,6 +31,7 @@ def invoice_splitter_v1_page_object(): json_data = json.load(open(FILE_PATH_INVOICE_SPLITTER_V1_COMPLETE)) return InvoiceSplitterV1(json_data["document"]["inference"]["pages"][0], page_n=0) + @pytest.fixture def invoice_splitter_v1_doc_object(): json_data = json.load(open(FILE_PATH_INVOICE_SPLITTER_V1_COMPLETE)) @@ -41,4 +42,3 @@ def test_doc_constructor(invoice_splitter_v1_doc): file_path = f"{ INVOICE_SPLITTER_DATA_DIR }/response_v1/doc_to_string.txt" reference_str = open(file_path, "r", encoding="utf-8").read().strip() assert str(invoice_splitter_v1_doc) == reference_str - From 8c5579c8e23c0472d238c22fe7ebb605c4273272 Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Thu, 4 May 2023 15:17:07 +0200 Subject: [PATCH 10/30] Clarified DocString & added documentation support --- docs/client.rst | 7 +++++++ mindee/__init__.py | 2 +- mindee/response.py | 8 ++++---- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/docs/client.rst b/docs/client.rst index 3c8c9f50..42c26524 100644 --- a/docs/client.rst +++ b/docs/client.rst @@ -21,3 +21,10 @@ PredictResponse --------------- .. autoclass:: mindee.response.PredictResponse :members: + +AsyncPredictResponse +-------------------- +.. autoclass:: mindee.response.AsyncPredictResponse + :members: +.. autoclass:: mindee.response.Job + :members: \ No newline at end of file diff --git a/mindee/__init__.py b/mindee/__init__.py index 6d67d4a2..16f1dc14 100644 --- a/mindee/__init__.py +++ b/mindee/__init__.py @@ -1,2 +1,2 @@ from mindee.client import Client, PageOptions -from mindee.response import PredictResponse +from mindee.response import AsyncPredictResponse, Job, PredictResponse diff --git a/mindee/response.py b/mindee/response.py index b7b91f47..8e17dfb0 100644 --- a/mindee/response.py +++ b/mindee/response.py @@ -15,12 +15,12 @@ class Job: Will hold information on the queue a document has been submitted to. """ + job_id: Optional[str] = None + """ID of the job sent by the API in response to an enqueue request.""" issued_at: datetime """Timestamp of the request reception by the API.""" available_at: Optional[datetime] = None """Timestamp of the request after it has been completed.""" - job_id: Optional[str] = None - """ID of the job.""" status: Optional[str] = None """Status of the request, as seen by the API.""" milli_secs_taken: int # Check if that one is fine as a simple int @@ -141,11 +141,11 @@ class AsyncPredictResponse(PredictResponse[TypeDocument]): """ Async Response Wrapper class for a Predict response. - Since Optional inheritance is convoluted, this is used as a simple wrapper for PredictResponse. + Links a Job to a PredictResponse. """ job: Job - """Job linked to an Async prediction.""" + """Job object link to the prediction. As long as it isn't complete, the prediction doesn't exist.""" def __init__( self, From e969c28050011cb0b4fcf293b48fee8593f14360 Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Thu, 4 May 2023 15:18:29 +0200 Subject: [PATCH 11/30] Clarified DocString & added documentation support Clarified Docstring --- mindee/response.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mindee/response.py b/mindee/response.py index 8e17dfb0..c199a85d 100644 --- a/mindee/response.py +++ b/mindee/response.py @@ -141,7 +141,7 @@ class AsyncPredictResponse(PredictResponse[TypeDocument]): """ Async Response Wrapper class for a Predict response. - Links a Job to a PredictResponse. + Links a Job to a future PredictResponse. """ job: Job From a9c21f4d7fb5318224bd998e5476e0ac7c13c95d Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Thu, 4 May 2023 18:41:19 +0200 Subject: [PATCH 12/30] Added unit testint for Async & fixed some obsolete AsyncPrediction notations --- mindee/client.py | 105 +++++------------ mindee/documents/config.py | 2 +- .../invoice_splitter/invoice_splitter_v1.py | 6 +- mindee/endpoints.py | 48 +++----- mindee/response.py | 28 ++--- tests/test_async_response_post.py | 109 ++++++++++++++++++ 6 files changed, 172 insertions(+), 126 deletions(-) create mode 100644 tests/test_async_response_post.py diff --git a/mindee/client.py b/mindee/client.py index 8f96388f..d5dd2c7e 100644 --- a/mindee/client.py +++ b/mindee/client.py @@ -84,31 +84,7 @@ def parse( logger.debug("Parsing document as '%s'", endpoint_name) - found = [] - for k in self.doc_configs.keys(): - if k[1] == endpoint_name: - found.append(k) - - if len(found) == 0: - raise RuntimeError(f"Document type not configured: {endpoint_name}") - - if account_name: - config_key = (account_name, endpoint_name) - elif len(found) == 1: - config_key = found[0] - else: - usernames = [k[0] for k in found] - raise RuntimeError( - ( - "Duplicate configuration detected.\n" - f"You specified a document_type '{endpoint_name}' in your custom config.\n" - "To avoid confusion, please add the 'account_name' attribute to " - f"the parse method, one of {usernames}." - ) - ) - - doc_config = self.doc_configs[config_key] - doc_config.check_api_keys() + doc_config = self._check_config(endpoint_name, account_name) if not isinstance(self.input_doc, UrlInputSource): if page_options and self.input_doc.is_pdf(): self.input_doc.process_pdf( @@ -166,31 +142,7 @@ def enqueue( logger.debug("Enqueuing document as '%s'", endpoint_name) - found = [] - for k in self.doc_configs.keys(): - if k[1] == endpoint_name: - found.append(k) - - if len(found) == 0: - raise RuntimeError(f"Document type not configured: {endpoint_name}") - - if account_name: - config_key = (account_name, endpoint_name) - elif len(found) == 1: - config_key = found[0] - else: - usernames = [k[0] for k in found] - raise RuntimeError( - ( - "Duplicate configuration detected.\n" - f"You specified a document_type '{endpoint_name}' in your custom config.\n" - "To avoid confusion, please add the 'account_name' attribute to " - f"the parse method, one of {usernames}." - ) - ) - - doc_config = self.doc_configs[config_key] - doc_config.check_api_keys() + doc_config = self._check_config(endpoint_name, account_name) if not isinstance(self.input_doc, UrlInputSource): if page_options and self.input_doc.is_pdf(): self.input_doc.process_pdf( @@ -226,30 +178,9 @@ def parse_queued( f"endpoint_name is required when using {bound_classname} class" ) - found = [] - for k in self.doc_configs.keys(): - if k[1] == endpoint_name: - found.append(k) + logger.debug("Fetching queued document as '%s'", endpoint_name) - if len(found) == 0: - raise RuntimeError(f"Document type not configured: {endpoint_name}") - - if account_name: - config_key = (account_name, endpoint_name) - elif len(found) == 1: - config_key = found[0] - else: - usernames = [k[0] for k in found] - raise RuntimeError( - ( - "Duplicate configuration detected.\n" - f"You specified a document_type '{endpoint_name}' in your custom config.\n" - "To avoid confusion, please add the 'account_name' attribute to " - f"the parse method, one of {usernames}." - ) - ) - doc_config = self.doc_configs[config_key] - doc_config.check_api_keys() + doc_config = self._check_config(endpoint_name, account_name) return self._get_queued_document(doc_config, queue_id) @@ -352,6 +283,34 @@ def close(self) -> None: if not isinstance(self.input_doc, UrlInputSource): self.input_doc.file_object.close() + def _check_config(self, endpoint_name, account_name) -> DocumentConfig: + found = [] + for k in self.doc_configs.keys(): + if k[1] == endpoint_name: + found.append(k) + + if len(found) == 0: + raise RuntimeError(f"Document type not configured: {endpoint_name}") + + if account_name: + config_key = (account_name, endpoint_name) + elif len(found) == 1: + config_key = found[0] + else: + usernames = [k[0] for k in found] + raise RuntimeError( + ( + "Duplicate configuration detected.\n" + f"You specified a document_type '{endpoint_name}' in your custom config.\n" + "To avoid confusion, please add the 'account_name' attribute to " + f"the parse method, one of {usernames}." + ) + ) + + doc_config = self.doc_configs[config_key] + doc_config.check_api_keys() + return doc_config + class ConfigSpec(NamedTuple): doc_class: Type[Document] diff --git a/mindee/documents/config.py b/mindee/documents/config.py index e03f9d0c..946e2738 100644 --- a/mindee/documents/config.py +++ b/mindee/documents/config.py @@ -27,7 +27,7 @@ def check_api_keys(self) -> None: if not endpoint.api_key: raise RuntimeError( ( - f"Missing API key for '{endpoint.url_name} v{endpoint.version.strip()}'," + f"Missing API key for '{endpoint.url_name} v{endpoint.version}'," " check your Client configuration.\n" "You can set this using the " f"'{API_KEY_ENV_NAME}' environment variable." diff --git a/mindee/documents/invoice_splitter/invoice_splitter_v1.py b/mindee/documents/invoice_splitter/invoice_splitter_v1.py index 79242843..2279d729 100644 --- a/mindee/documents/invoice_splitter/invoice_splitter_v1.py +++ b/mindee/documents/invoice_splitter/invoice_splitter_v1.py @@ -23,11 +23,7 @@ def __str__(self) -> str: class InvoiceSplitterV1(Document): - """ - Invoice Splitter prediction results. - - Currently uses the API's async endpoints. - """ + """Invoice Splitter v1 prediction results.""" invoice_page_groups: List[PageGroup] = [] """Page groups linked to an invoice.""" diff --git a/mindee/endpoints.py b/mindee/endpoints.py index ef23398f..0d5c7cde 100644 --- a/mindee/endpoints.py +++ b/mindee/endpoints.py @@ -111,34 +111,9 @@ def predict_req_post( :param cropper: Including Mindee cropping results. :return: requests response """ - data = {} - if include_words: - data["include_mvision"] = "true" - - params = {} - if cropper: - params["cropper"] = "true" - - if isinstance(input_source, UrlInputSource): - data["document"] = input_source.url - response = requests.post( - f"{self._url_root}/predict", - headers=self.base_headers, - data=data, - params=params, - timeout=self._request_timeout, - ) - else: - files = {"document": input_source.read_contents(close_file)} - response = requests.post( - f"{self._url_root}/predict", - files=files, - headers=self.base_headers, - data=data, - params=params, - timeout=self._request_timeout, - ) - return response + return self._custom_request( + "predict", input_source, include_words, close_file, cropper + ) def predict_async_req_post( self, @@ -156,6 +131,18 @@ def predict_async_req_post( :param cropper: Including Mindee cropping results. :return: requests response """ + return self._custom_request( + "predict_async", input_source, include_words, close_file, cropper + ) + + def _custom_request( + self, + route: str, + input_source: Union[LocalInputSource, UrlInputSource], + include_words: bool = False, + close_file: bool = True, + cropper: bool = False, + ): data = {} if include_words: data["include_mvision"] = "true" @@ -167,7 +154,7 @@ def predict_async_req_post( if isinstance(input_source, UrlInputSource): data["document"] = input_source.url response = requests.post( - f"{self._url_root}/predict_async", + f"{self._url_root}/{route}", headers=self.base_headers, data=data, params=params, @@ -176,13 +163,14 @@ def predict_async_req_post( else: files = {"document": input_source.read_contents(close_file)} response = requests.post( - f"{self._url_root}/predict_async", + f"{self._url_root}/{route}", files=files, headers=self.base_headers, data=data, params=params, timeout=self._request_timeout, ) + return response def document_queue_req_get(self, queue_id: str) -> requests.Response: diff --git a/mindee/response.py b/mindee/response.py index c199a85d..8bbabc6e 100644 --- a/mindee/response.py +++ b/mindee/response.py @@ -23,7 +23,7 @@ class Job: """Timestamp of the request after it has been completed.""" status: Optional[str] = None """Status of the request, as seen by the API.""" - milli_secs_taken: int # Check if that one is fine as a simple int + millisecs_taken: int """Time (ms) taken for the request to be processed by the API.""" def __init__(self, json_response: dict) -> None: @@ -38,7 +38,7 @@ def __init__(self, json_response: dict) -> None: self.job_id = json_response.get("id") self.status = json_response.get("status") if self.available_at: - self.milli_secs_taken = int( + self.millisecs_taken = int( (self.available_at.microsecond - self.issued_at.microsecond) / 1000 ) @@ -150,9 +150,9 @@ class AsyncPredictResponse(PredictResponse[TypeDocument]): def __init__( self, http_response: Dict[str, Any], - doc_config: Optional[DocumentConfig] = None, - input_source: Optional[Union[LocalInputSource, UrlInputSource]] = None, - response_ok: Optional[bool] = None, + doc_config: DocumentConfig, + input_source: Union[LocalInputSource, UrlInputSource], + response_ok: bool, ) -> None: """ Container wrapper for a raw API response. @@ -164,16 +164,10 @@ def __init__( :param input_source: Input object :param http_response: json response from HTTP call """ - if ( - doc_config - and input_source - and http_response["job"]["status"] == "completed" - and response_ok is not None - ): - super().__init__( - http_response=http_response, - doc_config=doc_config, - input_source=input_source, - response_ok=response_ok, - ) + super().__init__( + http_response=http_response, + doc_config=doc_config, + input_source=input_source, + response_ok=response_ok and http_response["job"]["status"] == "completed", + ) self.job = Job(http_response["job"]) diff --git a/tests/test_async_response_post.py b/tests/test_async_response_post.py new file mode 100644 index 00000000..73000bb6 --- /dev/null +++ b/tests/test_async_response_post.py @@ -0,0 +1,109 @@ +import json +from datetime import datetime + +import pytest + +from mindee import Client +from mindee.documents.base import Document +from mindee.documents.invoice_splitter import InvoiceSplitterV1 +from mindee.endpoints import OTS_OWNER +from mindee.input.sources import PathInput +from mindee.response import AsyncPredictResponse + +ASYNC_DIR = "./tests/data/async" + +FILE_PATH_POST_SUCCESS = f"{ ASYNC_DIR }/post_success.json" +FILE_PATH_POST_FAIL = f"{ ASYNC_DIR }/post_fail_forbidden.json" +FILE_PATH_GET_PROCESSING = f"{ ASYNC_DIR }/get_processing.json" +FILE_PATH_GET_COMPLETED = f"{ ASYNC_DIR }/get_completed.json" + + +@pytest.fixture +def dummy_file_input(): + file_input = PathInput("./tests/data/invoice_splitter/2_invoices.pdf") + return file_input + + +@pytest.fixture +def dummy_config(): + client = Client(api_key="dummy").add_endpoint( + endpoint_name="dummy", + account_name="dummy", + ) + return client._doc_configs + + +def test_constructor(dummy_file_input): + with pytest.raises(KeyError): + Document( + dummy_file_input, + document_type="invoice_splitter", + api_prediction={}, + page_n=0, + ) + + +def test_async_response_post_success(dummy_file_input, dummy_config): + response = json.load(open(FILE_PATH_POST_SUCCESS)) + parsed_response = AsyncPredictResponse[InvoiceSplitterV1]( + doc_config=dummy_config[(OTS_OWNER, InvoiceSplitterV1.__name__)], + http_response=response, + input_source=dummy_file_input, + response_ok=True, + ) + assert parsed_response.job is not None + assert ( + parsed_response.job.issued_at.isoformat() == "2023-02-16T12:33:49.602947+00:00" + ) + assert parsed_response.job.available_at is None + assert parsed_response.job.status == "waiting" + assert parsed_response.job.job_id == "76c90710-3a1b-4b91-8a39-31a6543e347c" + assert not parsed_response.http_response["api_request"]["error"] + + +def test_async_response_post_fail(dummy_file_input, dummy_config): + response = json.load(open(FILE_PATH_POST_FAIL)) + parsed_response = AsyncPredictResponse[InvoiceSplitterV1]( + doc_config=dummy_config[(OTS_OWNER, InvoiceSplitterV1.__name__)], + http_response=response, + input_source=dummy_file_input, + response_ok=True, + ) + assert parsed_response.job is not None + assert parsed_response.job.issued_at.isoformat() == "2023-01-01T00:00:00+00:00" + assert parsed_response.job.available_at is None + assert parsed_response.job.status is None + assert parsed_response.job.job_id is None + assert parsed_response.http_response["api_request"]["error"] + assert parsed_response.http_response["api_request"]["error"]["code"] == "Forbidden" + + +def test_async_get_processing(dummy_file_input, dummy_config): + response = json.load(open(FILE_PATH_GET_PROCESSING)) + parsed_response = AsyncPredictResponse[InvoiceSplitterV1]( + doc_config=dummy_config[(OTS_OWNER, InvoiceSplitterV1.__name__)], + http_response=response, + input_source=dummy_file_input, + response_ok=True, + ) + assert parsed_response.job is not None + assert parsed_response.job.issued_at.isoformat() == "2023-03-16T12:33:49.602947" + assert parsed_response.job.available_at is None + assert parsed_response.job.status == "processing" + assert parsed_response.job.job_id == "76c90710-3a1b-4b91-8a39-31a6543e347c" + assert not parsed_response.http_response["api_request"]["error"] + + +def test_async_response_get_completed(dummy_file_input, dummy_config): + response = json.load(open(FILE_PATH_GET_COMPLETED)) + parsed_response = AsyncPredictResponse[InvoiceSplitterV1]( + doc_config=dummy_config[(OTS_OWNER, InvoiceSplitterV1.__name__)], + http_response=response, + input_source=dummy_file_input, + response_ok=True, + ) + assert parsed_response.job is not None + assert parsed_response.job.issued_at.isoformat() == "2023-03-21T13:52:56.326107" + assert parsed_response.job.available_at.isoformat() == "2023-03-21T13:53:00.990339" + assert parsed_response.job.status == "completed" + assert parsed_response.http_response["api_request"]["error"] == {} From 2daaf182be302bb61764b0b0643dc98e1e2c0a62 Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Thu, 4 May 2023 18:45:02 +0200 Subject: [PATCH 13/30] renamed async unittest file to avoid confusion --- tests/{test_async_response_post.py => test_async_response.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{test_async_response_post.py => test_async_response.py} (100%) diff --git a/tests/test_async_response_post.py b/tests/test_async_response.py similarity index 100% rename from tests/test_async_response_post.py rename to tests/test_async_response.py From 09ac89c4c58c34b24806417e0207f20f29b416d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Fri, 5 May 2023 10:22:01 +0200 Subject: [PATCH 14/30] put api tests together --- tests/{ => api}/test_async_response.py | 1 - tests/{ => api}/test_response.py | 0 2 files changed, 1 deletion(-) rename tests/{ => api}/test_async_response.py (99%) rename tests/{ => api}/test_response.py (100%) diff --git a/tests/test_async_response.py b/tests/api/test_async_response.py similarity index 99% rename from tests/test_async_response.py rename to tests/api/test_async_response.py index 73000bb6..c4eaebac 100644 --- a/tests/test_async_response.py +++ b/tests/api/test_async_response.py @@ -1,5 +1,4 @@ import json -from datetime import datetime import pytest diff --git a/tests/test_response.py b/tests/api/test_response.py similarity index 100% rename from tests/test_response.py rename to tests/api/test_response.py From 80df96ef3f9a914e1de0d01920522949418e5940 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Fri, 5 May 2023 10:40:19 +0200 Subject: [PATCH 15/30] rework a bit the structure --- mindee/response.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/mindee/response.py b/mindee/response.py index 8bbabc6e..8b0a79ef 100644 --- a/mindee/response.py +++ b/mindee/response.py @@ -1,6 +1,6 @@ import json from datetime import datetime -from typing import Any, Dict, Generic, List, Optional, Union +from typing import Any, Dict, Generic, List, Literal, Optional, Union from mindee.documents.base import TypeDocument from mindee.documents.config import DocumentConfig @@ -8,6 +8,18 @@ from mindee.logger import logger +class ApiRequest: + error: Dict[str, Any] + resources: List[str] + status: Literal["failure", "success"] + status_code: int + """HTTP status code.""" + url: str + + def __init__(self, json_response: dict) -> None: + self.url = json_response["url"] + + class Job: """ Job class for asynchronous requests. @@ -137,15 +149,17 @@ def _load_response( ) -class AsyncPredictResponse(PredictResponse[TypeDocument]): +class AsyncPredictResponse(Generic[TypeDocument]): """ Async Response Wrapper class for a Predict response. Links a Job to a future PredictResponse. """ + api_request: ApiRequest job: Job """Job object link to the prediction. As long as it isn't complete, the prediction doesn't exist.""" + document: PredictResponse[TypeDocument] def __init__( self, @@ -164,10 +178,11 @@ def __init__( :param input_source: Input object :param http_response: json response from HTTP call """ - super().__init__( + self.document = PredictResponse[TypeDocument]( http_response=http_response, doc_config=doc_config, input_source=input_source, response_ok=response_ok and http_response["job"]["status"] == "completed", ) self.job = Job(http_response["job"]) + self.api_request = ApiRequest(http_response["api_request"]) From 7920f6bd58523caa3ad23204a010d4a3be74e89a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Fri, 5 May 2023 10:45:55 +0200 Subject: [PATCH 16/30] add empty files for docs --- docs/extras/code_samples/invoice_splitter_v1_async.txt | 0 docs/predictions/standard/documents/invoice_splitter_v1.rst | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 docs/extras/code_samples/invoice_splitter_v1_async.txt create mode 100644 docs/predictions/standard/documents/invoice_splitter_v1.rst diff --git a/docs/extras/code_samples/invoice_splitter_v1_async.txt b/docs/extras/code_samples/invoice_splitter_v1_async.txt new file mode 100644 index 00000000..e69de29b diff --git a/docs/predictions/standard/documents/invoice_splitter_v1.rst b/docs/predictions/standard/documents/invoice_splitter_v1.rst new file mode 100644 index 00000000..e69de29b From 8aaf753d195562312767c5a616cf308ba031334e Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Fri, 5 May 2023 11:20:11 +0200 Subject: [PATCH 17/30] added doc support for async, refactored AsyncPredict class to fit with other sdks & updated unittest --- .../invoice_splitter_v1_async.txt | 22 +++++++++++++++++++ .../documents/invoice_splitter_v1.rst | 10 +++++++++ mindee/response.py | 4 ++++ tests/api/test_async_response.py | 10 ++++----- 4 files changed, 41 insertions(+), 5 deletions(-) diff --git a/docs/extras/code_samples/invoice_splitter_v1_async.txt b/docs/extras/code_samples/invoice_splitter_v1_async.txt index e69de29b..e9c266f3 100644 --- a/docs/extras/code_samples/invoice_splitter_v1_async.txt +++ b/docs/extras/code_samples/invoice_splitter_v1_async.txt @@ -0,0 +1,22 @@ +from mindee import Client, documents +from time import wait + +# Init a new client +mindee_client = Client(api_key="my-api-key") + +# Load a file from disk +input_doc = mindee_client.doc_from_path("/path/to/the/file.ext") + +# Queue the async Document by passing the appropriate type (e.g. InvoiceSplitterV1) +queue_result = input_doc.enqueue(documents.InvoiceSplitterV1) + +# Wait for the result to be ready (will take a few seconds) +wait(10) + +# Fetch and parse the result, using the same type +parsed_result = input_doc.parse_queued(documents.InvoiceSplitterV1) + +# Check whether the result is ready +if parsed_result.status="completed" + # Print a brief summary of the parsed data + print(result.document) \ No newline at end of file diff --git a/docs/predictions/standard/documents/invoice_splitter_v1.rst b/docs/predictions/standard/documents/invoice_splitter_v1.rst index e69de29b..fedee953 100644 --- a/docs/predictions/standard/documents/invoice_splitter_v1.rst +++ b/docs/predictions/standard/documents/invoice_splitter_v1.rst @@ -0,0 +1,10 @@ +Invoice Splitter V1 +------------------- + +**Sample Code:** + +.. literalinclude:: /extras/code_samples/invoice_splitter_v1_async.txt + :language: Python + +.. autoclass:: mindee.documents.InvoiceSplitterV1 + :members: \ No newline at end of file diff --git a/mindee/response.py b/mindee/response.py index 8b0a79ef..39c20ffe 100644 --- a/mindee/response.py +++ b/mindee/response.py @@ -18,6 +18,10 @@ class ApiRequest: def __init__(self, json_response: dict) -> None: self.url = json_response["url"] + self.error = json_response["error"] + self.resources = json_response["resources"] + self.status = json_response["status"] + self.status_code = json_response["status_code"] class Job: diff --git a/tests/api/test_async_response.py b/tests/api/test_async_response.py index c4eaebac..41e5106d 100644 --- a/tests/api/test_async_response.py +++ b/tests/api/test_async_response.py @@ -57,7 +57,7 @@ def test_async_response_post_success(dummy_file_input, dummy_config): assert parsed_response.job.available_at is None assert parsed_response.job.status == "waiting" assert parsed_response.job.job_id == "76c90710-3a1b-4b91-8a39-31a6543e347c" - assert not parsed_response.http_response["api_request"]["error"] + assert not parsed_response.api_request.error def test_async_response_post_fail(dummy_file_input, dummy_config): @@ -73,8 +73,8 @@ def test_async_response_post_fail(dummy_file_input, dummy_config): assert parsed_response.job.available_at is None assert parsed_response.job.status is None assert parsed_response.job.job_id is None - assert parsed_response.http_response["api_request"]["error"] - assert parsed_response.http_response["api_request"]["error"]["code"] == "Forbidden" + assert parsed_response.api_request.error + assert parsed_response.api_request.error["code"] == "Forbidden" def test_async_get_processing(dummy_file_input, dummy_config): @@ -90,7 +90,7 @@ def test_async_get_processing(dummy_file_input, dummy_config): assert parsed_response.job.available_at is None assert parsed_response.job.status == "processing" assert parsed_response.job.job_id == "76c90710-3a1b-4b91-8a39-31a6543e347c" - assert not parsed_response.http_response["api_request"]["error"] + assert not parsed_response.api_request.error def test_async_response_get_completed(dummy_file_input, dummy_config): @@ -105,4 +105,4 @@ def test_async_response_get_completed(dummy_file_input, dummy_config): assert parsed_response.job.issued_at.isoformat() == "2023-03-21T13:52:56.326107" assert parsed_response.job.available_at.isoformat() == "2023-03-21T13:53:00.990339" assert parsed_response.job.status == "completed" - assert parsed_response.http_response["api_request"]["error"] == {} + assert parsed_response.api_request.error == {} From 0fa7c37ec86a00aa5700dc762e7c429364eaad77 Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Fri, 5 May 2023 11:25:14 +0200 Subject: [PATCH 18/30] tweaked import for retrocompatibility --- mindee/response.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mindee/response.py b/mindee/response.py index 39c20ffe..343a2532 100644 --- a/mindee/response.py +++ b/mindee/response.py @@ -1,6 +1,8 @@ import json from datetime import datetime -from typing import Any, Dict, Generic, List, Literal, Optional, Union +from typing import Any, Dict, Generic, List, Optional, Union + +from typing_extensions import Literal from mindee.documents.base import TypeDocument from mindee.documents.config import DocumentConfig From 9c8c361c3ebe6bb7238433ead82f9752312f8031 Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Fri, 5 May 2023 11:33:05 +0200 Subject: [PATCH 19/30] retrocompatibility tweak --- mindee/response.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mindee/response.py b/mindee/response.py index 343a2532..6acc9b29 100644 --- a/mindee/response.py +++ b/mindee/response.py @@ -1,19 +1,23 @@ import json from datetime import datetime +from enum import Enum from typing import Any, Dict, Generic, List, Optional, Union -from typing_extensions import Literal - from mindee.documents.base import TypeDocument from mindee.documents.config import DocumentConfig from mindee.input.sources import LocalInputSource, UrlInputSource from mindee.logger import logger +class RequestStatus(Enum): + FAILURE = "failure" + SUCCESS = "success" + + class ApiRequest: error: Dict[str, Any] resources: List[str] - status: Literal["failure", "success"] + status: RequestStatus status_code: int """HTTP status code.""" url: str From fbe2e4559c626a0f9d1fc0822e3d1bd0d48fe3ac Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Fri, 5 May 2023 11:36:42 +0200 Subject: [PATCH 20/30] further tweaking --- mindee/response.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mindee/response.py b/mindee/response.py index 6acc9b29..7968441d 100644 --- a/mindee/response.py +++ b/mindee/response.py @@ -26,7 +26,7 @@ def __init__(self, json_response: dict) -> None: self.url = json_response["url"] self.error = json_response["error"] self.resources = json_response["resources"] - self.status = json_response["status"] + self.status = RequestStatus(json_response["status"]) self.status_code = json_response["status_code"] From c37e8ec3e9422ea7d4866ad59ae9347095097bd3 Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Fri, 5 May 2023 11:39:37 +0200 Subject: [PATCH 21/30] fixed typo --- docs/extras/code_samples/invoice_splitter_v1_async.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/extras/code_samples/invoice_splitter_v1_async.txt b/docs/extras/code_samples/invoice_splitter_v1_async.txt index e9c266f3..c56fb0a1 100644 --- a/docs/extras/code_samples/invoice_splitter_v1_async.txt +++ b/docs/extras/code_samples/invoice_splitter_v1_async.txt @@ -17,6 +17,6 @@ wait(10) parsed_result = input_doc.parse_queued(documents.InvoiceSplitterV1) # Check whether the result is ready -if parsed_result.status="completed" +if parsed_result.status=="completed" # Print a brief summary of the parsed data print(result.document) \ No newline at end of file From 0d7347aebe270351c29c4868005630c1677d380b Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Fri, 5 May 2023 11:41:03 +0200 Subject: [PATCH 22/30] fixed typo... again... --- docs/extras/code_samples/invoice_splitter_v1_async.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/extras/code_samples/invoice_splitter_v1_async.txt b/docs/extras/code_samples/invoice_splitter_v1_async.txt index c56fb0a1..3c7cb7c4 100644 --- a/docs/extras/code_samples/invoice_splitter_v1_async.txt +++ b/docs/extras/code_samples/invoice_splitter_v1_async.txt @@ -17,6 +17,6 @@ wait(10) parsed_result = input_doc.parse_queued(documents.InvoiceSplitterV1) # Check whether the result is ready -if parsed_result.status=="completed" +if parsed_result.status=="completed": # Print a brief summary of the parsed data print(result.document) \ No newline at end of file From 497d64721f518d98ec9cafedf7d52e18038650d4 Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Fri, 5 May 2023 11:44:15 +0200 Subject: [PATCH 23/30] fixed txt test, again. --- docs/extras/code_samples/invoice_splitter_v1_async.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/extras/code_samples/invoice_splitter_v1_async.txt b/docs/extras/code_samples/invoice_splitter_v1_async.txt index 3c7cb7c4..58d3032f 100644 --- a/docs/extras/code_samples/invoice_splitter_v1_async.txt +++ b/docs/extras/code_samples/invoice_splitter_v1_async.txt @@ -1,5 +1,5 @@ from mindee import Client, documents -from time import wait +from time import sleep # Init a new client mindee_client = Client(api_key="my-api-key") @@ -11,7 +11,7 @@ input_doc = mindee_client.doc_from_path("/path/to/the/file.ext") queue_result = input_doc.enqueue(documents.InvoiceSplitterV1) # Wait for the result to be ready (will take a few seconds) -wait(10) +sleep(10) # Fetch and parse the result, using the same type parsed_result = input_doc.parse_queued(documents.InvoiceSplitterV1) From f5798528b13dad6ad8309e14a73914c744ae30d2 Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Fri, 5 May 2023 12:40:02 +0200 Subject: [PATCH 24/30] fixed code sample for invoicesplitter --- docs/extras/code_samples/invoice_splitter_v1_async.txt | 9 ++++++--- mindee/documents/invoice_splitter/invoice_splitter_v1.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/extras/code_samples/invoice_splitter_v1_async.txt b/docs/extras/code_samples/invoice_splitter_v1_async.txt index 58d3032f..cb5e3380 100644 --- a/docs/extras/code_samples/invoice_splitter_v1_async.txt +++ b/docs/extras/code_samples/invoice_splitter_v1_async.txt @@ -8,15 +8,18 @@ mindee_client = Client(api_key="my-api-key") input_doc = mindee_client.doc_from_path("/path/to/the/file.ext") # Queue the async Document by passing the appropriate type (e.g. InvoiceSplitterV1) -queue_result = input_doc.enqueue(documents.InvoiceSplitterV1) +queue_result = input_doc.enqueue(documents.TypeInvoiceSplitterV1) + +# Get the id of the queue (job) +queue_id = queue_result.job.job_id # Wait for the result to be ready (will take a few seconds) sleep(10) # Fetch and parse the result, using the same type -parsed_result = input_doc.parse_queued(documents.InvoiceSplitterV1) +parsed_result = input_doc.parse_queued(documents.TypeInvoiceSplitterV1, queue_id) # Check whether the result is ready -if parsed_result.status=="completed": +if parsed_result.api_request.status=="completed": # Print a brief summary of the parsed data print(result.document) \ No newline at end of file diff --git a/mindee/documents/invoice_splitter/invoice_splitter_v1.py b/mindee/documents/invoice_splitter/invoice_splitter_v1.py index 2279d729..1eda1a93 100644 --- a/mindee/documents/invoice_splitter/invoice_splitter_v1.py +++ b/mindee/documents/invoice_splitter/invoice_splitter_v1.py @@ -30,7 +30,7 @@ class InvoiceSplitterV1(Document): def __init__( self, - api_prediction: TypeApiPrediction, + api_prediction: TypeApiPrediction=None, input_source=None, page_n: Optional[int] = None, ): From 2b46983bce138e3d3e1ee02489011d58ce63ebba Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Fri, 5 May 2023 12:42:44 +0200 Subject: [PATCH 25/30] fixed typing in invoicesplitter --- mindee/documents/invoice_splitter/invoice_splitter_v1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mindee/documents/invoice_splitter/invoice_splitter_v1.py b/mindee/documents/invoice_splitter/invoice_splitter_v1.py index 1eda1a93..2279d729 100644 --- a/mindee/documents/invoice_splitter/invoice_splitter_v1.py +++ b/mindee/documents/invoice_splitter/invoice_splitter_v1.py @@ -30,7 +30,7 @@ class InvoiceSplitterV1(Document): def __init__( self, - api_prediction: TypeApiPrediction=None, + api_prediction: TypeApiPrediction, input_source=None, page_n: Optional[int] = None, ): From 5e38352686ee0e4e5436a4360f28808144fdc4e5 Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Fri, 5 May 2023 16:10:14 +0200 Subject: [PATCH 26/30] revamped sample code --- .../invoice_splitter_v1_async.txt | 45 ++++++++++++++----- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/docs/extras/code_samples/invoice_splitter_v1_async.txt b/docs/extras/code_samples/invoice_splitter_v1_async.txt index cb5e3380..cc3059f1 100644 --- a/docs/extras/code_samples/invoice_splitter_v1_async.txt +++ b/docs/extras/code_samples/invoice_splitter_v1_async.txt @@ -1,5 +1,6 @@ from mindee import Client, documents from time import sleep +import os # Init a new client mindee_client = Client(api_key="my-api-key") @@ -7,19 +8,41 @@ mindee_client = Client(api_key="my-api-key") # Load a file from disk input_doc = mindee_client.doc_from_path("/path/to/the/file.ext") -# Queue the async Document by passing the appropriate type (e.g. InvoiceSplitterV1) -queue_result = input_doc.enqueue(documents.TypeInvoiceSplitterV1) +# Put the document class in a local variable to keep the code DRY + +doc_class = documents.TypeInvoiceSplitterV1 + +# Limit the amount of API calls to retrieve your document +MAX_RETRIES = 10 + +# How many seconds to wait in-between tries +INTERVAL_SECS = 6 + +# Counter to keep track of how many times we try to retrieve the document +times_tried = 1 + + +queue_result = input_doc.enqueue(doc_class) # Get the id of the queue (job) queue_id = queue_result.job.job_id -# Wait for the result to be ready (will take a few seconds) -sleep(10) - -# Fetch and parse the result, using the same type -parsed_result = input_doc.parse_queued(documents.TypeInvoiceSplitterV1, queue_id) +# Recursive function that tries to retrieve the completed document. +# If the document is not "complete", try again +def get_doc_from_async_queue(queue_id, times_tried=0): + sleep(1) + + if times_tried >= MAX_RETRIES: + raise Exception(f"Maximum retries reached {times_tried}") + # Fetch and parse the result, using the same type + parsed_result = input_doc.parse_queued(doc_class, queue_id) + # Check whether the result is ready + if parsed_result.job.status=="completed": + # Print a brief summary of the parsed data + print(parsed_result.document.document) + return + # Otherwise, try again + else: + get_doc_from_async_queue(queue_id, times_tried+1) -# Check whether the result is ready -if parsed_result.api_request.status=="completed": - # Print a brief summary of the parsed data - print(result.document) \ No newline at end of file +get_doc_from_async_queue(queue_id) \ No newline at end of file From d9b6ffbebbf993571efb99e8f8e8eed26b6f95c2 Mon Sep 17 00:00:00 2001 From: Sebastian <130448732+seboliveramindee@users.noreply.github.com> Date: Fri, 5 May 2023 16:19:23 +0200 Subject: [PATCH 27/30] Update docs/extras/code_samples/invoice_splitter_v1_async.txt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: ianaré <97107275+ianardee@users.noreply.github.com> --- docs/extras/code_samples/invoice_splitter_v1_async.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/extras/code_samples/invoice_splitter_v1_async.txt b/docs/extras/code_samples/invoice_splitter_v1_async.txt index cc3059f1..cc208b7d 100644 --- a/docs/extras/code_samples/invoice_splitter_v1_async.txt +++ b/docs/extras/code_samples/invoice_splitter_v1_async.txt @@ -37,7 +37,7 @@ def get_doc_from_async_queue(queue_id, times_tried=0): # Fetch and parse the result, using the same type parsed_result = input_doc.parse_queued(doc_class, queue_id) # Check whether the result is ready - if parsed_result.job.status=="completed": + if parsed_result.job.status == "completed": # Print a brief summary of the parsed data print(parsed_result.document.document) return From 81191847a631049247965f4d2d0fac9a76b1b6e5 Mon Sep 17 00:00:00 2001 From: Sebastian <130448732+seboliveramindee@users.noreply.github.com> Date: Fri, 5 May 2023 16:20:17 +0200 Subject: [PATCH 28/30] Update docs/extras/code_samples/invoice_splitter_v1_async.txt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: ianaré <97107275+ianardee@users.noreply.github.com> --- docs/extras/code_samples/invoice_splitter_v1_async.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/extras/code_samples/invoice_splitter_v1_async.txt b/docs/extras/code_samples/invoice_splitter_v1_async.txt index cc208b7d..398af7b8 100644 --- a/docs/extras/code_samples/invoice_splitter_v1_async.txt +++ b/docs/extras/code_samples/invoice_splitter_v1_async.txt @@ -30,7 +30,7 @@ queue_id = queue_result.job.job_id # Recursive function that tries to retrieve the completed document. # If the document is not "complete", try again def get_doc_from_async_queue(queue_id, times_tried=0): - sleep(1) + sleep(INTERVAL_SECS) if times_tried >= MAX_RETRIES: raise Exception(f"Maximum retries reached {times_tried}") From fc4c6aa56449c7454dc2ee45df552f308de57590 Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Fri, 5 May 2023 16:28:18 +0200 Subject: [PATCH 29/30] fixed code sample & added comments --- .../code_samples/invoice_splitter_v1_async.txt | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/docs/extras/code_samples/invoice_splitter_v1_async.txt b/docs/extras/code_samples/invoice_splitter_v1_async.txt index 398af7b8..126fd294 100644 --- a/docs/extras/code_samples/invoice_splitter_v1_async.txt +++ b/docs/extras/code_samples/invoice_splitter_v1_async.txt @@ -30,19 +30,27 @@ queue_id = queue_result.job.job_id # Recursive function that tries to retrieve the completed document. # If the document is not "complete", try again def get_doc_from_async_queue(queue_id, times_tried=0): - sleep(INTERVAL_SECS) - + + # Have we exceeded our retry count? if times_tried >= MAX_RETRIES: raise Exception(f"Maximum retries reached {times_tried}") + + # Wait for a few seconds before fetching + sleep(INTERVAL_SECS) + # Fetch and parse the result, using the same type parsed_result = input_doc.parse_queued(doc_class, queue_id) + # Check whether the result is ready if parsed_result.job.status == "completed": + # Print a brief summary of the parsed data print(parsed_result.document.document) return - # Otherwise, try again + + # Otherwise, try again... else: get_doc_from_async_queue(queue_id, times_tried+1) +# Start the recursion... get_doc_from_async_queue(queue_id) \ No newline at end of file From 8c92a7aefc0f439bb2413d239fb44c1ff53a27cf Mon Sep 17 00:00:00 2001 From: Sebastian Olivera Date: Fri, 5 May 2023 16:29:39 +0200 Subject: [PATCH 30/30] removed needless import --- docs/extras/code_samples/invoice_splitter_v1_async.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/extras/code_samples/invoice_splitter_v1_async.txt b/docs/extras/code_samples/invoice_splitter_v1_async.txt index 126fd294..fc025265 100644 --- a/docs/extras/code_samples/invoice_splitter_v1_async.txt +++ b/docs/extras/code_samples/invoice_splitter_v1_async.txt @@ -1,6 +1,5 @@ from mindee import Client, documents from time import sleep -import os # Init a new client mindee_client = Client(api_key="my-api-key")