From 5b88447265bc910797808a6dcd9ecf675195dd08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Fri, 10 Mar 2023 19:38:02 +0100 Subject: [PATCH] :sparkles: add an URL input source --- docs/guide/python-getting-started.md | 9 ++++ mindee/cli.py | 19 +++++--- mindee/client.py | 43 +++++++++++++----- mindee/documents/base.py | 17 ++++--- mindee/documents/financial/financial_v1.py | 9 ++-- mindee/endpoints.py | 36 +++++++++------ mindee/input/sources.py | 53 ++++++++++++++++------ mindee/response.py | 10 ++-- tests/test_inputs.py | 6 +++ 9 files changed, 142 insertions(+), 60 deletions(-) diff --git a/docs/guide/python-getting-started.md b/docs/guide/python-getting-started.md index ac04e9c1..f7f0c9a0 100644 --- a/docs/guide/python-getting-started.md +++ b/docs/guide/python-getting-started.md @@ -153,6 +153,15 @@ async def upload(upload: UploadFile): ) ``` +### URL +Allows sending an URL directly. + +**Note**: No local operations can be performed on the input (such as removing pages from a PDF). + +```python +input_doc = mindee_client.doc_from_url(url="https://www.example.com/invoice.pdf") +``` + ## Sending a File To send a file to the API, we need to specify how to process the document. This will determine which API endpoint is used and how the API return will be handled internally by the library. diff --git a/mindee/cli.py b/mindee/cli.py index bd2fa662..8b4c6114 100644 --- a/mindee/cli.py +++ b/mindee/cli.py @@ -76,13 +76,19 @@ class CommandConfig(Generic[TypeDoc]): def _get_input_doc(client, args) -> DocumentClient: if args.input_type == "file": with open(args.path, "rb", buffering=30) as file_handle: - return client.doc_from_file(file_handle) + return client.doc_from_file(input_file=file_handle) elif args.input_type == "base64": with open(args.path, "rt", encoding="ascii") as base64_handle: - return client.doc_from_b64string(base64_handle.read(), "test.jpg") + return client.doc_from_b64string( + input_string=base64_handle.read(), filename="test.jpg" + ) elif args.input_type == "bytes": with open(args.path, "rb") as bytes_handle: - return client.doc_from_bytes(bytes_handle.read(), bytes_handle.name) + return client.doc_from_bytes( + input_bytes=bytes_handle.read(), filename=bytes_handle.name + ) + elif args.input_type == "url": + return client.doc_from_url(url=args.path) return client.doc_from_path(args.path) @@ -181,13 +187,14 @@ def _parse_args() -> Namespace: "-i", "--input-type", dest="input_type", - choices=["path", "file", "base64", "bytes"], + choices=["path", "file", "base64", "bytes", "url"], default="path", help="Specify how to handle the input.\n" "- path: open a path (default).\n" "- file: open as a file handle.\n" - "- base64: load the from a base64 encoded text file.\n" - "- bytes: load the contents as raw bytes.", + "- base64: open a base64 encoded text file.\n" + "- bytes: open the contents as raw bytes.\n" + "- url: open an URL.", ) subp.add_argument( "-o", diff --git a/mindee/client.py b/mindee/client.py index f5a0d52f..8f1b63f0 100644 --- a/mindee/client.py +++ b/mindee/client.py @@ -1,5 +1,5 @@ import json -from typing import BinaryIO, Dict, List, NamedTuple, Optional, Type +from typing import BinaryIO, Dict, List, NamedTuple, Optional, Type, Union from mindee import documents from mindee.documents.base import Document, TypeDocument @@ -10,8 +10,9 @@ Base64Input, BytesInput, FileInput, - InputSource, + LocalInputSource, PathInput, + UrlInputSource, ) from mindee.logger import logger from mindee.response import PredictResponse @@ -23,13 +24,13 @@ def get_bound_classname(type_var) -> str: class DocumentClient: - input_doc: InputSource + input_doc: Union[LocalInputSource, UrlInputSource] doc_configs: DocumentConfigDict raise_on_error: bool = True def __init__( self, - input_doc: InputSource, + input_doc: Union[LocalInputSource, UrlInputSource], doc_configs: DocumentConfigDict, raise_on_error: bool, ): @@ -108,12 +109,13 @@ def parse( doc_config = self.doc_configs[config_key] doc_config.check_api_keys() - if page_options and self.input_doc.is_pdf(): - self.input_doc.process_pdf( - page_options.operation, - page_options.on_min_pages, - page_options.page_indexes, - ) + if not isinstance(self.input_doc, UrlInputSource): + if page_options and self.input_doc.is_pdf(): + self.input_doc.process_pdf( + page_options.operation, + page_options.on_min_pages, + page_options.page_indexes, + ) return self._make_request( document_class, doc_config, include_words, close_file, cropper ) @@ -152,7 +154,8 @@ def _make_request( def close(self) -> None: """Close the file object.""" - self.input_doc.file_object.close() + if not isinstance(self.input_doc, UrlInputSource): + self.input_doc.file_object.close() class ConfigSpec(NamedTuple): @@ -397,3 +400,21 @@ def doc_from_bytes( doc_configs=self._doc_configs, raise_on_error=self.raise_on_error, ) + + def doc_from_url( + self, + url: str, + ) -> DocumentClient: + """ + Load a document from an URL. + + :param url: Raw byte input + """ + input_doc = UrlInputSource( + url, + ) + return DocumentClient( + input_doc=input_doc, + doc_configs=self._doc_configs, + raise_on_error=self.raise_on_error, + ) diff --git a/mindee/documents/base.py b/mindee/documents/base.py index eca2a419..75663a17 100644 --- a/mindee/documents/base.py +++ b/mindee/documents/base.py @@ -1,11 +1,11 @@ import datetime import re -from typing import Any, Dict, List, Optional, TypeVar +from typing import Any, Dict, List, Optional, TypeVar, Union from mindee.endpoints import Endpoint from mindee.fields.orientation import OrientationField from mindee.fields.position import PositionField -from mindee.input.sources import InputSource +from mindee.input.sources import LocalInputSource, UrlInputSource TypeApiPrediction = Dict[str, Any] @@ -46,15 +46,18 @@ class Document: def __init__( self, - input_source: InputSource, + input_source: Union[LocalInputSource, UrlInputSource], document_type: Optional[str], api_prediction: TypeApiPrediction, page_n: Optional[int] = None, ): if input_source: - self.filepath = input_source.filepath - self.filename = input_source.filename - self.file_extension = input_source.file_mimetype + if isinstance(input_source, UrlInputSource): + self.filename = input_source.url + else: + self.filepath = input_source.filepath + self.filename = input_source.filename + self.file_extension = input_source.file_mimetype self.checklist = {} self.type = document_type @@ -67,7 +70,7 @@ def __init__( @staticmethod def request( endpoints: List[Endpoint], - input_source: InputSource, + input_source: Union[LocalInputSource, UrlInputSource], include_words: bool = False, close_file: bool = True, cropper: bool = False, diff --git a/mindee/documents/financial/financial_v1.py b/mindee/documents/financial/financial_v1.py index fe9d9e5e..3e9e9218 100644 --- a/mindee/documents/financial/financial_v1.py +++ b/mindee/documents/financial/financial_v1.py @@ -1,4 +1,4 @@ -from typing import List, Optional, TypeVar +from typing import List, Optional, TypeVar, Union from mindee.documents.base import Document, TypeApiPrediction, clean_out_string from mindee.documents.invoice.invoice_v3 import InvoiceV3 @@ -11,7 +11,7 @@ from mindee.fields.payment_details import PaymentDetails from mindee.fields.tax import TaxField from mindee.fields.text import TextField -from mindee.input.sources import InputSource +from mindee.input.sources import LocalInputSource, UrlInputSource class FinancialV1(Document): @@ -152,7 +152,7 @@ def __str__(self) -> str: @staticmethod def request( endpoints: List[Endpoint], - input_source: InputSource, + input_source: Union[LocalInputSource, UrlInputSource], include_words: bool = False, close_file: bool = True, cropper: bool = False, @@ -166,6 +166,9 @@ def request( :param close_file: Whether to `close()` the file after parsing it. :param cropper: Including Mindee cropper results. """ + if isinstance(input_source, UrlInputSource): + raise AssertionError("URL input is not supported for this API endpoint.") + if "pdf" in input_source.file_mimetype: # invoices is index 0, receipts 1 (this should be cleaned up) index = 0 diff --git a/mindee/endpoints.py b/mindee/endpoints.py index 4c041cac..56a0f3a7 100644 --- a/mindee/endpoints.py +++ b/mindee/endpoints.py @@ -3,7 +3,7 @@ import requests -from mindee.input.sources import InputSource +from mindee.input.sources import LocalInputSource, UrlInputSource from mindee.logger import logger from mindee.versions import __version__, get_platform, python_version @@ -97,7 +97,7 @@ def set_api_key_from_env(self) -> None: def predict_req_post( self, - input_source: InputSource, + input_source: Union[LocalInputSource, UrlInputSource], include_words: bool = False, close_file: bool = True, cropper: bool = False, @@ -111,7 +111,6 @@ def predict_req_post( :param cropper: Including Mindee cropping results. :return: requests response """ - files = {"document": input_source.read_contents(close_file)} data = {} if include_words: data["include_mvision"] = "true" @@ -120,20 +119,31 @@ def predict_req_post( if cropper: params["cropper"] = "true" - response = requests.post( - f"{self._url_root}/predict", - files=files, - headers=self.base_headers, - data=data, - params=params, - timeout=self._request_timeout, - ) + if isinstance(input_source, UrlInputSource): + data["document"] = input_source.url + response = requests.post( + f"{self._url_root}/predict", + headers=self.base_headers, + data=data, + params=params, + timeout=self._request_timeout, + ) + else: + files = {"document": input_source.read_contents(close_file)} + response = requests.post( + f"{self._url_root}/predict", + files=files, + headers=self.base_headers, + data=data, + params=params, + timeout=self._request_timeout, + ) return response class CustomEndpoint(Endpoint): def training_req_post( - self, input_source: InputSource, close_file: bool = True + self, input_source: LocalInputSource, close_file: bool = True ) -> requests.Response: """ Make a request to POST a document for training. @@ -155,7 +165,7 @@ def training_req_post( return response def training_async_req_post( - self, input_source: InputSource, close_file: bool = True + self, input_source: LocalInputSource, close_file: bool = True ) -> requests.Response: """ Make a request to POST a document for training without processing. diff --git a/mindee/input/sources.py b/mindee/input/sources.py index 428dc3ff..6bf34361 100644 --- a/mindee/input/sources.py +++ b/mindee/input/sources.py @@ -2,6 +2,7 @@ import io import mimetypes import os +from enum import Enum from typing import BinaryIO, Optional, Sequence, Tuple import pikepdf @@ -22,26 +23,29 @@ "image/webp", ] -INPUT_TYPE_FILE = "file" -INPUT_TYPE_BASE64 = "base64" -INPUT_TYPE_BYTES = "bytes" -INPUT_TYPE_PATH = "path" + +class InputType(Enum): + FILE = "file" + BASE64 = "base64" + BYTES = "bytes" + PATH = "path" + URL = "url" class MimeTypeError(AssertionError): pass -class InputSource: +class LocalInputSource: file_object: BinaryIO filename: str file_mimetype: str - input_type: str + input_type: InputType filepath: Optional[str] = None def __init__( self, - input_type: str, + input_type: InputType, ): self.input_type = input_type self._check_mimetype() @@ -168,7 +172,7 @@ def read_contents(self, close_file: bool) -> Tuple[str, bytes]: return self.filename, data -class FileInput(InputSource): +class FileInput(LocalInputSource): def __init__(self, file: BinaryIO): """ Input document from a Python binary file object. @@ -182,10 +186,10 @@ def __init__(self, file: BinaryIO): self.file_object = file self.filename = os.path.basename(file.name) self.filepath = file.name - super().__init__(input_type=INPUT_TYPE_FILE) + super().__init__(input_type=InputType.FILE) -class PathInput(InputSource): +class PathInput(LocalInputSource): def __init__(self, filepath: str): """ Input document from a path. @@ -195,10 +199,10 @@ def __init__(self, filepath: str): self.file_object = open(filepath, "rb") # pylint: disable=consider-using-with self.filename = os.path.basename(filepath) self.filepath = filepath - super().__init__(input_type=INPUT_TYPE_PATH) + super().__init__(input_type=InputType.PATH) -class BytesInput(InputSource): +class BytesInput(LocalInputSource): def __init__(self, raw_bytes: bytes, filename: str): """ Input document from raw bytes (no buffer). @@ -209,10 +213,10 @@ def __init__(self, raw_bytes: bytes, filename: str): self.file_object = io.BytesIO(raw_bytes) self.filename = filename self.filepath = None - super().__init__(input_type=INPUT_TYPE_BYTES) + super().__init__(input_type=InputType.BYTES) -class Base64Input(InputSource): +class Base64Input(LocalInputSource): def __init__(self, base64_string: str, filename: str): """ Input document from a base64 encoded string. @@ -223,4 +227,23 @@ def __init__(self, base64_string: str, filename: str): self.file_object = io.BytesIO(base64.standard_b64decode(base64_string)) self.filename = filename self.filepath = None - super().__init__(input_type=INPUT_TYPE_BASE64) + super().__init__(input_type=InputType.BASE64) + + +class UrlInputSource: + url: str + + def __init__(self, url: str): + """ + Input document from a base64 encoded string. + + :param url: URL to send, must be HTTPS + """ + if not url.lower().startswith("https"): + raise AssertionError("URL must be HTTPS") + + self.input_type = InputType.URL + + logger.debug("URL input: %s", url) + + self.url = url diff --git a/mindee/response.py b/mindee/response.py index 05393b33..facf9831 100644 --- a/mindee/response.py +++ b/mindee/response.py @@ -1,8 +1,8 @@ -from typing import Any, Dict, Generic, List, Optional +from typing import Any, Dict, Generic, List, Optional, Union from mindee.documents.base import TypeDocument from mindee.documents.config import DocumentConfig -from mindee.input.sources import InputSource +from mindee.input.sources import LocalInputSource, UrlInputSource from mindee.logger import logger @@ -32,7 +32,7 @@ def __init__( self, doc_config: DocumentConfig, http_response: dict, - input_source: InputSource, + input_source: Union[LocalInputSource, UrlInputSource], response_ok: bool, ) -> None: """ @@ -48,7 +48,7 @@ def __init__( self.document_type = doc_config.document_type self.pages = [] - if input_source: + if not isinstance(input_source, UrlInputSource): self.input_path = input_source.filepath self.input_filename = input_source.filename self.input_mimetype = input_source.file_mimetype @@ -61,7 +61,7 @@ def __init__( def _load_response( self, doc_config: DocumentConfig, - input_source: InputSource, + input_source: Union[LocalInputSource, UrlInputSource], ) -> None: # This is some seriously ugly stuff. # Simplify all this in V4, as we won't need to pass the document type anymore diff --git a/tests/test_inputs.py b/tests/test_inputs.py index a816f211..7b8a9058 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -10,6 +10,7 @@ FileInput, MimeTypeError, PathInput, + UrlInputSource, ) from tests import INVOICE_DATA_DIR, PDF_DATA_DIR, RECEIPT_DATA_DIR @@ -145,6 +146,11 @@ def test_pdf_input_from_bytes(): assert input_obj.count_doc_pages() == 1 +def test_pdf_input_from_url(): + with pytest.raises(AssertionError): + UrlInputSource(url="http://example.com/invoice.pdf") + + def test_pdf_blank_check(): with pytest.raises(AssertionError): input_obj = PathInput(f"{PDF_DATA_DIR}/blank.pdf")