From 305de78e3d44a8775b6d88b56f22d54cfd0425c0 Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Mon, 17 Jul 2023 09:24:25 -0400 Subject: [PATCH] DEVEXP-503 Can now read a batch of documents with metadata Reworked some of the existing "write" tests to now use `client.documents.read` to verify data that was written. --- marklogic/documents.py | 174 +++++++++++++++--- .../security/users/python-not-rest-user.json | 8 + .../main/ml-modules/transforms/envelope.sjs | 6 + tests/conftest.py | 14 +- tests/test_get_documents.py | 54 ------ tests/test_read_documents.py | 122 ++++++++++++ tests/test_search.py | 5 - tests/test_write_documents.py | 23 ++- tests/test_write_documents_with_metadata.py | 130 ++++++------- 9 files changed, 383 insertions(+), 153 deletions(-) create mode 100644 test-app/src/main/ml-config/security/users/python-not-rest-user.json create mode 100644 test-app/src/main/ml-modules/transforms/envelope.sjs delete mode 100644 tests/test_get_documents.py create mode 100644 tests/test_read_documents.py delete mode 100644 tests/test_search.py diff --git a/marklogic/documents.py b/marklogic/documents.py index c00d1c8..74ac8c7 100644 --- a/marklogic/documents.py +++ b/marklogic/documents.py @@ -1,7 +1,9 @@ import json +from collections import OrderedDict from typing import Union from requests import Response, Session +from requests_toolbelt.multipart.decoder import MultipartDecoder from urllib3.fields import RequestField from urllib3.filepost import encode_multipart_formdata @@ -63,27 +65,29 @@ def metadata_to_dict(metadata: Metadata) -> dict: return md +def dict_to_metadata(metadata: dict, target_metadata: Metadata) -> None: + """ + Populates the given Metadata instance based on the metadata dictionary as returned + by the /v1/documents REST endpoint. + """ + target_metadata.collections = metadata.get("collections") + target_metadata.quality = metadata.get("quality") + target_metadata.metadata_values = metadata.get("metadataValues") + target_metadata.properties = metadata.get("properties") + if metadata.get("permissions"): + perms = {} + for perm in metadata["permissions"]: + role = perm["role-name"] + perms[role] = perm["capabilities"] + target_metadata.permissions = perms + else: + target_metadata.perms = None + + class Document(Metadata): """ - :param uri: the URI of the document; can be None when relying on MarkLogic to - generate a URI. - :param content: the content of the document. - :param collections: see definition in parent class. - :param permissions: see definition in parent class. - :param quality: see definition in parent class. - :param metadata_values: see definition in parent class. - :param properties: see definition in parent class. - :param content_type: the MIME type of the document; use when MarkLogic cannot - determine the MIME type based on the URI. - :param extension: specifies a suffix for a URI generated by MarkLogic. - :param directory: specifies a prefix for a URI generated by MarkLogic. - :param repair: for an XML document, the level of XML repair to perform; can be - "full" or "none", with "none" being the default. - :param version_id: affects updates when optimistic locking is enabled; see - https://docs.marklogic.com/REST/POST/v1/documents for more information. - :param temporal_document: the logical document URI for a document written to a - temporal collection; requires that a "temporal-collection" parameter be included in - the request. + Represents a document, either as read from MarkLogic or as a document to be + written to MarkLogic. """ def __init__( @@ -96,24 +100,48 @@ def __init__( metadata_values: dict = None, properties: dict = None, content_type: str = None, + version_id: str = None, extension: str = None, directory: str = None, repair: str = None, extract: str = None, - version_id: str = None, temporal_document: str = None, ): + """ + :param uri: the URI of the document; can be None when relying on MarkLogic to + generate a URI. + :param content: the content of the document. + :param collections: see definition in parent class. + :param permissions: see definition in parent class. + :param quality: see definition in parent class. + :param metadata_values: see definition in parent class. + :param properties: see definition in parent class. + :param content_type: the MIME type of the document; use when MarkLogic cannot + determine the MIME type based on the URI. + :param version_id: affects updates when optimistic locking is enabled; see + https://docs.marklogic.com/REST/POST/v1/documents for more information. + :param temporal_document: the logical document URI for a document written to a + :param extension: specifies a suffix for a URI generated by MarkLogic; only used + when writing a document. + :param directory: specifies a prefix for a URI generated by MarkLogic; only used + when writing a document. + :param repair: for an XML document, the level of XML repair to perform; can be + "full" or "none", with "none" being the default; only used when writing a + document. + temporal collection; requires that a "temporal-collection" parameter be + included in the request; only used when writing a document. + """ super().__init__(collections, permissions, quality, metadata_values, properties) self.uri = uri self.content = content + self.content_type = content_type + self.version_id = version_id # The following are all specific to writing a document. - self.content_type = content_type self.extension = extension self.directory = directory self.repair = repair self.extract = extract - self.version_id = version_id self.temporal_document = temporal_document def to_request_field(self) -> RequestField: @@ -208,6 +236,37 @@ def to_metadata_request_field(self) -> RequestField: return field +def _extract_values_from_header(part) -> dict: + """ + Returns a dict containing values about the document content or metadata. + """ + encoding = part.encoding + disposition = part.headers["Content-Disposition".encode(encoding)].decode(encoding) + disposition_values = {} + for item in disposition.split(";"): + tokens = item.split("=") + # The first item will be "attachment" and can be ignored. + if len(tokens) == 2: + disposition_values[tokens[0].strip()] = tokens[1] + + content_type = None + if part.headers.get("Content-Type".encode(encoding)): + content_type = part.headers["Content-Type".encode(encoding)].decode(encoding) + + uri = disposition_values["filename"] + if uri.startswith('"'): + uri = uri[1:] + if uri.endswith('"'): + uri = uri[:-1] + + return { + "uri": uri, + "category": disposition_values["category"], + "content_type": content_type, + "version_id": disposition_values.get("versionId"), + } + + class DocumentManager: """ Provides methods to simplify interacting with the /v1/documents REST endpoint @@ -251,3 +310,74 @@ def write( headers["Accept"] = "application/json" return self._session.post("/v1/documents", data=data, headers=headers, **kwargs) + + def _get_multipart_documents_response( + self, uris: list[str], categories: list[str], **kwargs + ) -> Response: + """ + Constructs and sends a multipart/mixed request to the v1/documents endpoint. + """ + params = kwargs.pop("params", {}) + params["uri"] = uris + params["format"] = "json" # This refers to the metadata format. + if categories: + params["category"] = categories + + headers = kwargs.pop("headers", {}) + headers["Accept"] = "multipart/mixed" + return self._session.get( + "/v1/documents", params=params, headers=headers, **kwargs + ) + + def read( + self, uris: list[str], categories: list[str] = None, **kwargs + ) -> Union[list[Document], Response]: + """ + Read one or many documents via a GET to the endpoint defined at + https://docs.marklogic.com/REST/POST/v1/documents . If a 200 is not returned + by that endpoint, then the Response is returned instead. + + :param uris: list of URIs to read. + :param categories: optional list of the categories of data to return for each + URI. By default, only content will be returned for each URI. See the endpoint + documentation for further information. + """ + response = self._get_multipart_documents_response(uris, categories, **kwargs) + if response.status_code != 200: + return response + + decoder = MultipartDecoder.from_response(response) + + # Use a dict to store URIs to Document objects so that we don't assume any + # order with how the metadata and content parts are returned. An OrderedDict is + # used to ensure that the order of the URIs is maintained, though the REST + # endpoint is not guaranteed to return them in the same order as provided by + # the user. + docs = OrderedDict() + + for part in decoder.parts: + header_values = _extract_values_from_header(part) + uri = header_values["uri"] + if header_values["category"] == "content": + content = ( + json.loads(part.content) + if header_values["content_type"] == "application/json" + else part.content + ) + content_type = header_values["content_type"] + version_id = header_values["version_id"] + if docs.get(uri): + doc: Document = docs[uri] + doc.content = content + doc.content_type = content_type + doc.version_id = version_id + else: + docs[uri] = Document( + uri, content, content_type=content_type, version_id=version_id + ) + else: + doc = docs[uri] if docs.get(uri) else Document(uri, None) + docs[uri] = doc + dict_to_metadata(json.loads(part.content), doc) + + return list(docs.values()) diff --git a/test-app/src/main/ml-config/security/users/python-not-rest-user.json b/test-app/src/main/ml-config/security/users/python-not-rest-user.json new file mode 100644 index 0000000..71fba98 --- /dev/null +++ b/test-app/src/main/ml-config/security/users/python-not-rest-user.json @@ -0,0 +1,8 @@ +{ + "user-name": "python-not-rest-user", + "description": "For tests where the user does not have the privileges required by the REST API.", + "password": "password", + "role": [ + "qconsole-user" + ] +} \ No newline at end of file diff --git a/test-app/src/main/ml-modules/transforms/envelope.sjs b/test-app/src/main/ml-modules/transforms/envelope.sjs new file mode 100644 index 0000000..3b6f6be --- /dev/null +++ b/test-app/src/main/ml-modules/transforms/envelope.sjs @@ -0,0 +1,6 @@ +function transform(context, params, content) { + return { + "envelope": content + } +}; +exports.transform = transform; diff --git a/tests/conftest.py b/tests/conftest.py index 8e796b1..bec236e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,21 +1,29 @@ import pytest + from marklogic import Client +BASE_URL = "http://localhost:8030" + @pytest.fixture def client(): - return Client("http://localhost:8030", digest=("python-test-user", "password")) + return Client(BASE_URL, digest=("python-test-user", "password")) @pytest.fixture def admin_client(): - return Client("http://localhost:8030", digest=("python-test-admin", "password")) + return Client(BASE_URL, digest=("python-test-admin", "password")) @pytest.fixture def basic_client(): # requests allows a tuple to be passed when doing basic authentication. - return Client("http://localhost:8030", auth=("python-test-user", "password")) + return Client(BASE_URL, auth=("python-test-user", "password")) + + +@pytest.fixture +def not_rest_user_client(): + return Client(BASE_URL, digest=("python-not-rest-user", "password")) @pytest.fixture diff --git a/tests/test_get_documents.py b/tests/test_get_documents.py deleted file mode 100644 index 0848560..0000000 --- a/tests/test_get_documents.py +++ /dev/null @@ -1,54 +0,0 @@ -from requests_toolbelt.multipart.decoder import MultipartDecoder - - -def test_get_docs(client): - """ - Possible future client interface: - array_of_documents = client.documents.get(uri=[], metadata=True) - - Where each Document in the array would have fields of: - uri/content/collections/permissions/quality/properties/metadata_values. - """ - response = client.get( - "/v1/documents", - params={ - "uri": ["/doc1.json", "/doc2.xml"], - "category": ["content", "metadata"], - "format": "json", # Applies only to metadata - }, - headers={"Accept": "multipart/mixed"}, - ) - - assert 200 == response.status_code - - # Could provide a class for converting a multipart/mixed response into an array - # of documents too: - # from marklogic import DocumentDecoder - # array_of_documents = DocumentDecoder.from_response(response) - decoder = MultipartDecoder.from_response(response) - for part in decoder.parts: - print(part.headers) - print(part.text) - - -def test_search_docs(client_with_props): - response = client_with_props.get( - "v1/search", - params={ - "collection": "test-data", - "category": ["content", "metadata"], - "format": "json", # Applies only to metadata - }, - headers={"Accept": "multipart/mixed"}, # Indicates we want documents back. - ) - - for part in MultipartDecoder.from_response(response).parts: - print(part.headers) - print(part.text) - - -def test_get_docs_basic_auth(basic_client): - # Just verifies that basic auth works as expected. - response = basic_client.get("/v1/documents", params={"uri": "/doc1.json"}) - assert 200 == response.status_code - assert "world" == response.json()["hello"] diff --git a/tests/test_read_documents.py b/tests/test_read_documents.py new file mode 100644 index 0000000..a8b104c --- /dev/null +++ b/tests/test_read_documents.py @@ -0,0 +1,122 @@ +from requests import Response + +from marklogic import Client +from marklogic.documents import Document + +DEFAULT_PERMS = {"python-tester": ["read", "update"]} + + +def test_write_and_read_binary(client: Client): + content = "MarkLogic and Python".encode("ascii") + response = client.documents.write( + [ + Document( + "/temp/doc1.bin", + content, + permissions=DEFAULT_PERMS, + ) + ] + ) + assert 200 == response.status_code + + docs = client.documents.read(["/temp/doc1.bin"]) + assert len(docs) == 1 + doc = docs[0] + assert doc.uri == "/temp/doc1.bin" + content = doc.content.decode("ascii") + assert content == "MarkLogic and Python" + + +def test_read_uri_with_double_quotes(client: Client): + uri = '/this/"works.json' + response = client.documents.write( + [Document(uri, {"hello": "world"}, permissions=DEFAULT_PERMS)] + ) + assert response.status_code == 200 + + docs = client.documents.read(["/this/%22works.json"]) + assert len(docs) == 1 + assert "/this/%22works.json" == docs[0].uri + + +def test_uri_not_found(client: Client): + docs = client.documents.read(["/doesnt-exist.json"]) + assert docs is not None + assert len(docs) == 0 + + +def test_read_with_transform(client: Client): + """ + Verifies a user can pass in any kwargs and they will be retained as request + parameters, along with the ones added by the client. + """ + docs = client.documents.read( + ["/doc1.json"], + categories=["content", "metadata"], + params={"transform": "envelope"}, + ) + assert 1 == len(docs) + assert docs[0].content == {"envelope": {"hello": "world"}} + + +def test_read_only_collections(client: Client): + docs = client.documents.read( + ["/doc1.json", "/doc2.xml"], categories=["collections"] + ) + assert 2 == len(docs) + + doc1 = docs[0] + assert doc1.uri == "/doc1.json" + assert len(doc1.collections) == 1 + assert doc1.collections[0] == "test-data" + assert doc1.content is None + assert doc1.permissions is None + assert doc1.quality is None + assert doc1.metadata_values is None + assert doc1.properties is None + + doc2 = docs[1] + assert doc2.uri == "/doc2.xml" + assert len(doc2.collections) == 1 + assert doc2.collections[0] == "test-data" + assert doc2.content is None + assert doc2.permissions is None + assert doc2.quality is None + assert doc2.metadata_values is None + assert doc2.properties is None + + +def test_with_accept_header(client: Client): + """ + Verifies that any Accept header provided by the user will be ignored, as it's + expected to be set to multipart/mixed by the client. + """ + docs = client.documents.read( + ["/doc1.json"], + headers={"Accept": "something/invalid"}, + categories=["content", "quality"], + ) + + assert len(docs) == 1 + doc = docs[0] + assert doc.uri == "/doc1.json" + assert doc.content == {"hello": "world"} + assert doc.quality == 0 + assert doc.collections is None + + +def test_read_with_basic_client(basic_client: Client): + # Just verifies that basic auth works as expected. + doc = basic_client.documents.read(["/doc1.json"])[0] + assert {"hello": "world"} == doc.content + + +def test_not_rest_user(not_rest_user_client: Client): + response: Response = not_rest_user_client.documents.read( + ["/doc1.json", "/doc2.xml"] + ) + assert ( + response.status_code == 403 + ), """The user does not have the rest-reader privilege, so MarkLogic is expected + to return a 403. And the documents.read method is then expected to return the + Response so that the user has access to everything in it.""" diff --git a/tests/test_search.py b/tests/test_search.py deleted file mode 100644 index 19316da..0000000 --- a/tests/test_search.py +++ /dev/null @@ -1,5 +0,0 @@ -def test_search(client): - response = client.get("v1/search") - assert 200 == response.status_code - assert "application/xml; charset=utf-8" == response.headers["Content-type"] - assert response.text.startswith("be embedded" == props["xml"] assert 1 == props["number"] - assert 1 == metadata["quality"] + assert 1 == doc.quality - values = metadata["metadataValues"] + values = doc.metadata_values assert 2 == len(values.keys()) assert "value1" == values["key1"] assert "value2" == values["key2"]