From 6739fd9119bda04e4ee32a5dcdea762e56eb4df4 Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Thu, 13 Jul 2023 16:11:59 -0400 Subject: [PATCH] DEVEXP-498 Can now define default metadata I realized my initial "DocumentBatch" design was not good because it limited the user to defining a single set of default metadata. And the v1/documents endpoint allows for many sets of default metadata. The new design - introducing a DefaultMetadata class, and allowing the user to provide a list of any number of Document and DefaultMetadata instances - is simpler and adheres to what /v1/documents allows too. No docs yet. Going to do a separate PR for those next. --- marklogic/documents.py | 172 +++++++++++++++----- tests/test_write_documents_with_metadata.py | 172 ++++++++++++++------ 2 files changed, 253 insertions(+), 91 deletions(-) diff --git a/marklogic/documents.py b/marklogic/documents.py index 918c107..c00d1c8 100644 --- a/marklogic/documents.py +++ b/marklogic/documents.py @@ -1,21 +1,78 @@ import json +from typing import Union -from requests import Session +from requests import Response, Session from urllib3.fields import RequestField from urllib3.filepost import encode_multipart_formdata +""" +Defines classes to simplify usage of the documents REST endpoint defined at +https://docs.marklogic.com/REST/client/management. +""" -class Document: + +class Metadata: """ - :param uri: the URI of the document; can be None when relying on MarkLogic to - generate a URI. - :param content: the content of the document. + Defines the metadata properties that can be associated with a document and also + used for specifying default metadata when writing many documents. One benefit + of this class - besides encapsulating each bit of what MarkLogic defines as metadata + for a document - is to provide a simpler mechanism for defining permissions via a + dictionary as opposed to an array of dictionaries. + :param collections: array of collection URIs. :param permissions: dict with keys of role names and values of arrays of capabilities such as "read", "update", and "execute". :param quality: document quality, used for scoring in searches. :param metadata_values: dict with string keys and string values. :param properties: dict with string keys and values of any type. + """ + + def __init__( + self, + collections: list[str] = None, + permissions: dict = None, + quality: int = None, + metadata_values: dict = None, + properties: dict = None, + ): + self.collections = collections + self.permissions = permissions + self.quality = quality + self.metadata_values = metadata_values + self.properties = properties + + +def metadata_to_dict(metadata: Metadata) -> dict: + """ + Returns a dictionary with a structure matching what the /v1/documents endpoint + requires. + """ + md = {} + if metadata.permissions: + md["permissions"] = [ + {"role-name": k, "capabilities": v} for k, v in metadata.permissions.items() + ] + if metadata.collections: + md["collections"] = metadata.collections + if metadata.quality: + md["quality"] = metadata.quality + if metadata.properties: + md["properties"] = metadata.properties + if metadata.metadata_values: + md["metadataValues"] = metadata.metadata_values + return md + + +class Document(Metadata): + """ + :param uri: the URI of the document; can be None when relying on MarkLogic to + generate a URI. + :param content: the content of the document. + :param collections: see definition in parent class. + :param permissions: see definition in parent class. + :param quality: see definition in parent class. + :param metadata_values: see definition in parent class. + :param properties: see definition in parent class. :param content_type: the MIME type of the document; use when MarkLogic cannot determine the MIME type based on the URI. :param extension: specifies a suffix for a URI generated by MarkLogic. @@ -46,13 +103,9 @@ def __init__( version_id: str = None, temporal_document: str = None, ): + super().__init__(collections, permissions, quality, metadata_values, properties) self.uri = uri self.content = content - self.collections = collections - self.permissions = permissions - self.quality = quality - self.metadata_values = metadata_values - self.properties = properties # The following are all specific to writing a document. self.content_type = content_type @@ -64,38 +117,25 @@ def __init__( self.temporal_document = temporal_document def to_request_field(self) -> RequestField: + """ + Returns a multipart request field representing the document to be written. + """ data = self.content if type(data) is dict: data = json.dumps(data) field = RequestField(name=self.uri, data=data, filename=self.uri) field.make_multipart( - content_disposition=self._make_disposition(), + content_disposition=self._make_content_disposition(), content_type=self.content_type, ) return field - def to_metadata_dict(self) -> dict: + def to_metadata_request_field(self) -> RequestField: """ - Returns a dictionary with a data structure matching what the /v1/documents - endpoint requires. + Returns a multipart request field if any metadata has been set on this + document; returns None otherwise. """ - metadata = {} - if self.permissions: - metadata["permissions"] = [ - {"role-name": k, "capabilities": v} for k, v in self.permissions.items() - ] - if self.collections: - metadata["collections"] = self.collections - if self.quality: - metadata["quality"] = self.quality - if self.properties: - metadata["properties"] = self.properties - if self.metadata_values: - metadata["metadataValues"] = self.metadata_values - return metadata - - def to_metadata_request_field(self) -> RequestField: - metadata = self.to_metadata_dict() + metadata = metadata_to_dict(self) if len(metadata.keys()) == 0: return None @@ -108,7 +148,12 @@ def to_metadata_request_field(self) -> RequestField: ) return field - def _make_disposition(self) -> str: + def _make_content_disposition(self) -> str: + """ + Returns a content disposition suitable for use when writing documents via + https://docs.marklogic.com/REST/POST/v1/documents . See that page for more + information on each part of the disposition. + """ disposition = "attachment" if not self.uri: @@ -133,17 +178,68 @@ def _make_disposition(self) -> str: return disposition +class DefaultMetadata(Metadata): + """ + Defines default metadata for use when writing many documents at one time. + """ + + def __init__( + self, + collections: list[str] = None, + permissions: dict = None, + quality: int = None, + metadata_values: dict = None, + properties: dict = None, + ): + super().__init__(collections, permissions, quality, metadata_values, properties) + + def to_metadata_request_field(self) -> RequestField: + """ + Returns a multipart request field suitable for use when writing many documents. + """ + metadata = metadata_to_dict(self) + if len(metadata.keys()) == 0: + return None + field = RequestField(name=None, data=json.dumps(metadata), filename=None) + field.make_multipart( + content_disposition="inline; category=metadata", + content_type="application/json", + ) + return field + + class DocumentManager: + """ + Provides methods to simplify interacting with the /v1/documents REST endpoint + defined at https://docs.marklogic.com/REST/client/management. + """ + def __init__(self, session: Session): self._session = session - def write(self, documents: list[Document], **kwargs): + def write( + self, parts: list[Union[DefaultMetadata, Document]], **kwargs + ) -> Response: + """ + Write one or many documents at a time via a POST to the endpoint defined at + https://docs.marklogic.com/REST/POST/v1/documents . + + :param parts: a part can define either a document to be written, which can + include metadata, or a set of default metadata to be applied to each document + after it that does not define its own metadata. See + https://docs.marklogic.com/guide/rest-dev/bulk#id_16015 for more information on + how the REST endpoint uses metadata. + """ fields = [] - for doc in documents: - metadata_field = doc.to_metadata_request_field() - if metadata_field: - fields.append(metadata_field) - fields.append(doc.to_request_field()) + + for part in parts: + if isinstance(part, DefaultMetadata): + fields.append(part.to_metadata_request_field()) + else: + metadata_field = part.to_metadata_request_field() + if metadata_field: + fields.append(metadata_field) + fields.append(part.to_request_field()) data, content_type = encode_multipart_formdata(fields) diff --git a/tests/test_write_documents_with_metadata.py b/tests/test_write_documents_with_metadata.py index 0d9f06a..7e750aa 100644 --- a/tests/test_write_documents_with_metadata.py +++ b/tests/test_write_documents_with_metadata.py @@ -1,65 +1,36 @@ from marklogic import Client -from marklogic.documents import Document +from marklogic.documents import Document, DefaultMetadata + +TEST_METADATA = { + "collections": ["c1", "c2"], + "permissions": { + "python-tester": ["read", "update"], + "qconsole-user": "execute", + }, + "quality": 1, + "metadata_values": {"key1": "value1", "key2": "value2"}, + "properties": { + "hello": "world", + "xml": "be embedded", + "number": 1, + }, +} def test_all_metadata(client: Client): - uri = "/temp/doc1.json" - response = client.documents.write( [ Document( - uri, + "/temp/doc1.json", {"content": "original"}, - collections=["c1", "c2"], - permissions={ - "python-tester": ["read", "update"], - "qconsole-user": "execute", - }, - quality=1, - properties={ - "hello": "world", - "xml": "be embedded", - "number": 1, - }, - metadata_values={"key1": "value1", "key2": "value2"}, + *TEST_METADATA.values(), ), ] ) - assert 200 == response.status_code - # Get and verify all the metadata. - metadata = client.get( - "v1/documents?uri=/temp/doc1.json&category=metadata&format=json" - ).json() - - perms = metadata["permissions"] - assert 2 == len(perms) - perm = next(perm for perm in perms if perm["role-name"] == "python-tester") - assert 2 == len(perm["capabilities"]) - assert "read" in perm["capabilities"] - assert "update" in perm["capabilities"] - perm = next(perm for perm in perms if perm["role-name"] == "qconsole-user") - assert 1 == len(perm["capabilities"]) - assert "execute" == perm["capabilities"][0] - - collections = metadata["collections"] - assert 2 == len(collections) - assert "c1" in collections - assert "c2" in collections - - props = metadata["properties"] - assert 3 == len(props.keys()) - assert "world" == props["hello"] - assert "be embedded" == props["xml"] - assert 1 == props["number"] - - assert 1 == metadata["quality"] - - values = metadata["metadataValues"] - assert 2 == len(values.keys()) - assert "value1" == values["key1"] - assert "value2" == values["key2"] + metadata = _get_metadata(client, "/temp/doc1.json") + _verify_test_metadata_exists(metadata) def test_only_quality_and_permissions(client: Client): @@ -79,10 +50,7 @@ def test_only_quality_and_permissions(client: Client): assert 200 == response.status_code - metadata = client.get( - "v1/documents?uri=/temp/doc1.json&category=metadata&format=json" - ).json() - + metadata = _get_metadata(client, "/temp/doc1.json") assert 2 == metadata["quality"] assert 0 == len(metadata["collections"]) assert 0 == len(metadata["properties"].keys()) @@ -105,3 +73,101 @@ def test_only_quality(client: Client): ), "The response should be sent without permissions and thus fail because a \ non-admin user requires at least one update permission." assert "XDMP-MUSTHAVEUPDATE" in response.text + + +def test_default_metadata(client: Client): + """ + The REST endpoint allows for default metadata to be provided at any point in the + multipart body, and it is expected to be applied to any document after it that does + not have any metadata itself. + """ + response = client.documents.write( + [ + DefaultMetadata(*TEST_METADATA.values()), + Document("/temp/doc1.json", {"doc": 1}), + Document( + "/temp/doc2.json", + {"doc": 2}, + permissions={"python-tester": "update", "rest-extension-user": "read"} + ), + DefaultMetadata( + permissions={"python-tester": "update", "qconsole-user": "read"} + ), + Document("/temp/doc3.json", {"doc": 3}), + ], + ) + + assert 200 == response.status_code + + # doc1 should use the first set of default metadata + metadata = _get_metadata(client, "/temp/doc1.json") + _verify_test_metadata_exists(metadata) + + # doc2 should use its own metadata + metadata = _get_metadata(client, "/temp/doc2.json") + assert 0 == metadata["quality"] + assert 0 == len(metadata["collections"]) + assert 0 == len(metadata["properties"].keys()) + assert 0 == len(metadata["metadataValues"].keys()) + perms = metadata["permissions"] + assert 2 == len(perms) + perm = next(perm for perm in perms if perm["role-name"] == "python-tester") + assert 1 == len(perm["capabilities"]) + assert "update" in perm["capabilities"] + perm = next(perm for perm in perms if perm["role-name"] == "rest-extension-user") + assert 1 == len(perm["capabilities"]) + assert "read" in perm["capabilities"] + + # doc3 should use the second set of default metadata + metadata = _get_metadata(client, "/temp/doc3.json") + assert 0 == metadata["quality"] + assert 0 == len(metadata["collections"]) + assert 0 == len(metadata["properties"].keys()) + assert 0 == len(metadata["metadataValues"].keys()) + perms = metadata["permissions"] + assert 2 == len(perms) + perm = next(perm for perm in perms if perm["role-name"] == "python-tester") + assert 1 == len(perm["capabilities"]) + assert "update" in perm["capabilities"] + perm = next(perm for perm in perms if perm["role-name"] == "qconsole-user") + assert 1 == len(perm["capabilities"]) + assert "read" in perm["capabilities"] + + + +def _get_metadata(client: Client, uri: str): + return client.get(f"v1/documents?uri={uri}&category=metadata&format=json").json() + + +def _verify_test_metadata_exists(metadata: dict): + """ + Convenience function for verifying that document metadata contains the metadata + defined by TEST_METADATA. + """ + perms = metadata["permissions"] + assert 2 == len(perms) + perm = next(perm for perm in perms if perm["role-name"] == "python-tester") + assert 2 == len(perm["capabilities"]) + assert "read" in perm["capabilities"] + assert "update" in perm["capabilities"] + perm = next(perm for perm in perms if perm["role-name"] == "qconsole-user") + assert 1 == len(perm["capabilities"]) + assert "execute" == perm["capabilities"][0] + + collections = metadata["collections"] + assert 2 == len(collections) + assert "c1" in collections + assert "c2" in collections + + props = metadata["properties"] + assert 3 == len(props.keys()) + assert "world" == props["hello"] + assert "be embedded" == props["xml"] + assert 1 == props["number"] + + assert 1 == metadata["quality"] + + values = metadata["metadataValues"] + assert 2 == len(values.keys()) + assert "value1" == values["key1"] + assert "value2" == values["key2"]