Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 152 additions & 22 deletions marklogic/documents.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import json
from collections import OrderedDict
from typing import Union

from requests import Response, Session
from requests_toolbelt.multipart.decoder import MultipartDecoder
from urllib3.fields import RequestField
from urllib3.filepost import encode_multipart_formdata

Expand Down Expand Up @@ -63,27 +65,29 @@ def metadata_to_dict(metadata: Metadata) -> dict:
return md


def dict_to_metadata(metadata: dict, target_metadata: Metadata) -> None:
"""
Populates the given Metadata instance based on the metadata dictionary as returned
by the /v1/documents REST endpoint.
"""
target_metadata.collections = metadata.get("collections")
target_metadata.quality = metadata.get("quality")
target_metadata.metadata_values = metadata.get("metadataValues")
target_metadata.properties = metadata.get("properties")
if metadata.get("permissions"):
perms = {}
for perm in metadata["permissions"]:
role = perm["role-name"]
perms[role] = perm["capabilities"]
target_metadata.permissions = perms
else:
target_metadata.perms = None


class Document(Metadata):
"""
:param uri: the URI of the document; can be None when relying on MarkLogic to
generate a URI.
:param content: the content of the document.
:param collections: see definition in parent class.
:param permissions: see definition in parent class.
:param quality: see definition in parent class.
:param metadata_values: see definition in parent class.
:param properties: see definition in parent class.
:param content_type: the MIME type of the document; use when MarkLogic cannot
determine the MIME type based on the URI.
:param extension: specifies a suffix for a URI generated by MarkLogic.
:param directory: specifies a prefix for a URI generated by MarkLogic.
:param repair: for an XML document, the level of XML repair to perform; can be
"full" or "none", with "none" being the default.
:param version_id: affects updates when optimistic locking is enabled; see
https://docs.marklogic.com/REST/POST/v1/documents for more information.
:param temporal_document: the logical document URI for a document written to a
temporal collection; requires that a "temporal-collection" parameter be included in
the request.
Represents a document, either as read from MarkLogic or as a document to be
written to MarkLogic.
"""

def __init__(
Expand All @@ -96,24 +100,48 @@ def __init__(
metadata_values: dict = None,
properties: dict = None,
content_type: str = None,
version_id: str = None,
extension: str = None,
directory: str = None,
repair: str = None,
extract: str = None,
version_id: str = None,
temporal_document: str = None,
):
"""
:param uri: the URI of the document; can be None when relying on MarkLogic to
generate a URI.
:param content: the content of the document.
:param collections: see definition in parent class.
:param permissions: see definition in parent class.
:param quality: see definition in parent class.
:param metadata_values: see definition in parent class.
:param properties: see definition in parent class.
:param content_type: the MIME type of the document; use when MarkLogic cannot
determine the MIME type based on the URI.
:param version_id: affects updates when optimistic locking is enabled; see
https://docs.marklogic.com/REST/POST/v1/documents for more information.
:param temporal_document: the logical document URI for a document written to a
:param extension: specifies a suffix for a URI generated by MarkLogic; only used
when writing a document.
:param directory: specifies a prefix for a URI generated by MarkLogic; only used
when writing a document.
:param repair: for an XML document, the level of XML repair to perform; can be
"full" or "none", with "none" being the default; only used when writing a
document.
temporal collection; requires that a "temporal-collection" parameter be
included in the request; only used when writing a document.
"""
super().__init__(collections, permissions, quality, metadata_values, properties)
self.uri = uri
self.content = content
self.content_type = content_type
self.version_id = version_id

# The following are all specific to writing a document.
self.content_type = content_type
self.extension = extension
self.directory = directory
self.repair = repair
self.extract = extract
self.version_id = version_id
self.temporal_document = temporal_document

def to_request_field(self) -> RequestField:
Expand Down Expand Up @@ -208,6 +236,37 @@ def to_metadata_request_field(self) -> RequestField:
return field


def _extract_values_from_header(part) -> dict:
"""
Returns a dict containing values about the document content or metadata.
"""
encoding = part.encoding
disposition = part.headers["Content-Disposition".encode(encoding)].decode(encoding)
disposition_values = {}
for item in disposition.split(";"):
tokens = item.split("=")
# The first item will be "attachment" and can be ignored.
if len(tokens) == 2:
disposition_values[tokens[0].strip()] = tokens[1]

content_type = None
if part.headers.get("Content-Type".encode(encoding)):
content_type = part.headers["Content-Type".encode(encoding)].decode(encoding)

uri = disposition_values["filename"]
if uri.startswith('"'):
uri = uri[1:]
if uri.endswith('"'):
uri = uri[:-1]

return {
"uri": uri,
"category": disposition_values["category"],
"content_type": content_type,
"version_id": disposition_values.get("versionId"),
}


class DocumentManager:
"""
Provides methods to simplify interacting with the /v1/documents REST endpoint
Expand Down Expand Up @@ -251,3 +310,74 @@ def write(
headers["Accept"] = "application/json"

return self._session.post("/v1/documents", data=data, headers=headers, **kwargs)

def _get_multipart_documents_response(
self, uris: list[str], categories: list[str], **kwargs
) -> Response:
"""
Constructs and sends a multipart/mixed request to the v1/documents endpoint.
"""
params = kwargs.pop("params", {})
params["uri"] = uris
params["format"] = "json" # This refers to the metadata format.
if categories:
params["category"] = categories

headers = kwargs.pop("headers", {})
headers["Accept"] = "multipart/mixed"
return self._session.get(
"/v1/documents", params=params, headers=headers, **kwargs
)

def read(
self, uris: list[str], categories: list[str] = None, **kwargs
) -> Union[list[Document], Response]:
"""
Read one or many documents via a GET to the endpoint defined at
https://docs.marklogic.com/REST/POST/v1/documents . If a 200 is not returned
by that endpoint, then the Response is returned instead.

:param uris: list of URIs to read.
:param categories: optional list of the categories of data to return for each
URI. By default, only content will be returned for each URI. See the endpoint
documentation for further information.
"""
response = self._get_multipart_documents_response(uris, categories, **kwargs)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be helpful if to return the response code if the URI isn't found?
What if there's a network or authentication error?
With Python, is it generally assumed that the user of these functions will do their own error handling?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I'll add a test user that lacks the "rest-reader" role and thus will get a 4xx, and I'll add a test to verify that it's returned.

So I think that will change the method signature to return either a list[Document] or a Response.

if response.status_code != 200:
return response

decoder = MultipartDecoder.from_response(response)

# Use a dict to store URIs to Document objects so that we don't assume any
# order with how the metadata and content parts are returned. An OrderedDict is
# used to ensure that the order of the URIs is maintained, though the REST
# endpoint is not guaranteed to return them in the same order as provided by
# the user.
docs = OrderedDict()

for part in decoder.parts:
header_values = _extract_values_from_header(part)
uri = header_values["uri"]
if header_values["category"] == "content":
content = (
json.loads(part.content)
if header_values["content_type"] == "application/json"
else part.content
)
content_type = header_values["content_type"]
version_id = header_values["version_id"]
if docs.get(uri):
doc: Document = docs[uri]
doc.content = content
doc.content_type = content_type
doc.version_id = version_id
else:
docs[uri] = Document(
uri, content, content_type=content_type, version_id=version_id
)
else:
doc = docs[uri] if docs.get(uri) else Document(uri, None)
docs[uri] = doc
dict_to_metadata(json.loads(part.content), doc)

return list(docs.values())
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"user-name": "python-not-rest-user",
"description": "For tests where the user does not have the privileges required by the REST API.",
"password": "password",
"role": [
"qconsole-user"
]
}
6 changes: 6 additions & 0 deletions test-app/src/main/ml-modules/transforms/envelope.sjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
function transform(context, params, content) {
return {
"envelope": content
}
};
exports.transform = transform;
14 changes: 11 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,29 @@
import pytest

from marklogic import Client

BASE_URL = "http://localhost:8030"


@pytest.fixture
def client():
return Client("http://localhost:8030", digest=("python-test-user", "password"))
return Client(BASE_URL, digest=("python-test-user", "password"))


@pytest.fixture
def admin_client():
return Client("http://localhost:8030", digest=("python-test-admin", "password"))
return Client(BASE_URL, digest=("python-test-admin", "password"))


@pytest.fixture
def basic_client():
# requests allows a tuple to be passed when doing basic authentication.
return Client("http://localhost:8030", auth=("python-test-user", "password"))
return Client(BASE_URL, auth=("python-test-user", "password"))


@pytest.fixture
def not_rest_user_client():
return Client(BASE_URL, digest=("python-not-rest-user", "password"))


@pytest.fixture
Expand Down
54 changes: 0 additions & 54 deletions tests/test_get_documents.py

This file was deleted.

Loading