From bbcf2b98351cd2c1039c2405d0f4b102aefdfb53 Mon Sep 17 00:00:00 2001
From: Rob Rudin <rob.rudin@marklogic.com>
Date: Wed, 12 Jul 2023 12:09:45 -0400
Subject: [PATCH] DEVEXP-496 Can now write a batch of documents

I'm going to do docs in a separate PR, after all the metadata stuff is supported too.

There are only a lot of new files here because I needed them in the test app to test out things like optimistic locking and temporal writes.
---
 marklogic/client.py                           |   7 +
 marklogic/documents.py                        | 125 ++++++++++++
 test-app/.gitignore                           |   1 +
 .../security/roles/python-tester.json         |  23 +++
 .../security/users/python-test-admin.json     |   7 +
 .../security/users/python-test-user.json      |   4 +-
 .../temporal/axes/temporal-system-axis.json   |  15 ++
 .../temporal/axes/temporal-valid-axis.json    |  15 ++
 .../collections/temporal-collection.json      |   8 +
 .../src/main/ml-data/permissions.properties   |   2 +-
 .../src/main/ml-modules/rest-properties.json  |   3 +
 tests/conftest.py                             |   5 +
 tests/test_write_documents.py                 | 182 ++++++++++++++++++
 13 files changed, 393 insertions(+), 4 deletions(-)
 create mode 100644 marklogic/documents.py
 create mode 100644 test-app/src/main/ml-config/security/roles/python-tester.json
 create mode 100644 test-app/src/main/ml-config/security/users/python-test-admin.json
 create mode 100644 test-app/src/main/ml-config/temporal/axes/temporal-system-axis.json
 create mode 100644 test-app/src/main/ml-config/temporal/axes/temporal-valid-axis.json
 create mode 100644 test-app/src/main/ml-config/temporal/collections/temporal-collection.json
 create mode 100644 test-app/src/main/ml-modules/rest-properties.json
 create mode 100644 tests/test_write_documents.py

diff --git a/marklogic/client.py b/marklogic/client.py
index ed96507..3ba4a10 100644
--- a/marklogic/client.py
+++ b/marklogic/client.py
@@ -1,5 +1,6 @@
 import requests
 from marklogic.cloud_auth import MarkLogicCloudAuth
+from marklogic.documents import DocumentManager
 from requests.auth import HTTPDigestAuth
 from urllib.parse import urljoin
 
@@ -63,3 +64,9 @@ def prepare_request(self, request, *args, **kwargs):
         """
         request.url = urljoin(self.base_url, request.url)
         return super(Client, self).prepare_request(request, *args, **kwargs)
+
+    @property
+    def documents(self):
+        if not hasattr(self, "_documents"):
+            self._documents = DocumentManager(self)
+        return self._documents
diff --git a/marklogic/documents.py b/marklogic/documents.py
new file mode 100644
index 0000000..61c7836
--- /dev/null
+++ b/marklogic/documents.py
@@ -0,0 +1,125 @@
+import json
+from requests import Session
+from urllib3.fields import RequestField
+from urllib3.filepost import encode_multipart_formdata
+
+
+class Document:
+    """
+    :param uri: the URI of the document; can be None when relying on MarkLogic to
+    generate a URI.
+    :param content: the content of the document.
+    :param content_type: the MIME type of the document; use when MarkLogic cannot
+    determine the MIME type based on the URI.
+    :param extension: specifies a suffix for a URI generated by MarkLogic.
+    :param directory: specifies a prefix for a URI generated by MarkLogic.
+    :param repair: for an XML document, the level of XML repair to perform; can be
+    "full" or "none", with "none" being the default.
+    :param version_id: affects updates when optimistic locking is enabled; see
+    https://docs.marklogic.com/REST/POST/v1/documents for more information.
+    :param temporal_document: the logical document URI for a document written to a
+    temporal collection; requires that a "temporal-collection" parameter be included in
+    the request.
+    """
+
+    def __init__(
+        self,
+        uri: str,
+        content,
+        content_type: str = None,
+        extension: str = None,
+        directory: str = None,
+        repair: str = None,
+        extract: str = None,
+        version_id: str = None,
+        temporal_document: str = None,
+    ):
+        self.uri = uri
+        self.content = content
+        self.content_type = content_type
+        self.extension = extension
+        self.directory = directory
+        self.repair = repair
+        self.extract = extract
+        self.version_id = version_id
+        self.temporal_document = temporal_document
+
+    def to_request_field(self) -> RequestField:
+        data = self.content
+        if type(data) is dict:
+            data = json.dumps(data)
+        field = RequestField(name=self.uri, data=data, filename=self.uri)
+        field.make_multipart(
+            content_disposition=self._make_disposition(),
+            content_type=self.content_type,
+        )
+        return field
+
+    def _make_disposition(self) -> str:
+        disposition = "attachment"
+
+        if not self.uri:
+            disposition = "inline"
+            if self.extension:
+                disposition = f"{disposition};extension={self.extension}"
+            if self.directory:
+                disposition = f"{disposition};directory={self.directory}"
+
+        if self.repair:
+            disposition = f"{disposition};repair={self.repair}"
+
+        if self.extract:
+            disposition = f"{disposition};extract={self.extract}"
+
+        if self.version_id:
+            disposition = f"{disposition};versionId={self.version_id}"
+
+        if self.temporal_document:
+            disposition = f"{disposition};temporal-document={self.temporal_document}"
+
+        return disposition
+
+
+class DocumentManager:
+    def __init__(self, session: Session):
+        self._session = session
+
+    def write(self, documents: list[Document], **kwargs):
+        fields = [self._make_default_metadata_field()]
+        for doc in documents:
+            fields.append(doc.to_request_field())
+
+        data, content_type = encode_multipart_formdata(fields)
+
+        headers = kwargs.pop("headers", {})
+        headers["Content-Type"] = "".join(
+            ("multipart/mixed",) + content_type.partition(";")[1:]
+        )
+        if not headers.get("Accept"):
+            headers["Accept"] = "application/json"
+
+        return self._session.post("/v1/documents", data=data, headers=headers, **kwargs)
+
+    def _make_default_metadata_field(self):
+        """
+        Temporary method to ensure the test user can see written documents. Will be
+        removed when this feature is implemented for real.
+        """
+        metadata_field = RequestField(
+            name="request-metadata",
+            data=json.dumps(
+                {
+                    "permissions": [
+                        {
+                            "role-name": "python-tester",
+                            "capabilities": ["read", "update"],
+                        }
+                    ]
+                }
+            ),
+        )
+        metadata_field.make_multipart(
+            content_disposition="inline; category=metadata",
+            content_type="application/json",
+        )
+        return metadata_field
diff --git a/test-app/.gitignore b/test-app/.gitignore
index 7f6511a..3efdcd5 100644
--- a/test-app/.gitignore
+++ b/test-app/.gitignore
@@ -1,2 +1,3 @@
 .gradle
 gradle-local.properties
+build
diff --git a/test-app/src/main/ml-config/security/roles/python-tester.json b/test-app/src/main/ml-config/security/roles/python-tester.json
new file mode 100644
index 0000000..616bb5d
--- /dev/null
+++ b/test-app/src/main/ml-config/security/roles/python-tester.json
@@ -0,0 +1,23 @@
+{
+    "role-name": "python-tester",
+    "role": [
+        "rest-extension-user"
+    ],
+    "privilege": [
+        {
+            "privilege-name": "rest-reader",
+            "action": "http://marklogic.com/xdmp/privileges/rest-reader",
+            "kind": "execute"
+        },
+        {
+            "privilege-name": "rest-writer",
+            "action": "http://marklogic.com/xdmp/privileges/rest-writer",
+            "kind": "execute"
+        },
+        {
+            "privilege-name": "xdbc:eval",
+            "action": "http://marklogic.com/xdmp/privileges/xdbc-eval",
+            "kind": "execute"
+        }
+    ]
+}
diff --git a/test-app/src/main/ml-config/security/users/python-test-admin.json b/test-app/src/main/ml-config/security/users/python-test-admin.json
new file mode 100644
index 0000000..ff988f3
--- /dev/null
+++ b/test-app/src/main/ml-config/security/users/python-test-admin.json
@@ -0,0 +1,7 @@
+{
+    "user-name": "python-test-admin",
+    "password": "password",
+    "role": [
+        "admin"
+    ]
+}
\ No newline at end of file
diff --git a/test-app/src/main/ml-config/security/users/python-test-user.json b/test-app/src/main/ml-config/security/users/python-test-user.json
index fcacb51..f033fbb 100644
--- a/test-app/src/main/ml-config/security/users/python-test-user.json
+++ b/test-app/src/main/ml-config/security/users/python-test-user.json
@@ -2,9 +2,7 @@
     "user-name": "python-test-user",
     "password": "password",
     "role": [
-        "rest-evaluator",
-        "rest-reader",
-        "rest-writer",
+        "python-tester",
         "qconsole-user"
     ]
 }
\ No newline at end of file
diff --git a/test-app/src/main/ml-config/temporal/axes/temporal-system-axis.json b/test-app/src/main/ml-config/temporal/axes/temporal-system-axis.json
new file mode 100644
index 0000000..7c87e06
--- /dev/null
+++ b/test-app/src/main/ml-config/temporal/axes/temporal-system-axis.json
@@ -0,0 +1,15 @@
+{
+	"axis-name": "system",
+	"axis-start": {
+		"element-reference": {
+			"namespace-uri": "",
+			"localname": "systemStart"
+		}
+	},
+	"axis-end": {
+		"element-reference": {
+			"namespace-uri": "",
+			"localname": "systemEnd"
+		}
+	}
+}
diff --git a/test-app/src/main/ml-config/temporal/axes/temporal-valid-axis.json b/test-app/src/main/ml-config/temporal/axes/temporal-valid-axis.json
new file mode 100644
index 0000000..4781821
--- /dev/null
+++ b/test-app/src/main/ml-config/temporal/axes/temporal-valid-axis.json
@@ -0,0 +1,15 @@
+{
+	"axis-name": "valid",
+	"axis-start": {
+		"element-reference": {
+			"namespace-uri": "",
+			"localname": "validStart"
+		}
+	},
+	"axis-end": {
+		"element-reference": {
+			"namespace-uri": "",
+			"localname": "validEnd"
+		}
+	}
+}
diff --git a/test-app/src/main/ml-config/temporal/collections/temporal-collection.json b/test-app/src/main/ml-config/temporal/collections/temporal-collection.json
new file mode 100644
index 0000000..436f6eb
--- /dev/null
+++ b/test-app/src/main/ml-config/temporal/collections/temporal-collection.json
@@ -0,0 +1,8 @@
+{
+  "collection-name": "temporal-collection",
+  "system-axis": "system",
+  "valid-axis": "valid",
+  "option": [
+    "updates-admin-override"
+  ]
+}
diff --git a/test-app/src/main/ml-data/permissions.properties b/test-app/src/main/ml-data/permissions.properties
index c977854..a181f8b 100644
--- a/test-app/src/main/ml-data/permissions.properties
+++ b/test-app/src/main/ml-data/permissions.properties
@@ -1 +1 @@
-*=rest-reader,read,rest-writer,update
+*=python-tester,read,python-tester,update
diff --git a/test-app/src/main/ml-modules/rest-properties.json b/test-app/src/main/ml-modules/rest-properties.json
new file mode 100644
index 0000000..a0e9758
--- /dev/null
+++ b/test-app/src/main/ml-modules/rest-properties.json
@@ -0,0 +1,3 @@
+{
+    "update-policy": "VERSION_OPTIONAL"
+  }
\ No newline at end of file
diff --git a/tests/conftest.py b/tests/conftest.py
index 8639a17..570fd9e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -7,6 +7,11 @@ def client():
     return Client("http://localhost:8030", digest=("python-test-user", "password"))
 
 
+@pytest.fixture
+def admin_client():
+    return Client("http://localhost:8030", digest=("python-test-admin", "password"))
+
+
 @pytest.fixture
 def basic_client():
     # requests allows a tuple to be passed when doing basic authentication.
diff --git a/tests/test_write_documents.py b/tests/test_write_documents.py
new file mode 100644
index 0000000..b1cd57d
--- /dev/null
+++ b/tests/test_write_documents.py
@@ -0,0 +1,182 @@
+import pytest
+
+from marklogic import Client
+from marklogic.documents import Document
+
+
+@pytest.fixture(autouse=True)
+def prepare_test_database(admin_client: Client):
+    """
+    Deletes any documents created by other tests to ensure a 'clean' database before a
+    test runs. Does not delete documents in the 'test-data' collection which is intended
+    to contain all the documents loaded by the test-app. A user with the 'admin' role
+    is used so that temporal documents can be deleted.
+    """
+    query = "cts:uris((), (), cts:not-query(cts:collection-query('test-data'))) \
+        ! xdmp:document-delete(.)"
+    response = admin_client.post(
+        "v1/eval",
+        headers={"Content-type": "application/x-www-form-urlencoded"},
+        data={"xquery": query},
+    )
+    assert 200 == response.status_code
+
+
+def test_write_json(client: Client):
+    # Verifies that JSON can be either a dict or a string.
+    response = client.documents.write(
+        [
+            Document("/temp/doc1.json", {"doc": 1}),
+            Document("/temp/doc2.json", '{"doc": 2}'),
+        ]
+    )
+
+    assert 200 == response.status_code
+    assert response.headers["Content-type"].startswith("application/json")
+    data = response.json()
+    assert len(data["documents"]) == 2
+
+    doc1 = client.get("v1/documents?uri=/temp/doc1.json").json()
+    assert 1 == doc1["doc"]
+    doc2 = client.get("v1/documents?uri=/temp/doc2.json").json()
+    assert 2 == doc2["doc"]
+
+
+def test_return_xml(client: Client):
+    """
+    Verifies that the headers passed in by a user aren't lost when the client sets
+    the Content-type to multipart/mixed.
+    """
+    docs = [
+        Document("/temp/doc1.json", {"doc": 1}),
+        Document("/temp/doc2.json", {"doc": 2}),
+    ]
+    response = client.documents.write(docs, headers={"Accept": "application/xml"})
+
+    assert response.headers["Content-type"].startswith("application/xml")
+    assert response.text.startswith("<rapi:documents")
+
+
+def test_write_json_and_xml(client: Client):
+    response = client.documents.write(
+        [
+            Document("/temp/doc1.json", {"doc": 1}),
+            Document("/temp/doc2.xml", "<doc>2</doc>"),
+        ]
+    )
+    assert 200 == response.status_code
+
+    doc1 = client.get("v1/documents?uri=/temp/doc1.json").json()
+    assert 1 == doc1["doc"]
+    doc2_text = client.get("v1/documents?uri=/temp/doc2.xml").text
+    assert doc2_text.__contains__("<doc>2</doc>")
+
+
+def test_content_types(client: Client):
+    """
+    Verifies a user can specify a content type for each document where MarkLogic is not
+    able to determine a type based on the URI.
+    """
+    response = client.documents.write(
+        [
+            Document("/temp/doc1", {"doc": 1}, content_type="application/json"),
+            Document("/temp/doc2", "<doc>2</doc>", content_type="application/xml"),
+        ]
+    )
+    assert 200 == response.status_code
+
+    doc1 = client.get("v1/documents?uri=/temp/doc1").json()
+    assert 1 == doc1["doc"]
+    doc2_text = client.get("v1/documents?uri=/temp/doc2").text
+    assert doc2_text.__contains__("<doc>2</doc>")
+
+
+def test_single_doc(client):
+    response = client.documents.write([Document("/temp/doc1.json", {"doc": 1})])
+    assert 200 == response.status_code
+
+    doc1 = client.get("v1/documents?uri=/temp/doc1.json").json()
+    assert 1 == doc1["doc"]
+
+
+def test_server_generated_uri(client):
+    response = client.documents.write(
+        [Document(None, {"doc": "serveruri"}, extension=".json", directory="/temp/")]
+    )
+    assert 200 == response.status_code
+
+    # Do a search to find the URI.
+    data = client.get("/v1/search?q=serveruri&format=json").json()
+    assert 1 == data["total"]
+    uri = data["results"][0]["uri"]
+
+    doc1 = client.get(f"v1/documents?uri={uri}").json()
+    assert "serveruri" == doc1["doc"]
+
+
+def test_repair_xml(client):
+    response = client.documents.write(
+        [Document("/temp/doc1.xml", "<doc>needs <b>closing tag</doc>", repair="full")]
+    )
+    assert 200 == response.status_code
+
+    xml = client.get("v1/documents?uri=/temp/doc1.xml").text
+    assert xml.__contains__("<doc>needs <b>closing tag</b></doc>")
+
+
+@pytest.mark.skip("Will succeed only if MarkLogic converters are installed.")
+def test_extract_binary(client):
+    content = "MarkLogic and Python".encode("ascii")
+    response = client.documents.write(
+        [Document("/temp/doc1.bin", content, extract="properties")]
+    )
+    assert 200 == response.status_code
+
+
+def test_optimistic_locking(client):
+    response = client.documents.write(
+        [Document("/temp/doc1.json", {"content": "original"})]
+    )
+    assert 200 == response.status_code
+
+    # The ETag defines the version of the document.
+    etag = client.get("v1/documents?uri=/temp/doc1.json").headers["ETag"]
+
+    # Update the document, passing in the current version_id based on the ETag.
+    response = client.documents.write(
+        [Document("/temp/doc1.json", {"content": "updated!"}, version_id=etag)]
+    )
+    assert 200 == response.status_code
+
+    # Verify the doc was updated.
+    doc = client.get("v1/documents?uri=/temp/doc1.json").json()
+    assert "updated!" == doc["content"]
+
+    # Next update should fail since the ETag is no longer the current version.
+    response = client.documents.write(
+        [Document("/temp/doc1.json", {"this": "should fail"}, version_id=etag)]
+    )
+    assert 412 == response.status_code, "412 is returned when the versionId is invalid."
+    assert response.text.__contains__("RESTAPI-CONTENTWRONGVERSION")
+
+
+def test_temporal_doc(client):
+    content = {
+        "text": "hello world",
+        "systemStart": "2014-04-03T11:00:00",
+        "systemEnd": "2014-04-03T16:00:00",
+        "validStart": "2014-04-03T11:00:00",
+        "validEnd": "2014-04-03T16:00:00",
+    }
+
+    response = client.documents.write(
+        [Document("/temp/doc1.json", content, temporal_document="custom1")],
+        params={"temporal-collection": "temporal-collection"},
+    )
+    assert 200 == response.status_code
+
+    # Verify that the temporal doc was written to the "custom1" collection. This will be
+    # easier to do once we have support for reading documents and their metadata.
+    data = client.get("/v1/search?collection=custom1&format=json").json()
+    assert 1 == data["total"]
+    assert "/temp/doc1.json" == data["results"][0]["uri"]