feat: XML string formats for normalizedString and token (#119)

fixes #114 fixes #115 --------- Signed-off-by: Jan Kowalleck <jan.kowalleck@gmail.com>
madpah · Jul 8, 2024 · 3a1728d · 3a1728d
1 parent aabb5e9
commit 3a1728d
Show file tree

Hide file tree

Showing 8 changed files with 371 additions and 23 deletions.
diff --git a/docs/customising-structure.rst b/docs/customising-structure.rst
@@ -176,6 +176,26 @@ For *Example 3*, you would add the following to your class:
 
 Further examples are available in our :ref:`unit tests <unit-tests>`.
 
+Serializing special XML string types
+----------------------------------------------------
+
+In XML, are special string types, ech with defined set of allowed characters and whitespace handling.
+We can handle this by adding the decorator :obj:`serializable.xml_string()` to the appropriate property in your class.
+
+.. code-block:: python
+
+    @property
+    @serializable.xml_string(serializable.XmlStringSerializationType.TOKEN)
+    def author(self) -> str:
+        return self._author
+
+Further examples are available in our :ref:`unit tests <unit-tests>`.
+
+.. note::
+
+   The actual transformation is done by :func:`serializable.xml.xs_normalizedString()`
+   and :func:`serializable.xml.xs_token()`
+
 Serialization Views
 ----------------------------------------------------
 

diff --git a/serializable/__init__.py b/serializable/__init__.py
@@ -48,6 +48,7 @@
 
 from .formatters import BaseNameFormatter, CurrentFormatter
 from .helpers import BaseHelper
+from .xml import xs_normalizedString, xs_token
 
 # `Intersection` is still not implemented, so it is interim replaced by Union for any support
 # see section "Intersection" in https://peps.python.org/pep-0483/
@@ -128,6 +129,47 @@ class XmlArraySerializationType(Enum):
     NESTED = 2
 
 
+@unique
+class XmlStringSerializationType(Enum):
+    """
+    Enum to differentiate how string-type properties are serialized.
+    """
+    STRING = 1
+    """
+    as raw string.
+    see https://www.w3.org/TR/xmlschema-2/#string
+    """
+    NORMALIZED_STRING = 2
+    """
+    as `normalizedString`.
+    see http://www.w3.org/TR/xmlschema-2/#normalizedString"""
+    TOKEN = 3
+    """
+    as `token`.
+    see http://www.w3.org/TR/xmlschema-2/#token"""
+
+    # unimplemented cases
+    # - https://www.w3.org/TR/xmlschema-2/#language
+    # - https://www.w3.org/TR/xmlschema-2/#NMTOKEN
+    # - https://www.w3.org/TR/xmlschema-2/#Name
+
+
+# region _xs_string_mod_apply
+
+__XS_STRING_MODS: Dict[XmlStringSerializationType, Callable[[str], str]] = {
+    XmlStringSerializationType.NORMALIZED_STRING: xs_normalizedString,
+    XmlStringSerializationType.TOKEN: xs_token,
+}
+
+
+def _xs_string_mod_apply(v: str, t: Optional[XmlStringSerializationType]) -> str:
+    mod = __XS_STRING_MODS.get(t)  # type: ignore[arg-type]
+    return mod(v) if mod else v
+
+
+# endregion _xs_string_mod_apply
+
+
 def _allow_property_for_view(prop_info: 'ObjectMetadataLibrary.SerializableProperty', value_: Any,
                              view_: Optional[Type[ViewType]]) -> bool:
     # First check Property is part of the View is given
@@ -394,7 +436,8 @@ def as_xml(self: Any, view_: Optional[Type[ViewType]] = None,
                     elif prop_info.is_enum:
                         v = v.value
 
-                    this_e_attributes[_namespace_element_name(new_key, xmlns)] = str(v)
+                    this_e_attributes[_namespace_element_name(new_key, xmlns)] = \
+                        _xs_string_mod_apply(str(v), prop_info.xml_string_config)
 
         element_name = _namespace_element_name(
             element_name if element_name else CurrentFormatter.formatter.encode(self.__class__.__name__),
@@ -426,7 +469,8 @@ def as_xml(self: Any, view_: Optional[Type[ViewType]] = None,
                     continue
 
                 if new_key == '.':
-                    this_e.text = str(v)
+                    this_e.text = _xs_string_mod_apply(str(v),
+                                                       prop_info.xml_string_config)
                     continue
 
                 if CurrentFormatter.formatter:
@@ -445,14 +489,16 @@ def as_xml(self: Any, view_: Optional[Type[ViewType]] = None,
                             nested_e.append(
                                 j.as_xml(view_=view_, as_string=False, element_name=nested_key, xmlns=xmlns))
                         elif prop_info.is_enum:
-                            SubElement(nested_e, nested_key).text = str(j.value)
+                            SubElement(nested_e, nested_key).text = _xs_string_mod_apply(str(j.value),
+                                                                                         prop_info.xml_string_config)
                         elif prop_info.concrete_type in (float, int):
                             SubElement(nested_e, nested_key).text = str(j)
                         elif prop_info.concrete_type is bool:
                             SubElement(nested_e, nested_key).text = str(j).lower()
                         else:
                             # Assume type is str
-                            SubElement(nested_e, nested_key).text = str(j)
+                            SubElement(nested_e, nested_key).text = _xs_string_mod_apply(str(j),
+                                                                                         prop_info.xml_string_config)
                 elif prop_info.custom_type:
                     if prop_info.is_helper_type():
                         v_ser = prop_info.custom_type.xml_normalize(
@@ -462,11 +508,14 @@ def as_xml(self: Any, view_: Optional[Type[ViewType]] = None,
                         elif isinstance(v_ser, Element):
                             this_e.append(v_ser)
                         else:
-                            SubElement(this_e, new_key).text = str(v_ser)
+                            SubElement(this_e, new_key).text = _xs_string_mod_apply(str(v_ser),
+                                                                                    prop_info.xml_string_config)
                     else:
-                        SubElement(this_e, new_key).text = str(prop_info.custom_type(v))
+                        SubElement(this_e, new_key).text = _xs_string_mod_apply(str(prop_info.custom_type(v)),
+                                                                                prop_info.xml_string_config)
                 elif prop_info.is_enum:
-                    SubElement(this_e, new_key).text = str(v.value)
+                    SubElement(this_e, new_key).text = _xs_string_mod_apply(str(v.value),
+                                                                            prop_info.xml_string_config)
                 elif not prop_info.is_primitive_type():
                     global_klass_name = f'{prop_info.concrete_type.__module__}.{prop_info.concrete_type.__name__}'
                     if global_klass_name in ObjectMetadataLibrary.klass_mappings:
@@ -475,16 +524,19 @@ def as_xml(self: Any, view_: Optional[Type[ViewType]] = None,
                     else:
                         # Handle properties that have a type that is not a Python Primitive (e.g. int, float, str)
                         if prop_info.string_format:
-                            SubElement(this_e, new_key).text = f'{v:{prop_info.string_format}}'
+                            SubElement(this_e, new_key).text = _xs_string_mod_apply(f'{v:{prop_info.string_format}}',
+                                                                                    prop_info.xml_string_config)
                         else:
-                            SubElement(this_e, new_key).text = str(v)
+                            SubElement(this_e, new_key).text = _xs_string_mod_apply(str(v),
+                                                                                    prop_info.xml_string_config)
                 elif prop_info.concrete_type in (float, int):
                     SubElement(this_e, new_key).text = str(v)
                 elif prop_info.concrete_type is bool:
                     SubElement(this_e, new_key).text = str(v).lower()
                 else:
                     # Assume type is str
-                    SubElement(this_e, new_key).text = str(v)
+                    SubElement(this_e, new_key).text = _xs_string_mod_apply(str(v),
+                                                                            prop_info.xml_string_config)
 
         if as_string:
             return cast(Element, SafeElementTree.tostring(this_e, 'unicode'))
@@ -542,6 +594,9 @@ def strip_default_namespace(s: str) -> str:
                 raise ValueError(f'Non-primitive types not supported from XML Attributes - see {decoded_k} for '
                                  f'{cls.__module__}.{cls.__qualname__} which has Prop Metadata: {prop_info}')
 
+            if prop_info.xml_string_config:
+                v = _xs_string_mod_apply(v, prop_info.xml_string_config)
+
             if prop_info.custom_type and prop_info.is_helper_type():
                 _data[decoded_k] = prop_info.custom_type.xml_deserialize(v)
             elif prop_info.is_enum:
@@ -555,7 +610,7 @@ def strip_default_namespace(s: str) -> str:
         if data.text:
             for p, pi in klass_properties.items():
                 if pi.custom_names.get(SerializationType.XML) == '.':
-                    _data[p] = data.text.strip()
+                    _data[p] = _xs_string_mod_apply(data.text.strip(), pi.xml_string_config)
 
         # Handle Sub-Elements
         for child_e in data:
@@ -594,6 +649,9 @@ def strip_default_namespace(s: str) -> str:
             try:
                 _logger.debug('Handling %s', prop_info)
 
+                if child_e.text:
+                    child_e.text = _xs_string_mod_apply(child_e.text, prop_info.xml_string_config)
+
                 if prop_info.is_array and prop_info.xml_array_config:
                     array_type, nested_name = prop_info.xml_array_config
 
@@ -602,6 +660,9 @@ def strip_default_namespace(s: str) -> str:
 
                     if array_type == XmlArraySerializationType.NESTED:
                         for sub_child_e in child_e:
+                            if sub_child_e.text:
+                                sub_child_e.text = _xs_string_mod_apply(sub_child_e.text,
+                                                                        prop_info.xml_string_config)
                             if not prop_info.is_primitive_type() and not prop_info.is_enum:
                                 _data[decoded_k].append(prop_info.concrete_type.from_xml(
                                     data=sub_child_e, default_namespace=default_namespace)
@@ -675,6 +736,7 @@ class ObjectMetadataLibrary:
     _deferred_property_type_parsing: Dict[str, Set['ObjectMetadataLibrary.SerializableProperty']] = {}
     _klass_views: Dict[str, Type[ViewType]] = {}
     _klass_property_array_config: Dict[str, Tuple[XmlArraySerializationType, str]] = {}
+    _klass_property_string_config: Dict[str, Optional[XmlStringSerializationType]] = {}
     _klass_property_attributes: Set[str] = set()
     _klass_property_include_none: Dict[str, Set[Tuple[Type[ViewType], Any]]] = {}
     _klass_property_names: Dict[str, Dict[SerializationType, str]] = {}
@@ -738,12 +800,14 @@ class SerializableProperty:
 
         _DEFAULT_XML_SEQUENCE = 100
 
-        def __init__(self, *, prop_name: str, prop_type: Any, custom_names: Dict[SerializationType, str],
+        def __init__(self, *,
+                     prop_name: str, prop_type: Any, custom_names: Dict[SerializationType, str],
                      custom_type: Optional[Any] = None,
                      include_none_config: Optional[Set[Tuple[Type[ViewType], Any]]] = None,
                      is_xml_attribute: bool = False, string_format_: Optional[str] = None,
                      views: Optional[Iterable[Type[ViewType]]] = None,
                      xml_array_config: Optional[Tuple[XmlArraySerializationType, str]] = None,
+                     xml_string_config: Optional[XmlStringSerializationType] = None,
                      xml_sequence_: Optional[int] = None) -> None:
 
             self._name = prop_name
@@ -764,6 +828,7 @@ def __init__(self, *, prop_name: str, prop_type: Any, custom_names: Dict[Seriali
             self._string_format = string_format_
             self._views = set(views or ())
             self._xml_array_config = xml_array_config
+            self._xml_string_config = xml_string_config
             self._xml_sequence = xml_sequence_ or self._DEFAULT_XML_SEQUENCE
 
             self._deferred_type_parsing = False
@@ -834,6 +899,10 @@ def xml_array_config(self) -> Optional[Tuple[XmlArraySerializationType, str]]:
         def is_array(self) -> bool:
             return self._is_array
 
+        @property
+        def xml_string_config(self) -> Optional[XmlStringSerializationType]:
+            return self._xml_string_config
+
         @property
         def is_enum(self) -> bool:
             return self._is_enum
@@ -1050,6 +1119,7 @@ def register_klass(cls, klass: Type[_T], custom_name: Optional[str],
                 string_format_=ObjectMetadataLibrary._klass_property_string_formats.get(qualified_property_name),
                 views=ObjectMetadataLibrary._klass_property_views.get(qualified_property_name),
                 xml_array_config=ObjectMetadataLibrary._klass_property_array_config.get(qualified_property_name),
+                xml_string_config=ObjectMetadataLibrary._klass_property_string_config.get(qualified_property_name),
                 xml_sequence_=ObjectMetadataLibrary._klass_property_xml_sequence.get(
                     qualified_property_name,
                     ObjectMetadataLibrary.SerializableProperty._DEFAULT_XML_SEQUENCE)
@@ -1117,6 +1187,11 @@ def register_xml_property_array_config(cls, qual_name: str,
                                            array_type: XmlArraySerializationType, child_name: str) -> None:
         cls._klass_property_array_config[qual_name] = (array_type, child_name)
 
+    @classmethod
+    def register_xml_property_string_config(cls, qual_name: str,
+                                            string_type: Optional[XmlStringSerializationType]) -> None:
+        cls._klass_property_string_config[qual_name] = string_type
+
     @classmethod
     def register_xml_property_attribute(cls, qual_name: str) -> None:
         cls._klass_property_attributes.add(qual_name)
@@ -1305,6 +1380,19 @@ def decorate(f: _F) -> _F:
     return decorate
 
 
+def xml_string(string_type: XmlStringSerializationType) -> Callable[[_F], _F]:
+    """Decorator"""
+
+    def decorate(f: _F) -> _F:
+        _logger.debug('Registering %s.%s as XML StringType: %s', f.__module__, f.__qualname__, string_type)
+        ObjectMetadataLibrary.register_xml_property_string_config(
+            qual_name=f'{f.__module__}.{f.__qualname__}', string_type=string_type
+        )
+        return f
+
+    return decorate
+
+
 def xml_name(name: str) -> Callable[[_F], _F]:
     """Decorator"""
 

diff --git a/serializable/json.py b/serializable/json.py
@@ -0,0 +1,22 @@
+# encoding: utf-8
+
+# This file is part of py-serializable
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Paul Horton. All Rights Reserved.
+
+"""
+JSON-specific functionality.
+"""
diff --git a/serializable/xml.py b/serializable/xml.py
@@ -0,0 +1,82 @@
+# encoding: utf-8
+
+# This file is part of py-serializable
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Paul Horton. All Rights Reserved.
+
+"""
+XML-specific functionality.
+"""
+
+__all__ = ['xs_normalizedString', 'xs_token']
+
+from re import compile as re_compile
+
+# region normalizedString
+
+__NORMALIZED_STRING_FORBIDDEN_SEARCH = re_compile(r'\r\n|\t|\n|\r')
+__NORMALIZED_STRING_FORBIDDEN_REPLACE = ' '
+
+
+def xs_normalizedString(s: str) -> str:
+    """Make a ``normalizedString``, adhering XML spec.
+
+    .. epigraph::
+       *normalizedString* represents white space normalized strings.
+       The `·value space· <https://www.w3.org/TR/xmlschema-2/#dt-value-space>`_ of normalizedString is the set of
+       strings that do not contain the carriage return (#xD), line feed (#xA) nor tab (#x9) characters.
+       The `·lexical space· <https://www.w3.org/TR/xmlschema-2/#dt-lexical-space>`_ of normalizedString is the set of
+       strings that do not contain the carriage return (#xD), line feed (#xA) nor tab (#x9) characters.
+       The `·base type· <https://www.w3.org/TR/xmlschema-2/#dt-basetype>`_ of normalizedString is
+       `string <https://www.w3.org/TR/xmlschema-2/#string>`_.
+
+       -- the `XML schema spec <http://www.w3.org/TR/xmlschema-2/#normalizedString>`_
+    """
+    return __NORMALIZED_STRING_FORBIDDEN_SEARCH.sub(
+        __NORMALIZED_STRING_FORBIDDEN_REPLACE,
+        s)
+
+
+# endregion
+
+# region token
+
+
+__TOKEN_MULTISTRING_SEARCH = re_compile(r' {2,}')
+__TOKEN_MULTISTRING_REPLACE = ' '
+
+
+def xs_token(s: str) -> str:
+    """Make a ``token``, adhering XML spec.
+
+    .. epigraph::
+       *token* represents tokenized strings.
+       The `·value space· <https://www.w3.org/TR/xmlschema-2/#dt-value-space>`_ of token is the set of strings that do
+       not contain the carriage return (#xD), line feed (#xA) nor tab (#x9) characters, that have no leading or
+       trailing spaces (#x20) and that have no internal sequences of two or more spaces.
+       The `·lexical space· <https://www.w3.org/TR/xmlschema-2/#dt-lexical-space>`_ of token is the set of strings that
+       do not contain the carriage return (#xD), line feed (#xA) nor tab (#x9) characters, that have no leading or
+       trailing spaces (#x20) and that have no internal sequences of two or more spaces.
+       The `·base type· <https://www.w3.org/TR/xmlschema-2/#dt-basetype>`_ of token is
+       `normalizedString <https://www.w3.org/TR/xmlschema-2/#normalizedString>`_.
+
+       -- the `XML schema spec <http://www.w3.org/TR/xmlschema-2/#token>`_
+    """
+    return __TOKEN_MULTISTRING_SEARCH.sub(
+        __TOKEN_MULTISTRING_REPLACE,
+        xs_normalizedString(s).strip())
+
+# endregion