Skip to content

Commit

Permalink
feat: XML string formats for normalizedString and token (#119)
Browse files Browse the repository at this point in the history
fixes #114 
fixes #115 

---------

Signed-off-by: Jan Kowalleck <jan.kowalleck@gmail.com>
  • Loading branch information
jkowalleck committed Jul 8, 2024
1 parent aabb5e9 commit 3a1728d
Show file tree
Hide file tree
Showing 8 changed files with 371 additions and 23 deletions.
20 changes: 20 additions & 0 deletions docs/customising-structure.rst
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,26 @@ For *Example 3*, you would add the following to your class:
Further examples are available in our :ref:`unit tests <unit-tests>`.

Serializing special XML string types
----------------------------------------------------

In XML, are special string types, ech with defined set of allowed characters and whitespace handling.
We can handle this by adding the decorator :obj:`serializable.xml_string()` to the appropriate property in your class.

.. code-block:: python
@property
@serializable.xml_string(serializable.XmlStringSerializationType.TOKEN)
def author(self) -> str:
return self._author
Further examples are available in our :ref:`unit tests <unit-tests>`.

.. note::

The actual transformation is done by :func:`serializable.xml.xs_normalizedString()`
and :func:`serializable.xml.xs_token()`

Serialization Views
----------------------------------------------------

Expand Down
112 changes: 100 additions & 12 deletions serializable/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@

from .formatters import BaseNameFormatter, CurrentFormatter
from .helpers import BaseHelper
from .xml import xs_normalizedString, xs_token

# `Intersection` is still not implemented, so it is interim replaced by Union for any support
# see section "Intersection" in https://peps.python.org/pep-0483/
Expand Down Expand Up @@ -128,6 +129,47 @@ class XmlArraySerializationType(Enum):
NESTED = 2


@unique
class XmlStringSerializationType(Enum):
"""
Enum to differentiate how string-type properties are serialized.
"""
STRING = 1
"""
as raw string.
see https://www.w3.org/TR/xmlschema-2/#string
"""
NORMALIZED_STRING = 2
"""
as `normalizedString`.
see http://www.w3.org/TR/xmlschema-2/#normalizedString"""
TOKEN = 3
"""
as `token`.
see http://www.w3.org/TR/xmlschema-2/#token"""

# unimplemented cases
# - https://www.w3.org/TR/xmlschema-2/#language
# - https://www.w3.org/TR/xmlschema-2/#NMTOKEN
# - https://www.w3.org/TR/xmlschema-2/#Name


# region _xs_string_mod_apply

__XS_STRING_MODS: Dict[XmlStringSerializationType, Callable[[str], str]] = {
XmlStringSerializationType.NORMALIZED_STRING: xs_normalizedString,
XmlStringSerializationType.TOKEN: xs_token,
}


def _xs_string_mod_apply(v: str, t: Optional[XmlStringSerializationType]) -> str:
mod = __XS_STRING_MODS.get(t) # type: ignore[arg-type]
return mod(v) if mod else v


# endregion _xs_string_mod_apply


def _allow_property_for_view(prop_info: 'ObjectMetadataLibrary.SerializableProperty', value_: Any,
view_: Optional[Type[ViewType]]) -> bool:
# First check Property is part of the View is given
Expand Down Expand Up @@ -394,7 +436,8 @@ def as_xml(self: Any, view_: Optional[Type[ViewType]] = None,
elif prop_info.is_enum:
v = v.value

this_e_attributes[_namespace_element_name(new_key, xmlns)] = str(v)
this_e_attributes[_namespace_element_name(new_key, xmlns)] = \
_xs_string_mod_apply(str(v), prop_info.xml_string_config)

element_name = _namespace_element_name(
element_name if element_name else CurrentFormatter.formatter.encode(self.__class__.__name__),
Expand Down Expand Up @@ -426,7 +469,8 @@ def as_xml(self: Any, view_: Optional[Type[ViewType]] = None,
continue

if new_key == '.':
this_e.text = str(v)
this_e.text = _xs_string_mod_apply(str(v),
prop_info.xml_string_config)
continue

if CurrentFormatter.formatter:
Expand All @@ -445,14 +489,16 @@ def as_xml(self: Any, view_: Optional[Type[ViewType]] = None,
nested_e.append(
j.as_xml(view_=view_, as_string=False, element_name=nested_key, xmlns=xmlns))
elif prop_info.is_enum:
SubElement(nested_e, nested_key).text = str(j.value)
SubElement(nested_e, nested_key).text = _xs_string_mod_apply(str(j.value),
prop_info.xml_string_config)
elif prop_info.concrete_type in (float, int):
SubElement(nested_e, nested_key).text = str(j)
elif prop_info.concrete_type is bool:
SubElement(nested_e, nested_key).text = str(j).lower()
else:
# Assume type is str
SubElement(nested_e, nested_key).text = str(j)
SubElement(nested_e, nested_key).text = _xs_string_mod_apply(str(j),
prop_info.xml_string_config)
elif prop_info.custom_type:
if prop_info.is_helper_type():
v_ser = prop_info.custom_type.xml_normalize(
Expand All @@ -462,11 +508,14 @@ def as_xml(self: Any, view_: Optional[Type[ViewType]] = None,
elif isinstance(v_ser, Element):
this_e.append(v_ser)
else:
SubElement(this_e, new_key).text = str(v_ser)
SubElement(this_e, new_key).text = _xs_string_mod_apply(str(v_ser),
prop_info.xml_string_config)
else:
SubElement(this_e, new_key).text = str(prop_info.custom_type(v))
SubElement(this_e, new_key).text = _xs_string_mod_apply(str(prop_info.custom_type(v)),
prop_info.xml_string_config)
elif prop_info.is_enum:
SubElement(this_e, new_key).text = str(v.value)
SubElement(this_e, new_key).text = _xs_string_mod_apply(str(v.value),
prop_info.xml_string_config)
elif not prop_info.is_primitive_type():
global_klass_name = f'{prop_info.concrete_type.__module__}.{prop_info.concrete_type.__name__}'
if global_klass_name in ObjectMetadataLibrary.klass_mappings:
Expand All @@ -475,16 +524,19 @@ def as_xml(self: Any, view_: Optional[Type[ViewType]] = None,
else:
# Handle properties that have a type that is not a Python Primitive (e.g. int, float, str)
if prop_info.string_format:
SubElement(this_e, new_key).text = f'{v:{prop_info.string_format}}'
SubElement(this_e, new_key).text = _xs_string_mod_apply(f'{v:{prop_info.string_format}}',
prop_info.xml_string_config)
else:
SubElement(this_e, new_key).text = str(v)
SubElement(this_e, new_key).text = _xs_string_mod_apply(str(v),
prop_info.xml_string_config)
elif prop_info.concrete_type in (float, int):
SubElement(this_e, new_key).text = str(v)
elif prop_info.concrete_type is bool:
SubElement(this_e, new_key).text = str(v).lower()
else:
# Assume type is str
SubElement(this_e, new_key).text = str(v)
SubElement(this_e, new_key).text = _xs_string_mod_apply(str(v),
prop_info.xml_string_config)

if as_string:
return cast(Element, SafeElementTree.tostring(this_e, 'unicode'))
Expand Down Expand Up @@ -542,6 +594,9 @@ def strip_default_namespace(s: str) -> str:
raise ValueError(f'Non-primitive types not supported from XML Attributes - see {decoded_k} for '
f'{cls.__module__}.{cls.__qualname__} which has Prop Metadata: {prop_info}')

if prop_info.xml_string_config:
v = _xs_string_mod_apply(v, prop_info.xml_string_config)

if prop_info.custom_type and prop_info.is_helper_type():
_data[decoded_k] = prop_info.custom_type.xml_deserialize(v)
elif prop_info.is_enum:
Expand All @@ -555,7 +610,7 @@ def strip_default_namespace(s: str) -> str:
if data.text:
for p, pi in klass_properties.items():
if pi.custom_names.get(SerializationType.XML) == '.':
_data[p] = data.text.strip()
_data[p] = _xs_string_mod_apply(data.text.strip(), pi.xml_string_config)

# Handle Sub-Elements
for child_e in data:
Expand Down Expand Up @@ -594,6 +649,9 @@ def strip_default_namespace(s: str) -> str:
try:
_logger.debug('Handling %s', prop_info)

if child_e.text:
child_e.text = _xs_string_mod_apply(child_e.text, prop_info.xml_string_config)

if prop_info.is_array and prop_info.xml_array_config:
array_type, nested_name = prop_info.xml_array_config

Expand All @@ -602,6 +660,9 @@ def strip_default_namespace(s: str) -> str:

if array_type == XmlArraySerializationType.NESTED:
for sub_child_e in child_e:
if sub_child_e.text:
sub_child_e.text = _xs_string_mod_apply(sub_child_e.text,
prop_info.xml_string_config)
if not prop_info.is_primitive_type() and not prop_info.is_enum:
_data[decoded_k].append(prop_info.concrete_type.from_xml(
data=sub_child_e, default_namespace=default_namespace)
Expand Down Expand Up @@ -675,6 +736,7 @@ class ObjectMetadataLibrary:
_deferred_property_type_parsing: Dict[str, Set['ObjectMetadataLibrary.SerializableProperty']] = {}
_klass_views: Dict[str, Type[ViewType]] = {}
_klass_property_array_config: Dict[str, Tuple[XmlArraySerializationType, str]] = {}
_klass_property_string_config: Dict[str, Optional[XmlStringSerializationType]] = {}
_klass_property_attributes: Set[str] = set()
_klass_property_include_none: Dict[str, Set[Tuple[Type[ViewType], Any]]] = {}
_klass_property_names: Dict[str, Dict[SerializationType, str]] = {}
Expand Down Expand Up @@ -738,12 +800,14 @@ class SerializableProperty:

_DEFAULT_XML_SEQUENCE = 100

def __init__(self, *, prop_name: str, prop_type: Any, custom_names: Dict[SerializationType, str],
def __init__(self, *,
prop_name: str, prop_type: Any, custom_names: Dict[SerializationType, str],
custom_type: Optional[Any] = None,
include_none_config: Optional[Set[Tuple[Type[ViewType], Any]]] = None,
is_xml_attribute: bool = False, string_format_: Optional[str] = None,
views: Optional[Iterable[Type[ViewType]]] = None,
xml_array_config: Optional[Tuple[XmlArraySerializationType, str]] = None,
xml_string_config: Optional[XmlStringSerializationType] = None,
xml_sequence_: Optional[int] = None) -> None:

self._name = prop_name
Expand All @@ -764,6 +828,7 @@ def __init__(self, *, prop_name: str, prop_type: Any, custom_names: Dict[Seriali
self._string_format = string_format_
self._views = set(views or ())
self._xml_array_config = xml_array_config
self._xml_string_config = xml_string_config
self._xml_sequence = xml_sequence_ or self._DEFAULT_XML_SEQUENCE

self._deferred_type_parsing = False
Expand Down Expand Up @@ -834,6 +899,10 @@ def xml_array_config(self) -> Optional[Tuple[XmlArraySerializationType, str]]:
def is_array(self) -> bool:
return self._is_array

@property
def xml_string_config(self) -> Optional[XmlStringSerializationType]:
return self._xml_string_config

@property
def is_enum(self) -> bool:
return self._is_enum
Expand Down Expand Up @@ -1050,6 +1119,7 @@ def register_klass(cls, klass: Type[_T], custom_name: Optional[str],
string_format_=ObjectMetadataLibrary._klass_property_string_formats.get(qualified_property_name),
views=ObjectMetadataLibrary._klass_property_views.get(qualified_property_name),
xml_array_config=ObjectMetadataLibrary._klass_property_array_config.get(qualified_property_name),
xml_string_config=ObjectMetadataLibrary._klass_property_string_config.get(qualified_property_name),
xml_sequence_=ObjectMetadataLibrary._klass_property_xml_sequence.get(
qualified_property_name,
ObjectMetadataLibrary.SerializableProperty._DEFAULT_XML_SEQUENCE)
Expand Down Expand Up @@ -1117,6 +1187,11 @@ def register_xml_property_array_config(cls, qual_name: str,
array_type: XmlArraySerializationType, child_name: str) -> None:
cls._klass_property_array_config[qual_name] = (array_type, child_name)

@classmethod
def register_xml_property_string_config(cls, qual_name: str,
string_type: Optional[XmlStringSerializationType]) -> None:
cls._klass_property_string_config[qual_name] = string_type

@classmethod
def register_xml_property_attribute(cls, qual_name: str) -> None:
cls._klass_property_attributes.add(qual_name)
Expand Down Expand Up @@ -1305,6 +1380,19 @@ def decorate(f: _F) -> _F:
return decorate


def xml_string(string_type: XmlStringSerializationType) -> Callable[[_F], _F]:
"""Decorator"""

def decorate(f: _F) -> _F:
_logger.debug('Registering %s.%s as XML StringType: %s', f.__module__, f.__qualname__, string_type)
ObjectMetadataLibrary.register_xml_property_string_config(
qual_name=f'{f.__module__}.{f.__qualname__}', string_type=string_type
)
return f

return decorate


def xml_name(name: str) -> Callable[[_F], _F]:
"""Decorator"""

Expand Down
22 changes: 22 additions & 0 deletions serializable/json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# encoding: utf-8

# This file is part of py-serializable
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) Paul Horton. All Rights Reserved.

"""
JSON-specific functionality.
"""
82 changes: 82 additions & 0 deletions serializable/xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# encoding: utf-8

# This file is part of py-serializable
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) Paul Horton. All Rights Reserved.

"""
XML-specific functionality.
"""

__all__ = ['xs_normalizedString', 'xs_token']

from re import compile as re_compile

# region normalizedString

__NORMALIZED_STRING_FORBIDDEN_SEARCH = re_compile(r'\r\n|\t|\n|\r')
__NORMALIZED_STRING_FORBIDDEN_REPLACE = ' '


def xs_normalizedString(s: str) -> str:
"""Make a ``normalizedString``, adhering XML spec.
.. epigraph::
*normalizedString* represents white space normalized strings.
The `·value space· <https://www.w3.org/TR/xmlschema-2/#dt-value-space>`_ of normalizedString is the set of
strings that do not contain the carriage return (#xD), line feed (#xA) nor tab (#x9) characters.
The `·lexical space· <https://www.w3.org/TR/xmlschema-2/#dt-lexical-space>`_ of normalizedString is the set of
strings that do not contain the carriage return (#xD), line feed (#xA) nor tab (#x9) characters.
The `·base type· <https://www.w3.org/TR/xmlschema-2/#dt-basetype>`_ of normalizedString is
`string <https://www.w3.org/TR/xmlschema-2/#string>`_.
-- the `XML schema spec <http://www.w3.org/TR/xmlschema-2/#normalizedString>`_
"""
return __NORMALIZED_STRING_FORBIDDEN_SEARCH.sub(
__NORMALIZED_STRING_FORBIDDEN_REPLACE,
s)


# endregion

# region token


__TOKEN_MULTISTRING_SEARCH = re_compile(r' {2,}')
__TOKEN_MULTISTRING_REPLACE = ' '


def xs_token(s: str) -> str:
"""Make a ``token``, adhering XML spec.
.. epigraph::
*token* represents tokenized strings.
The `·value space· <https://www.w3.org/TR/xmlschema-2/#dt-value-space>`_ of token is the set of strings that do
not contain the carriage return (#xD), line feed (#xA) nor tab (#x9) characters, that have no leading or
trailing spaces (#x20) and that have no internal sequences of two or more spaces.
The `·lexical space· <https://www.w3.org/TR/xmlschema-2/#dt-lexical-space>`_ of token is the set of strings that
do not contain the carriage return (#xD), line feed (#xA) nor tab (#x9) characters, that have no leading or
trailing spaces (#x20) and that have no internal sequences of two or more spaces.
The `·base type· <https://www.w3.org/TR/xmlschema-2/#dt-basetype>`_ of token is
`normalizedString <https://www.w3.org/TR/xmlschema-2/#normalizedString>`_.
-- the `XML schema spec <http://www.w3.org/TR/xmlschema-2/#token>`_
"""
return __TOKEN_MULTISTRING_SEARCH.sub(
__TOKEN_MULTISTRING_REPLACE,
xs_normalizedString(s).strip())

# endregion
Loading

0 comments on commit 3a1728d

Please sign in to comment.