Permalink
Browse files

Add support for tag="{*}name" in _Element.iter() and friends.

Such methods now have all combinations of wilcards to implement
CSS selectors NS|E, |E, *|E, NS|*, |*, and *|*

Internally the meaning of the qname.href struct field was changed:

* The NULL pointer is a wildcard, matches any namespace
* The empty Python byte-string matches elements without a namespace
* Other Python byte-strings are namespace URIs, matching elements in
  the same namespace.

qname.c_name is uchanged (NULL for a wildcard, a C string for a
given local name.)
  • Loading branch information...
1 parent 7eca2bb commit 4e1e685014743e196fdb07eb7b9cd78132f12c27 @SimonSapin SimonSapin committed Aug 11, 2012
Showing with 92 additions and 42 deletions.
  1. +14 −0 CHANGES.txt
  2. +6 −2 src/lxml/apihelpers.pxi
  3. +8 −1 src/lxml/cleanup.pxi
  4. +32 −39 src/lxml/lxml.etree.pyx
  5. +32 −0 src/lxml/tests/test_etree.py
View
@@ -8,6 +8,20 @@ lxml changelog
Features added
--------------
+
+* The ``.iter()`` method of elements now accepts ``tag`` arguments like
+ ``"{*}name"`` to search for elements with a given local name in any
+ namespace. With this addition, all combinations of wildcards now work
+ as expected:
+ ``"{ns}name"``, ``"{}name"``, ``"{*}name"``, ``"{ns}*"``, ``"{}*"``
+ and ``"{*}*"``. Note that ``"name"`` is equivalent to ``"{}name"``,
+ but ``"*"`` is ``"{*}*"``.
+ The same change applies to the ``.getiterator()``, ``.itersiblings()``,
+ ``.iterancestors()``, ``.iterdescendants()``, ``.iterchildren()``
+ and ``.itertext()`` methods as well as the ``strip_attributes()``,
+ ``strip_elements()`` and ``strip_tags()`` functions.
+
+
Bugs fixed
----------
View
@@ -906,15 +906,19 @@ cdef inline bint _tagMatchesExactly(xmlNode* c_node, qname* c_qname):
* c_name is NULL
* its name string points to the same address (!) as c_name
"""
+ cdef char* c_href
if c_qname.c_name is not NULL and c_qname.c_name is not c_node.name:
return 0
- c_node_href = _getNs(c_node)
if c_qname.href is NULL:
+ return 1
+ c_node_href = _getNs(c_node)
+ c_href = python.__cstr(c_qname.href)
+ if c_href[0] == '\0':
return c_node_href is NULL or c_node_href[0] == '\0'
elif c_node_href is NULL:
return 0
else:
- return tree.xmlStrcmp(<const_xmlChar*>python.__cstr(c_qname.href), c_node_href) == 0
+ return tree.xmlStrcmp(<const_xmlChar*>c_href, c_node_href) == 0
cdef Py_ssize_t _mapTagsToQnameMatchArray(xmlDoc* c_doc, list ns_tags,
qname* c_ns_tags, bint force_into_dict) except -1:
View
@@ -16,11 +16,14 @@ def strip_attributes(tree_or_element, *attribute_names):
Delete all attributes with the provided attribute names from an
Element (or ElementTree) and its descendants.
+ Attribute names can contain wildcards as in `_Element.iter`.
+
Example usage::
strip_attributes(root_element,
'simpleattr',
- '{http://some/ns}attrname')
+ '{http://some/ns}attrname',
+ '{http://other/ns}*')
"""
cdef _MultiTagMatcher matcher
cdef _Element element
@@ -57,6 +60,8 @@ def strip_elements(tree_or_element, *tag_names, bint with_tail=True):
will also remove the tail text of the element unless you
explicitly set the ``with_tail`` keyword argument option to False.
+ Tag names can contain wildcards as in `_Element.iter`.
+
Note that this will not delete the element (or ElementTree root
element) that you passed even if it matches. It will only treat
its descendants. If you want to include the root element, check
@@ -133,6 +138,8 @@ def strip_tags(tree_or_element, *tag_names):
merge the text content and children of the element into its
parent.
+ Tag names can contain wildcards as in `_Element.iter`.
+
Note that this will not delete the element (or ElementTree root
element) that you passed even if it matches. It will only treat
its descendants.
View
@@ -1289,8 +1289,8 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
siblings in reverse document order, i.e. starting right before
the current element and going backwards.
- The returned elements can be restricted to a specific tag name by
- passing a tag or a series of tag names.
+ Can be restricted to find only elements with a specific tag,
+ see `iter`.
"""
if tag is not None:
tags += (tag,)
@@ -1301,8 +1301,8 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
Iterate over the ancestors of this element (from parent to parent).
- The returned elements can be restricted to a specific tag name by
- passing a tag or a series of tag names.
+ Can be restricted to find only elements with a specific tag,
+ see `iter`.
"""
if tag is not None:
tags += (tag,)
@@ -1314,8 +1314,8 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
Iterate over the descendants of this element in document order.
As opposed to ``el.iter()``, this iterator does not yield the element
- itself. The returned elements can be restricted to a specific tag
- name by passing a tag or a series of tag names.
+ itself. The returned elements can be restricted to find only elements
+ with a specific tag, see `iter`.
"""
if tag is not None:
tags += (tag,)
@@ -1327,8 +1327,8 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
Iterate over the children of this element.
As opposed to using normal iteration on this element, the returned
- elements can be restricted to a specific tag name by passing a tag
- or a series of tag names, and reversed with the 'reversed' keyword.
+ elements can be reversed with the 'reversed' keyword and restricted
+ to find only elements with a specific tag, see `iter`.
"""
if tag is not None:
tags += (tag,)
@@ -1353,13 +1353,8 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
document order (depth first pre-order), starting with this
element.
- Can be restricted to find only elements with a specific tag
- (pass ``"xyz"`` as tag) or from a namespace (pass ``"{ns}*"`` as tag).
- Passing a sequence of tags will let the iterator return all
- elements matching any of these tags, in document order.
-
- You can also pass the Element, Comment, ProcessingInstruction and
- Entity factory functions to look only for the specific element type.
+ Can be restricted to find only elements with a specific tag,
+ see `iter`.
:deprecated: Note that this method is deprecated as of
ElementTree 1.3 and lxml 2.0. It returns an iterator in
@@ -1379,13 +1374,17 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
Iterate over all elements in the subtree in document order (depth
first pre-order), starting with this element.
- Can be restricted to find only elements with a specific tag
- (pass ``"xyz"`` as tag) or from a namespace (pass ``"{ns}*"`` as tag).
- Passing a sequence of tags will let the iterator return all
- elements matching any of these tags, in document order.
+ Can be restricted to find only elements with a specific tag:
+ pass ``"{ns}localname"`` as tag. Either or both of ``ns`` and
+ ``localname`` can be ``*`` for a wildcard; ``ns`` can be empty
+ for no namespace. ``"localname"`` is equivalent to ``"{}localname"``
+ but ``"*"`` is ``"{*}*"``, not ``"{}*"``.
You can also pass the Element, Comment, ProcessingInstruction and
Entity factory functions to look only for the specific element type.
+
+ Passing a sequence of tags will let the iterator return all
+ elements matching any of these tags, in document order.
"""
if tag is not None:
tags += (tag,)
@@ -1396,9 +1395,8 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
Iterates over the text content of a subtree.
- You can pass a tag name to restrict text content to a specific tag
- name. Passing a sequence of tags will let the iterator consider
- all elements matching any of these tags.
+ You can pass a tag name to restrict text content to specific elements,
+ see `iter`.
You can set the ``with_tail`` keyword argument to ``False`` to skip
over tail text.
@@ -1943,13 +1941,8 @@ cdef public class _ElementTree [ type LxmlElementTreeType,
Returns a sequence or iterator of all elements in document order
(depth first pre-order), starting with the root element.
- Can be restricted to find only elements with a specific tag
- (pass ``"xyz"`` as tag) or from a namespace (pass ``"{ns}*"`` as tag).
- Passing a sequence of tags will let the iterator return all
- elements matching any of these tags, in document order.
-
- You can also pass the Element, Comment, ProcessingInstruction and
- Entity factory functions to look only for the specific element type.
+ Can be restricted to find only elements with a specific tag,
+ see `_Element.iter`.
:deprecated: Note that this method is deprecated as of
ElementTree 1.3 and lxml 2.0. It returns an iterator in
@@ -1971,6 +1964,9 @@ cdef public class _ElementTree [ type LxmlElementTreeType,
Creates an iterator for the root element. The iterator loops over
all elements in this tree, in document order.
+
+ Can be restricted to find only elements with a specific tag,
+ see `_Element.iter`.
"""
root = self.getroot()
if root is None:
@@ -2488,6 +2484,10 @@ cdef class _MultiTagMatcher:
href, name = _getNsTag(tag)
if name == b'*':
name = None
+ if href is None:
+ href = b'' # no namespace
+ elif href == b'*':
+ href = None # wildcard: any namespace, including none
self._py_tags.append((href, name))
else:
# support a sequence of tags
@@ -2530,19 +2530,12 @@ cdef class _MultiTagMatcher:
cdef inline bint matchesAttribute(self, xmlAttr* c_attr):
"""Attribute matches differ from Element matches in that they do
- not care about node types and href NULL values are not wildcards
- but match only unnamespaced attributes.
+ not care about node types.
"""
cdef qname* c_qname
for c_qname in self._cached_tags[:self._tag_count]:
- if c_qname.c_name is NULL or c_qname.c_name is c_attr.name:
- c_href = tree._getNs(<xmlNode*>c_attr)
- if c_qname.href is NULL:
- if c_href is NULL:
- return True
- elif c_href is not NULL:
- if tree.xmlStrcmp(c_href, <const_xmlChar*>python.__cstr(c_qname.href)) == 0:
- return True
+ if _tagMatchesExactly(<xmlNode*>c_attr, c_qname):
+ return True
return False
cdef class _ElementMatchIterator:
@@ -2259,6 +2259,7 @@ def test_getiterator_filter_namespace(self):
d = SubElement(b, '{b}d')
e = SubElement(c, '{a}e')
f = SubElement(c, '{b}f')
+ g = SubElement(c, 'g')
self.assertEquals(
[a],
@@ -2275,6 +2276,37 @@ def test_getiterator_filter_namespace(self):
self.assertEquals(
[d, f],
list(a.getiterator('{b}*')))
+ self.assertEquals(
+ [g],
+ list(a.getiterator('g')))
+ self.assertEquals(
+ [g],
+ list(a.getiterator('{}g')))
+ self.assertEquals(
+ [g],
+ list(a.getiterator('{}*')))
+
+ def test_getiterator_filter_local_name(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('{a}a')
+ b = SubElement(a, '{nsA}b')
+ c = SubElement(b, '{nsB}b')
+ d = SubElement(a, 'b')
+ e = SubElement(a, '{nsA}e')
+ f = SubElement(e, '{nsB}e')
+ g = SubElement(e, 'e')
+
+ self.assertEquals(
+ [b, c, d],
+ list(a.getiterator('{*}b')))
+ self.assertEquals(
+ [e, f, g],
+ list(a.getiterator('{*}e')))
+ self.assertEquals(
+ [a, b, c, d, e, f, g],
+ list(a.getiterator('{*}*')))
def test_getiterator_filter_entities(self):
Element = self.etree.Element

0 comments on commit 4e1e685

Please sign in to comment.