rewrite of node matcher to remove code duplication and make it PyPy c…

…ompatible
lxml · Apr 21, 2012 · 714024e · 714024e
1 parent 7dcdd7c
commit 714024e
Show file tree

Hide file tree

Showing 4 changed files with 152 additions and 209 deletions.
diff --git a/src/lxml/apihelpers.pxi b/src/lxml/apihelpers.pxi
@@ -946,15 +946,15 @@ cdef inline bint _tagMatches(xmlNode* c_node, char* c_href, char* c_name):
     else:
         return 0
 
-cdef inline bint _tagMatchesExactly(xmlNode* c_node, char* c_href, char* c_name):
+cdef inline bint _tagMatchesExactly(xmlNode* c_node, qname* c_qname):
     u"""Tests if the node matches namespace URI and tag name.
 
     This differs from _tagMatches() in that it does not consider a
-    NULL value in c_href a wildcard, and that it expects the c_name to
-    be taken from the doc dict, i.e. it only compares the names by
+    NULL value in qname.href a wildcard, and that it expects the c_name
+    to be taken from the doc dict, i.e. it only compares the names by
     address.
 
-    A node matches if it matches both c_href and c_name.
+    A node matches if it matches both href and c_name of the qname.
 
     A node matches c_href if any of the following is true:
     * its namespace is NULL and c_href is the empty string
@@ -965,15 +965,48 @@ cdef inline bint _tagMatchesExactly(xmlNode* c_node, char* c_href, char* c_name)
     * its name string points to the same address (!) as c_name
     """
     cdef char* c_node_href
-    if c_name is not NULL and c_name is not c_node.name:
+    if c_qname.c_name is not NULL and c_qname.c_name is not c_node.name:
         return 0
     c_node_href = _getNs(c_node)
-    if c_href is NULL:
+    if c_qname.href is NULL:
         return c_node_href is NULL or c_node_href[0] == '\0'
     elif c_node_href is NULL:
         return 0
     else:
-        return cstring_h.strcmp(c_href, c_node_href) == 0
+        return cstring_h.strcmp(python.__cstr(c_qname.href), c_node_href) == 0
+
+cdef Py_ssize_t _mapTagsToQnameMatchArray(xmlDoc* c_doc, list ns_tags,
+                                          qname* c_ns_tags, bint force_into_dict) except -1:
+    u"""Map a sequence of (name, namespace) pairs to a qname array for efficient
+    matching with _tagMatchesExactly() above.
+
+    Note that each qname struct in the array owns its href byte string object
+    if it is not NULL.
+    """
+    cdef Py_ssize_t count = 0
+    cdef char* c_tag
+    cdef bytes ns, tag
+    for ns, tag in ns_tags:
+        if tag is None:
+            c_tag = NULL
+        elif force_into_dict:
+            c_tag = tree.xmlDictLookup(c_doc.dict, _cstr(tag), len(tag))
+            if c_tag is NULL:
+                raise MemoryError()
+        else:
+            c_tag = tree.xmlDictExists(c_doc.dict, _cstr(tag), len(tag))
+            if c_tag is NULL:
+                # not in the dict => not in the document
+                continue
+        c_ns_tags[0].c_name = c_tag
+        if ns is None:
+            c_ns_tags[0].href = NULL
+        else:
+            python.Py_INCREF(ns) # keep an owned reference!
+            c_ns_tags[0].href = <python.PyObject*>ns
+        c_ns_tags += 1
+        count += 1
+    return count
 
 cdef int _removeNode(_Document doc, xmlNode* c_node) except -1:
     u"""Unlink and free a node and subnodes if possible.  Otherwise, make sure

diff --git a/src/lxml/cleanup.pxi b/src/lxml/cleanup.pxi
@@ -22,59 +22,30 @@ def strip_attributes(tree_or_element, *attribute_names):
                          'simpleattr',
                          '{http://some/ns}attrname')
     """
+    cdef _MultiTagMatcher matcher
     cdef _Element element
-    cdef list ns_tags
-    cdef char** c_ns_tags
-    cdef Py_ssize_t c_tag_count
 
     element = _rootNodeOrRaise(tree_or_element)
-    if not attribute_names: return
-
-    ns_tags = _sortedTagList([ _getNsTag(attr)
-                               for attr in <tuple>attribute_names ])
-    ns_tags = [ (ns, tag if tag != b'*' else None)
-                for ns, tag in ns_tags ]
-
-    # tag names are passes as C pointers as this allows us to take
-    # them from the doc dict and do pointer comparisons
-    c_ns_tags = <char**> stdlib.malloc(sizeof(char*) * len(ns_tags) * 2 + 2)
-    if c_ns_tags is NULL:
-        raise MemoryError()
+    if not attribute_names:
+        return
 
-    try:
-        c_tag_count = _mapTagsToCharArray(element._doc._c_doc, ns_tags, c_ns_tags)
-        if c_tag_count > 0:
-            _strip_attributes(element._c_node, c_ns_tags, c_tag_count)
-    finally:
-        stdlib.free(c_ns_tags)
+    matcher = _MultiTagMatcher(attribute_names)
+    matcher.cacheTags(element._doc)
+    if matcher.rejectsAllAttributes():
+        return
+    _strip_attributes(element._c_node, matcher)
 
-cdef _strip_attributes(xmlNode* c_node, char** c_ns_tags, Py_ssize_t c_tag_count):
+cdef _strip_attributes(xmlNode* c_node, _MultiTagMatcher matcher):
     cdef xmlAttr* c_attr
-    cdef Py_ssize_t i
-    cdef char* c_href
-    cdef char* c_name
-
+    cdef xmlAttr* c_next_attr
     tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
     if c_node.type == tree.XML_ELEMENT_NODE:
-        if c_node.properties is not NULL:
-            for i in range(c_tag_count):
-                c_href = c_ns_tags[2*i]
-                c_name = c_ns_tags[2*i+1]
-                # must compare attributes manually to make sure we
-                # only match on wildcard tag names if the attribute
-                # has no namespace
-                c_attr = c_node.properties
-                while c_attr is not NULL:
-                    if c_name is NULL or c_attr.name == c_name:
-                        if c_href is NULL:
-                            if c_attr.ns is NULL or c_attr.ns.href is NULL:
-                                tree.xmlRemoveProp(c_attr)
-                                break
-                        elif c_attr.ns is not NULL and c_attr.ns.href is not NULL:
-                            if cstring_h.strcmp(c_attr.ns.href, c_href) == 0:
-                                tree.xmlRemoveProp(c_attr)
-                                break
-                    c_attr = c_attr.next
+        c_attr = c_node.properties
+        while c_attr is not NULL:
+            c_next_attr = c_attr.next
+            if matcher.matchesAttribute(c_attr):
+                tree.xmlRemoveProp(c_attr)
+            c_attr = c_next_attr
     tree.END_FOR_EACH_ELEMENT_FROM(c_node)
 
 def strip_elements(tree_or_element, *tag_names, bint with_tail=True):
@@ -100,48 +71,36 @@ def strip_elements(tree_or_element, *tag_names, bint with_tail=True):
             lxml.etree.Comment           # comments
             )
     """
+    cdef _MultiTagMatcher matcher
     cdef _Element element
     cdef _Document doc
     cdef list ns_tags
-    cdef char** c_ns_tags
+    cdef qname* c_ns_tags
     cdef Py_ssize_t c_tag_count
     cdef bint strip_comments = 0, strip_pis = 0, strip_entities = 0
 
     doc = _documentOrRaise(tree_or_element)
     element = _rootNodeOrRaise(tree_or_element)
-    if not tag_names: return
+    if not tag_names:
+        return
 
-    ns_tags = _filterSpecialTagNames(
-        tag_names, &strip_comments, &strip_pis, &strip_entities)
+    matcher = _MultiTagMatcher(tag_names)
+    matcher.cacheTags(doc)
+    if matcher.rejectsAll():
+        return
 
-    if (strip_comments or strip_pis) and isinstance(tree_or_element, _ElementTree):
+    if isinstance(tree_or_element, _ElementTree):
         # include PIs and comments next to the root node
-        if strip_comments:
+        if matcher.matchesType(tree.XML_COMMENT_NODE):
             _removeSiblings(element._c_node, tree.XML_COMMENT_NODE, with_tail)
-        if strip_pis:
+        if matcher.matchesType(tree.XML_PI_NODE):
             _removeSiblings(element._c_node, tree.XML_PI_NODE, with_tail)
+    _strip_elements(doc, element._c_node, matcher, with_tail)
 
-    # tag names are passed as C pointers as this allows us to take
-    # them from the doc dict and do pointer comparisons
-    c_ns_tags = <char**> stdlib.malloc(sizeof(char*) * len(ns_tags) * 2 + 2)
-    if c_ns_tags is NULL:
-        raise MemoryError()
-
-    try:
-        c_tag_count = _mapTagsToCharArray(doc._c_doc, ns_tags, c_ns_tags)
-        if c_tag_count > 0 or strip_comments or strip_pis or strip_entities:
-            _strip_elements(doc, element._c_node, c_ns_tags, c_tag_count,
-                            strip_comments, strip_pis, strip_entities, with_tail)
-    finally:
-        stdlib.free(c_ns_tags)
-
-cdef _strip_elements(_Document doc, xmlNode* c_node,
-                     char** c_ns_tags, Py_ssize_t c_tag_count,
-                     bint strip_comments, bint strip_pis, bint strip_entities,
+cdef _strip_elements(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher,
                      bint with_tail):
     cdef xmlNode* c_child
     cdef xmlNode* c_next
-    cdef Py_ssize_t i
 
     tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
     if c_node.type == tree.XML_ELEMENT_NODE:
@@ -151,20 +110,16 @@ cdef _strip_elements(_Document doc, xmlNode* c_node,
         c_child = _findChildForwards(c_node, 0)
         while c_child is not NULL:
             c_next = _nextElement(c_child)
-            if c_child.type == tree.XML_ELEMENT_NODE:
-                for i in range(0, c_tag_count*2, 2):
-                    if _tagMatchesExactly(c_child, c_ns_tags[i], c_ns_tags[i+1]):
-                        if not with_tail:
-                            tree.xmlUnlinkNode(c_child)
-                        _removeNode(doc, c_child)
-                        break
-            elif c_child.type == tree.XML_COMMENT_NODE and strip_comments \
-                     or c_child.type == tree.XML_PI_NODE and strip_pis \
-                     or c_child.type == tree.XML_ENTITY_REF_NODE and strip_entities:
-                if with_tail:
-                    _removeText(c_child.next)
-                tree.xmlUnlinkNode(c_child)
-                attemptDeallocation(c_child)
+            if matcher.matches(c_child):
+                if c_child.type == tree.XML_ELEMENT_NODE:
+                    if not with_tail:
+                        tree.xmlUnlinkNode(c_child)
+                    _removeNode(doc, c_child)
+                else:
+                    if with_tail:
+                        _removeText(c_child.next)
+                    tree.xmlUnlinkNode(c_child)
+                    attemptDeallocation(c_child)
             c_child = c_next
     tree.END_FOR_EACH_ELEMENT_FROM(c_node)
 
@@ -191,6 +146,7 @@ def strip_tags(tree_or_element, *tag_names):
             Comment                      # comments (including their text!)
             )
     """
+    cdef _MultiTagMatcher matcher
     cdef _Element element
     cdef _Document doc
     cdef list ns_tags
@@ -200,35 +156,23 @@ def strip_tags(tree_or_element, *tag_names):
 
     doc = _documentOrRaise(tree_or_element)
     element = _rootNodeOrRaise(tree_or_element)
-    if not tag_names: return
+    if not tag_names:
+        return
 
-    ns_tags = _filterSpecialTagNames(
-        tag_names, &strip_comments, &strip_pis, &strip_entities)
+    matcher = _MultiTagMatcher(tag_names)
+    matcher.cacheTags(doc)
+    if matcher.rejectsAll():
+        return
 
-    if (strip_comments or strip_pis) and isinstance(tree_or_element, _ElementTree):
+    if isinstance(tree_or_element, _ElementTree):
         # include PIs and comments next to the root node
-        if strip_comments:
+        if matcher.matchesType(tree.XML_COMMENT_NODE):
             _removeSiblings(element._c_node, tree.XML_COMMENT_NODE, 0)
-        if strip_pis:
+        if matcher.matchesType(tree.XML_PI_NODE):
             _removeSiblings(element._c_node, tree.XML_PI_NODE, 0)
+    _strip_tags(doc, element._c_node, matcher)
 
-    # tag names are passes as C pointers as this allows us to take
-    # them from the doc dict and do pointer comparisons
-    c_ns_tags = <char**> stdlib.malloc(sizeof(char*) * len(ns_tags) * 2 + 2)
-    if c_ns_tags is NULL:
-        raise MemoryError()
-
-    try:
-        c_tag_count = _mapTagsToCharArray(doc._c_doc, ns_tags, c_ns_tags)
-        if c_tag_count > 0 or strip_comments or strip_pis or strip_entities:
-            _strip_tags(doc, element._c_node, c_ns_tags, c_tag_count,
-                        strip_comments, strip_pis, strip_entities)
-    finally:
-        stdlib.free(c_ns_tags)
-
-cdef _strip_tags(_Document doc, xmlNode* c_node,
-                 char** c_ns_tags, Py_ssize_t c_tag_count,
-                 bint strip_comments, bint strip_pis, bint strip_entities):
+cdef _strip_tags(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher):
     cdef xmlNode* c_child
     cdef xmlNode* c_next
     cdef Py_ssize_t i
@@ -240,82 +184,20 @@ cdef _strip_tags(_Document doc, xmlNode* c_node,
         # c_node itself
         c_child = _findChildForwards(c_node, 0)
         while c_child is not NULL:
+            if not matcher.matches(c_child):
+                c_child = _nextElement(c_child)
+                continue
             if c_child.type == tree.XML_ELEMENT_NODE:
-                for i in range(c_tag_count):
-                    if _tagMatchesExactly(c_child, c_ns_tags[2*i], c_ns_tags[2*i+1]):
-                        c_next = _findChildForwards(c_child, 0) or _nextElement(c_child)
-                        _replaceNodeByChildren(doc, c_child)
-                        if not attemptDeallocation(c_child):
-                            if c_child.nsDef is not NULL:
-                                # make namespaces absolute
-                                moveNodeToDocument(doc, doc._c_doc, c_child)
-                        c_child = c_next
-                        break
-                else:
-                    c_child = _nextElement(c_child)
+                c_next = _findChildForwards(c_child, 0) or _nextElement(c_child)
+                _replaceNodeByChildren(doc, c_child)
+                if not attemptDeallocation(c_child):
+                    if c_child.nsDef is not NULL:
+                        # make namespaces absolute
+                        moveNodeToDocument(doc, doc._c_doc, c_child)
+                c_child = c_next
             else:
                 c_next = _nextElement(c_child)
-                if c_child.type == tree.XML_COMMENT_NODE and strip_comments \
-                       or c_child.type == tree.XML_PI_NODE and strip_pis \
-                       or c_child.type == tree.XML_ENTITY_REF_NODE and strip_entities:
-                    tree.xmlUnlinkNode(c_child)
-                    attemptDeallocation(c_child)
+                tree.xmlUnlinkNode(c_child)
+                attemptDeallocation(c_child)
                 c_child = c_next
     tree.END_FOR_EACH_ELEMENT_FROM(c_node)
-
-
-# helper functions
-
-cdef list _sortedTagList(list l):
-    # This is required since the namespace may be None (which Py3
-    # can't compare to strings).  A bit of overhead, but at least
-    # portable ...
-    cdef list decorated_list
-    cdef tuple ns_tag
-    cdef Py_ssize_t i
-    decorated_list = [ (ns_tag[0] or b'', ns_tag[1], i, ns_tag)
-                       for i, ns_tag in enumerate(l) ]
-    decorated_list.sort()
-    return [ item[-1] for item in decorated_list ]
-
-cdef list _filterSpecialTagNames(tag_names, bint* comments, bint* pis, bint* entities):
-    cdef list ns_tags
-    comments[0] = 0
-    pis[0] = 0
-    entities[0] = 0
-
-    ns_tags = []
-    for tag in tag_names:
-        if tag is Comment:
-            comments[0] = 1
-        elif tag is ProcessingInstruction:
-            pis[0] = 1
-        elif tag is Entity:
-            entities[0] = 1
-        else:
-            ns_tags.append(_getNsTag(tag))
-
-    return [ (ns, tag if tag != b'*' else None)
-             for ns, tag in _sortedTagList(ns_tags) ]
-
-cdef Py_ssize_t _mapTagsToCharArray(xmlDoc* c_doc, list ns_tags,
-                                    char** c_ns_tags) except -1:
-    cdef Py_ssize_t count = 0
-    cdef char* c_tag
-    for ns, tag in ns_tags:
-        if ns is None:
-            c_ns_tags[0] = NULL
-        else:
-            c_ns_tags[0] = _cstr(ns)
-        if tag is None:
-            c_ns_tags[1] = NULL
-        else:
-            c_tag = _cstr(tag)
-            c_ns_tags[1] = tree.xmlDictExists(
-                c_doc.dict, c_tag, cstring_h.strlen(c_tag))
-            if c_ns_tags[1] == NULL:
-                # not in the dict => not in the document
-                continue
-        c_ns_tags += 2
-        count += 1
-    return count