Skip to content

Commit

Permalink
rewrite of node matcher to remove code duplication and make it PyPy c…
Browse files Browse the repository at this point in the history
…ompatible
  • Loading branch information
scoder committed Apr 21, 2012
1 parent 7dcdd7c commit 714024e
Show file tree
Hide file tree
Showing 4 changed files with 152 additions and 209 deletions.
47 changes: 40 additions & 7 deletions src/lxml/apihelpers.pxi
Expand Up @@ -946,15 +946,15 @@ cdef inline bint _tagMatches(xmlNode* c_node, char* c_href, char* c_name):
else:
return 0

cdef inline bint _tagMatchesExactly(xmlNode* c_node, char* c_href, char* c_name):
cdef inline bint _tagMatchesExactly(xmlNode* c_node, qname* c_qname):
u"""Tests if the node matches namespace URI and tag name.
This differs from _tagMatches() in that it does not consider a
NULL value in c_href a wildcard, and that it expects the c_name to
be taken from the doc dict, i.e. it only compares the names by
NULL value in qname.href a wildcard, and that it expects the c_name
to be taken from the doc dict, i.e. it only compares the names by
address.
A node matches if it matches both c_href and c_name.
A node matches if it matches both href and c_name of the qname.
A node matches c_href if any of the following is true:
* its namespace is NULL and c_href is the empty string
Expand All @@ -965,15 +965,48 @@ cdef inline bint _tagMatchesExactly(xmlNode* c_node, char* c_href, char* c_name)
* its name string points to the same address (!) as c_name
"""
cdef char* c_node_href
if c_name is not NULL and c_name is not c_node.name:
if c_qname.c_name is not NULL and c_qname.c_name is not c_node.name:
return 0
c_node_href = _getNs(c_node)
if c_href is NULL:
if c_qname.href is NULL:
return c_node_href is NULL or c_node_href[0] == '\0'
elif c_node_href is NULL:
return 0
else:
return cstring_h.strcmp(c_href, c_node_href) == 0
return cstring_h.strcmp(python.__cstr(c_qname.href), c_node_href) == 0

cdef Py_ssize_t _mapTagsToQnameMatchArray(xmlDoc* c_doc, list ns_tags,
qname* c_ns_tags, bint force_into_dict) except -1:
u"""Map a sequence of (name, namespace) pairs to a qname array for efficient
matching with _tagMatchesExactly() above.
Note that each qname struct in the array owns its href byte string object
if it is not NULL.
"""
cdef Py_ssize_t count = 0
cdef char* c_tag
cdef bytes ns, tag
for ns, tag in ns_tags:
if tag is None:
c_tag = NULL
elif force_into_dict:
c_tag = tree.xmlDictLookup(c_doc.dict, _cstr(tag), len(tag))
if c_tag is NULL:
raise MemoryError()
else:
c_tag = tree.xmlDictExists(c_doc.dict, _cstr(tag), len(tag))
if c_tag is NULL:
# not in the dict => not in the document
continue
c_ns_tags[0].c_name = c_tag
if ns is None:
c_ns_tags[0].href = NULL
else:
python.Py_INCREF(ns) # keep an owned reference!
c_ns_tags[0].href = <python.PyObject*>ns
c_ns_tags += 1
count += 1
return count

cdef int _removeNode(_Document doc, xmlNode* c_node) except -1:
u"""Unlink and free a node and subnodes if possible. Otherwise, make sure
Expand Down
244 changes: 63 additions & 181 deletions src/lxml/cleanup.pxi
Expand Up @@ -22,59 +22,30 @@ def strip_attributes(tree_or_element, *attribute_names):
'simpleattr',
'{http://some/ns}attrname')
"""
cdef _MultiTagMatcher matcher
cdef _Element element
cdef list ns_tags
cdef char** c_ns_tags
cdef Py_ssize_t c_tag_count

element = _rootNodeOrRaise(tree_or_element)
if not attribute_names: return

ns_tags = _sortedTagList([ _getNsTag(attr)
for attr in <tuple>attribute_names ])
ns_tags = [ (ns, tag if tag != b'*' else None)
for ns, tag in ns_tags ]

# tag names are passes as C pointers as this allows us to take
# them from the doc dict and do pointer comparisons
c_ns_tags = <char**> stdlib.malloc(sizeof(char*) * len(ns_tags) * 2 + 2)
if c_ns_tags is NULL:
raise MemoryError()
if not attribute_names:
return

try:
c_tag_count = _mapTagsToCharArray(element._doc._c_doc, ns_tags, c_ns_tags)
if c_tag_count > 0:
_strip_attributes(element._c_node, c_ns_tags, c_tag_count)
finally:
stdlib.free(c_ns_tags)
matcher = _MultiTagMatcher(attribute_names)
matcher.cacheTags(element._doc)
if matcher.rejectsAllAttributes():
return
_strip_attributes(element._c_node, matcher)

cdef _strip_attributes(xmlNode* c_node, char** c_ns_tags, Py_ssize_t c_tag_count):
cdef _strip_attributes(xmlNode* c_node, _MultiTagMatcher matcher):
cdef xmlAttr* c_attr
cdef Py_ssize_t i
cdef char* c_href
cdef char* c_name

cdef xmlAttr* c_next_attr
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
if c_node.type == tree.XML_ELEMENT_NODE:
if c_node.properties is not NULL:
for i in range(c_tag_count):
c_href = c_ns_tags[2*i]
c_name = c_ns_tags[2*i+1]
# must compare attributes manually to make sure we
# only match on wildcard tag names if the attribute
# has no namespace
c_attr = c_node.properties
while c_attr is not NULL:
if c_name is NULL or c_attr.name == c_name:
if c_href is NULL:
if c_attr.ns is NULL or c_attr.ns.href is NULL:
tree.xmlRemoveProp(c_attr)
break
elif c_attr.ns is not NULL and c_attr.ns.href is not NULL:
if cstring_h.strcmp(c_attr.ns.href, c_href) == 0:
tree.xmlRemoveProp(c_attr)
break
c_attr = c_attr.next
c_attr = c_node.properties
while c_attr is not NULL:
c_next_attr = c_attr.next
if matcher.matchesAttribute(c_attr):
tree.xmlRemoveProp(c_attr)
c_attr = c_next_attr
tree.END_FOR_EACH_ELEMENT_FROM(c_node)

def strip_elements(tree_or_element, *tag_names, bint with_tail=True):
Expand All @@ -100,48 +71,36 @@ def strip_elements(tree_or_element, *tag_names, bint with_tail=True):
lxml.etree.Comment # comments
)
"""
cdef _MultiTagMatcher matcher
cdef _Element element
cdef _Document doc
cdef list ns_tags
cdef char** c_ns_tags
cdef qname* c_ns_tags
cdef Py_ssize_t c_tag_count
cdef bint strip_comments = 0, strip_pis = 0, strip_entities = 0

doc = _documentOrRaise(tree_or_element)
element = _rootNodeOrRaise(tree_or_element)
if not tag_names: return
if not tag_names:
return

ns_tags = _filterSpecialTagNames(
tag_names, &strip_comments, &strip_pis, &strip_entities)
matcher = _MultiTagMatcher(tag_names)
matcher.cacheTags(doc)
if matcher.rejectsAll():
return

if (strip_comments or strip_pis) and isinstance(tree_or_element, _ElementTree):
if isinstance(tree_or_element, _ElementTree):
# include PIs and comments next to the root node
if strip_comments:
if matcher.matchesType(tree.XML_COMMENT_NODE):
_removeSiblings(element._c_node, tree.XML_COMMENT_NODE, with_tail)
if strip_pis:
if matcher.matchesType(tree.XML_PI_NODE):
_removeSiblings(element._c_node, tree.XML_PI_NODE, with_tail)
_strip_elements(doc, element._c_node, matcher, with_tail)

# tag names are passed as C pointers as this allows us to take
# them from the doc dict and do pointer comparisons
c_ns_tags = <char**> stdlib.malloc(sizeof(char*) * len(ns_tags) * 2 + 2)
if c_ns_tags is NULL:
raise MemoryError()

try:
c_tag_count = _mapTagsToCharArray(doc._c_doc, ns_tags, c_ns_tags)
if c_tag_count > 0 or strip_comments or strip_pis or strip_entities:
_strip_elements(doc, element._c_node, c_ns_tags, c_tag_count,
strip_comments, strip_pis, strip_entities, with_tail)
finally:
stdlib.free(c_ns_tags)

cdef _strip_elements(_Document doc, xmlNode* c_node,
char** c_ns_tags, Py_ssize_t c_tag_count,
bint strip_comments, bint strip_pis, bint strip_entities,
cdef _strip_elements(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher,
bint with_tail):
cdef xmlNode* c_child
cdef xmlNode* c_next
cdef Py_ssize_t i

tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
if c_node.type == tree.XML_ELEMENT_NODE:
Expand All @@ -151,20 +110,16 @@ cdef _strip_elements(_Document doc, xmlNode* c_node,
c_child = _findChildForwards(c_node, 0)
while c_child is not NULL:
c_next = _nextElement(c_child)
if c_child.type == tree.XML_ELEMENT_NODE:
for i in range(0, c_tag_count*2, 2):
if _tagMatchesExactly(c_child, c_ns_tags[i], c_ns_tags[i+1]):
if not with_tail:
tree.xmlUnlinkNode(c_child)
_removeNode(doc, c_child)
break
elif c_child.type == tree.XML_COMMENT_NODE and strip_comments \
or c_child.type == tree.XML_PI_NODE and strip_pis \
or c_child.type == tree.XML_ENTITY_REF_NODE and strip_entities:
if with_tail:
_removeText(c_child.next)
tree.xmlUnlinkNode(c_child)
attemptDeallocation(c_child)
if matcher.matches(c_child):
if c_child.type == tree.XML_ELEMENT_NODE:
if not with_tail:
tree.xmlUnlinkNode(c_child)
_removeNode(doc, c_child)
else:
if with_tail:
_removeText(c_child.next)
tree.xmlUnlinkNode(c_child)
attemptDeallocation(c_child)
c_child = c_next
tree.END_FOR_EACH_ELEMENT_FROM(c_node)

Expand All @@ -191,6 +146,7 @@ def strip_tags(tree_or_element, *tag_names):
Comment # comments (including their text!)
)
"""
cdef _MultiTagMatcher matcher
cdef _Element element
cdef _Document doc
cdef list ns_tags
Expand All @@ -200,35 +156,23 @@ def strip_tags(tree_or_element, *tag_names):

doc = _documentOrRaise(tree_or_element)
element = _rootNodeOrRaise(tree_or_element)
if not tag_names: return
if not tag_names:
return

ns_tags = _filterSpecialTagNames(
tag_names, &strip_comments, &strip_pis, &strip_entities)
matcher = _MultiTagMatcher(tag_names)
matcher.cacheTags(doc)
if matcher.rejectsAll():
return

if (strip_comments or strip_pis) and isinstance(tree_or_element, _ElementTree):
if isinstance(tree_or_element, _ElementTree):
# include PIs and comments next to the root node
if strip_comments:
if matcher.matchesType(tree.XML_COMMENT_NODE):
_removeSiblings(element._c_node, tree.XML_COMMENT_NODE, 0)
if strip_pis:
if matcher.matchesType(tree.XML_PI_NODE):
_removeSiblings(element._c_node, tree.XML_PI_NODE, 0)
_strip_tags(doc, element._c_node, matcher)

# tag names are passes as C pointers as this allows us to take
# them from the doc dict and do pointer comparisons
c_ns_tags = <char**> stdlib.malloc(sizeof(char*) * len(ns_tags) * 2 + 2)
if c_ns_tags is NULL:
raise MemoryError()

try:
c_tag_count = _mapTagsToCharArray(doc._c_doc, ns_tags, c_ns_tags)
if c_tag_count > 0 or strip_comments or strip_pis or strip_entities:
_strip_tags(doc, element._c_node, c_ns_tags, c_tag_count,
strip_comments, strip_pis, strip_entities)
finally:
stdlib.free(c_ns_tags)

cdef _strip_tags(_Document doc, xmlNode* c_node,
char** c_ns_tags, Py_ssize_t c_tag_count,
bint strip_comments, bint strip_pis, bint strip_entities):
cdef _strip_tags(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher):
cdef xmlNode* c_child
cdef xmlNode* c_next
cdef Py_ssize_t i
Expand All @@ -240,82 +184,20 @@ cdef _strip_tags(_Document doc, xmlNode* c_node,
# c_node itself
c_child = _findChildForwards(c_node, 0)
while c_child is not NULL:
if not matcher.matches(c_child):
c_child = _nextElement(c_child)
continue
if c_child.type == tree.XML_ELEMENT_NODE:
for i in range(c_tag_count):
if _tagMatchesExactly(c_child, c_ns_tags[2*i], c_ns_tags[2*i+1]):
c_next = _findChildForwards(c_child, 0) or _nextElement(c_child)
_replaceNodeByChildren(doc, c_child)
if not attemptDeallocation(c_child):
if c_child.nsDef is not NULL:
# make namespaces absolute
moveNodeToDocument(doc, doc._c_doc, c_child)
c_child = c_next
break
else:
c_child = _nextElement(c_child)
c_next = _findChildForwards(c_child, 0) or _nextElement(c_child)
_replaceNodeByChildren(doc, c_child)
if not attemptDeallocation(c_child):
if c_child.nsDef is not NULL:
# make namespaces absolute
moveNodeToDocument(doc, doc._c_doc, c_child)
c_child = c_next
else:
c_next = _nextElement(c_child)
if c_child.type == tree.XML_COMMENT_NODE and strip_comments \
or c_child.type == tree.XML_PI_NODE and strip_pis \
or c_child.type == tree.XML_ENTITY_REF_NODE and strip_entities:
tree.xmlUnlinkNode(c_child)
attemptDeallocation(c_child)
tree.xmlUnlinkNode(c_child)
attemptDeallocation(c_child)
c_child = c_next
tree.END_FOR_EACH_ELEMENT_FROM(c_node)


# helper functions

cdef list _sortedTagList(list l):
# This is required since the namespace may be None (which Py3
# can't compare to strings). A bit of overhead, but at least
# portable ...
cdef list decorated_list
cdef tuple ns_tag
cdef Py_ssize_t i
decorated_list = [ (ns_tag[0] or b'', ns_tag[1], i, ns_tag)
for i, ns_tag in enumerate(l) ]
decorated_list.sort()
return [ item[-1] for item in decorated_list ]

cdef list _filterSpecialTagNames(tag_names, bint* comments, bint* pis, bint* entities):
cdef list ns_tags
comments[0] = 0
pis[0] = 0
entities[0] = 0

ns_tags = []
for tag in tag_names:
if tag is Comment:
comments[0] = 1
elif tag is ProcessingInstruction:
pis[0] = 1
elif tag is Entity:
entities[0] = 1
else:
ns_tags.append(_getNsTag(tag))

return [ (ns, tag if tag != b'*' else None)
for ns, tag in _sortedTagList(ns_tags) ]

cdef Py_ssize_t _mapTagsToCharArray(xmlDoc* c_doc, list ns_tags,
char** c_ns_tags) except -1:
cdef Py_ssize_t count = 0
cdef char* c_tag
for ns, tag in ns_tags:
if ns is None:
c_ns_tags[0] = NULL
else:
c_ns_tags[0] = _cstr(ns)
if tag is None:
c_ns_tags[1] = NULL
else:
c_tag = _cstr(tag)
c_ns_tags[1] = tree.xmlDictExists(
c_doc.dict, c_tag, cstring_h.strlen(c_tag))
if c_ns_tags[1] == NULL:
# not in the dict => not in the document
continue
c_ns_tags += 2
count += 1
return count

0 comments on commit 714024e

Please sign in to comment.