Browse files

add a method tree.getelementpath(element) that generates a structural…

… ElementPath expression for an Element
  • Loading branch information...
1 parent 85e65a9 commit 7bf642938121be9d377b19d6e4d047258360446f @scoder scoder committed Apr 6, 2014
Showing with 159 additions and 1 deletion.
  1. +3 −0 CHANGES.txt
  2. +21 −0 doc/tutorial.txt
  3. +72 −1 src/lxml/lxml.etree.pyx
  4. +63 −0 src/lxml/tests/test_etree.py
View
3 CHANGES.txt
@@ -9,6 +9,9 @@ Latest changes
Features added
--------------
+* ``ElementTree.getelementpath(element)`` returns a structural ElementPath
+ expression for the given element, which can be used for lookups later.
+
* ``xmlfile()`` accepts a new argument ``close=True`` to close file(-like)
objects after writing to them.
View
21 doc/tutorial.txt
@@ -1426,6 +1426,27 @@ Find Elements with a certain attribute:
>>> print(root.findall(".//a[@y]"))
[]
+In lxml 3.4, there is a new helper to generate a structural ElementPath
+expression for an Element:
+
+.. sourcecode:: pycon
+
+ >>> tree = etree.ElementTree(root)
+ >>> a = root[0]
+ >>> print(tree.getelementpath(a[0]))
+ a/b[1]
+ >>> print(tree.getelementpath(a[1]))
+ a/c
+ >>> print(tree.getelementpath(a[2]))
+ a/b[2]
+ >>> tree.find(tree.getelementpath(a[2])) == a[2]
+ True
+
+As long as the tree is not modified, this path expression represents an
+identifier for a given element that can be used to find() it in the same
+tree later. Compared to XPath, ElementPath expressions have the advantage
+of being self-contained even for documents that use namespaces.
+
The ``.iter()`` method is a special case that only finds specific tags
in the tree by their name, not based on a path. That means that the
following commands are equivalent in the success case:
View
73 src/lxml/lxml.etree.pyx
@@ -1942,7 +1942,14 @@ cdef public class _ElementTree [ type LxmlElementTreeType,
def getpath(self, _Element element not None):
u"""getpath(self, element)
- Returns a structural, absolute XPath expression to find that element.
+ Returns a structural, absolute XPath expression to find the element.
+
+ For namespaced elements, the expression uses prefixes from the
+ document, which therefore need to be provided in order to make any
+ use of the expression in XPath.
+
+ Also see the method getelementpath(self, element), which returns a
+ self-contained ElementPath expression.
"""
cdef _Document doc
cdef _Element root
@@ -1970,6 +1977,70 @@ cdef public class _ElementTree [ type LxmlElementTreeType,
tree.xmlFree(c_path)
return path
+ def getelementpath(self, _Element element not None):
+ u"""getelementpath(self, element)
+
+ Returns a structural, absolute ElementPath expression to find the
+ element. This path can be used in the .find() method to look up
+ the element, provided that the elements along the path and their
+ list of immediate children were not modified in between.
+
+ ElementPath has the advantage over an XPath expression (as returned
+ by the .getpath() method) that it does not require additional prefix
+ declarations. It is always self-contained.
+ """
+ cdef _Element root
+ cdef Py_ssize_t count
+ _assertValidNode(element)
+ if element._c_node.type != tree.XML_ELEMENT_NODE:
+ raise ValueError, u"input is not an Element"
+ if self._context_node is not None:
+ root = self._context_node
+ elif self._doc is not None:
+ root = self._doc.getroot()
+ else:
+ raise ValueError, u"Element is not in this tree"
+ _assertValidNode(root)
+ if element._doc is not root._doc:
+ raise ValueError, u"Element is not in this tree"
+
+ path = []
+ c_element = element._c_node
+ while c_element is not root._c_node:
+ c_name = c_element.name
+ c_href = _getNs(c_element)
+ tag = _namespacedNameFromNsName(c_href, c_name)
+ if c_href is NULL:
+ c_href = <const_xmlChar*>b'' # no namespace (NULL is wildcard)
+ # use tag[N] if there are preceding siblings with the same tag
+ count = 0
+ c_node = c_element.prev
+ while c_node is not NULL:
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ if _tagMatches(c_node, c_href, c_name):
+ count += 1
+ c_node = c_node.prev
+ if count:
+ tag = '%s[%d]' % (tag, count+1)
+ else:
+ # use tag[1] if there are following siblings with the same tag
+ c_node = c_element.next
+ while c_node is not NULL:
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ if _tagMatches(c_node, c_href, c_name):
+ tag += '[1]'
+ break
+ c_node = c_node.next
+
+ path.append(tag)
+ c_element = c_element.parent
+ if c_element is NULL or c_element.type != tree.XML_ELEMENT_NODE:
+ raise ValueError, u"Element is not in this tree."
+ if not path:
+ return '.'
+ path.reverse()
+ return '/'.join(path)
+
def getiterator(self, tag=None, *tags):
u"""getiterator(self, *tags, tag=None)
View
63 src/lxml/tests/test_etree.py
@@ -2661,6 +2661,69 @@ def test_getiterator_filter_all_comment_pi(self):
[a, b, c],
list(a.getiterator('*')))
+ def test_elementtree_getelementpath(self):
+ a = etree.Element("a")
+ b = etree.SubElement(a, "b")
+ c = etree.SubElement(a, "c")
+ d1 = etree.SubElement(c, "d")
+ d2 = etree.SubElement(c, "d")
+ c.text = d1.text = 'TEXT'
+
+ tree = etree.ElementTree(a)
+ self.assertEqual('.', tree.getelementpath(a))
+ self.assertEqual('c/d[1]', tree.getelementpath(d1))
+ self.assertEqual('c/d[2]', tree.getelementpath(d2))
+
+ self.assertEqual(d1, tree.find(tree.getelementpath(d1)))
+ self.assertEqual(d2, tree.find(tree.getelementpath(d2)))
+
+ tree = etree.ElementTree(c)
+ self.assertEqual('.', tree.getelementpath(c))
+ self.assertEqual('d[2]', tree.getelementpath(d2))
+ self.assertEqual(d2, tree.find(tree.getelementpath(d2)))
+
+ tree = etree.ElementTree(b) # not a parent of a/c/d1/d2
+ self.assertEqual('.', tree.getelementpath(b))
+ self.assertRaises(ValueError, tree.getelementpath, a)
+ self.assertRaises(ValueError, tree.getelementpath, c)
+ self.assertRaises(ValueError, tree.getelementpath, d2)
+
+ def test_elementtree_getelementpath_ns(self):
+ a = etree.Element("{http://ns1/}a")
+ b = etree.SubElement(a, "{http://ns1/}b")
+ c = etree.SubElement(a, "{http://ns1/}c")
+ d1 = etree.SubElement(c, "{http://ns1/}d")
+ d2 = etree.SubElement(c, "{http://ns2/}d")
+ d3 = etree.SubElement(c, "{http://ns1/}d")
+
+ tree = etree.ElementTree(a)
+ self.assertEqual('.', tree.getelementpath(a))
+ self.assertEqual('{http://ns1/}c/{http://ns1/}d[1]',
+ tree.getelementpath(d1))
+ self.assertEqual('{http://ns1/}c/{http://ns2/}d',
+ tree.getelementpath(d2))
+ self.assertEqual('{http://ns1/}c/{http://ns1/}d[2]',
+ tree.getelementpath(d3))
+
+ self.assertEqual(a, tree.find(tree.getelementpath(a)))
+ self.assertEqual(b, tree.find(tree.getelementpath(b)))
+ self.assertEqual(c, tree.find(tree.getelementpath(c)))
+ self.assertEqual(d1, tree.find(tree.getelementpath(d1)))
+ self.assertEqual(d2, tree.find(tree.getelementpath(d2)))
+ self.assertEqual(d3, tree.find(tree.getelementpath(d3)))
+
+ tree = etree.ElementTree(c)
+ self.assertEqual('{http://ns1/}d[1]', tree.getelementpath(d1))
+ self.assertEqual('{http://ns2/}d', tree.getelementpath(d2))
+ self.assertEqual('{http://ns1/}d[2]', tree.getelementpath(d3))
+ self.assertEqual(d1, tree.find(tree.getelementpath(d1)))
+ self.assertEqual(d2, tree.find(tree.getelementpath(d2)))
+ self.assertEqual(d3, tree.find(tree.getelementpath(d3)))
+
+ tree = etree.ElementTree(b) # not a parent of d1/d2
+ self.assertRaises(ValueError, tree.getelementpath, d1)
+ self.assertRaises(ValueError, tree.getelementpath, d2)
+
def test_elementtree_find_qname(self):
XML = self.etree.XML
ElementTree = self.etree.ElementTree

0 comments on commit 7bf6429

Please sign in to comment.