Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

use per-document hash tables for XML IDs and allow disabling them com…

…pletely with collect_ids=False
  • Loading branch information...
commit 35316b052af48921657813bb68563fe4a301d1b8 1 parent 593f24a
@scoder scoder authored
View
8 CHANGES.txt
@@ -8,6 +8,14 @@ Latest changes
Features added
--------------
+* New ``XMLParser`` option ``collect_ids=False`` to disable ID hash table
+ creation. This can substantially speed up parsing of documents with many
+ different IDs that are not used.
+
+* The parser uses per-document hash tables for XML IDs. This reduces the
+ load of the global parser dict and thus speeds up parsing for documents
+ with many different IDs.
+
* ``ElementTree.getelementpath(element)`` returns a structural ElementPath
expression for the given element, which can be used for lookups later.
View
4 doc/parsing.txt
@@ -170,6 +170,10 @@ Available boolean keyword arguments:
* compact - use compact storage for short text content (on by default)
+* collect_ids - collect XML IDs in a hash table while parsing (on by default).
+ Disabling this can substantially speed up parsing of documents with many
+ different IDs if the hash lookup is not used afterwards.
+
Other keyword arguments:
* encoding - override the document encoding
View
5 src/lxml/includes/tree.pxd
@@ -66,6 +66,11 @@ cdef extern from "libxml/hash.h":
ctypedef void (*xmlHashScanner)(void* payload, void* data, const_xmlChar* name) # may require GIL!
void xmlHashScan(xmlHashTable* table, xmlHashScanner f, void* data) nogil
void* xmlHashLookup(xmlHashTable* table, const_xmlChar* name) nogil
+ ctypedef void (*xmlHashDeallocator)(void *payload, xmlChar *name)
+ cdef xmlHashTable* xmlHashCreate(int size)
+ cdef xmlHashTable* xmlHashCreateDict(int size, xmlDict *dict)
+ cdef int xmlHashSize(xmlHashTable* table)
+ cdef void xmlHashFree(xmlHashTable* table, xmlHashDeallocator f)
cdef extern from *: # actually "libxml/dict.h"
# libxml/dict.h appears to be broken to include in C
View
16 src/lxml/includes/xmlparser.pxd
@@ -1,6 +1,7 @@
from libc.string cimport const_char
-from lxml.includes.tree cimport xmlDoc, xmlNode, xmlDict, xmlDtd, const_xmlChar
+from lxml.includes.tree cimport (
+ xmlDoc, xmlNode, xmlDict, xmlDtd, xmlChar, const_xmlChar)
from lxml.includes.tree cimport xmlInputReadCallback, xmlInputCloseCallback
from lxml.includes.xmlerror cimport xmlError, xmlStructuredErrorFunc
@@ -82,8 +83,14 @@ cdef extern from "libxml/tree.h":
xmlStructuredErrorFunc serror
void* _private
-cdef extern from "libxml/xmlIO.h":
- cdef xmlParserInputBuffer* xmlAllocParserInputBuffer(int enc) nogil
+
+cdef extern from "libxml/SAX2.h" nogil:
+ cdef void xmlSAX2StartDocument(void* ctxt)
+
+
+cdef extern from "libxml/xmlIO.h" nogil:
+ cdef xmlParserInputBuffer* xmlAllocParserInputBuffer(int enc)
+
cdef extern from "libxml/parser.h":
@@ -92,7 +99,8 @@ cdef extern from "libxml/parser.h":
cdef void xmlDictFree(xmlDict* sub) nogil
cdef int xmlDictReference(xmlDict* dict) nogil
- cdef int XML_COMPLETE_ATTRS # SAX option for adding DTD default attributes
+ cdef int XML_COMPLETE_ATTRS # SAX option for adding DTD default attributes
+ cdef int XML_SKIP_IDS # SAX option for not building an XML ID dict
ctypedef struct xmlParserCtxt:
xmlDoc* myDoc
View
3  src/lxml/iterparse.pxi
@@ -68,7 +68,7 @@ cdef class iterparse:
load_dtd=False, no_network=True, remove_blank_text=False,
compact=True, resolve_entities=True, remove_comments=False,
remove_pis=False, strip_cdata=True, encoding=None,
- html=False, recover=None, huge_tree=False,
+ html=False, recover=None, huge_tree=False, collect_ids=True,
XMLSchema schema=None):
if not hasattr(source, 'read'):
self._filename = source
@@ -119,6 +119,7 @@ cdef class iterparse:
remove_comments=remove_comments,
remove_pis=remove_pis,
strip_cdata=strip_cdata,
+ collect_ids=True,
target=None, # TODO
compact=compact)
View
81 src/lxml/parser.pxi
@@ -503,9 +503,11 @@ cdef class _ParserContext(_ResolverContext):
cdef xmlparser.xmlParserCtxt* _c_ctxt
cdef python.PyThread_type_lock _lock
cdef _Document _doc
+ cdef bint _collect_ids
def __cinit__(self):
self._c_ctxt = NULL
+ self._collect_ids = True
if not config.ENABLE_THREADING:
self._lock = NULL
else:
@@ -521,6 +523,7 @@ cdef class _ParserContext(_ResolverContext):
cdef _ParserContext _copy(self):
cdef _ParserContext context
context = self.__class__()
+ context._collect_ids = self._collect_ids
context._validator = self._validator.copy()
_initParserContext(context, self._resolvers._copy(), NULL)
return context
@@ -764,6 +767,7 @@ cdef class _BaseParser:
cdef bint _remove_comments
cdef bint _remove_pis
cdef bint _strip_cdata
+ cdef bint _collect_ids
cdef XMLSchema _schema
cdef bytes _filename
cdef readonly object target
@@ -771,7 +775,8 @@ cdef class _BaseParser:
cdef tuple _events_to_collect # (event_types, tag)
def __init__(self, int parse_options, bint for_html, XMLSchema schema,
- remove_comments, remove_pis, strip_cdata, target, encoding):
+ remove_comments, remove_pis, strip_cdata, collect_ids,
+ target, encoding):
cdef tree.xmlCharEncodingHandler* enchandler
cdef int c_encoding
if not isinstance(self, (XMLParser, HTMLParser)):
@@ -783,6 +788,7 @@ cdef class _BaseParser:
self._remove_comments = remove_comments
self._remove_pis = remove_pis
self._strip_cdata = strip_cdata
+ self._collect_ids = collect_ids
self._schema = schema
self._resolvers = _ResolverRegistry()
@@ -812,19 +818,14 @@ cdef class _BaseParser:
cdef xmlparser.xmlParserCtxt* pctxt
if self._parser_context is None:
self._parser_context = self._createContext(self.target, None)
+ self._parser_context._collect_ids = self._collect_ids
if self._schema is not None:
self._parser_context._validator = \
self._schema._newSaxValidator(
self._parse_options & xmlparser.XML_PARSE_DTDATTR)
pctxt = self._newParserCtxt()
_initParserContext(self._parser_context, self._resolvers, pctxt)
- if self._remove_comments:
- pctxt.sax.comment = NULL
- if self._remove_pis:
- pctxt.sax.processingInstruction = NULL
- if self._strip_cdata:
- # hard switch-off for CDATA nodes => makes them plain text
- pctxt.sax.cdataBlock = NULL
+ self._configureSaxContext(pctxt)
return self._parser_context
cdef _ParserContext _getPushParserContext(self):
@@ -832,6 +833,7 @@ cdef class _BaseParser:
if self._push_parser_context is None:
self._push_parser_context = self._createContext(
self.target, self._events_to_collect)
+ self._push_parser_context._collect_ids = self._collect_ids
if self._schema is not None:
self._push_parser_context._validator = \
self._schema._newSaxValidator(
@@ -839,13 +841,7 @@ cdef class _BaseParser:
pctxt = self._newPushParserCtxt()
_initParserContext(
self._push_parser_context, self._resolvers, pctxt)
- if self._remove_comments:
- pctxt.sax.comment = NULL
- if self._remove_pis:
- pctxt.sax.processingInstruction = NULL
- if self._strip_cdata:
- # hard switch-off for CDATA nodes => makes them plain text
- pctxt.sax.cdataBlock = NULL
+ self._configureSaxContext(pctxt)
return self._push_parser_context
cdef _ParserContext _createContext(self, target, events_to_collect):
@@ -863,6 +859,16 @@ cdef class _BaseParser:
sax_context._setEventFilter(events, tag)
return sax_context
+ @cython.final
+ cdef int _configureSaxContext(self, xmlparser.xmlParserCtxt* pctxt) except -1:
+ if self._remove_comments:
+ pctxt.sax.comment = NULL
+ if self._remove_pis:
+ pctxt.sax.processingInstruction = NULL
+ if self._strip_cdata:
+ # hard switch-off for CDATA nodes => makes them plain text
+ pctxt.sax.cdataBlock = NULL
+
cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1:
cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax
if sax is not NULL and sax.initialized and sax.initialized != xmlparser.XML_SAX2_MAGIC:
@@ -891,6 +897,7 @@ cdef class _BaseParser:
c_ctxt = xmlparser.xmlNewParserCtxt()
if c_ctxt is NULL:
raise MemoryError
+ c_ctxt.sax.startDocument = _initSaxDocument
return c_ctxt
cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self) except NULL:
@@ -909,6 +916,7 @@ cdef class _BaseParser:
xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options)
if c_ctxt is NULL:
raise MemoryError()
+ c_ctxt.sax.startDocument = _initSaxDocument
return c_ctxt
property error_log:
@@ -1140,6 +1148,40 @@ cdef class _BaseParser:
finally:
context.cleanup()
+
+cdef void _initSaxDocument(void* ctxt) with gil:
+ xmlparser.xmlSAX2StartDocument(ctxt)
+ c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
+ c_doc = c_ctxt.myDoc
+
+ # set up document dict
+ if c_doc and c_ctxt.dict and not c_doc.dict:
+ # I have no idea why libxml2 disables this - we need it
+ c_ctxt.dictNames = 1
+ c_doc.dict = c_ctxt.dict
+ xmlparser.xmlDictReference(c_ctxt.dict)
+
+ # set up XML ID hash table
+ if c_ctxt._private and not c_ctxt.html:
+ context = <_ParserContext>c_ctxt._private
+ if context._collect_ids:
+ # keep the global parser dict from filling up with XML IDs
+ if c_doc and not c_doc.ids:
+ # memory errors are not fatal here
+ c_dict = xmlparser.xmlDictCreate()
+ if c_dict:
+ c_doc.ids = tree.xmlHashCreateDict(0, c_dict)
+ xmlparser.xmlDictFree(c_dict)
+ else:
+ c_doc.ids = tree.xmlHashCreate(0)
+ else:
+ c_ctxt.loadsubset |= xmlparser.XML_SKIP_IDS
+ if c_doc and c_doc.ids and not tree.xmlHashSize(c_doc.ids):
+ # already initialised but empty => clear
+ tree.xmlHashFree(c_doc.ids, NULL)
+ c_doc.ids = NULL
+
+
############################################################
## ET feed parser
############################################################
@@ -1357,7 +1399,7 @@ _XML_DEFAULT_PARSE_OPTIONS = (
)
cdef class XMLParser(_FeedParser):
- u"""XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, XMLSchema schema=None, remove_blank_text=False, resolve_entities=True, remove_comments=False, remove_pis=False, strip_cdata=True, target=None, compact=True)
+ u"""XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, XMLSchema schema=None, remove_blank_text=False, resolve_entities=True, remove_comments=False, remove_pis=False, strip_cdata=True, collect_ids=True, target=None, compact=True)
The XML parser.
@@ -1386,6 +1428,7 @@ cdef class XMLParser(_FeedParser):
- remove_pis - discard processing instructions
- strip_cdata - replace CDATA sections by normal text content (default: True)
- compact - save memory for short text content (default: True)
+ - collect_ids - create a hash table of XML IDs (default: True, always True with DTD validation)
- resolve_entities - replace entities by their text value (default: True)
- huge_tree - disable security restrictions and support very deep trees
and very long text content (only affects libxml2 2.7+)
@@ -1405,7 +1448,7 @@ cdef class XMLParser(_FeedParser):
ns_clean=False, recover=False, XMLSchema schema=None,
huge_tree=False, remove_blank_text=False, resolve_entities=True,
remove_comments=False, remove_pis=False, strip_cdata=True,
- target=None, compact=True):
+ collect_ids=True, target=None, compact=True):
cdef int parse_options
parse_options = _XML_DEFAULT_PARSE_OPTIONS
if load_dtd:
@@ -1436,7 +1479,7 @@ cdef class XMLParser(_FeedParser):
_BaseParser.__init__(self, parse_options, 0, schema,
remove_comments, remove_pis, strip_cdata,
- target, encoding)
+ collect_ids, target, encoding)
cdef class XMLPullParser(XMLParser):
@@ -1595,7 +1638,7 @@ cdef class HTMLParser(_FeedParser):
parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
_BaseParser.__init__(self, parse_options, 1, schema,
- remove_comments, remove_pis, strip_cdata,
+ remove_comments, remove_pis, strip_cdata, True,
target, encoding)
View
5 src/lxml/saxparser.pxi
@@ -525,11 +525,6 @@ cdef void _handleSaxStartDocument(void* ctxt) with gil:
context = <_SaxParserContext>c_ctxt._private
context._origSaxStartDocument(ctxt)
c_doc = c_ctxt.myDoc
- if c_doc and c_ctxt.dict and not c_doc.dict:
- # I have no idea why libxml2 disables this - we need it
- c_ctxt.dictNames = 1
- c_doc.dict = c_ctxt.dict
- xmlparser.xmlDictReference(c_ctxt.dict)
try:
context.startDocument(c_doc)
except:
View
83 src/lxml/tests/test_etree.py
@@ -866,6 +866,31 @@ def test_feed_parser_recover(self):
# FIXME: would be nice to get some errors logged ...
#self.assertTrue(len(parser.error_log) > 0, "error log is empty")
+ def test_feed_parser_recover_no_id_dict(self):
+ # test that recover mode plays nicely with the no-id-dict setup
+ parser = self.etree.XMLParser(recover=True, collect_ids=False)
+
+ parser.feed('<?xml version=')
+ parser.feed('"1.0"?><ro')
+ parser.feed('ot xml:id="123"><')
+ parser.feed('a test="works" xml:id=')
+ parser.feed('"321"><othertag/></root') # <a> not closed!
+ parser.feed('>')
+
+ root = parser.close()
+
+ self.assertEqual(root.tag, "root")
+ self.assertEqual(len(root), 1)
+ self.assertEqual(root[0].tag, "a")
+ self.assertEqual(root[0].get("test"), "works")
+ self.assertEqual(root[0].attrib, {
+ 'test': 'works',
+ '{http://www.w3.org/XML/1998/namespace}id': '321'})
+ self.assertEqual(len(root[0]), 1)
+ self.assertEqual(root[0][0].tag, "othertag")
+ # FIXME: would be nice to get some errors logged ...
+ #self.assertTrue(len(parser.error_log) > 0, "error log is empty")
+
def test_elementtree_parser_target_type_error(self):
assertEqual = self.assertEqual
assertFalse = self.assertFalse
@@ -944,8 +969,34 @@ def close(self):
done = 'value error received as expected'
self.assertEqual(["start-root", "data-A", "start-a",
- "data-ca", "end-a", "close"],
- events)
+ "data-ca", "end-a", "close"],
+ events)
+
+ def test_parser_target_feed_no_id_dict(self):
+ # test that target parsing works nicely with the no-id-hash setup
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start-" + tag)
+ def end(self, tag):
+ events.append("end-" + tag)
+ def data(self, data):
+ events.append("data-" + data)
+ def comment(self, text):
+ events.append("comment-" + text)
+ def close(self):
+ return "DONE"
+
+ parser = self.etree.XMLParser(target=Target(), collect_ids=False)
+
+ parser.feed(_bytes('<!--a--><root xml:id="123">A<!--b-->'))
+ parser.feed(_bytes('<sub xml:id="321"/>B</root>'))
+ done = parser.close()
+
+ self.assertEqual("DONE", done)
+ self.assertEqual(["comment-a", "start-root", "data-A", "comment-b",
+ "start-sub", "end-sub", "data-B", "end-root"],
+ events)
def test_parser_target_comment(self):
events = []
@@ -2232,6 +2283,34 @@ def test_XMLDTDID_empty(self):
expected = {}
self._checkIDDict(dic, expected)
+ def test_XMLDTDID_no_id_dict(self):
+ XMLDTDID = self.etree.XMLDTDID
+ XML = self.etree.XML
+ xml_text = _bytes('''
+ <!DOCTYPE document [
+ <!ELEMENT document (h1,p)*>
+ <!ELEMENT h1 (#PCDATA)>
+ <!ATTLIST h1 myid ID #REQUIRED>
+ <!ELEMENT p (#PCDATA)>
+ <!ATTLIST p someid ID #REQUIRED>
+ ]>
+ <document>
+ <h1 myid="chapter1">...</h1>
+ <p id="note1" class="note">...</p>
+ <p>Regular paragraph.</p>
+ <p xml:id="xmlid">XML:ID paragraph.</p>
+ <p someid="warn1" class="warning">...</p>
+ </document>
+ ''')
+
+ parser = etree.XMLParser(collect_ids=False)
+ root, dic = XMLDTDID(xml_text, parser=parser)
+ root2 = XML(xml_text)
+ self.assertEqual(self._writeElement(root),
+ self._writeElement(root2))
+ self.assertFalse(dic)
+ self._checkIDDict(dic, {})
+
def _checkIDDict(self, dic, expected):
self.assertEqual(len(dic),
len(expected))
Please sign in to comment.
Something went wrong with that request. Please try again.