Browse files

first stab - parse CSV/Schema.org data and dump in Turtle

  • Loading branch information...
1 parent e3bfbe6 commit b752f1dd7d1e42c5f7d46d6f46f03a8dde6b61c5 mhausenblas committed Jul 30, 2011
Showing with 29,094 additions and 0 deletions.
  1. +1 −0 .gitignore
  2. +17 −0 lib/html5lib/__init__.py
  3. +3,084 −0 lib/html5lib/constants.py
  4. 0 lib/html5lib/filters/__init__.py
  5. +10 −0 lib/html5lib/filters/_base.py
  6. +127 −0 lib/html5lib/filters/formfiller.py
  7. +62 −0 lib/html5lib/filters/inject_meta_charset.py
  8. +88 −0 lib/html5lib/filters/lint.py
  9. +202 −0 lib/html5lib/filters/optionaltags.py
  10. +8 −0 lib/html5lib/filters/sanitizer.py
  11. +41 −0 lib/html5lib/filters/whitespace.py
  12. +2,708 −0 lib/html5lib/html5parser.py
  13. +177 −0 lib/html5lib/ihatexml.py
  14. +778 −0 lib/html5lib/inputstream.py
  15. +258 −0 lib/html5lib/sanitizer.py
  16. +17 −0 lib/html5lib/serializer/__init__.py
  17. +312 −0 lib/html5lib/serializer/htmlserializer.py
  18. +9 −0 lib/html5lib/serializer/xhtmlserializer.py
  19. +5 −0 lib/html5lib/tests/README
  20. +12 −0 lib/html5lib/tests/__init__.py
  21. +37 −0 lib/html5lib/tests/mockParser.py
  22. +30 −0 lib/html5lib/tests/performance/concatenation.py
  23. +27 −0 lib/html5lib/tests/runparsertests.py
  24. +20 −0 lib/html5lib/tests/runtests.py
  25. +127 −0 lib/html5lib/tests/support.py
  26. +54 −0 lib/html5lib/tests/test_encoding.py
  27. +296 −0 lib/html5lib/tests/test_formfiller.py
  28. +163 −0 lib/html5lib/tests/test_parser.py
  29. +39 −0 lib/html5lib/tests/test_parser2.py
  30. +98 −0 lib/html5lib/tests/test_sanitizer.py
  31. +204 −0 lib/html5lib/tests/test_serializer.py
  32. +97 −0 lib/html5lib/tests/test_stream.py
  33. +205 −0 lib/html5lib/tests/test_tokenizer.py
  34. +323 −0 lib/html5lib/tests/test_treewalkers.py
  35. +123 −0 lib/html5lib/tests/test_whitespace_filter.py
  36. +64 −0 lib/html5lib/tests/tokenizertotree.py
  37. +3 −0 lib/html5lib/tests/us-ascii.html
  38. +3 −0 lib/html5lib/tests/utf-8-bom.html
  39. +1,745 −0 lib/html5lib/tokenizer.py
  40. +96 −0 lib/html5lib/treebuilders/__init__.py
  41. +377 −0 lib/html5lib/treebuilders/_base.py
  42. +286 −0 lib/html5lib/treebuilders/dom.py
  43. +339 −0 lib/html5lib/treebuilders/etree.py
  44. +337 −0 lib/html5lib/treebuilders/etree_lxml.py
  45. +256 −0 lib/html5lib/treebuilders/simpletree.py
  46. +236 −0 lib/html5lib/treebuilders/soup.py
  47. +52 −0 lib/html5lib/treewalkers/__init__.py
  48. +176 −0 lib/html5lib/treewalkers/_base.py
  49. +41 −0 lib/html5lib/treewalkers/dom.py
  50. +137 −0 lib/html5lib/treewalkers/etree.py
  51. +70 −0 lib/html5lib/treewalkers/genshistream.py
  52. +186 −0 lib/html5lib/treewalkers/lxmletree.py
  53. +60 −0 lib/html5lib/treewalkers/pulldom.py
  54. +78 −0 lib/html5lib/treewalkers/simpletree.py
  55. +60 −0 lib/html5lib/treewalkers/soup.py
  56. +175 −0 lib/html5lib/utils.py
  57. +16 −0 lib/rdfextras/__init__.py
  58. +167 −0 lib/rdfextras/sparql/__init__.py
  59. +1,319 −0 lib/rdfextras/sparql/algebra.py
  60. +708 −0 lib/rdfextras/sparql/components.py
  61. +422 −0 lib/rdfextras/sparql/evaluate.py
  62. +556 −0 lib/rdfextras/sparql/graph.py
  63. +509 −0 lib/rdfextras/sparql/operators.py
  64. +872 −0 lib/rdfextras/sparql/parser.py
  65. +43 −0 lib/rdfextras/sparql/processor.py
  66. +3,791 −0 lib/rdfextras/sparql/pyparsing.py
  67. +1,568 −0 lib/rdfextras/sparql/query.py
  68. +58 −0 lib/rdfextras/tools/CSVWriter.py
  69. +384 −0 lib/rdfextras/tools/DatabaseStats.py
  70. +80 −0 lib/rdfextras/tools/EARLPlugin.py
  71. +82 −0 lib/rdfextras/tools/FixTypeViews.py
  72. +227 −0 lib/rdfextras/tools/QueryRunner.py
  73. +218 −0 lib/rdfextras/tools/QueryStats.py
  74. 0 lib/rdfextras/tools/__init__.py
  75. +244 −0 lib/rdfextras/tools/describer.py
  76. +87 −0 lib/rdfextras/tools/pathutils.py
  77. +177 −0 lib/rdfextras/tools/rdfpipe.py
  78. +214 −0 lib/rdfextras/tools/sparqler.py
  79. +32 −0 lib/rdflib/LICENSE
  80. +76 −0 lib/rdflib/__init__.py
  81. +259 −0 lib/rdflib/collection.py
  82. +246 −0 lib/rdflib/compare.py
  83. +90 −0 lib/rdflib/events.py
  84. +69 −0 lib/rdflib/exceptions.py
  85. +1,375 −0 lib/rdflib/graph.py
  86. +330 −0 lib/rdflib/namespace.py
  87. +180 −0 lib/rdflib/parser.py
  88. +152 −0 lib/rdflib/plugin.py
  89. +7 −0 lib/rdflib/plugins/__init__.py
Sorry, we could not display the entire diff because it was too big.
View
1 .gitignore
@@ -0,0 +1 @@
+*.pyc
View
17 lib/html5lib/__init__.py
@@ -0,0 +1,17 @@
+"""
+HTML parsing library based on the WHATWG "HTML5"
+specification. The parser is designed to be compatible with existing
+HTML found in the wild and implements well-defined error recovery that
+is largely compatible with modern desktop web browsers.
+
+Example usage:
+
+import html5lib
+f = open("my_document.html")
+tree = html5lib.parse(f)
+"""
+__version__ = "0.95-dev"
+from html5parser import HTMLParser, parse, parseFragment
+from treebuilders import getTreeBuilder
+from treewalkers import getTreeWalker
+from serializer import serialize
View
3,084 lib/html5lib/constants.py
3,084 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
0 lib/html5lib/filters/__init__.py
No changes.
View
10 lib/html5lib/filters/_base.py
@@ -0,0 +1,10 @@
+
+class Filter(object):
+ def __init__(self, source):
+ self.source = source
+
+ def __iter__(self):
+ return iter(self.source)
+
+ def __getattr__(self, name):
+ return getattr(self.source, name)
View
127 lib/html5lib/filters/formfiller.py
@@ -0,0 +1,127 @@
+#
+# The goal is to finally have a form filler where you pass data for
+# each form, using the algorithm for "Seeding a form with initial values"
+# See http://www.whatwg.org/specs/web-forms/current-work/#seeding
+#
+
+import _base
+
+from html5lib.constants import spaceCharacters
+spaceCharacters = u"".join(spaceCharacters)
+
+class SimpleFilter(_base.Filter):
+ def __init__(self, source, fieldStorage):
+ _base.Filter.__init__(self, source)
+ self.fieldStorage = fieldStorage
+
+ def __iter__(self):
+ field_indices = {}
+ state = None
+ field_name = None
+ for token in _base.Filter.__iter__(self):
+ type = token["type"]
+ if type in ("StartTag", "EmptyTag"):
+ name = token["name"].lower()
+ if name == "input":
+ field_name = None
+ field_type = None
+ input_value_index = -1
+ input_checked_index = -1
+ for i,(n,v) in enumerate(token["data"]):
+ n = n.lower()
+ if n == u"name":
+ field_name = v.strip(spaceCharacters)
+ elif n == u"type":
+ field_type = v.strip(spaceCharacters)
+ elif n == u"checked":
+ input_checked_index = i
+ elif n == u"value":
+ input_value_index = i
+
+ value_list = self.fieldStorage.getlist(field_name)
+ field_index = field_indices.setdefault(field_name, 0)
+ if field_index < len(value_list):
+ value = value_list[field_index]
+ else:
+ value = ""
+
+ if field_type in (u"checkbox", u"radio"):
+ if value_list:
+ if token["data"][input_value_index][1] == value:
+ if input_checked_index < 0:
+ token["data"].append((u"checked", u""))
+ field_indices[field_name] = field_index + 1
+ elif input_checked_index >= 0:
+ del token["data"][input_checked_index]
+
+ elif field_type not in (u"button", u"submit", u"reset"):
+ if input_value_index >= 0:
+ token["data"][input_value_index] = (u"value", value)
+ else:
+ token["data"].append((u"value", value))
+ field_indices[field_name] = field_index + 1
+
+ field_type = None
+ field_name = None
+
+ elif name == "textarea":
+ field_type = "textarea"
+ field_name = dict((token["data"])[::-1])["name"]
+
+ elif name == "select":
+ field_type = "select"
+ attributes = dict(token["data"][::-1])
+ field_name = attributes.get("name")
+ is_select_multiple = "multiple" in attributes
+ is_selected_option_found = False
+
+ elif field_type == "select" and field_name and name == "option":
+ option_selected_index = -1
+ option_value = None
+ for i,(n,v) in enumerate(token["data"]):
+ n = n.lower()
+ if n == "selected":
+ option_selected_index = i
+ elif n == "value":
+ option_value = v.strip(spaceCharacters)
+ if option_value is None:
+ raise NotImplementedError("<option>s without a value= attribute")
+ else:
+ value_list = self.fieldStorage.getlist(field_name)
+ if value_list:
+ field_index = field_indices.setdefault(field_name, 0)
+ if field_index < len(value_list):
+ value = value_list[field_index]
+ else:
+ value = ""
+ if (is_select_multiple or not is_selected_option_found) and option_value == value:
+ if option_selected_index < 0:
+ token["data"].append((u"selected", u""))
+ field_indices[field_name] = field_index + 1
+ is_selected_option_found = True
+ elif option_selected_index >= 0:
+ del token["data"][option_selected_index]
+
+ elif field_type is not None and field_name and type == "EndTag":
+ name = token["name"].lower()
+ if name == field_type:
+ if name == "textarea":
+ value_list = self.fieldStorage.getlist(field_name)
+ if value_list:
+ field_index = field_indices.setdefault(field_name, 0)
+ if field_index < len(value_list):
+ value = value_list[field_index]
+ else:
+ value = ""
+ yield {"type": "Characters", "data": value}
+ field_indices[field_name] = field_index + 1
+
+ field_name = None
+
+ elif name == "option" and field_type == "select":
+ pass # TODO: part of "option without value= attribute" processing
+
+ elif field_type == "textarea":
+ continue # ignore token
+
+ yield token
View
62 lib/html5lib/filters/inject_meta_charset.py
@@ -0,0 +1,62 @@
+import _base
+
+class Filter(_base.Filter):
+ def __init__(self, source, encoding):
+ _base.Filter.__init__(self, source)
+ self.encoding = encoding
+
+ def __iter__(self):
+ state = "pre_head"
+ meta_found = (self.encoding is None)
+ pending = []
+
+ for token in _base.Filter.__iter__(self):
+ type = token["type"]
+ if type == "StartTag":
+ if token["name"].lower() == u"head":
+ state = "in_head"
+
+ elif type == "EmptyTag":
+ if token["name"].lower() == u"meta":
+ # replace charset with actual encoding
+ has_http_equiv_content_type = False
+ for (namespace,name),value in token["data"].iteritems():
+ if namespace != None:
+ continue
+ elif name.lower() == u'charset':
+ token["data"][(namespace,name)] = self.encoding
+ meta_found = True
+ break
+ elif name == u'http-equiv' and value.lower() == u'content-type':
+ has_http_equiv_content_type = True
+ else:
+ if has_http_equiv_content_type and (None, u"content") in token["data"]:
+ token["data"][(None, u"content")] = u'text/html; charset=%s' % self.encoding
+ meta_found = True
+
+ elif token["name"].lower() == u"head" and not meta_found:
+ # insert meta into empty head
+ yield {"type": "StartTag", "name": u"head",
+ "data": token["data"]}
+ yield {"type": "EmptyTag", "name": u"meta",
+ "data": {(None, u"charset"): self.encoding}}
+ yield {"type": "EndTag", "name": u"head"}
+ meta_found = True
+ continue
+
+ elif type == "EndTag":
+ if token["name"].lower() == u"head" and pending:
+ # insert meta into head (if necessary) and flush pending queue
+ yield pending.pop(0)
+ if not meta_found:
+ yield {"type": "EmptyTag", "name": u"meta",
+ "data": {(None, u"charset"): self.encoding}}
+ while pending:
+ yield pending.pop(0)
+ meta_found = True
+ state = "post_head"
+
+ if state == "in_head":
+ pending.append(token)
+ else:
+ yield token
View
88 lib/html5lib/filters/lint.py
@@ -0,0 +1,88 @@
+from gettext import gettext
+_ = gettext
+
+import _base
+from html5lib.constants import cdataElements, rcdataElements, voidElements
+
+from html5lib.constants import spaceCharacters
+spaceCharacters = u"".join(spaceCharacters)
+
+class LintError(Exception): pass
+
+class Filter(_base.Filter):
+ def __iter__(self):
+ open_elements = []
+ contentModelFlag = "PCDATA"
+ for token in _base.Filter.__iter__(self):
+ type = token["type"]
+ if type in ("StartTag", "EmptyTag"):
+ name = token["name"]
+ if contentModelFlag != "PCDATA":
+ raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
+ if not isinstance(name, unicode):
+ raise LintError(_(u"Tag name is not a string: %r") % name)
+ if not name:
+ raise LintError(_(u"Empty tag name"))
+ if type == "StartTag" and name in voidElements:
+ raise LintError(_(u"Void element reported as StartTag token: %s") % name)
+ elif type == "EmptyTag" and name not in voidElements:
+ raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
+ if type == "StartTag":
+ open_elements.append(name)
+ for name, value in token["data"]:
+ if not isinstance(name, unicode):
+ raise LintError(_("Attribute name is not a string: %r") % name)
+ if not name:
+ raise LintError(_(u"Empty attribute name"))
+ if not isinstance(value, unicode):
+ raise LintError(_("Attribute value is not a string: %r") % value)
+ if name in cdataElements:
+ contentModelFlag = "CDATA"
+ elif name in rcdataElements:
+ contentModelFlag = "RCDATA"
+ elif name == "plaintext":
+ contentModelFlag = "PLAINTEXT"
+
+ elif type == "EndTag":
+ name = token["name"]
+ if not isinstance(name, unicode):
+ raise LintError(_(u"Tag name is not a string: %r") % name)
+ if not name:
+ raise LintError(_(u"Empty tag name"))
+ if name in voidElements:
+ raise LintError(_(u"Void element reported as EndTag token: %s") % name)
+ start_name = open_elements.pop()
+ if start_name != name:
+ raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
+ contentModelFlag = "PCDATA"
+
+ elif type == "Comment":
+ if contentModelFlag != "PCDATA":
+ raise LintError(_("Comment not in PCDATA content model flag"))
+
+ elif type in ("Characters", "SpaceCharacters"):
+ data = token["data"]
+ if not isinstance(data, unicode):
+ raise LintError(_("Attribute name is not a string: %r") % data)
+ if not data:
+ raise LintError(_(u"%s token with empty data") % type)
+ if type == "SpaceCharacters":
+ data = data.strip(spaceCharacters)
+ if data:
+ raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
+
+ elif type == "Doctype":
+ name = token["name"]
+ if contentModelFlag != "PCDATA":
+ raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
+ if not isinstance(name, unicode):
+ raise LintError(_(u"Tag name is not a string: %r") % name)
+ # XXX: what to do with token["data"] ?
+
+ elif type in ("ParseError", "SerializeError"):
+ pass
+
+ else:
+ raise LintError(_(u"Unknown token type: %s") % type)
+
+ yield token
View
202 lib/html5lib/filters/optionaltags.py
@@ -0,0 +1,202 @@
+import _base
+
+class Filter(_base.Filter):
+ def slider(self):
+ previous1 = previous2 = None
+ for token in self.source:
+ if previous1 is not None:
+ yield previous2, previous1, token
+ previous2 = previous1
+ previous1 = token
+ yield previous2, previous1, None
+
+ def __iter__(self):
+ for previous, token, next in self.slider():
+ type = token["type"]
+ if type == "StartTag":
+ if (token["data"] or
+ not self.is_optional_start(token["name"], previous, next)):
+ yield token
+ elif type == "EndTag":
+ if not self.is_optional_end(token["name"], next):
+ yield token
+ else:
+ yield token
+
+ def is_optional_start(self, tagname, previous, next):
+ type = next and next["type"] or None
+ if tagname in 'html':
+ # An html element's start tag may be omitted if the first thing
+ # inside the html element is not a space character or a comment.
+ return type not in ("Comment", "SpaceCharacters")
+ elif tagname == 'head':
+ # A head element's start tag may be omitted if the first thing
+ # inside the head element is an element.
+ # XXX: we also omit the start tag if the head element is empty
+ if type in ("StartTag", "EmptyTag"):
+ return True
+ elif type == "EndTag":
+ return next["name"] == "head"
+ elif tagname == 'body':
+ # A body element's start tag may be omitted if the first thing
+ # inside the body element is not a space character or a comment,
+ # except if the first thing inside the body element is a script
+ # or style element and the node immediately preceding the body
+ # element is a head element whose end tag has been omitted.
+ if type in ("Comment", "SpaceCharacters"):
+ return False
+ elif type == "StartTag":
+ # XXX: we do not look at the preceding event, so we never omit
+ # the body element's start tag if it's followed by a script or
+ # a style element.
+ return next["name"] not in ('script', 'style')
+ else:
+ return True
+ elif tagname == 'colgroup':
+ # A colgroup element's start tag may be omitted if the first thing
+ # inside the colgroup element is a col element, and if the element
+ # is not immediately preceeded by another colgroup element whose
+ # end tag has been omitted.
+ if type in ("StartTag", "EmptyTag"):
+ # XXX: we do not look at the preceding event, so instead we never
+ # omit the colgroup element's end tag when it is immediately
+ # followed by another colgroup element. See is_optional_end.
+ return next["name"] == "col"
+ else:
+ return False
+ elif tagname == 'tbody':
+ # A tbody element's start tag may be omitted if the first thing
+ # inside the tbody element is a tr element, and if the element is
+ # not immediately preceeded by a tbody, thead, or tfoot element
+ # whose end tag has been omitted.
+ if type == "StartTag":
+ # omit the thead and tfoot elements' end tag when they are
+ # immediately followed by a tbody element. See is_optional_end.
+ if previous and previous['type'] == 'EndTag' and \
+ previous['name'] in ('tbody','thead','tfoot'):
+ return False
+ return next["name"] == 'tr'
+ else:
+ return False
+ return False
+
+ def is_optional_end(self, tagname, next):
+ type = next and next["type"] or None
+ if tagname in ('html', 'head', 'body'):
+ # An html element's end tag may be omitted if the html element
+ # is not immediately followed by a space character or a comment.
+ return type not in ("Comment", "SpaceCharacters")
+ elif tagname in ('li', 'optgroup', 'tr'):
+ # A li element's end tag may be omitted if the li element is
+ # immediately followed by another li element or if there is
+ # no more content in the parent element.
+ # An optgroup element's end tag may be omitted if the optgroup
+ # element is immediately followed by another optgroup element,
+ # or if there is no more content in the parent element.
+ # A tr element's end tag may be omitted if the tr element is
+ # immediately followed by another tr element, or if there is
+ # no more content in the parent element.
+ if type == "StartTag":
+ return next["name"] == tagname
+ else:
+ return type == "EndTag" or type is None
+ elif tagname in ('dt', 'dd'):
+ # A dt element's end tag may be omitted if the dt element is
+ # immediately followed by another dt element or a dd element.
+ # A dd element's end tag may be omitted if the dd element is
+ # immediately followed by another dd element or a dt element,
+ # or if there is no more content in the parent element.
+ if type == "StartTag":
+ return next["name"] in ('dt', 'dd')
+ elif tagname == 'dd':
+ return type == "EndTag" or type is None
+ else:
+ return False
+ elif tagname == 'p':
+ # A p element's end tag may be omitted if the p element is
+ # immediately followed by an address, article, aside,
+ # blockquote, datagrid, dialog, dir, div, dl, fieldset,
+ # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
+ # nav, ol, p, pre, section, table, or ul, element, or if
+ # there is no more content in the parent element.
+ if type in ("StartTag", "EmptyTag"):
+ return next["name"] in ('address', 'article', 'aside',
+ 'blockquote', 'datagrid', 'dialog',
+ 'dir', 'div', 'dl', 'fieldset', 'footer',
+ 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+ 'header', 'hr', 'menu', 'nav', 'ol',
+ 'p', 'pre', 'section', 'table', 'ul')
+ else:
+ return type == "EndTag" or type is None
+ elif tagname == 'option':
+ # An option element's end tag may be omitted if the option
+ # element is immediately followed by another option element,
+ # or if it is immediately followed by an <code>optgroup</code>
+ # element, or if there is no more content in the parent
+ # element.
+ if type == "StartTag":
+ return next["name"] in ('option', 'optgroup')
+ else:
+ return type == "EndTag" or type is None
+ elif tagname in ('rt', 'rp'):
+ # An rt element's end tag may be omitted if the rt element is
+ # immediately followed by an rt or rp element, or if there is
+ # no more content in the parent element.
+ # An rp element's end tag may be omitted if the rp element is
+ # immediately followed by an rt or rp element, or if there is
+ # no more content in the parent element.
+ if type == "StartTag":
+ return next["name"] in ('rt', 'rp')
+ else:
+ return type == "EndTag" or type is None
+ elif tagname == 'colgroup':
+ # A colgroup element's end tag may be omitted if the colgroup
+ # element is not immediately followed by a space character or
+ # a comment.
+ if type in ("Comment", "SpaceCharacters"):
+ return False
+ elif type == "StartTag":
+ # XXX: we also look for an immediately following colgroup
+ # element. See is_optional_start.
+ return next["name"] != 'colgroup'
+ else:
+ return True
+ elif tagname in ('thead', 'tbody'):
+ # A thead element's end tag may be omitted if the thead element
+ # is immediately followed by a tbody or tfoot element.
+ # A tbody element's end tag may be omitted if the tbody element
+ # is immediately followed by a tbody or tfoot element, or if
+ # there is no more content in the parent element.
+ # A tfoot element's end tag may be omitted if the tfoot element
+ # is immediately followed by a tbody element, or if there is no
+ # more content in the parent element.
+ # XXX: we never omit the end tag when the following element is
+ # a tbody. See is_optional_start.
+ if type == "StartTag":
+ return next["name"] in ['tbody', 'tfoot']
+ elif tagname == 'tbody':
+ return type == "EndTag" or type is None
+ else:
+ return False
+ elif tagname == 'tfoot':
+ # A tfoot element's end tag may be omitted if the tfoot element
+ # is immediately followed by a tbody element, or if there is no
+ # more content in the parent element.
+ # XXX: we never omit the end tag when the following element is
+ # a tbody. See is_optional_start.
+ if type == "StartTag":
+ return next["name"] == 'tbody'
+ else:
+ return type == "EndTag" or type is None
+ elif tagname in ('td', 'th'):
+ # A td element's end tag may be omitted if the td element is
+ # immediately followed by a td or th element, or if there is
+ # no more content in the parent element.
+ # A th element's end tag may be omitted if the th element is
+ # immediately followed by a td or th element, or if there is
+ # no more content in the parent element.
+ if type == "StartTag":
+ return next["name"] in ('td', 'th')
+ else:
+ return type == "EndTag" or type is None
+ return False
View
8 lib/html5lib/filters/sanitizer.py
@@ -0,0 +1,8 @@
+import _base
+from html5lib.sanitizer import HTMLSanitizerMixin
+
+class Filter(_base.Filter, HTMLSanitizerMixin):
+ def __iter__(self):
+ for token in _base.Filter.__iter__(self):
+ token = self.sanitize_token(token)
+ if token: yield token
View
41 lib/html5lib/filters/whitespace.py
@@ -0,0 +1,41 @@
+try:
+ frozenset
+except NameError:
+ # Import from the sets module for python 2.3
+ from sets import ImmutableSet as frozenset
+
+import re
+
+import _base
+from html5lib.constants import rcdataElements, spaceCharacters
+spaceCharacters = u"".join(spaceCharacters)
+
+SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters)
+
+class Filter(_base.Filter):
+
+ spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
+
+ def __iter__(self):
+ preserve = 0
+ for token in _base.Filter.__iter__(self):
+ type = token["type"]
+ if type == "StartTag" \
+ and (preserve or token["name"] in self.spacePreserveElements):
+ preserve += 1
+
+ elif type == "EndTag" and preserve:
+ preserve -= 1
+
+ elif not preserve and type == "SpaceCharacters" and token["data"]:
+ # Test on token["data"] above to not introduce spaces where there were not
+ token["data"] = u" "
+
+ elif not preserve and type == "Characters":
+ token["data"] = collapse_spaces(token["data"])
+
+ yield token
+
+def collapse_spaces(text):
+ return SPACES_REGEX.sub(' ', text)
+
View
2,708 lib/html5lib/html5parser.py
2,708 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
177 lib/html5lib/ihatexml.py
@@ -0,0 +1,177 @@
+import re
+
+baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
+
+ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
+
+combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A"""
+
+digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
+
+extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
+
+letter = " | ".join([baseChar, ideographic])
+
+#Without the
+name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
+ extender])
+nameFirst = " | ".join([letter, "_"])
+
+reChar = re.compile(r"#x([\d|A-F]{4,4})")
+reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
+
+def charStringToList(chars):
+ charRanges = [item.strip() for item in chars.split(" | ")]
+ rv = []
+ for item in charRanges:
+ foundMatch = False
+ for regexp in (reChar, reCharRange):
+ match = regexp.match(item)
+ if match is not None:
+ rv.append([hexToInt(item) for item in match.groups()])
+ if len(rv[-1]) == 1:
+ rv[-1] = rv[-1]*2
+ foundMatch = True
+ break
+ if not foundMatch:
+ assert len(item) == 1
+
+ rv.append([ord(item)] * 2)
+ rv = normaliseCharList(rv)
+ return rv
+
+def normaliseCharList(charList):
+ charList = sorted(charList)
+ for item in charList:
+ assert item[1] >= item[0]
+ rv = []
+ i = 0
+ while i < len(charList):
+ j = 1
+ rv.append(charList[i])
+ while i + j < len(charList) and charList[i+j][0] <= rv[-1][1] + 1:
+ rv[-1][1] = charList[i+j][1]
+ j += 1
+ i += j
+ return rv
+
+#We don't really support characters above the BMP :(
+max_unicode = int("FFFF", 16)
+
+def missingRanges(charList):
+ rv = []
+ if charList[0] != 0:
+ rv.append([0, charList[0][0] - 1])
+ for i, item in enumerate(charList[:-1]):
+ rv.append([item[1]+1, charList[i+1][0] - 1])
+ if charList[-1][1] != max_unicode:
+ rv.append([charList[-1][1] + 1, max_unicode])
+ return rv
+
+def listToRegexpStr(charList):
+ rv = []
+ for item in charList:
+ if item[0] == item[1]:
+ rv.append(escapeRegexp(unichr(item[0])))
+ else:
+ rv.append(escapeRegexp(unichr(item[0])) + "-" +
+ escapeRegexp(unichr(item[1])))
+ return "[%s]"%"".join(rv)
+
+def hexToInt(hex_str):
+ return int(hex_str, 16)
+
+def escapeRegexp(string):
+ specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
+ "[", "]", "|", "(", ")", "-")
+ for char in specialCharacters:
+ string = string.replace(char, "\\" + char)
+ if char in string:
+ print string
+
+ return string
+
+#output from the above
+nonXmlNameBMPRegexp = re.compile(u'[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+
+nonXmlNameFirstBMPRegexp = re.compile(u'[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+
+class InfosetFilter(object):
+ replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
+ def __init__(self, replaceChars = None,
+ dropXmlnsLocalName = False,
+ dropXmlnsAttrNs = False,
+ preventDoubleDashComments = False,
+ preventDashAtCommentEnd = False,
+ replaceFormFeedCharacters = True):
+
+ self.dropXmlnsLocalName = dropXmlnsLocalName
+ self.dropXmlnsAttrNs = dropXmlnsAttrNs
+
+ self.preventDoubleDashComments = preventDoubleDashComments
+ self.preventDashAtCommentEnd = preventDashAtCommentEnd
+
+ self.replaceFormFeedCharacters = replaceFormFeedCharacters
+
+ self.replaceCache = {}
+
+ def coerceAttribute(self, name, namespace=None):
+ if self.dropXmlnsLocalName and name.startswith("xmlns:"):
+ #Need a datalosswarning here
+ return None
+ elif (self.dropXmlnsAttrNs and
+ namespace == "http://www.w3.org/2000/xmlns/"):
+ return None
+ else:
+ return self.toXmlName(name)
+
+ def coerceElement(self, name, namespace=None):
+ return self.toXmlName(name)
+
+ def coerceComment(self, data):
+ if self.preventDoubleDashComments:
+ while "--" in data:
+ data = data.replace("--", "- -")
+ return data
+
+ def coerceCharacters(self, data):
+ if self.replaceFormFeedCharacters:
+ data = data.replace("\x0C", " ")
+ #Other non-xml characters
+ return data
+
+ def toXmlName(self, name):
+ nameFirst = name[0]
+ nameRest = name[1:]
+ m = nonXmlNameFirstBMPRegexp.match(nameFirst)
+ if m:
+ nameFirstOutput = self.getReplacementCharacter(nameFirst)
+ else:
+ nameFirstOutput = nameFirst
+
+ nameRestOutput = nameRest
+ replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
+ for char in replaceChars:
+ replacement = self.getReplacementCharacter(char)
+ nameRestOutput = nameRestOutput.replace(char, replacement)
+ return nameFirstOutput + nameRestOutput
+
+ def getReplacementCharacter(self, char):
+ if char in self.replaceCache:
+ replacement = self.replaceCache[char]
+ else:
+ replacement = self.escapeChar(char)
+ return replacement
+
+ def fromXmlName(self, name):
+ for item in set(self.replacementRegexp.findall(name)):
+ name = name.replace(item, self.unescapeChar(item))
+ return name
+
+ def escapeChar(self, char):
+ replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
+ self.replaceCache[char] = replacement
+ return replacement
+
+ def unescapeChar(self, charcode):
+ return unichr(int(charcode[1:], 16))
View
778 lib/html5lib/inputstream.py
@@ -0,0 +1,778 @@
+import codecs
+import re
+import types
+import sys
+
+from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
+from constants import encodings, ReparseException
+import utils
+
+#Non-unicode versions of constants for use in the pre-parser
+spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
+asciiLettersBytes = frozenset([str(item) for item in asciiLetters])
+asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase])
+spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"])
+
+invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
+
+non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
+ 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
+ 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
+ 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
+ 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
+ 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
+ 0x10FFFE, 0x10FFFF])
+
+ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
+
+# Cache for charsUntil()
+charsUntilRegEx = {}
+
+class BufferedStream:
+ """Buffering for streams that do not have buffering of their own
+
+ The buffer is implemented as a list of chunks on the assumption that
+ joining many strings will be slow since it is O(n**2)
+ """
+
+ def __init__(self, stream):
+ self.stream = stream
+ self.buffer = []
+ self.position = [-1,0] #chunk number, offset
+
+ def tell(self):
+ pos = 0
+ for chunk in self.buffer[:self.position[0]]:
+ pos += len(chunk)
+ pos += self.position[1]
+ return pos
+
+ def seek(self, pos):
+ assert pos < self._bufferedBytes()
+ offset = pos
+ i = 0
+ while len(self.buffer[i]) < offset:
+ offset -= pos
+ i += 1
+ self.position = [i, offset]
+
+ def read(self, bytes):
+ if not self.buffer:
+ return self._readStream(bytes)
+ elif (self.position[0] == len(self.buffer) and
+ self.position[1] == len(self.buffer[-1])):
+ return self._readStream(bytes)
+ else:
+ return self._readFromBuffer(bytes)
+
+ def _bufferedBytes(self):
+ return sum([len(item) for item in self.buffer])
+
+ def _readStream(self, bytes):
+ data = self.stream.read(bytes)
+ self.buffer.append(data)
+ self.position[0] += 1
+ self.position[1] = len(data)
+ return data
+
+ def _readFromBuffer(self, bytes):
+ remainingBytes = bytes
+ rv = []
+ bufferIndex = self.position[0]
+ bufferOffset = self.position[1]
+ while bufferIndex < len(self.buffer) and remainingBytes != 0:
+ assert remainingBytes > 0
+ bufferedData = self.buffer[bufferIndex]
+
+ if remainingBytes <= len(bufferedData) - bufferOffset:
+ bytesToRead = remainingBytes
+ self.position = [bufferIndex, bufferOffset + bytesToRead]
+ else:
+ bytesToRead = len(bufferedData) - bufferOffset
+ self.position = [bufferIndex, len(bufferedData)]
+ bufferIndex += 1
+ data = rv.append(bufferedData[bufferOffset:
+ bufferOffset + bytesToRead])
+ remainingBytes -= bytesToRead
+
+ bufferOffset = 0
+
+ if remainingBytes:
+ rv.append(self._readStream(remainingBytes))
+
+ return "".join(rv)
+
+
+
+class HTMLInputStream:
+ """Provides a unicode stream of characters to the HTMLTokenizer.
+
+ This class takes care of character encoding and removing or replacing
+ incorrect byte-sequences and also provides column and line tracking.
+
+ """
+
+ _defaultChunkSize = 10240
+
+ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
+ """Initialises the HTMLInputStream.
+
+ HTMLInputStream(source, [encoding]) -> Normalized stream from source
+ for use by html5lib.
+
+ source can be either a file-object, local filename or a string.
+
+ The optional encoding parameter must be a string that indicates
+ the encoding. If specified, that encoding will be used,
+ regardless of any BOM or later declaration (such as in a meta
+ element)
+
+ parseMeta - Look for a <meta> element containing encoding information
+
+ """
+
+ #Craziness
+ if len(u"\U0010FFFF") == 1:
+ self.reportCharacterErrors = self.characterErrorsUCS4
+ self.replaceCharactersRegexp = re.compile(u"[\uD800-\uDFFF]")
+ else:
+ self.reportCharacterErrors = self.characterErrorsUCS2
+ self.replaceCharactersRegexp = re.compile(u"([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
+
+ # List of where new lines occur
+ self.newLines = [0]
+
+ self.charEncoding = (codecName(encoding), "certain")
+
+ # Raw Stream - for unicode objects this will encode to utf-8 and set
+ # self.charEncoding as appropriate
+ self.rawStream = self.openStream(source)
+
+ # Encoding Information
+ #Number of bytes to use when looking for a meta element with
+ #encoding information
+ self.numBytesMeta = 512
+ #Number of bytes to use when using detecting encoding using chardet
+ self.numBytesChardet = 100
+ #Encoding to use if no other information can be found
+ self.defaultEncoding = "windows-1252"
+
+ #Detect encoding iff no explicit "transport level" encoding is supplied
+ if (self.charEncoding[0] is None):
+ self.charEncoding = self.detectEncoding(parseMeta, chardet)
+
+
+ self.reset()
+
+ def reset(self):
+ self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
+ 'replace')
+
+ self.chunk = u""
+ self.chunkSize = 0
+ self.chunkOffset = 0
+ self.errors = []
+
+ # number of (complete) lines in previous chunks
+ self.prevNumLines = 0
+ # number of columns in the last line of the previous chunk
+ self.prevNumCols = 0
+
+ #Deal with CR LF and surrogates split over chunk boundaries
+ self._bufferedCharacter = None
+
+ def openStream(self, source):
+ """Produces a file object from source.
+
+ source can be either a file object, local filename or a string.
+
+ """
+ # Already a file object
+ if hasattr(source, 'read'):
+ stream = source
+ else:
+ # Otherwise treat source as a string and convert to a file object
+ if isinstance(source, unicode):
+ source = source.encode('utf-8')
+ self.charEncoding = ("utf-8", "certain")
+ import cStringIO
+ stream = cStringIO.StringIO(str(source))
+
+ if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
+ stream is sys.stdin):
+ stream = BufferedStream(stream)
+
+ return stream
+
+ def detectEncoding(self, parseMeta=True, chardet=True):
+ #First look for a BOM
+ #This will also read past the BOM if present
+ encoding = self.detectBOM()
+ confidence = "certain"
+ #If there is no BOM need to look for meta elements with encoding
+ #information
+ if encoding is None and parseMeta:
+ encoding = self.detectEncodingMeta()
+ confidence = "tentative"
+ #Guess with chardet, if avaliable
+ if encoding is None and chardet:
+ confidence = "tentative"
+ try:
+ from chardet.universaldetector import UniversalDetector
+ buffers = []
+ detector = UniversalDetector()
+ while not detector.done:
+ buffer = self.rawStream.read(self.numBytesChardet)
+ if not buffer:
+ break
+ buffers.append(buffer)
+ detector.feed(buffer)
+ detector.close()
+ encoding = detector.result['encoding']
+ self.rawStream.seek(0)
+ except ImportError:
+ pass
+ # If all else fails use the default encoding
+ if encoding is None:
+ confidence="tentative"
+ encoding = self.defaultEncoding
+
+ #Substitute for equivalent encodings:
+ encodingSub = {"iso-8859-1":"windows-1252"}
+
+ if encoding.lower() in encodingSub:
+ encoding = encodingSub[encoding.lower()]
+
+ return encoding, confidence
+
+ def changeEncoding(self, newEncoding):
+ newEncoding = codecName(newEncoding)
+ if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
+ newEncoding = "utf-8"
+ if newEncoding is None:
+ return
+ elif newEncoding == self.charEncoding[0]:
+ self.charEncoding = (self.charEncoding[0], "certain")
+ else:
+ self.rawStream.seek(0)
+ self.reset()
+ self.charEncoding = (newEncoding, "certain")
+ raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
+
+ def detectBOM(self):
+ """Attempts to detect at BOM at the start of the stream. If
+ an encoding can be determined from the BOM return the name of the
+ encoding otherwise return None"""
+ bomDict = {
+ codecs.BOM_UTF8: 'utf-8',
+ codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
+ codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
+ }
+
+ # Go to beginning of file and read in 4 bytes
+ string = self.rawStream.read(4)
+
+ # Try detecting the BOM using bytes from the string
+ encoding = bomDict.get(string[:3]) # UTF-8
+ seek = 3
+ if not encoding:
+ # Need to detect UTF-32 before UTF-16
+ encoding = bomDict.get(string) # UTF-32
+ seek = 4
+ if not encoding:
+ encoding = bomDict.get(string[:2]) # UTF-16
+ seek = 2
+
+ # Set the read position past the BOM if one was found, otherwise
+ # set it to the start of the stream
+ self.rawStream.seek(encoding and seek or 0)
+
+ return encoding
+
+ def detectEncodingMeta(self):
+ """Report the encoding declared by the meta element
+ """
+ buffer = self.rawStream.read(self.numBytesMeta)
+ parser = EncodingParser(buffer)
+ self.rawStream.seek(0)
+ encoding = parser.getEncoding()
+
+ if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
+ encoding = "utf-8"
+
+ return encoding
+
+ def _position(self, offset):
+ chunk = self.chunk
+ nLines = chunk.count(u'\n', 0, offset)
+ positionLine = self.prevNumLines + nLines
+ lastLinePos = chunk.rfind(u'\n', 0, offset)
+ if lastLinePos == -1:
+ positionColumn = self.prevNumCols + offset
+ else:
+ positionColumn = offset - (lastLinePos + 1)
+ return (positionLine, positionColumn)
+
+ def position(self):
+ """Returns (line, col) of the current position in the stream."""
+ line, col = self._position(self.chunkOffset)
+ return (line+1, col)
+
+ def char(self):
+ """ Read one character from the stream or queue if available. Return
+ EOF when EOF is reached.
+ """
+ # Read a new chunk from the input stream if necessary
+ if self.chunkOffset >= self.chunkSize:
+ if not self.readChunk():
+ return EOF
+
+ chunkOffset = self.chunkOffset
+ char = self.chunk[chunkOffset]
+ self.chunkOffset = chunkOffset + 1
+
+ return char
+
+ def readChunk(self, chunkSize=None):
+ if chunkSize is None:
+ chunkSize = self._defaultChunkSize
+
+ self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
+
+ self.chunk = u""
+ self.chunkSize = 0
+ self.chunkOffset = 0
+
+ data = self.dataStream.read(chunkSize)
+
+ #Deal with CR LF and surrogates broken across chunks
+ if self._bufferedCharacter:
+ data = self._bufferedCharacter + data
+ self._bufferedCharacter = None
+ elif not data:
+ # We have no more data, bye-bye stream
+ return False
+
+ if len(data) > 1:
+ lastv = ord(data[-1])
+ if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
+ self._bufferedCharacter = data[-1]
+ data = data[:-1]
+
+ self.reportCharacterErrors(data)
+
+ # Replace invalid characters
+ # Note U+0000 is dealt with in the tokenizer
+ data = self.replaceCharactersRegexp.sub(u"\ufffd", data)
+
+ data = data.replace(u"\r\n", u"\n")
+ data = data.replace(u"\r", u"\n")
+
+ self.chunk = data
+ self.chunkSize = len(data)
+
+ return True
+
+ def characterErrorsUCS4(self, data):
+ for i in xrange(len(invalid_unicode_re.findall(data))):
+ self.errors.append("invalid-codepoint")
+
+ def characterErrorsUCS2(self, data):
+ #Someone picked the wrong compile option
+ #You lose
+ skip = False
+ import sys
+ for match in invalid_unicode_re.finditer(data):
+ if skip:
+ continue
+ codepoint = ord(match.group())
+ pos = match.start()
+ #Pretty sure there should be endianness issues here
+ if utils.isSurrogatePair(data[pos:pos+2]):
+ #We have a surrogate pair!
+ char_val = utils.surrogatePairToCodepoint(data[pos:pos+2])
+ if char_val in non_bmp_invalid_codepoints:
+ self.errors.append("invalid-codepoint")
+ skip = True
+ elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
+ pos == len(data) - 1):
+ self.errors.append("invalid-codepoint")
+ else:
+ skip = False
+ self.errors.append("invalid-codepoint")
+
+ def charsUntil(self, characters, opposite = False):
+ """ Returns a string of characters from the stream up to but not
+ including any character in 'characters' or EOF. 'characters' must be
+ a container that supports the 'in' method and iteration over its
+ characters.
+ """
+
+ # Use a cache of regexps to find the required characters
+ try:
+ chars = charsUntilRegEx[(characters, opposite)]
+ except KeyError:
+ if __debug__:
+ for c in characters:
+ assert(ord(c) < 128)
+ regex = u"".join([u"\\x%02x" % ord(c) for c in characters])
+ if not opposite:
+ regex = u"^%s" % regex
+ chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex)
+
+ rv = []
+
+ while True:
+ # Find the longest matching prefix
+ m = chars.match(self.chunk, self.chunkOffset)
+ if m is None:
+ # If nothing matched, and it wasn't because we ran out of chunk,
+ # then stop
+ if self.chunkOffset != self.chunkSize:
+ break
+ else:
+ end = m.end()
+ # If not the whole chunk matched, return everything
+ # up to the part that didn't match
+ if end != self.chunkSize:
+ rv.append(self.chunk[self.chunkOffset:end])
+ self.chunkOffset = end
+ break
+ # If the whole remainder of the chunk matched,
+ # use it all and read the next chunk
+ rv.append(self.chunk[self.chunkOffset:])
+ if not self.readChunk():
+ # Reached EOF
+ break
+
+ r = u"".join(rv)
+ return r
+
+ def unget(self, char):
+ # Only one character is allowed to be ungotten at once - it must
+ # be consumed again before any further call to unget
+ if char is not None:
+ if self.chunkOffset == 0:
+ # unget is called quite rarely, so it's a good idea to do
+ # more work here if it saves a bit of work in the frequently
+ # called char and charsUntil.
+ # So, just prepend the ungotten character onto the current
+ # chunk:
+ self.chunk = char + self.chunk
+ self.chunkSize += 1
+ else:
+ self.chunkOffset -= 1
+ assert self.chunk[self.chunkOffset] == char
+
+class EncodingBytes(str):
+ """String-like object with an associated position and various extra methods
+ If the position is ever greater than the string length then an exception is
+ raised"""
+ def __new__(self, value):
+ return str.__new__(self, value.lower())
+
+ def __init__(self, value):
+ self._position=-1
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ p = self._position = self._position + 1
+ if p >= len(self):
+ raise StopIteration
+ elif p < 0:
+ raise TypeError
+ return self[p]
+
+ def previous(self):
+ p = self._position
+ if p >= len(self):
+ raise StopIteration
+ elif p < 0:
+ raise TypeError
+ self._position = p = p - 1
+ return self[p]
+
+ def setPosition(self, position):
+ if self._position >= len(self):
+ raise StopIteration
+ self._position = position
+
+ def getPosition(self):
+ if self._position >= len(self):
+ raise StopIteration
+ if self._position >= 0:
+ return self._position
+ else:
+ return None
+
+ position = property(getPosition, setPosition)
+
+ def getCurrentByte(self):
+ return self[self.position]
+
+ currentByte = property(getCurrentByte)
+
+ def skip(self, chars=spaceCharactersBytes):
+ """Skip past a list of characters"""
+ p = self.position # use property for the error-checking
+ while p < len(self):
+ c = self[p]
+ if c not in chars:
+ self._position = p
+ return c
+ p += 1
+ self._position = p
+ return None
+
+ def skipUntil(self, chars):
+ p = self.position
+ while p < len(self):
+ c = self[p]
+ if c in chars:
+ self._position = p
+ return c
+ p += 1
+ self._position = p
+ return None
+
+ def matchBytes(self, bytes):
+ """Look for a sequence of bytes at the start of a string. If the bytes
+ are found return True and advance the position to the byte after the
+ match. Otherwise return False and leave the position alone"""
+ p = self.position
+ data = self[p:p+len(bytes)]
+ rv = data.startswith(bytes)
+ if rv:
+ self.position += len(bytes)
+ return rv
+
+ def jumpTo(self, bytes):
+ """Look for the next sequence of bytes matching a given sequence. If
+ a match is found advance the position to the last byte of the match"""
+ newPosition = self[self.position:].find(bytes)
+ if newPosition > -1:
+ # XXX: This is ugly, but I can't see a nicer way to fix this.
+ if self._position == -1:
+ self._position = 0
+ self._position += (newPosition + len(bytes)-1)
+ return True
+ else:
+ raise StopIteration
+
+class EncodingParser(object):
+ """Mini parser for detecting character encoding from meta elements"""
+
+ def __init__(self, data):
+ """string - the data to work on for encoding detection"""
+ self.data = EncodingBytes(data)
+ self.encoding = None
+
+ def getEncoding(self):
+ methodDispatch = (
+ ("<!--",self.handleComment),
+ ("<meta",self.handleMeta),
+ ("</",self.handlePossibleEndTag),
+ ("<!",self.handleOther),
+ ("<?",self.handleOther),
+ ("<",self.handlePossibleStartTag))
+ for byte in self.data:
+ keepParsing = True
+ for key, method in methodDispatch:
+ if self.data.matchBytes(key):
+ try:
+ keepParsing = method()
+ break
+ except StopIteration:
+ keepParsing=False
+ break
+ if not keepParsing:
+ break
+
+ return self.encoding
+
+ def handleComment(self):
+ """Skip over comments"""
+ return self.data.jumpTo("-->")
+
+ def handleMeta(self):
+ if self.data.currentByte not in spaceCharactersBytes:
+ #if we have <meta not followed by a space so just keep going
+ return True
+ #We have a valid meta element we want to search for attributes
+ while True:
+ #Try to find the next attribute after the current position
+ attr = self.getAttribute()
+ if attr is None:
+ return True
+ else:
+ if attr[0] == "charset":
+ tentativeEncoding = attr[1]
+ codec = codecName(tentativeEncoding)
+ if codec is not None:
+ self.encoding = codec
+ return False
+ elif attr[0] == "content":
+ contentParser = ContentAttrParser(EncodingBytes(attr[1]))
+ tentativeEncoding = contentParser.parse()
+ codec = codecName(tentativeEncoding)
+ if codec is not None:
+ self.encoding = codec
+ return False
+
+ def handlePossibleStartTag(self):
+ return self.handlePossibleTag(False)
+
+ def handlePossibleEndTag(self):
+ self.data.next()
+ return self.handlePossibleTag(True)
+
+ def handlePossibleTag(self, endTag):
+ data = self.data
+ if data.currentByte not in asciiLettersBytes:
+ #If the next byte is not an ascii letter either ignore this
+ #fragment (possible start tag case) or treat it according to
+ #handleOther
+ if endTag:
+ data.previous()
+ self.handleOther()
+ return True
+
+ c = data.skipUntil(spacesAngleBrackets)
+ if c == "<":
+ #return to the first step in the overall "two step" algorithm
+ #reprocessing the < byte
+ data.previous()
+ else:
+ #Read all attributes
+ attr = self.getAttribute()
+ while attr is not None:
+ attr = self.getAttribute()
+ return True
+
+ def handleOther(self):
+ return self.data.jumpTo(">")
+
+ def getAttribute(self):
+ """Return a name,value pair for the next attribute in the stream,
+ if one is found, or None"""
+ data = self.data
+ # Step 1 (skip chars)
+ c = data.skip(spaceCharactersBytes | frozenset("/"))
+ # Step 2
+ if c in (">", None):
+ return None
+ # Step 3
+ attrName = []
+ attrValue = []
+ #Step 4 attribute name
+ while True:
+ if c == "=" and attrName:
+ break
+ elif c in spaceCharactersBytes:
+ #Step 6!
+ c = data.skip()
+ c = data.next()
+ break
+ elif c in ("/", ">"):
+ return "".join(attrName), ""
+ elif c in asciiUppercaseBytes:
+ attrName.append(c.lower())
+ elif c == None:
+ return None
+ else:
+ attrName.append(c)
+ #Step 5
+ c = data.next()
+ #Step 7
+ if c != "=":
+ data.previous()
+ return "".join(attrName), ""
+ #Step 8
+ data.next()
+ #Step 9
+ c = data.skip()
+ #Step 10
+ if c in ("'", '"'):
+ #10.1
+ quoteChar = c
+ while True:
+ #10.2
+ c = data.next()
+ #10.3
+ if c == quoteChar:
+ data.next()
+ return "".join(attrName), "".join(attrValue)
+ #10.4
+ elif c in asciiUppercaseBytes:
+ attrValue.append(c.lower())
+ #10.5
+ else:
+ attrValue.append(c)
+ elif c == ">":
+ return "".join(attrName), ""
+ elif c in asciiUppercaseBytes:
+ attrValue.append(c.lower())
+ elif c is None:
+ return None
+ else:
+ attrValue.append(c)
+ # Step 11
+ while True:
+ c = data.next()
+ if c in spacesAngleBrackets:
+ return "".join(attrName), "".join(attrValue)
+ elif c in asciiUppercaseBytes:
+ attrValue.append(c.lower())
+ elif c is None:
+ return None
+ else:
+ attrValue.append(c)
+
+
+class ContentAttrParser(object):
+ def __init__(self, data):
+ self.data = data
+ def parse(self):
+ try:
+ #Check if the attr name is charset
+ #otherwise return
+ self.data.jumpTo("charset")
+ self.data.position += 1
+ self.data.skip()
+ if not self.data.currentByte == "=":
+ #If there is no = sign keep looking for attrs
+ return None
+ self.data.position += 1
+ self.data.skip()
+ #Look for an encoding between matching quote marks
+ if self.data.currentByte in ('"', "'"):
+ quoteMark = self.data.currentByte
+ self.data.position += 1
+ oldPosition = self.data.position
+ if self.data.jumpTo(quoteMark):
+ return self.data[oldPosition:self.data.position]
+ else:
+ return None
+ else:
+ #Unquoted value
+ oldPosition = self.data.position
+ try:
+ self.data.skipUntil(spaceCharactersBytes)
+ return self.data[oldPosition:self.data.position]
+ except StopIteration:
+ #Return the whole remaining value
+ return self.data[oldPosition:]
+ except StopIteration:
+ return None
+
+
+def codecName(encoding):
+ """Return the python codec name corresponding to an encoding or None if the
+ string doesn't correspond to a valid encoding."""
+ if (encoding is not None and type(encoding) in types.StringTypes):
+ canonicalName = ascii_punctuation_re.sub("", encoding).lower()
+ return encodings.get(canonicalName, None)
+ else:
+ return None
View
258 lib/html5lib/sanitizer.py
@@ -0,0 +1,258 @@
+import re
+from xml.sax.saxutils import escape, unescape
+
+from tokenizer import HTMLTokenizer
+from constants import tokenTypes
+
+class HTMLSanitizerMixin(object):
+ """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
+
+ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
+ 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
+ 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
+ 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
+ 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
+ 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
+ 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
+ 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
+ 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
+ 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
+ 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
+ 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
+ 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
+
+ mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
+ 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
+ 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
+ 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
+ 'munderover', 'none']
+
+ svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
+ 'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
+ 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
+ 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
+ 'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
+ 'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
+
+ acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
+ 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
+ 'background', 'balance', 'bgcolor', 'bgproperties', 'border',
+ 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
+ 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
+ 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
+ 'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
+ 'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
+ 'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
+ 'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
+ 'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
+ 'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
+ 'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
+ 'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
+ 'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
+ 'optimum', 'pattern', 'ping', 'point-size', 'prompt', 'pqg',
+ 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
+ 'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
+ 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
+ 'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
+ 'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
+ 'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
+ 'width', 'wrap', 'xml:lang']
+
+ mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
+ 'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
+ 'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
+ 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
+ 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
+ 'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
+ 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
+ 'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
+ 'xlink:type', 'xmlns', 'xmlns:xlink']
+
+ svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
+ 'arabic-form', 'ascent', 'attributeName', 'attributeType',
+ 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
+ 'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
+ 'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
+ 'fill-opacity', 'fill-rule', 'font-family', 'font-size',
+ 'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
+ 'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
+ 'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
+ 'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
+ 'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
+ 'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
+ 'opacity', 'orient', 'origin', 'overline-position',
+ 'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
+ 'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
+ 'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
+ 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
+ 'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
+ 'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
+ 'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
+ 'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
+ 'transform', 'type', 'u1', 'u2', 'underline-position',
+ 'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
+ 'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
+ 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
+ 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
+ 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
+ 'y1', 'y2', 'zoomAndPan']
+
+ attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
+ 'xlink:href', 'xml:base']
+
+ svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
+ 'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
+ 'mask', 'stroke']
+
+ svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
+ 'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
+ 'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
+ 'set', 'use']
+
+ acceptable_css_properties = ['azimuth', 'background-color',
+ 'border-bottom-color', 'border-collapse', 'border-color',
+ 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
+ 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
+ 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
+ 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
+ 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
+ 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
+ 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
+ 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
+ 'white-space', 'width']
+
+ acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
+ 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
+ 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
+ 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
+ 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
+ 'transparent', 'underline', 'white', 'yellow']
+
+ acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
+ 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
+ 'stroke-opacity']
+
+ acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
+ 'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
+ 'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
+ 'ssh', 'sftp', 'rtsp', 'afs' ]
+
+ # subclasses may define their own versions of these constants
+ allowed_elements = acceptable_elements + mathml_elements + svg_elements
+ allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
+ allowed_css_properties = acceptable_css_properties
+ allowed_css_keywords = acceptable_css_keywords
+ allowed_svg_properties = acceptable_svg_properties
+ allowed_protocols = acceptable_protocols
+
+ # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
+ # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
+ # attributes are parsed, and a restricted set, # specified by
+ # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
+ # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
+ # in ALLOWED_PROTOCOLS are allowed.
+ #
+ # sanitize_html('<script> do_nasty_stuff() </script>')
+ # => &lt;script> do_nasty_stuff() &lt;/script>
+ # sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
+ # => <a>Click here for $100</a>
+ def sanitize_token(self, token):
+
+ # accommodate filters which use token_type differently
+ token_type = token["type"]
+ if token_type in tokenTypes.keys():
+ token_type = tokenTypes[token_type]
+
+ if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
+ tokenTypes["EmptyTag"]):
+ if token["name"] in self.allowed_elements:
+ if token.has_key("data"):
+ attrs = dict([(name,val) for name,val in
+ token["data"][::-1]
+ if name in self.allowed_attributes])
+ for attr in self.attr_val_is_uri:
+ if not attrs.has_key(attr):
+ continue
+ val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
+ unescape(attrs[attr])).lower()
+ #remove replacement characters from unescaped characters
+ val_unescaped = val_unescaped.replace(u"\ufffd", "")
+ if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
+ (val_unescaped.split(':')[0] not in
+ self.allowed_protocols)):
+ del attrs[attr]
+ for attr in self.svg_attr_val_allows_ref:
+ if attr in attrs:
+ attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+ ' ',
+ unescape(attrs[attr]))
+ if (token["name"] in self.svg_allow_local_href and
+ 'xlink:href' in attrs and re.search('^\s*[^#\s].*',
+ attrs['xlink:href'])):
+ del attrs['xlink:href']
+ if attrs.has_key('style'):
+ attrs['style'] = self.sanitize_css(attrs['style'])
+ token["data"] = [[name,val] for name,val in attrs.items()]
+ return token
+ else:
+ if token_type == tokenTypes["EndTag"]:
+ token["data"] = "</%s>" % token["name"]
+ elif token["data"]:
+ attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
+ token["data"] = "<%s%s>" % (token["name"],attrs)
+ else:
+ token["data"] = "<%s>" % token["name"]
+ if token.get("selfClosing"):
+ token["data"]=token["data"][:-1] + "/>"
+
+ if token["type"] in tokenTypes.keys():
+ token["type"] = "Characters"
+ else:
+ token["type"] = tokenTypes["Characters"]
+
+ del token["name"]
+ return token
+ elif token_type == tokenTypes["Comment"]:
+ pass
+ else:
+ return token
+
+ def sanitize_css(self, style):
+ # disallow urls
+ style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
+
+ # gauntlet
+ if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
+ if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return ''
+
+ clean = []
+ for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
+ if not value: continue
+ if prop.lower() in self.allowed_css_properties:
+ clean.append(prop + ': ' + value + ';')
+ elif prop.split('-')[0].lower() in ['background','border','margin',
+ 'padding']:
+ for keyword in value.split():
+ if not keyword in self.acceptable_css_keywords and \
+ not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
+ break
+ else:
+ clean.append(prop + ': ' + value + ';')
+ elif prop.lower() in self.allowed_svg_properties:
+ clean.append(prop + ': ' + value + ';')
+
+ return ' '.join(clean)
+
+class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
+ def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
+ lowercaseElementName=False, lowercaseAttrName=False):
+ #Change case matching defaults as we only output lowercase html anyway
+ #This solution doesn't seem ideal...
+ HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
+ lowercaseElementName, lowercaseAttrName)
+
+ def __iter__(self):
+ for token in HTMLTokenizer.__iter__(self):
+ token = self.sanitize_token(token)
+ if token:
+ yield token
View
17 lib/html5lib/serializer/__init__.py
@@ -0,0 +1,17 @@
+
+from html5lib import treewalkers
+
+from htmlserializer import HTMLSerializer
+from xhtmlserializer import XHTMLSerializer
+
+def serialize(input, tree="simpletree", format="html", encoding=None,
+ **serializer_opts):
+ # XXX: Should we cache this?
+ walker = treewalkers.getTreeWalker(tree)
+ if format == "html":
+ s = HTMLSerializer(**serializer_opts)
+ elif format == "xhtml":
+ s = XHTMLSerializer(**serializer_opts)
+ else:
+ raise ValueError, "type must be either html or xhtml"
+ return s.render(walker(input), encoding)
View
312 lib/html5lib/serializer/htmlserializer.py
@@ -0,0 +1,312 @@
+try:
+ frozenset
+except NameError:
+ # Import from the sets module for python 2.3
+ from sets import ImmutableSet as frozenset
+
+import gettext
+_ = gettext.gettext
+
+from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
+from html5lib.constants import rcdataElements, entities, xmlEntities
+from html5lib import utils
+from xml.sax.saxutils import escape
+
+spaceCharacters = u"".join(spaceCharacters)
+
+try:
+ from codecs import register_error, xmlcharrefreplace_errors
+except ImportError:
+ unicode_encode_errors = "strict"
+else:
+ unicode_encode_errors = "htmlentityreplace"
+
+ from html5lib.constants import entities
+
+ encode_entity_map = {}
+ is_ucs4 = len(u"\U0010FFFF") == 1
+ for k, v in entities.items():
+ #skip multi-character entities
+ if ((is_ucs4 and len(v) > 1) or
+ (not is_ucs4 and len(v) > 2)):
+ continue
+ if v != "&":
+ if len(v) == 2:
+ v = utils.surrogatePairToCodepoint(v)
+ else:
+ try:
+ v = ord(v)
+ except:
+ print v
+ raise
+ if not v in encode_entity_map or k.islower():
+ # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
+ encode_entity_map[v] = k
+
+ def htmlentityreplace_errors(exc):
+ if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
+ res = []
+ codepoints = []
+ skip = False
+ for i, c in enumerate(exc.object[exc.start:exc.end]):
+ if skip:
+ skip = False
+ continue
+ index = i + exc.start
+ if utils.isSurrogatePair(exc.object[index:min([exc.end, index+2])]):
+ codepoint = utils.surrogatePairToCodepoint(exc.object[index:index+2])
+ skip = True
+ else:
+ codepoint = ord(c)
+ codepoints.append(codepoint)
+ for cp in codepoints:
+ e = encode_entity_map.get(cp)
+ if e:
+ res.append("&")
+ res.append(e)
+ if not e.endswith(";"):
+ res.append(";")
+ else:
+ res.append("&#x%s;"%(hex(cp)[2:]))
+ return (u"".join(res), exc.end)
+ else:
+ return xmlcharrefreplace_errors(exc)
+
+ register_error(unicode_encode_errors, htmlentityreplace_errors)
+
+ del register_error
+
+
+class HTMLSerializer(object):
+
+ # attribute quoting options
+ quote_attr_values = False
+ quote_char = u'"'
+ use_best_quote_char = True
+
+ # tag syntax options
+ omit_optional_tags = True
+ minimize_boolean_attributes = True
+ use_trailing_solidus = False
+ space_before_trailing_solidus = True
+
+ # escaping options
+ escape_lt_in_attrs = False
+ escape_rcdata = False
+ resolve_entities = True
+
+ # miscellaneous options
+ inject_meta_charset = True
+ strip_whitespace = False
+ sanitize = False
+
+ options = ("quote_attr_values", "quote_char", "use_best_quote_char",
+ "minimize_boolean_attributes", "use_trailing_solidus",
+ "space_before_trailing_solidus", "omit_optional_tags",
+ "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
+ "escape_rcdata", "resolve_entities", "sanitize")
+
+ def __init__(self, **kwargs):
+ """Initialize HTMLSerializer.
+
+ Keyword options (default given first unless specified) include:
+
+ inject_meta_charset=True|False
+ Whether it insert a meta element to define the character set of the
+ document.
+ quote_attr_values=True|False
+ Whether to quote attribute values that don't require quoting
+ per HTML5 parsing rules.
+ quote_char=u'"'|u"'"
+ Use given quote character for attribute quoting. Default is to
+ use double quote unless attribute value contains a double quote,
+ in which case single quotes are used instead.
+ escape_lt_in_attrs=False|True
+ Whether to escape < in attribute values.
+ escape_rcdata=False|True
+ Whether to escape characters that need to be escaped within normal