Permalink
Browse files

improve WebMention support by importing a library that does it; impro…

…ve library import handling via pip and the lib dir
  • Loading branch information...
1 parent 8ef6879 commit 058897cbc9475e6d9db63ebf18d063f85fea6294 @npdoty committed Sep 30, 2014
Showing with 28,732 additions and 2,019 deletions.
  1. +0 −2,000 BeautifulSoup.py
  2. +406 −0 lib/bs4/__init__.py
  3. +321 −0 lib/bs4/builder/__init__.py
  4. +285 −0 lib/bs4/builder/_html5lib.py
  5. +258 −0 lib/bs4/builder/_htmlparser.py
  6. +233 −0 lib/bs4/builder/_lxml.py
  7. +829 −0 lib/bs4/dammit.py
  8. +204 −0 lib/bs4/diagnose.py
  9. +1,611 −0 lib/bs4/element.py
  10. +592 −0 lib/bs4/testing.py
  11. +1 −0 lib/bs4/tests/__init__.py
  12. +141 −0 lib/bs4/tests/test_builder_registry.py
  13. +36 −0 lib/bs4/tests/test_docs.py
  14. +85 −0 lib/bs4/tests/test_html5lib.py
  15. +19 −0 lib/bs4/tests/test_htmlparser.py
  16. +91 −0 lib/bs4/tests/test_lxml.py
  17. +434 −0 lib/bs4/tests/test_soup.py
  18. +1,829 −0 lib/bs4/tests/test_tree.py
  19. +77 −0 lib/requests/__init__.py
  20. +426 −0 lib/requests/adapters.py
  21. +124 −0 lib/requests/api.py
  22. +197 −0 lib/requests/auth.py
  23. +5,026 −0 lib/requests/cacert.pem
  24. +25 −0 lib/requests/certs.py
  25. +115 −0 lib/requests/compat.py
  26. +454 −0 lib/requests/cookies.py
  27. +91 −0 lib/requests/exceptions.py
  28. +45 −0 lib/requests/hooks.py
  29. +816 −0 lib/requests/models.py
  30. +3 −0 lib/requests/packages/__init__.py
  31. +32 −0 lib/requests/packages/chardet/__init__.py
  32. +925 −0 lib/requests/packages/chardet/big5freq.py
  33. +42 −0 lib/requests/packages/chardet/big5prober.py
  34. +46 −0 lib/requests/packages/chardet/chardetect.py
  35. +231 −0 lib/requests/packages/chardet/chardistribution.py
  36. +106 −0 lib/requests/packages/chardet/charsetgroupprober.py
  37. +62 −0 lib/requests/packages/chardet/charsetprober.py
  38. +61 −0 lib/requests/packages/chardet/codingstatemachine.py
  39. +34 −0 lib/requests/packages/chardet/compat.py
  40. +39 −0 lib/requests/packages/chardet/constants.py
  41. +44 −0 lib/requests/packages/chardet/cp949prober.py
  42. +86 −0 lib/requests/packages/chardet/escprober.py
  43. +242 −0 lib/requests/packages/chardet/escsm.py
  44. +90 −0 lib/requests/packages/chardet/eucjpprober.py
  45. +596 −0 lib/requests/packages/chardet/euckrfreq.py
  46. +42 −0 lib/requests/packages/chardet/euckrprober.py
  47. +428 −0 lib/requests/packages/chardet/euctwfreq.py
  48. +41 −0 lib/requests/packages/chardet/euctwprober.py
  49. +472 −0 lib/requests/packages/chardet/gb2312freq.py
  50. +41 −0 lib/requests/packages/chardet/gb2312prober.py
  51. +283 −0 lib/requests/packages/chardet/hebrewprober.py
  52. +569 −0 lib/requests/packages/chardet/jisfreq.py
  53. +219 −0 lib/requests/packages/chardet/jpcntx.py
  54. +229 −0 lib/requests/packages/chardet/langbulgarianmodel.py
  55. +329 −0 lib/requests/packages/chardet/langcyrillicmodel.py
  56. +225 −0 lib/requests/packages/chardet/langgreekmodel.py
  57. +201 −0 lib/requests/packages/chardet/langhebrewmodel.py
  58. +225 −0 lib/requests/packages/chardet/langhungarianmodel.py
  59. +200 −0 lib/requests/packages/chardet/langthaimodel.py
  60. +139 −0 lib/requests/packages/chardet/latin1prober.py
  61. +86 −0 lib/requests/packages/chardet/mbcharsetprober.py
  62. +54 −0 lib/requests/packages/chardet/mbcsgroupprober.py
  63. +575 −0 lib/requests/packages/chardet/mbcssm.py
  64. +120 −0 lib/requests/packages/chardet/sbcharsetprober.py
  65. +69 −0 lib/requests/packages/chardet/sbcsgroupprober.py
  66. +91 −0 lib/requests/packages/chardet/sjisprober.py
  67. +170 −0 lib/requests/packages/chardet/universaldetector.py
  68. +76 −0 lib/requests/packages/chardet/utf8prober.py
  69. +66 −0 lib/requests/packages/urllib3/__init__.py
  70. +199 −0 lib/requests/packages/urllib3/_collections.py
  71. +246 −0 lib/requests/packages/urllib3/connection.py
  72. +757 −0 lib/requests/packages/urllib3/connectionpool.py
  73. 0 lib/requests/packages/urllib3/contrib/__init__.py
  74. +114 −0 lib/requests/packages/urllib3/contrib/ntlmpool.py
  75. +284 −0 lib/requests/packages/urllib3/contrib/pyopenssl.py
  76. +156 −0 lib/requests/packages/urllib3/exceptions.py
  77. +177 −0 lib/requests/packages/urllib3/fields.py
  78. +93 −0 lib/requests/packages/urllib3/filepost.py
  79. +4 −0 lib/requests/packages/urllib3/packages/__init__.py
  80. +259 −0 lib/requests/packages/urllib3/packages/ordered_dict.py
  81. +385 −0 lib/requests/packages/urllib3/packages/six.py
  82. +13 −0 lib/requests/packages/urllib3/packages/ssl_match_hostname/__init__.py
  83. +105 −0 lib/requests/packages/urllib3/packages/ssl_match_hostname/_implementation.py
  84. +265 −0 lib/requests/packages/urllib3/poolmanager.py
  85. +135 −0 lib/requests/packages/urllib3/request.py
  86. +333 −0 lib/requests/packages/urllib3/response.py
  87. +24 −0 lib/requests/packages/urllib3/util/__init__.py
  88. +97 −0 lib/requests/packages/urllib3/util/connection.py
  89. +71 −0 lib/requests/packages/urllib3/util/request.py
  90. +22 −0 lib/requests/packages/urllib3/util/response.py
  91. +279 −0 lib/requests/packages/urllib3/util/retry.py
  92. +132 −0 lib/requests/packages/urllib3/util/ssl_.py
  93. +240 −0 lib/requests/packages/urllib3/util/timeout.py
  94. +171 −0 lib/requests/packages/urllib3/util/url.py
  95. +659 −0 lib/requests/sessions.py
  96. +89 −0 lib/requests/status_codes.py
  97. +104 −0 lib/requests/structures.py
  98. +674 −0 lib/requests/utils.py
  99. +4 −0 lib/webmentiontools/__init__.py
  100. +95 −0 lib/webmentiontools/send.py
  101. +114 −0 lib/webmentiontools/urlinfo.py
  102. +54 −0 lib/webmentiontools/webmentionio.py
  103. +18 −19 models.py
View
Oops, something went wrong.
View
Oops, something went wrong.
@@ -0,0 +1,321 @@
+from collections import defaultdict
+import itertools
+import sys
+from bs4.element import (
+ CharsetMetaAttributeValue,
+ ContentMetaAttributeValue,
+ whitespace_re
+ )
+
+__all__ = [
+ 'HTMLTreeBuilder',
+ 'SAXTreeBuilder',
+ 'TreeBuilder',
+ 'TreeBuilderRegistry',
+ ]
+
+# Some useful features for a TreeBuilder to have.
+FAST = 'fast'
+PERMISSIVE = 'permissive'
+STRICT = 'strict'
+XML = 'xml'
+HTML = 'html'
+HTML_5 = 'html5'
+
+
+class TreeBuilderRegistry(object):
+
+ def __init__(self):
+ self.builders_for_feature = defaultdict(list)
+ self.builders = []
+
+ def register(self, treebuilder_class):
+ """Register a treebuilder based on its advertised features."""
+ for feature in treebuilder_class.features:
+ self.builders_for_feature[feature].insert(0, treebuilder_class)
+ self.builders.insert(0, treebuilder_class)
+
+ def lookup(self, *features):
+ if len(self.builders) == 0:
+ # There are no builders at all.
+ return None
+
+ if len(features) == 0:
+ # They didn't ask for any features. Give them the most
+ # recently registered builder.
+ return self.builders[0]
+
+ # Go down the list of features in order, and eliminate any builders
+ # that don't match every feature.
+ features = list(features)
+ features.reverse()
+ candidates = None
+ candidate_set = None
+ while len(features) > 0:
+ feature = features.pop()
+ we_have_the_feature = self.builders_for_feature.get(feature, [])
+ if len(we_have_the_feature) > 0:
+ if candidates is None:
+ candidates = we_have_the_feature
+ candidate_set = set(candidates)
+ else:
+ # Eliminate any candidates that don't have this feature.
+ candidate_set = candidate_set.intersection(
+ set(we_have_the_feature))
+
+ # The only valid candidates are the ones in candidate_set.
+ # Go through the original list of candidates and pick the first one
+ # that's in candidate_set.
+ if candidate_set is None:
+ return None
+ for candidate in candidates:
+ if candidate in candidate_set:
+ return candidate
+ return None
+
+# The BeautifulSoup class will take feature lists from developers and use them
+# to look up builders in this registry.
+builder_registry = TreeBuilderRegistry()
+
+class TreeBuilder(object):
+ """Turn a document into a Beautiful Soup object tree."""
+
+ features = []
+
+ is_xml = False
+ preserve_whitespace_tags = set()
+ empty_element_tags = None # A tag will be considered an empty-element
+ # tag when and only when it has no contents.
+
+ # A value for these tag/attribute combinations is a space- or
+ # comma-separated list of CDATA, rather than a single CDATA.
+ cdata_list_attributes = {}
+
+
+ def __init__(self):
+ self.soup = None
+
+ def reset(self):
+ pass
+
+ def can_be_empty_element(self, tag_name):
+ """Might a tag with this name be an empty-element tag?
+
+ The final markup may or may not actually present this tag as
+ self-closing.
+
+ For instance: an HTMLBuilder does not consider a <p> tag to be
+ an empty-element tag (it's not in
+ HTMLBuilder.empty_element_tags). This means an empty <p> tag
+ will be presented as "<p></p>", not "<p />".
+
+ The default implementation has no opinion about which tags are
+ empty-element tags, so a tag will be presented as an
+ empty-element tag if and only if it has no contents.
+ "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
+ be left alone.
+ """
+ if self.empty_element_tags is None:
+ return True
+ return tag_name in self.empty_element_tags
+
+ def feed(self, markup):
+ raise NotImplementedError()
+
+ def prepare_markup(self, markup, user_specified_encoding=None,
+ document_declared_encoding=None):
+ return markup, None, None, False
+
+ def test_fragment_to_document(self, fragment):
+ """Wrap an HTML fragment to make it look like a document.
+
+ Different parsers do this differently. For instance, lxml
+ introduces an empty <head> tag, and html5lib
+ doesn't. Abstracting this away lets us write simple tests
+ which run HTML fragments through the parser and compare the
+ results against other HTML fragments.
+
+ This method should not be used outside of tests.
+ """
+ return fragment
+
+ def set_up_substitutions(self, tag):
+ return False
+
+ def _replace_cdata_list_attribute_values(self, tag_name, attrs):
+ """Replaces class="foo bar" with class=["foo", "bar"]
+
+ Modifies its input in place.
+ """
+ if not attrs:
+ return attrs
+ if self.cdata_list_attributes:
+ universal = self.cdata_list_attributes.get('*', [])
+ tag_specific = self.cdata_list_attributes.get(
+ tag_name.lower(), None)
+ for attr in attrs.keys():
+ if attr in universal or (tag_specific and attr in tag_specific):
+ # We have a "class"-type attribute whose string
+ # value is a whitespace-separated list of
+ # values. Split it into a list.
+ value = attrs[attr]
+ if isinstance(value, basestring):
+ values = whitespace_re.split(value)
+ else:
+ # html5lib sometimes calls setAttributes twice
+ # for the same tag when rearranging the parse
+ # tree. On the second call the attribute value
+ # here is already a list. If this happens,
+ # leave the value alone rather than trying to
+ # split it again.
+ values = value
+ attrs[attr] = values
+ return attrs
+
+class SAXTreeBuilder(TreeBuilder):
+ """A Beautiful Soup treebuilder that listens for SAX events."""
+
+ def feed(self, markup):
+ raise NotImplementedError()
+
+ def close(self):
+ pass
+
+ def startElement(self, name, attrs):
+ attrs = dict((key[1], value) for key, value in list(attrs.items()))
+ #print "Start %s, %r" % (name, attrs)
+ self.soup.handle_starttag(name, attrs)
+
+ def endElement(self, name):
+ #print "End %s" % name
+ self.soup.handle_endtag(name)
+
+ def startElementNS(self, nsTuple, nodeName, attrs):
+ # Throw away (ns, nodeName) for now.
+ self.startElement(nodeName, attrs)
+
+ def endElementNS(self, nsTuple, nodeName):
+ # Throw away (ns, nodeName) for now.
+ self.endElement(nodeName)
+ #handler.endElementNS((ns, node.nodeName), node.nodeName)
+
+ def startPrefixMapping(self, prefix, nodeValue):
+ # Ignore the prefix for now.
+ pass
+
+ def endPrefixMapping(self, prefix):
+ # Ignore the prefix for now.
+ # handler.endPrefixMapping(prefix)
+ pass
+
+ def characters(self, content):
+ self.soup.handle_data(content)
+
+ def startDocument(self):
+ pass
+
+ def endDocument(self):
+ pass
+
+
+class HTMLTreeBuilder(TreeBuilder):
+ """This TreeBuilder knows facts about HTML.
+
+ Such as which tags are empty-element tags.
+ """
+
+ preserve_whitespace_tags = set(['pre', 'textarea'])
+ empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
+ 'spacer', 'link', 'frame', 'base'])
+
+ # The HTML standard defines these attributes as containing a
+ # space-separated list of values, not a single value. That is,
+ # class="foo bar" means that the 'class' attribute has two values,
+ # 'foo' and 'bar', not the single value 'foo bar'. When we
+ # encounter one of these attributes, we will parse its value into
+ # a list of values if possible. Upon output, the list will be
+ # converted back into a string.
+ cdata_list_attributes = {
+ "*" : ['class', 'accesskey', 'dropzone'],
+ "a" : ['rel', 'rev'],
+ "link" : ['rel', 'rev'],
+ "td" : ["headers"],
+ "th" : ["headers"],
+ "td" : ["headers"],
+ "form" : ["accept-charset"],
+ "object" : ["archive"],
+
+ # These are HTML5 specific, as are *.accesskey and *.dropzone above.
+ "area" : ["rel"],
+ "icon" : ["sizes"],
+ "iframe" : ["sandbox"],
+ "output" : ["for"],
+ }
+
+ def set_up_substitutions(self, tag):
+ # We are only interested in <meta> tags
+ if tag.name != 'meta':
+ return False
+
+ http_equiv = tag.get('http-equiv')
+ content = tag.get('content')
+ charset = tag.get('charset')
+
+ # We are interested in <meta> tags that say what encoding the
+ # document was originally in. This means HTML 5-style <meta>
+ # tags that provide the "charset" attribute. It also means
+ # HTML 4-style <meta> tags that provide the "content"
+ # attribute and have "http-equiv" set to "content-type".
+ #
+ # In both cases we will replace the value of the appropriate
+ # attribute with a standin object that can take on any
+ # encoding.
+ meta_encoding = None
+ if charset is not None:
+ # HTML 5 style:
+ # <meta charset="utf8">
+ meta_encoding = charset
+ tag['charset'] = CharsetMetaAttributeValue(charset)
+
+ elif (content is not None and http_equiv is not None
+ and http_equiv.lower() == 'content-type'):
+ # HTML 4 style:
+ # <meta http-equiv="content-type" content="text/html; charset=utf8">
+ tag['content'] = ContentMetaAttributeValue(content)
+
+ return (meta_encoding is not None)
+
+def register_treebuilders_from(module):
+ """Copy TreeBuilders from the given module into this module."""
+ # I'm fairly sure this is not the best way to do this.
+ this_module = sys.modules['bs4.builder']
+ for name in module.__all__:
+ obj = getattr(module, name)
+
+ if issubclass(obj, TreeBuilder):
+ setattr(this_module, name, obj)
+ this_module.__all__.append(name)
+ # Register the builder while we're at it.
+ this_module.builder_registry.register(obj)
+
+class ParserRejectedMarkup(Exception):
+ pass
+
+# Builders are registered in reverse order of priority, so that custom
+# builder registrations will take precedence. In general, we want lxml
+# to take precedence over html5lib, because it's faster. And we only
+# want to use HTMLParser as a last result.
+from . import _htmlparser
+register_treebuilders_from(_htmlparser)
+try:
+ from . import _html5lib
+ register_treebuilders_from(_html5lib)
+except ImportError:
+ # They don't have html5lib installed.
+ pass
+try:
+ from . import _lxml
+ register_treebuilders_from(_lxml)
+except ImportError:
+ # They don't have lxml installed.
+ pass
Oops, something went wrong.

0 comments on commit 058897c

Please sign in to comment.