|
| 1 | +#!/usr/bin/env python |
| 2 | +""" |
| 3 | +Python 3.x HTMLParser extension with ElementTree support. |
| 4 | +""" |
| 5 | + |
| 6 | +from html.parser import HTMLParser |
| 7 | +from xml.etree import ElementTree |
| 8 | + |
| 9 | + |
| 10 | +class NaiveHTMLParser(HTMLParser): |
| 11 | + """ |
| 12 | + Python 3.x HTMLParser extension with ElementTree support. |
| 13 | + @see https://github.com/marmelo/python-htmlparser |
| 14 | + """ |
| 15 | + |
| 16 | + def __init__(self): |
| 17 | + self.root = None |
| 18 | + self.tree = [] |
| 19 | + HTMLParser.__init__(self) |
| 20 | + |
| 21 | + def feed(self, data): |
| 22 | + HTMLParser.feed(self, data) |
| 23 | + return self.root |
| 24 | + |
| 25 | + def handle_starttag(self, tag, attrs): |
| 26 | + if len(self.tree) == 0: |
| 27 | + element = ElementTree.Element(tag, dict(self.__filter_attrs(attrs))) |
| 28 | + self.tree.append(element) |
| 29 | + self.root = element |
| 30 | + else: |
| 31 | + element = ElementTree.SubElement(self.tree[-1], tag, dict(self.__filter_attrs(attrs))) |
| 32 | + self.tree.append(element) |
| 33 | + |
| 34 | + def handle_endtag(self, tag): |
| 35 | + self.tree.pop() |
| 36 | + |
| 37 | + def handle_startendtag(self, tag, attrs): |
| 38 | + self.handle_starttag(tag, attrs) |
| 39 | + self.handle_endtag(tag) |
| 40 | + pass |
| 41 | + |
| 42 | + def handle_data(self, data): |
| 43 | + if self.tree: |
| 44 | + self.tree[-1].text = data |
| 45 | + |
| 46 | + def get_root_element(self): |
| 47 | + return self.root |
| 48 | + |
| 49 | + def __filter_attrs(self, attrs): |
| 50 | + return filter(lambda x: x[0] and x[1], attrs) if attrs else [] |
| 51 | + |
| 52 | + |
| 53 | +# example usage |
| 54 | +if __name__ == "__main__": |
| 55 | + |
| 56 | + html = """ |
| 57 | + <html> |
| 58 | + <head> |
| 59 | + <title>GitHub</title> |
| 60 | + </head> |
| 61 | + <body> |
| 62 | + <a href="https://github.com/marmelo">GitHub</a> |
| 63 | + <a href="https://github.com/marmelo/python-htmlparser">GitHub Project</a> |
| 64 | + </body> |
| 65 | + </html> |
| 66 | + """ |
| 67 | + |
| 68 | + parser = NaiveHTMLParser() |
| 69 | + root = parser.feed(html) |
| 70 | + parser.close() |
| 71 | + |
| 72 | + # root is an xml.etree.Element and supports the ElementTree API |
| 73 | + # (e.g. you may use its limited support for XPath expressions) |
| 74 | + |
| 75 | + # get title |
| 76 | + print(root.find('head/title').text) |
| 77 | + |
| 78 | + # get all anchors |
| 79 | + for a in root.findall('.//a'): |
| 80 | + print(a.get('href')) |
| 81 | + |
| 82 | + # for more information, see: |
| 83 | + # http://docs.python.org/2/library/xml.etree.elementtree.html |
| 84 | + # http://docs.python.org/2/library/xml.etree.elementtree.html#xpath-support |
0 commit comments