Skip to content

Commit

Permalink
[htmlhead] Increase test coverage, invoke reset after parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
lovett committed Oct 8, 2018
1 parent ccb9365 commit b7adea0
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 8 deletions.
17 changes: 13 additions & 4 deletions parsers/htmlhead.py
@@ -1,7 +1,6 @@
"""Extract the tags in the head section of an HTML document."""

from html.parser import HTMLParser
from html.entities import name2codepoint
from collections import deque


Expand All @@ -20,7 +19,18 @@ class Parser(HTMLParser):
def __init__(self):
super().__init__()

def reset(self):
"""Reset the parser instance."""
super().reset()
self.in_head = False
self.finished = False
self.stack.clear()

def error(self, message):
"""Do-nothing override of default error handler.
This is pointless, but having it appeases pylint.
"""
pass

def parse(self, markup):
Expand All @@ -29,6 +39,8 @@ def parse(self, markup):

self.feed(markup)

self.reset()

return self.result

def handle_starttag(self, tag, attrs):
Expand Down Expand Up @@ -68,6 +80,3 @@ def handle_data(self, data):
tag, attrs, text = self.stack.pop()
text = "{} {}".format(text, data.strip()).strip()
self.stack.append((tag, attrs, text))

def handle_entityref(self, name):
self.result.append(chr(name2codepoint[name]))
50 changes: 46 additions & 4 deletions parsers/test_htmlhead.py
Expand Up @@ -10,20 +10,25 @@ class TestHtmlHeadParser(unittest.TestCase):
parser = None

@classmethod
def setUpClass(cls):
def setUp(cls):
"""Create the parser instance."""
cls.parser = parsers.htmlhead.Parser()

@classmethod
def tearDownClass(cls):
def tearDown(cls):
"""Destroy the parser."""
cls.parser = None

def test_simple_parse(self):
"""A simplistic document is parsed successfully."""
"""A reasonably-structured document is parsed successfully."""

initial = """
<html><head><title id="test">Hello world</title></head></html>
<html>
<head>
<title id="test">Hello world</title>
<meta name="keyword" content="whatever" />
</head>
</html>
"""
final = self.parser.parse(initial)

Expand All @@ -32,6 +37,43 @@ def test_simple_parse(self):
self.assertEqual(final[0][1][0][1], "test")
self.assertEqual(final[0][2], "Hello world")
self.assertEqual(len(final[0]), 3)
self.assertEqual(len(final), 2)

def test_no_head(self):
"""A document with no head is parsed successfully."""

initial = """
<html>this is weird</html>
"""
final = self.parser.parse(initial)

self.assertEqual(len(final), 0)

def test_entity(self):
"""Entities are converted during parsing."""

initial = """
<html><head><title>hello &gt; world</title></head></html>
"""
final = self.parser.parse(initial)

self.assertEqual(final[0][2], "hello > world")

def test_only_head(self):
"""Tags outside the head are ignored."""

initial = """
<html>
<head>
<title>hello world</title>
</head>
<body>this is ignored</body>
</html>
"""
final = self.parser.parse(initial)

print(final)
self.assertEqual(len(final), 1)


if __name__ == "__main__":
Expand Down

0 comments on commit b7adea0

Please sign in to comment.