Skip to content

Commit

Permalink
bpo-41748: Handles unquoted attributes with commas (python#24072)
Browse files Browse the repository at this point in the history
* bpo-41748: Adds tests for unquoted attributes with comma

* bpo-41748: Handles unquoted attributes with comma

* bpo-41748: Addresses review comments

* bpo-41748: Addresses review comments

* Adds more test cases
* Simplifies the regex for handling spaces

* bpo-41748: Moves attributes tests under the right class

* bpo-41748: Addresses review about duplicate attributes

* bpo-41748: Adds NEWS.d entry for this patch
  • Loading branch information
karlcow committed Feb 1, 2021
1 parent 000cde5 commit 9eb11a1
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 37 deletions.
2 changes: 1 addition & 1 deletion Lib/html/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
|"[^"]*" # LIT-enclosed value
|(?!['"])[^>\s]* # bare value
)
(?:\s*,)* # possibly followed by a comma
\s* # possibly followed by a space
)?(?:\s|/(?!>))*
)*
)?
Expand Down
92 changes: 56 additions & 36 deletions Lib/test/test_htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,42 +452,6 @@ def test_illegal_declarations(self):
self._run_check('<!spacer type="block" height="25">',
[('comment', 'spacer type="block" height="25"')])

def test_with_unquoted_attributes(self):
# see #12008
html = ("<html><body bgcolor=d0ca90 text='181008'>"
"<table cellspacing=0 cellpadding=1 width=100% ><tr>"
"<td align=left><font size=-1>"
"- <a href=/rabota/><span class=en> software-and-i</span></a>"
"- <a href='/1/'><span class=en> library</span></a></table>")
expected = [
('starttag', 'html', []),
('starttag', 'body', [('bgcolor', 'd0ca90'), ('text', '181008')]),
('starttag', 'table',
[('cellspacing', '0'), ('cellpadding', '1'), ('width', '100%')]),
('starttag', 'tr', []),
('starttag', 'td', [('align', 'left')]),
('starttag', 'font', [('size', '-1')]),
('data', '- '), ('starttag', 'a', [('href', '/rabota/')]),
('starttag', 'span', [('class', 'en')]), ('data', ' software-and-i'),
('endtag', 'span'), ('endtag', 'a'),
('data', '- '), ('starttag', 'a', [('href', '/1/')]),
('starttag', 'span', [('class', 'en')]), ('data', ' library'),
('endtag', 'span'), ('endtag', 'a'), ('endtag', 'table')
]
self._run_check(html, expected)

def test_comma_between_attributes(self):
self._run_check('<form action="/xxx.php?a=1&amp;b=2&amp", '
'method="post">', [
('starttag', 'form',
[('action', '/xxx.php?a=1&b=2&'),
(',', None), ('method', 'post')])])

def test_weird_chars_in_unquoted_attribute_values(self):
self._run_check('<form action=bogus|&#()value>', [
('starttag', 'form',
[('action', 'bogus|&#()value')])])

def test_invalid_end_tags(self):
# A collection of broken end tags. <br> is used as separator.
# see http://www.w3.org/TR/html5/tokenization.html#end-tag-open-state
Expand Down Expand Up @@ -766,6 +730,62 @@ def test_end_tag_in_attribute_value(self):
[("href", "http://www.example.org/\">;")]),
("data", "spam"), ("endtag", "a")])

def test_with_unquoted_attributes(self):
# see #12008
html = ("<html><body bgcolor=d0ca90 text='181008'>"
"<table cellspacing=0 cellpadding=1 width=100% ><tr>"
"<td align=left><font size=-1>"
"- <a href=/rabota/><span class=en> software-and-i</span></a>"
"- <a href='/1/'><span class=en> library</span></a></table>")
expected = [
('starttag', 'html', []),
('starttag', 'body', [('bgcolor', 'd0ca90'), ('text', '181008')]),
('starttag', 'table',
[('cellspacing', '0'), ('cellpadding', '1'), ('width', '100%')]),
('starttag', 'tr', []),
('starttag', 'td', [('align', 'left')]),
('starttag', 'font', [('size', '-1')]),
('data', '- '), ('starttag', 'a', [('href', '/rabota/')]),
('starttag', 'span', [('class', 'en')]), ('data', ' software-and-i'),
('endtag', 'span'), ('endtag', 'a'),
('data', '- '), ('starttag', 'a', [('href', '/1/')]),
('starttag', 'span', [('class', 'en')]), ('data', ' library'),
('endtag', 'span'), ('endtag', 'a'), ('endtag', 'table')
]
self._run_check(html, expected)

def test_comma_between_attributes(self):
# see bpo 41478
# HTMLParser preserves duplicate attributes, leaving the task of
# removing duplicate attributes to a conformant html tree builder
html = ('<div class=bar,baz=asd>' # between attrs (unquoted)
'<div class="bar",baz="asd">' # between attrs (quoted)
'<div class=bar, baz=asd,>' # after values (unquoted)
'<div class="bar", baz="asd",>' # after values (quoted)
'<div class="bar",>' # one comma values (quoted)
'<div class=,bar baz=,asd>' # before values (unquoted)
'<div class=,"bar" baz=,"asd">' # before values (quoted)
'<div ,class=bar ,baz=asd>' # before names
'<div class,="bar" baz,="asd">' # after names
)
expected = [
('starttag', 'div', [('class', 'bar,baz=asd'),]),
('starttag', 'div', [('class', 'bar'), (',baz', 'asd')]),
('starttag', 'div', [('class', 'bar,'), ('baz', 'asd,')]),
('starttag', 'div', [('class', 'bar'), (',', None),
('baz', 'asd'), (',', None)]),
('starttag', 'div', [('class', 'bar'), (',', None)]),
('starttag', 'div', [('class', ',bar'), ('baz', ',asd')]),
('starttag', 'div', [('class', ',"bar"'), ('baz', ',"asd"')]),
('starttag', 'div', [(',class', 'bar'), (',baz', 'asd')]),
('starttag', 'div', [('class,', 'bar'), ('baz,', 'asd')]),
]
self._run_check(html, expected)

def test_weird_chars_in_unquoted_attribute_values(self):
self._run_check('<form action=bogus|&#()value>', [
('starttag', 'form',
[('action', 'bogus|&#()value')])])

if __name__ == "__main__":
unittest.main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix HTMLParser parsing rules for element attributes containing
commas with spaces. Patch by Karl Dubost.

0 comments on commit 9eb11a1

Please sign in to comment.