Skip to content

Commit

Permalink
Prevent combinations of <math/svg> and <style> to sneak JavaScript th…
Browse files Browse the repository at this point in the history
…rough the HTML cleaner.
  • Loading branch information
scoder committed Nov 26, 2020
1 parent c053dc1 commit a105ab8
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 11 deletions.
11 changes: 11 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,17 @@
lxml changelog
==============

4.6.2 (2020-11-26)
==================

Bugs fixed
----------

* A vulnerability (CVE-2020-27783) was discovered in the HTML Cleaner by Yaniv Nizry,
which allowed JavaScript to pass through. The cleaner now removes more sneaky
"style" content.


4.6.1 (2020-10-18)
==================

Expand Down
22 changes: 14 additions & 8 deletions src/lxml/html/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,15 @@

# This is an IE-specific construct you can have in a stylesheet to
# run some Javascript:
_css_javascript_re = re.compile(
r'expression\s*\(.*?\)', re.S|re.I)
_replace_css_javascript = re.compile(
r'expression\s*\(.*?\)', re.S|re.I).sub

# Do I have to worry about @\nimport?
_css_import_re = re.compile(
r'@\s*import', re.I)
_replace_css_import = re.compile(
r'@\s*import', re.I).sub

_looks_like_tag_content = re.compile(
r'</?[a-zA-Z]+|\son[a-zA-Z]+\s*=', re.ASCII).search

# All kinds of schemes besides just javascript: that can cause
# execution:
Expand Down Expand Up @@ -304,8 +307,8 @@ def __call__(self, doc):
if not self.inline_style:
for el in _find_styled_elements(doc):
old = el.get('style')
new = _css_javascript_re.sub('', old)
new = _css_import_re.sub('', new)
new = _replace_css_javascript('', old)
new = _replace_css_import('', new)
if self._has_sneaky_javascript(new):
# Something tricky is going on...
del el.attrib['style']
Expand All @@ -317,9 +320,9 @@ def __call__(self, doc):
el.drop_tree()
continue
old = el.text or ''
new = _css_javascript_re.sub('', old)
new = _replace_css_javascript('', old)
# The imported CSS can do anything; we just can't allow:
new = _css_import_re.sub('', old)
new = _replace_css_import('', new)
if self._has_sneaky_javascript(new):
# Something tricky is going on...
el.text = '/* deleted */'
Expand Down Expand Up @@ -539,6 +542,9 @@ def _has_sneaky_javascript(self, style):
if '</noscript' in style:
# e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
return True
if _looks_like_tag_content(style):
# e.g. '<math><style><img src=x onerror=alert(1)></style></math>'
return True
return False

def clean_html(self, html):
Expand Down
10 changes: 10 additions & 0 deletions src/lxml/html/tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,16 @@ def test_sneaky_noscript_in_style(self):
b'<noscript><style>/* deleted */</style></noscript>',
lxml.html.tostring(clean_html(s)))

def test_sneaky_js_in_math_style(self):
# This gets parsed as <math> -> <style>"..."</style>
# thus passing any tag/script/whatever content through into the output.
html = '<math><style><img src=x onerror=alert(1)></style></math>'
s = lxml.html.fragment_fromstring(html)

self.assertEqual(
b'<math><style>/* deleted */</style></math>',
lxml.html.tostring(clean_html(s)))


def test_suite():
suite = unittest.TestSuite()
Expand Down
18 changes: 15 additions & 3 deletions src/lxml/html/tests/test_clean.txt
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,11 @@
>>> print(Cleaner(page_structure=False, comments=False).clean_html(doc))
<html>
<head>
<style>/* deleted */</style>
<style>
body {background-image: url()};
div {background-image: url()};
div {color: };
</style>
</head>
<body>
<!-- I am interpreted for EVIL! -->
Expand All @@ -126,7 +130,11 @@
>>> print(Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc))
<html>
<head>
<style>/* deleted */</style>
<style>
body {background-image: url()};
div {background-image: url()};
div {color: };
</style>
</head>
<body>
<a href="">a link</a>
Expand Down Expand Up @@ -190,7 +198,11 @@
<link rel="alternate" type="text/rss" src="evil-rss">
<link rel="alternate" type="text/rss" href="http://example.com">
<link rel="stylesheet" type="text/rss" href="http://example.com">
<style>/* deleted */</style>
<style>
body {background-image: url()};
div {background-image: url()};
div {color: };
</style>
</head>
<body>
<a href="">a link</a>
Expand Down

5 comments on commit a105ab8

@kirotawa
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doing a backporting of this patch and would like some advice, if possible. For versions running with Python2, is removing the re.ASCII part in line 72 ok? Since python2 versions hasn't that flag.

thanks!

@kirotawa
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Never mind, sounds py2 is default re.ASCII , and that flag is only required in py3.

@scoder
Copy link
Member Author

@scoder scoder commented on a105ab8 Dec 11, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See 4cb5736

@guitos
Copy link

@guitos guitos commented on a105ab8 Dec 16, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @scoder, was this part of the fix of CVE-2020-27783 or is this a new separate issue?

Asking this because CVE-2020-27783 was assigned to the issue fixed in 89e7aad which are the noscript and style vectors and this commit added math/svg as well.
Also, looks like the CVE-2020-27783 was fixed in 4.6.1 version and it is on 4.6.2 ChangeLog as well.

Could you please clarify? Thank you.

@scoder
Copy link
Member Author

@scoder scoder commented on a105ab8 Dec 16, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I consider both issues part of the same (kind of) vulnerability, and they were discovered together.
Thus, use 4.6.2.

Please sign in to comment.