Skip to content
Permalink
Browse files Browse the repository at this point in the history
Cleaner: Remove SVG image data URLs since they can embed script content.
Reported as GHSL-2021-1038
  • Loading branch information
scoder committed Nov 11, 2021
1 parent 12fa966 commit f233023
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 8 deletions.
23 changes: 15 additions & 8 deletions src/lxml/html/clean.py
Expand Up @@ -75,18 +75,25 @@

# All kinds of schemes besides just javascript: that can cause
# execution:
_is_image_dataurl = re.compile(
r'^data:image/.+;base64', re.I).search
_find_image_dataurls = re.compile(
r'^data:image/(.+);base64,', re.I).findall
_is_possibly_malicious_scheme = re.compile(
r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',
re.I).search
r'(javascript|jscript|livescript|vbscript|data|about|mocha):',
re.I).findall
# SVG images can contain script content
_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).findall

def _is_javascript_scheme(s):
if _is_image_dataurl(s):
return None
return _is_possibly_malicious_scheme(s)
is_image_url = False
for image_type in _find_image_dataurls(s):
is_image_url = True
if _is_unsafe_image_type(image_type):
return True
if is_image_url:
return False
return bool(_is_possibly_malicious_scheme(s))

_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
# FIXME: should data: be blocked?

# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
_conditional_comment_re = re.compile(
Expand Down
45 changes: 45 additions & 0 deletions src/lxml/html/tests/test_clean.py
@@ -1,3 +1,5 @@
import base64
import gzip
import unittest
from lxml.tests.common_imports import make_doctest

Expand Down Expand Up @@ -143,6 +145,49 @@ def test_sneaky_import_in_style(self):
cleaned,
"%s -> %s" % (style_code, cleaned))

def test_svg_data_links(self):
# Remove SVG images with potentially insecure content.
svg = b'<svg onload="alert(123)" />'
svgz = gzip.compress(svg)
svg_b64 = base64.b64encode(svg).decode('ASCII')
svgz_b64 = base64.b64encode(svgz).decode('ASCII')
urls = [
"data:image/svg+xml;base64," + svg_b64,
"data:image/svg+xml-compressed;base64," + svgz_b64,
]
for url in urls:
html = '<img src="%s">' % url
s = lxml.html.fragment_fromstring(html)

cleaned = lxml.html.tostring(clean_html(s))
self.assertEqual(
b'<img src="">',
cleaned,
"%s -> %s" % (url, cleaned))

def test_image_data_links(self):
data = b'123'
data_b64 = base64.b64encode(data).decode('ASCII')
urls = [
"data:image/jpeg;base64," + data_b64,
"data:image/apng;base64," + data_b64,
"data:image/png;base64," + data_b64,
"data:image/gif;base64," + data_b64,
"data:image/webp;base64," + data_b64,
"data:image/bmp;base64," + data_b64,
"data:image/tiff;base64," + data_b64,
"data:image/x-icon;base64," + data_b64,
]
for url in urls:
html = '<img src="%s">' % url
s = lxml.html.fragment_fromstring(html)

cleaned = lxml.html.tostring(clean_html(s))
self.assertEqual(
html.encode("UTF-8"),
cleaned,
"%s -> %s" % (url, cleaned))

def test_formaction_attribute_in_button_input(self):
# The formaction attribute overrides the form's action and should be
# treated as a malicious link attribute
Expand Down

0 comments on commit f233023

Please sign in to comment.