Skip to content
Permalink
Browse files Browse the repository at this point in the history
Fix XXE vulnerability in XMP metadata parsing
For details:
https://portswigger.net/web-security/xxe

Reported by: Eric Therond eric.therond@sonarsource.com) of Sonarsource (https://www.sonarsource.com/)
  • Loading branch information
jbarlow83 committed Mar 30, 2021
1 parent 3a58318 commit 3f38f73
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 5 deletions.
30 changes: 30 additions & 0 deletions src/pikepdf/_xml.py
@@ -0,0 +1,30 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Copyright (C) 2021, James R. Barlow (https://github.com/jbarlow83/)


from typing import IO, Any, AnyStr, Union

from lxml.etree import XMLParser as _UnsafeXMLParser
from lxml.etree import parse as _parse


class _XMLParser(_UnsafeXMLParser):
def __init__(self, *args, **kwargs):
# Prevent XXE attacks
# https://rules.sonarsource.com/python/type/Vulnerability/RSPEC-2755
kwargs['resolve_entities'] = False
kwargs['no_network'] = True
super().__init__(*args, **kwargs)


def parse_xml(source: Union[AnyStr, IO[Any]], recover: bool = False):
"""Wrapper around lxml's parse to provide protection against XXE attacks."""

parser = _XMLParser(recover=recover, remove_pis=False)
return _parse(source, parser=parser)


__all__ = ['parse_xml']
10 changes: 5 additions & 5 deletions src/pikepdf/models/metadata.py
Expand Up @@ -26,10 +26,11 @@
from warnings import warn

from lxml import etree
from lxml.etree import QName, XMLParser, XMLSyntaxError, parse
from lxml.etree import QName, XMLSyntaxError

from .. import Name, Stream, String
from .. import __version__ as pikepdf_version
from .._xml import parse_xml

if sys.version_info < (3, 9): # pragma: no cover
from typing import Iterable, MutableMapping
Expand Down Expand Up @@ -413,14 +414,13 @@ def _load_from(self, data: bytes) -> None:
data = XMP_EMPTY # on some platforms lxml chokes on empty documents

def basic_parser(xml):
return parse(BytesIO(xml))
return parse_xml(BytesIO(xml))

def strip_illegal_bytes_parser(xml):
return parse(BytesIO(re_xml_illegal_bytes.sub(b'', xml)))
return parse_xml(BytesIO(re_xml_illegal_bytes.sub(b'', xml)))

def recovery_parser(xml):
parser = XMLParser(recover=True)
return parse(BytesIO(xml), parser)
return parse_xml(BytesIO(xml), recover=True)

def replace_with_empty_xmp(_xml=None):
log.warning("Error occurred parsing XMP, replacing with empty XMP.")
Expand Down
24 changes: 24 additions & 0 deletions tests/test_metadata.py
Expand Up @@ -729,3 +729,27 @@ def test_exception_undoes_edits(graph):
raise
m = graph.open_metadata()
assert m['dc:format'] != 'application/pdf-demo'


def test_xxe(trivial, outdir):
secret = outdir / 'secret.txt'
secret.write_text("This is a secret")
trivial.Root.Metadata = Stream(
trivial,
b"""\
<?xpacket begin='\xef\xbb\xbf' id='W5M0MpCehiHzreSzNTczkc9d'?>
<!DOCTYPE rdf:RDF [<!ENTITY xxe SYSTEM "file://%s">]>
<x:xmpmeta xmlns:x='adobe:ns:meta/' x:xmptk='Image'>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<note>
<to>&xxe;</to>
<from>xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx</from>
</note>
</rdf:RDF>
</x:xmpmeta>
<?xpacket end='w'?>
"""
% os.fsencode(secret),
)
with trivial.open_metadata() as m:
assert 'This is a secret' not in str(m)

1 comment on commit 3f38f73

@abergmann
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CVE-2021-29421 was assigned to this commit.

Please sign in to comment.