Skip to content

Commit 3f38f73

Browse files
committed
Fix XXE vulnerability in XMP metadata parsing
For details: https://portswigger.net/web-security/xxe Reported by: Eric Therond eric.therond@sonarsource.com) of Sonarsource (https://www.sonarsource.com/)
1 parent 3a58318 commit 3f38f73

File tree

3 files changed

+59
-5
lines changed

3 files changed

+59
-5
lines changed

Diff for: src/pikepdf/_xml.py

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# This Source Code Form is subject to the terms of the Mozilla Public
2+
# License, v. 2.0. If a copy of the MPL was not distributed with this
3+
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
4+
#
5+
# Copyright (C) 2021, James R. Barlow (https://github.com/jbarlow83/)
6+
7+
8+
from typing import IO, Any, AnyStr, Union
9+
10+
from lxml.etree import XMLParser as _UnsafeXMLParser
11+
from lxml.etree import parse as _parse
12+
13+
14+
class _XMLParser(_UnsafeXMLParser):
15+
def __init__(self, *args, **kwargs):
16+
# Prevent XXE attacks
17+
# https://rules.sonarsource.com/python/type/Vulnerability/RSPEC-2755
18+
kwargs['resolve_entities'] = False
19+
kwargs['no_network'] = True
20+
super().__init__(*args, **kwargs)
21+
22+
23+
def parse_xml(source: Union[AnyStr, IO[Any]], recover: bool = False):
24+
"""Wrapper around lxml's parse to provide protection against XXE attacks."""
25+
26+
parser = _XMLParser(recover=recover, remove_pis=False)
27+
return _parse(source, parser=parser)
28+
29+
30+
__all__ = ['parse_xml']

Diff for: src/pikepdf/models/metadata.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,11 @@
2626
from warnings import warn
2727

2828
from lxml import etree
29-
from lxml.etree import QName, XMLParser, XMLSyntaxError, parse
29+
from lxml.etree import QName, XMLSyntaxError
3030

3131
from .. import Name, Stream, String
3232
from .. import __version__ as pikepdf_version
33+
from .._xml import parse_xml
3334

3435
if sys.version_info < (3, 9): # pragma: no cover
3536
from typing import Iterable, MutableMapping
@@ -413,14 +414,13 @@ def _load_from(self, data: bytes) -> None:
413414
data = XMP_EMPTY # on some platforms lxml chokes on empty documents
414415

415416
def basic_parser(xml):
416-
return parse(BytesIO(xml))
417+
return parse_xml(BytesIO(xml))
417418

418419
def strip_illegal_bytes_parser(xml):
419-
return parse(BytesIO(re_xml_illegal_bytes.sub(b'', xml)))
420+
return parse_xml(BytesIO(re_xml_illegal_bytes.sub(b'', xml)))
420421

421422
def recovery_parser(xml):
422-
parser = XMLParser(recover=True)
423-
return parse(BytesIO(xml), parser)
423+
return parse_xml(BytesIO(xml), recover=True)
424424

425425
def replace_with_empty_xmp(_xml=None):
426426
log.warning("Error occurred parsing XMP, replacing with empty XMP.")

Diff for: tests/test_metadata.py

+24
Original file line numberDiff line numberDiff line change
@@ -729,3 +729,27 @@ def test_exception_undoes_edits(graph):
729729
raise
730730
m = graph.open_metadata()
731731
assert m['dc:format'] != 'application/pdf-demo'
732+
733+
734+
def test_xxe(trivial, outdir):
735+
secret = outdir / 'secret.txt'
736+
secret.write_text("This is a secret")
737+
trivial.Root.Metadata = Stream(
738+
trivial,
739+
b"""\
740+
<?xpacket begin='\xef\xbb\xbf' id='W5M0MpCehiHzreSzNTczkc9d'?>
741+
<!DOCTYPE rdf:RDF [<!ENTITY xxe SYSTEM "file://%s">]>
742+
<x:xmpmeta xmlns:x='adobe:ns:meta/' x:xmptk='Image'>
743+
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
744+
<note>
745+
<to>&xxe;</to>
746+
<from>xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx</from>
747+
</note>
748+
</rdf:RDF>
749+
</x:xmpmeta>
750+
<?xpacket end='w'?>
751+
"""
752+
% os.fsencode(secret),
753+
)
754+
with trivial.open_metadata() as m:
755+
assert 'This is a secret' not in str(m)

0 commit comments

Comments
 (0)