Skip to content

Commit

Permalink
Fix handling of encoded NUL in XMP metadata (as produced by Ghostscript)
Browse files Browse the repository at this point in the history
  • Loading branch information
James R. Barlow committed Jan 4, 2019
1 parent 3cd085c commit 4c87508
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 12 deletions.
6 changes: 6 additions & 0 deletions debian/copyright
Expand Up @@ -47,6 +47,12 @@ License: public-domain
From US Congressional Records.
Comment: Converted from JPEG to PDF.

Files: test/resources/enron.pdf
Copyright: EnronData.org
License: CC-BY-3.0
See: https://enrondata.readthedocs.io/en/latest/data/edo-enron-email-pst-dataset/
Comment: Processed by Ghostscript 9.26.

Files: tests/resources/graph*.pdf
Copyright: Public domain
License: public-domain
Expand Down
5 changes: 5 additions & 0 deletions docs/changelog.rst
Expand Up @@ -9,6 +9,11 @@ The pikepdf API (as provided by ``import pikepdf``) is quite stable and is in pr

Note that the C++ extension module ``pikepdf._qpdf`` is a private interface within pikepdf that applications should not use directly.

v1.0.1
======

* Fixed an exception with handling metadata that contains the invalid XML entity � (an escaped NUL)

v1.0.0
======

Expand Down
78 changes: 78 additions & 0 deletions examples/find_links.py
@@ -0,0 +1,78 @@
# Copyright (c) 2019, James R. Barlow

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

"""Use pikepdf to find links in a PDF"""

import argparse
import pikepdf
from pikepdf import Name

parser = argparse.ArgumentParser(description="Find URIs in a PDF")
parser.add_argument('input_file')


def check_action(action):
if action.Type != Name.Action:
return
if action.S == Name.URI:
yield str(bytes(action.URI), encoding='ascii')


def check_object_aa(obj):
if Name.AA in obj:
for name, action in obj.AA.items():
yield from check_action(action)


def check_page_annots(pdf, page):
if Name.Annots not in page:
return
annots = page.Annots
for annot in annots:
if annot.Type != Name.Annot:
continue
if annot.Subtype == Name.Link:
link_annot = annot
if Name.A in link_annot:
action = link_annot.A
yield from check_action(action)
yield from check_object_aa(annot)


def check_page(pdf, page):
yield from check_object_aa(page)


def gather_links(pdf):
for page in pdf.pages:
yield from check_page(pdf, page)
yield from check_page_annots(pdf, page)


def main():
args = parser.parse_args()
pdf = pikepdf.open(args.input_file)
links = gather_links(pdf)
for link in links:
print(link)


if __name__ == "__main__":
main()
33 changes: 21 additions & 12 deletions src/pikepdf/models/metadata.py
Expand Up @@ -17,10 +17,10 @@
import sys

from lxml import etree
from lxml.etree import QName
from lxml.etree import QName, XMLSyntaxError
from defusedxml.lxml import parse

from .. import Stream, Name, String
from .. import Stream, Name, String, PdfError

XMP_NS_DC = "http://purl.org/dc/elements/1.1/"
XMP_NS_PDF = "http://ns.adobe.com/pdf/1.3/"
Expand Down Expand Up @@ -90,6 +90,9 @@ class AltList(list):
re_xml_illegal_chars = re.compile(
r"(?u)[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]"
)
re_xml_illegal_bytes = re.compile(
br"[^\x09\x0A\x0D\x20-\xFF]|�"
)

# Repeat this to avoid circular from top package's pikepdf.__version__
try:
Expand Down Expand Up @@ -251,9 +254,6 @@ def __init__(self, pdf, pikepdf_mark=True, sync_docinfo=True):
self.sync_docinfo = sync_docinfo
self._updating = False

def _create_xmp(self):
self._xmp = parse(BytesIO(XMP_EMPTY))

def load_from_docinfo(self, docinfo, delete_missing=False):
"""Populate the XMP metadata object with DocumentInfo
Expand All @@ -279,14 +279,23 @@ def load_from_docinfo(self, docinfo, delete_missing=False):

def _load(self):
try:
data = BytesIO(self._pdf.Root.Metadata.get_stream_buffer())
data = self._pdf.Root.Metadata.read_bytes()
except AttributeError:
self._create_xmp()
else:
self._xmp = parse(data)
pis = self._xmp.xpath('/processing-instruction()')
for pi in pis:
etree.strip_tags(self._xmp, pi.tag)
data = XMP_EMPTY
self._load_from(data)

def _load_from(self, data):
try:
self._xmp = parse(BytesIO(data))
except XMLSyntaxError:
data = re_xml_illegal_bytes.sub(b'', data)
try:
self._xmp = parse(BytesIO(data))
except XMLSyntaxError as e:
raise PdfError() from e
pis = self._xmp.xpath('/processing-instruction()')
for pi in pis:
etree.strip_tags(self._xmp, pi.tag)

@ensure_loaded
def __enter__(self):
Expand Down
Binary file added tests/resources/enron.pdf
Binary file not shown.
11 changes: 11 additions & 0 deletions tests/test_metadata.py
Expand Up @@ -47,6 +47,12 @@ def trivial(resources):
return Pdf.open(resources / 'pal-1bit-trivial.pdf')


@pytest.fixture
def enron(resources):
# Has nuls in docinfo, old PDF
return Pdf.open(resources / 'enron.pdf')


def test_lowlevel(sandwich):
meta = sandwich.open_metadata()
assert meta._qname('pdf:Producer') == '{http://ns.adobe.com/pdf/1.3/}Producer'
Expand Down Expand Up @@ -278,3 +284,8 @@ def test_remove_attribute_metadata(sandwich):

# Ensure the whole node was deleted
assert not re.search(r'rdf:Description xmlns:[^\s]+ rdf:about=""/', str(xmp))


def test_nuls(enron):
meta = enron.open_metadata()
meta._load() # File has invalid XML sequence �

0 comments on commit 4c87508

Please sign in to comment.