Fix handling of encoded NUL in XMP metadata (as produced by Ghostscript)

pikepdf · Jan 4, 2019 · 4c87508 · 4c87508
1 parent 3cd085c
commit 4c87508
Show file tree

Hide file tree

Showing 6 changed files with 121 additions and 12 deletions.
diff --git a/debian/copyright b/debian/copyright
@@ -47,6 +47,12 @@ License: public-domain
  From US Congressional Records.
 Comment: Converted from JPEG to PDF.
 
+Files: test/resources/enron.pdf
+Copyright: EnronData.org
+License: CC-BY-3.0
+ See: https://enrondata.readthedocs.io/en/latest/data/edo-enron-email-pst-dataset/
+Comment: Processed by Ghostscript 9.26.
+
 Files: tests/resources/graph*.pdf
 Copyright: Public domain
 License: public-domain

diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -9,6 +9,11 @@ The pikepdf API (as provided by ``import pikepdf``) is quite stable and is in pr
 
 Note that the C++ extension module ``pikepdf._qpdf`` is a private interface within pikepdf that applications should not use directly.
 
+v1.0.1
+======
+
+* Fixed an exception with handling metadata that contains the invalid XML entity &#0; (an escaped NUL)
+
 v1.0.0
 ======
 

diff --git a/examples/find_links.py b/examples/find_links.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2019, James R. Barlow
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Use pikepdf to find links in a PDF"""
+
+import argparse
+import pikepdf
+from pikepdf import Name
+
+parser = argparse.ArgumentParser(description="Find URIs in a PDF")
+parser.add_argument('input_file')
+
+
+def check_action(action):
+    if action.Type != Name.Action:
+        return
+    if action.S == Name.URI:
+        yield str(bytes(action.URI), encoding='ascii')
+
+
+def check_object_aa(obj):
+    if Name.AA in obj:
+        for name, action in obj.AA.items():
+            yield from check_action(action)
+
+
+def check_page_annots(pdf, page):
+    if Name.Annots not in page:
+        return
+    annots = page.Annots
+    for annot in annots:
+        if annot.Type != Name.Annot:
+            continue
+        if annot.Subtype == Name.Link:
+            link_annot = annot
+            if Name.A in link_annot:
+                action = link_annot.A
+                yield from check_action(action)
+        yield from check_object_aa(annot)
+
+
+def check_page(pdf, page):
+    yield from check_object_aa(page)
+
+
+def gather_links(pdf):
+    for page in pdf.pages:
+        yield from check_page(pdf, page)
+        yield from check_page_annots(pdf, page)
+
+
+def main():
+    args = parser.parse_args()
+    pdf = pikepdf.open(args.input_file)
+    links = gather_links(pdf)
+    for link in links:
+        print(link)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/pikepdf/models/metadata.py b/src/pikepdf/models/metadata.py
@@ -17,10 +17,10 @@
 import sys
 
 from lxml import etree
-from lxml.etree import QName
+from lxml.etree import QName, XMLSyntaxError
 from defusedxml.lxml import parse
 
-from .. import Stream, Name, String
+from .. import Stream, Name, String, PdfError
 
 XMP_NS_DC = "http://purl.org/dc/elements/1.1/"
 XMP_NS_PDF = "http://ns.adobe.com/pdf/1.3/"
@@ -90,6 +90,9 @@ class AltList(list):
 re_xml_illegal_chars = re.compile(
     r"(?u)[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]"
 )
+re_xml_illegal_bytes = re.compile(
+    br"[^\x09\x0A\x0D\x20-\xFF]|&#0;"
+)
 
 # Repeat this to avoid circular from top package's pikepdf.__version__
 try:
@@ -251,9 +254,6 @@ def __init__(self, pdf, pikepdf_mark=True, sync_docinfo=True):
         self.sync_docinfo = sync_docinfo
         self._updating = False
 
-    def _create_xmp(self):
-        self._xmp = parse(BytesIO(XMP_EMPTY))
-
     def load_from_docinfo(self, docinfo, delete_missing=False):
         """Populate the XMP metadata object with DocumentInfo
 
@@ -279,14 +279,23 @@ def load_from_docinfo(self, docinfo, delete_missing=False):
 
     def _load(self):
         try:
-            data = BytesIO(self._pdf.Root.Metadata.get_stream_buffer())
+            data = self._pdf.Root.Metadata.read_bytes()
         except AttributeError:
-            self._create_xmp()
-        else:
-            self._xmp = parse(data)
-            pis = self._xmp.xpath('/processing-instruction()')
-            for pi in pis:
-                etree.strip_tags(self._xmp, pi.tag)
+            data = XMP_EMPTY
+        self._load_from(data)
+
+    def _load_from(self, data):
+        try:
+            self._xmp = parse(BytesIO(data))
+        except XMLSyntaxError:
+            data = re_xml_illegal_bytes.sub(b'', data)
+            try:
+                self._xmp = parse(BytesIO(data))
+            except XMLSyntaxError as e:
+                raise PdfError() from e
+        pis = self._xmp.xpath('/processing-instruction()')
+        for pi in pis:
+            etree.strip_tags(self._xmp, pi.tag)
 
     @ensure_loaded
     def __enter__(self):

diff --git a/tests/resources/enron.pdf b/tests/resources/enron.pdf
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
@@ -47,6 +47,12 @@ def trivial(resources):
     return Pdf.open(resources / 'pal-1bit-trivial.pdf')
 
 
+@pytest.fixture
+def enron(resources):
+    # Has nuls in docinfo, old PDF
+    return Pdf.open(resources / 'enron.pdf')
+
+
 def test_lowlevel(sandwich):
     meta = sandwich.open_metadata()
     assert meta._qname('pdf:Producer') == '{http://ns.adobe.com/pdf/1.3/}Producer'
@@ -278,3 +284,8 @@ def test_remove_attribute_metadata(sandwich):
 
     # Ensure the whole node was deleted
     assert not re.search(r'rdf:Description xmlns:[^\s]+ rdf:about=""/', str(xmp))
+
+
+def test_nuls(enron):
+    meta = enron.open_metadata()
+    meta._load()  # File has invalid XML sequence &#0;