Merge branch 'master' of https://github.com/mstamy2/PyPDF2

py-pdf · Feb 25, 2014 · 84a8669 · 84a8669
2 parents 1252414 + f3c9dc5
commit 84a8669
Show file tree

Hide file tree

Showing 10 changed files with 217 additions and 78 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,11 +1,14 @@
-Version 1.20, 2014-01-??
+Version 1.20, 2014-01-27
 ------------------------
 
- - Many Python 3 support changes (with contributions from TWAC and cgammans)
+ - Official Python 3+ support (with contributions from TWAC and cgammans)
+   Support for Python versions 2.6 and 2.7 will be maintained
 
- - Updated FAQ; link included in README
+ - Command line concatenation (see pdfcat in sample code) (by Steve Witham)
 
- - Allow more (unnecessary) escape sequences
+ - New FAQ; link included in README
+
+ - Allow more (although unnecessary) escape sequences
 
  - Prevent exception when reading a null object in decoding parameters
 
@@ -19,11 +22,18 @@ Version 1.20, 2014-01-??
 
  - Additions to Sample Code and Sample PDFs
 
- - changes to allow 2up script to work (by Dylan McNamee)
+ - changes to allow 2up script to work (see sample code) (by Dylan McNamee)
 
  - changes to metadata encoding (by Chris Hiestand)
 
- - New methods for links: addLink() (by Enrico Lambertini) and ignoreLinks()
+ - New methods for links: addLink() (by Enrico Lambertini) and removeLinks()
+
+ - Bugfix to handle nested bookmarks correctly (by Jamie Lentin)
+
+ - New methods removeImages() and removeText() available for PdfFileWriter
+   (by Tien Ha�)
+
+ - Exception handling for illegal characters in Name Objects
 
 
 Version 1.19, 2013-10-08

diff --git a/PyPDF2/_version.py b/PyPDF2/_version.py
@@ -1,2 +1,2 @@
-__version__ = '1.20b'
+__version__ = '1.20'
 
diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py
@@ -34,7 +34,7 @@
 __author__ = "Mathieu Fenniak"
 __author_email__ = "biziqe@mathieu.fenniak.net"
 
-from .utils import PdfReadError
+from .utils import PdfReadError, ord_, chr_
 from sys import version_info
 if version_info < ( 3, 0 ):
     from cStringIO import StringIO
@@ -118,7 +118,7 @@ def decode(data, decodeParms):
                 assert len(data) % rowlength == 0
                 prev_rowdata = (0,) * rowlength
                 for row in range(len(data) // rowlength):
-                    rowdata = [ord(x) for x in data[(row*rowlength):((row+1)*rowlength)]]
+                    rowdata = [ord_(x) for x in data[(row*rowlength):((row+1)*rowlength)]]
                     filterByte = rowdata[0]
                     if filterByte == 0:
                         pass

diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py
@@ -56,7 +56,7 @@ def readObject(stream, pdf):
         return readStringFromStream(stream)
     elif tok == b_('/'):
         # name object
-        return NameObject.readFromStream(stream)
+        return NameObject.readFromStream(stream, pdf)
     elif tok == b_('['):
         # array object
         return ArrayObject.readFromStream(stream, pdf)
@@ -85,7 +85,7 @@ def readObject(stream, pdf):
             return NumberObject.readFromStream(stream)
         peek = stream.read(20)
         stream.seek(-len(peek), 1) # reset to start
-        if re.match(b_(r"(\d+)\s(\d+)\sR[^a-zA-Z]"), peek) != None:
+        if re.match(b_(r"(\d+)\s+(\d+)\s+R[^a-zA-Z]"), peek) != None:
             return IndirectObject.readFromStream(stream, pdf)
         else:
             return NumberObject.readFromStream(stream)
@@ -204,9 +204,11 @@ def readFromStream(stream, pdf):
                 # stream has truncated prematurely
                 raise PdfStreamError("Stream has ended unexpectedly")
             if tok.isspace():
+                if not generation:
+                    continue
                 break
             generation += tok
-        r = stream.read(1)
+        r = readNonWhitespace(stream)
         if r != b_("R"):
             raise utils.PdfReadError("Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell()))
         return IndirectObject(int(idnum), int(generation), pdf)
@@ -218,7 +220,7 @@ def __new__(cls, value="0", context=None):
         try:
             return decimal.Decimal.__new__(cls, utils.str_(value), context)
         except:
-            return decimal.Decimal.__new__(cls, utils.str_(value))
+            return decimal.Decimal.__new__(cls, str(value))
     def __repr__(self):
         if self == self.to_integral():
             return str(self.quantize(decimal.Decimal(1)))
@@ -452,7 +454,7 @@ def __init__(self, data):
     def writeToStream(self, stream, encryption_key):
         stream.write(b_(self))
 
-    def readFromStream(stream):
+    def readFromStream(stream, pdf):
         debug = False
         if debug: print((stream.tell()))
         name = stream.read(1)
@@ -468,7 +470,17 @@ def readFromStream(stream):
                 break
             name += tok
         if debug: print(name)
-        return NameObject(name.decode('utf-8'))
+        try:
+            return NameObject(name.decode('utf-8'))
+        except UnicodeDecodeError as e:
+            # Name objects should represent irregular characters
+            # with a '#' followed by the symbol's hex number
+            if not pdf.strict:
+                warnings.warn("Illegal character in Name Object", utils.PdfReadWarning)
+                return NameObject(name)
+            else:
+                raise utils.PdfReadError("Illegal character in Name Object")
+
     readFromStream = staticmethod(readFromStream)
 
 
@@ -909,7 +921,7 @@ def getWidth(self):
         return self.getUpperRight_x() - self.getLowerLeft_x()
 
     def getHeight(self):
-        return self.getUpperRight_y() - self.getLowerLeft_x()
+        return self.getUpperRight_y() - self.getLowerLeft_y()
 
     lowerLeft = property(getLowerLeft, setLowerLeft, None, None)
     lowerRight = property(getLowerRight, setLowerRight, None, None)

diff --git a/PyPDF2/merger.py b/PyPDF2/merger.py
@@ -28,6 +28,7 @@
 # POSSIBILITY OF SUCH DAMAGE.
 
 from .generic import *
+from .utils import string_type
 from .pdf import PdfFileReader, PdfFileWriter
 from .pagerange import PageRange
 from sys import version_info
@@ -98,7 +99,7 @@ def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=T
         # it is a PdfFileReader, copy that reader's stream into a 
         # StringIO stream.
         # If fileobj is none of the above types, it is not modified
-        if type(fileobj) in (str, str):
+        if type(fileobj) == string_type:
             fileobj = file(fileobj, 'rb')
             my_file = True
         elif isinstance(fileobj, file):
@@ -417,7 +418,7 @@ def findBookmark(self, bookmark, root=None):
     			res = self.findBookmark(bookmark, b)
     			if res:
     				return [i] + res
-    		if b == bookmark or b['/Title'] == bookmark:
+    		elif b == bookmark or b['/Title'] == bookmark:
     			return [i]
 
     	return None

diff --git a/PyPDF2/pagerange.py b/PyPDF2/pagerange.py
@@ -9,14 +9,17 @@
 
 import re
 
+# "Str" maintains compatibility with Python 2.x.
+# The next line is obfuscated like this so 2to3 won't change it.
+Str = getattr(__builtins__, "basestring", str)
+
 _INT_RE = r"(0|-?[1-9]\d*)"  # A decimal int, don't allow "-0".
 PAGE_RANGE_RE = "^({int}|({int}?(:{int}?(:{int}?)?)))$".format(int=_INT_RE)
 # groups:         12     34     5 6     7 8
 
 
 class ParseError(Exception):
-    def __init__(self, message):
-        super(self, ParseError).__init__(repr(message))
+    pass
 
 
 PAGE_RANGE_HELP = """Remember, page indices start with zero.
@@ -68,7 +71,7 @@ def __init__(self, arg):
             self._slice = arg.to_slice()
             return
 
-        m = re.match(PAGE_RANGE_RE, arg)
+        m = isinstance(arg, Str) and re.match(PAGE_RANGE_RE, arg)
         if not m:
             raise ParseError(arg)
         elif m.group(2):
@@ -87,7 +90,7 @@ def valid(input):
         """ True if input is a valid initializer for a PageRange. """
         return isinstance(input, slice)  or \
                isinstance(input, PageRange) or \
-               (isinstance(input, basestring)
+               (isinstance(input, Str)
                 and bool(re.match(PAGE_RANGE_RE, input)))
 
     def to_slice(self):