Fix #299

pdfcpu · Feb 27, 2021 · 6901fad · 6901fad
1 parent f0dcd27
commit 6901fad
Show file tree

Hide file tree

Showing 5 changed files with 45 additions and 53 deletions.
diff --git a/pkg/pdfcpu/optimize.go b/pkg/pdfcpu/optimize.go
@@ -554,6 +554,8 @@ func parseResourcesDict(ctx *Context, pageDict Dict, pageNumber, pageObjNumber i
 // Iterate over all pages and optimize resources.
 func parsePagesDict(ctx *Context, pagesDict Dict, pageNumber int) (int, error) {
 
+	// TODO Integrate resource consolidation based on content stream requirements.
+
 	log.Optimize.Printf("parsePagesDict begin (next page=%d): %s\n", pageNumber+1, pagesDict)
 
 	// Get number of pages of this PDF file.

diff --git a/pkg/pdfcpu/parse.go b/pkg/pdfcpu/parse.go
@@ -47,9 +47,8 @@ var (
 )
 
 func positionToNextWhitespace(s string) (int, string) {
-
 	for i, c := range s {
-		if unicode.IsSpace(c) {
+		if unicode.IsSpace(c) || c == 0x00 {
 			return i, s[i:]
 		}
 	}
@@ -59,14 +58,13 @@ func positionToNextWhitespace(s string) (int, string) {
 // PositionToNextWhitespaceOrChar trims a string to next whitespace or one of given chars.
 // Returns the index of the position or -1 if no match.
 func positionToNextWhitespaceOrChar(s, chars string) (int, string) {
-
 	if len(chars) == 0 {
 		return positionToNextWhitespace(s)
 	}
 
 	for i, c := range s {
 		for _, m := range chars {
-			if c == m || unicode.IsSpace(c) {
+			if c == m || unicode.IsSpace(c) || c == 0x00 {
 				return i, s[i:]
 			}
 		}
@@ -76,11 +74,8 @@ func positionToNextWhitespaceOrChar(s, chars string) (int, string) {
 }
 
 func positionToNextEOL(s string) string {
-
-	chars := "\x0A\x0D"
-
 	for i, c := range s {
-		for _, m := range chars {
+		for _, m := range "\x0A\x0D" {
 			if c == m {
 				return s[i:]
 			}
@@ -91,14 +86,13 @@ func positionToNextEOL(s string) string {
 
 // trimLeftSpace trims leading whitespace and trailing comment.
 func trimLeftSpace(s string, relaxed bool) (outstr string, eol bool) {
-
 	log.Parse.Printf("TrimLeftSpace: begin %s\n", s)
 
-	whitespace := func(c rune) bool { return unicode.IsSpace(c) }
+	whitespace := func(c rune) bool { return unicode.IsSpace(c) || c == 0x00 }
 
 	whitespaceNoEol := func(r rune) bool {
 		switch r {
-		case '\t', '\v', '\f', ' ', 0x85, 0xA0:
+		case '\t', '\v', '\f', ' ', 0x85, 0xA0, 0x00:
 			return true
 		}
 		return false
@@ -174,7 +168,6 @@ func hexString(s string) (*string, bool) {
 // balancedParenthesesPrefix returns the index of the end position of the balanced parentheses prefix of s
 // or -1 if unbalanced. s has to start with '('
 func balancedParenthesesPrefix(s string) int {
-
 	var j int
 	escaped := false
 
@@ -213,26 +206,21 @@ func forwardParseBuf(buf string, pos int) string {
 	if pos < len(buf) {
 		return buf[pos:]
 	}
-
 	return ""
 }
 
 func delimiter(b byte) bool {
-
 	s := "<>[]()/"
-
 	for i := 0; i < len(s); i++ {
 		if b == s[i] {
 			return true
 		}
 	}
-
 	return false
 }
 
 // parseObjectAttributes parses object number and generation of the next object for given string buffer.
 func parseObjectAttributes(line *string) (objectNumber *int, generationNumber *int, err error) {
-
 	log.Parse.Printf("ParseObjectAttributes: buf=<%s>\n", *line)
 
 	if line == nil || len(*line) == 0 {
@@ -294,7 +282,6 @@ func parseObjectAttributes(line *string) (objectNumber *int, generationNumber *i
 }
 
 func parseArray(line *string) (*Array, error) {
-
 	if line == nil || len(*line) == 0 {
 		return nil, errNoArray
 	}
@@ -356,7 +343,6 @@ func parseArray(line *string) (*Array, error) {
 }
 
 func parseStringLiteral(line *string) (Object, error) {
-
 	// Balanced pairs of parenthesis are allowed.
 	// Empty literals are allowed.
 	// \ needs special treatment.
@@ -413,9 +399,7 @@ func parseStringLiteral(line *string) (Object, error) {
 }
 
 func parseHexLiteral(line *string) (Object, error) {
-
 	// hexliterals have no whitespace and can't be empty.
-
 	if line == nil || len(*line) == 0 {
 		return nil, errBufNotAvailable
 	}
@@ -448,7 +432,6 @@ func parseHexLiteral(line *string) (Object, error) {
 }
 
 func validateNameHexSequence(s string) error {
-
 	for i := 0; i < len(s); {
 		c := s[i]
 		if c != '#' {
@@ -471,14 +454,11 @@ func validateNameHexSequence(s string) error {
 
 		i += 3
 	}
-
 	return nil
 }
 
 func parseName(line *string) (*Name, error) {
-
 	// see 7.3.5
-
 	if line == nil || len(*line) == 0 {
 		return nil, errBufNotAvailable
 	}
@@ -577,7 +557,6 @@ func processDictKeys(line *string, relaxed bool) (Dict, error) {
 }
 
 func parseDict(line *string, relaxed bool) (Dict, error) {
-
 	if line == nil || len(*line) == 0 {
 		return nil, errNoDictionary
 	}
@@ -656,7 +635,6 @@ func startParseNumericOrIndRef(l string) (string, string, int) {
 }
 
 func parseNumericOrIndRef(line *string) (Object, error) {
-
 	if noBuf(line) {
 		return nil, errBufNotAvailable
 	}
@@ -757,7 +735,6 @@ func parseNumericOrIndRef(line *string) (Object, error) {
 }
 
 func parseHexLiteralOrDict(l *string) (val Object, err error) {
-
 	if len(*l) < 2 {
 		return nil, errBufNotAvailable
 	}
@@ -787,7 +764,6 @@ func parseHexLiteralOrDict(l *string) (val Object, err error) {
 }
 
 func parseBooleanOrNull(l string) (val Object, s string, ok bool) {
-
 	// null, absent object
 	if strings.HasPrefix(l, "null") {
 		log.Parse.Println("parseBoolean: value = null")
@@ -811,7 +787,6 @@ func parseBooleanOrNull(l string) (val Object, s string, ok bool) {
 
 // parseObject parses next Object from string buffer and returns the updated (left clipped) buffer.
 func parseObject(line *string) (Object, error) {
-
 	if noBuf(line) {
 		return nil, errBufNotAvailable
 	}
@@ -887,9 +862,7 @@ func parseObject(line *string) (Object, error) {
 
 // parseXRefStreamDict creates a XRefStreamDict out of a StreamDict.
 func parseXRefStreamDict(sd *StreamDict) (*XRefStreamDict, error) {
-
 	log.Parse.Println("ParseXRefStreamDict: begin")
-
 	if sd.Size() == nil {
 		return nil, errors.New("pdfcpu: ParseXRefStreamDict: \"Size\" not available")
 	}
@@ -984,7 +957,6 @@ func parseXRefStreamDict(sd *StreamDict) (*XRefStreamDict, error) {
 
 // objectStreamDict creates a ObjectStreamDict out of a StreamDict.
 func objectStreamDict(sd *StreamDict) (*ObjectStreamDict, error) {
-
 	if sd.First() == nil {
 		return nil, errObjStreamMissingFirst
 	}

diff --git a/pkg/pdfcpu/parseContent.go b/pkg/pdfcpu/parseContent.go
@@ -31,7 +31,7 @@ var (
 )
 
 func whitespaceOrEOL(c rune) bool {
-	return unicode.IsSpace(c) || c == 0x0A || c == 0x0D
+	return unicode.IsSpace(c) || c == 0x0A || c == 0x0D || c == 0x00
 }
 
 func skipDict(l *string) error {
@@ -322,7 +322,7 @@ func parseContent(s string) (PageResourceNames, error) {
 
 	for pos := 0; ; {
 		t, err := nextContentToken(&s, prn)
-		//log.Parse.Printf("t = <%s>\n", t)
+		log.Parse.Printf("t = <%s>\n", t)
 		if err != nil {
 			return nil, err
 		}

diff --git a/pkg/pdfcpu/read.go b/pkg/pdfcpu/read.go
@@ -1810,10 +1810,38 @@ func int64Object(ctx *Context, objectNumber int) (*int64, error) {
 
 }
 
+func readStreamContentBlindly(rd io.Reader) (buf []byte, err error) {
+	// Weak heuristic for reading in stream data for cases where stream length is unknown.
+	// ...data...{eol}endstream{eol}endobj
+	var i int
+	for i = -1; i < 0; i = bytes.Index(buf, []byte("endstream")) {
+		buf, err = growBufBy(buf, defaultBufSize, rd)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	buf = buf[:i]
+
+	j := 0
+
+	// Cut off trailing eol's.
+	for i = len(buf) - 1; i >= 0 && (buf[i] == 0x0A || buf[i] == 0x0D); i-- {
+		j++
+	}
+
+	return buf[:i+1], nil
+}
+
 // Reads and returns a file buffer with length = stream length using provided reader positioned at offset.
-func readContentStream(rd io.Reader, streamLength int) ([]byte, error) {
+func readStreamContent(rd io.Reader, streamLength int) ([]byte, error) {
 
-	log.Read.Printf("readContentStream: begin streamLength:%d\n", streamLength)
+	log.Read.Printf("readStreamContent: begin streamLength:%d\n", streamLength)
+
+	// If streamLength == 0 read until "endstream" then fix "Length"
+	if streamLength == 0 {
+		return readStreamContentBlindly(rd)
+	}
 
 	buf := make([]byte, streamLength)
 
@@ -1832,12 +1860,12 @@ func readContentStream(rd io.Reader, streamLength int) ([]byte, error) {
 			return buf[:eob], nil
 		}
 
-		log.Read.Printf("readContentStream: count=%d, buflen=%d(%X)\n", count, len(buf), len(buf))
+		log.Read.Printf("readStreamContent: count=%d, buflen=%d(%X)\n", count, len(buf), len(buf))
 		totalCount += count
 
 	}
 
-	log.Read.Printf("readContentStream: end\n")
+	log.Read.Printf("readStreamContent: end\n")
 
 	return buf, nil
 }
@@ -1880,14 +1908,14 @@ func loadEncodedStreamContent(ctx *Context, sd *StreamDict) ([]byte, error) {
 
 	// Buffer stream contents.
 	// Read content from disk.
-	rawContent, err := readContentStream(rd, int(*sd.StreamLength))
+	rawContent, err := readStreamContent(rd, int(*sd.StreamLength))
 	if err != nil {
 		return nil, err
 	}
 
 	// Sometimes the stream dict length is corrupt and needs to be fixed.
 	l := int64(len(rawContent))
-	if l < *sd.StreamLength {
+	if *sd.StreamLength == 0 || l < *sd.StreamLength {
 		sd.StreamLength = &l
 		sd.Dict["Length"] = Integer(l)
 	}
@@ -1937,7 +1965,7 @@ func saveDecodedStreamContent(ctx *Context, sd *StreamDict, objNr, genNr int, de
 		return nil
 	}
 
-	// Actual decoding of content stream.
+	// Actual decoding of stream data.
 	err = sd.Decode()
 	if err == filter.ErrUnsupportedFilter {
 		err = nil
@@ -2222,7 +2250,7 @@ func dereferenceObject(ctx *Context, objNr int) error {
 
 	o := entry.Object
 
-	// Already dereferenced stream dict.
+	// Already dereferenced object.
 	if o != nil {
 		logStream(entry.Object)
 		updateBinaryTotalSize(ctx, o)

diff --git a/pkg/pdfcpu/utf16.go b/pkg/pdfcpu/utf16.go
@@ -35,8 +35,6 @@ var ErrInvalidUTF16BE = errors.New("pdfcpu: invalid UTF-16BE detected")
 func IsStringUTF16BE(s string) bool {
 	s1 := fmt.Sprintf("%s", s)
 	ok := strings.HasPrefix(s1, "\376\377") // 0xFE 0xFF
-	//log.Debug.Printf("IsStringUTF16BE: <%s> returning %v\n", s1, ok)
-	//log.Debug.Printf("\n%s", hex.Dump([]byte(s1)))
 	return ok
 }
 
@@ -50,9 +48,6 @@ func IsUTF16BE(b []byte) bool {
 }
 
 func decodeUTF16String(b []byte) (string, error) {
-
-	//log.Debug.Printf("decodeUTF16String: begin %v\n", b)
-
 	// We only accept big endian byte order.
 	if !IsUTF16BE(b) {
 		log.Debug.Printf("decodeUTF16String: not UTF16BE: %v\n", b)
@@ -68,13 +63,10 @@ func decodeUTF16String(b []byte) (string, error) {
 	// Collect code points.
 	for i := 0; i < len(b); {
 
-		//log.Debug.Printf("i=%d\n", i)
-
 		val := (uint16(b[i]) << 8) + uint16(b[i+1])
 
 		if val <= 0xD7FF || val > 0xE000 && val <= 0xFFFF {
 			// Basic Multilingual Plane
-			//log.Debug.Println("decodeUTF16String: Basic Multilingual Plane detected")
 			u16 = append(u16, val)
 			i += 2
 			continue
@@ -91,7 +83,6 @@ func decodeUTF16String(b []byte) (string, error) {
 		}
 
 		// Supplementary Planes
-		//log.Debug.Println("decodeUTF16String: Supplementary Planes detected")
 		u16 = append(u16, val)
 		val = (uint16(b[i+2]) << 8) + uint16(b[i+3])
 		if val < 0xDC00 || val > 0xDFFF {
@@ -110,7 +101,6 @@ func decodeUTF16String(b []byte) (string, error) {
 		decb = append(decb, utf8Buf[:n]...)
 	}
 
-	//log.Debug.Printf("decodeUTF16String: end %s\n", hex.Dump(decb))
 	return string(decb), nil
 }