Skip to content

Commit

Permalink
Fix #299
Browse files Browse the repository at this point in the history
  • Loading branch information
hhrutter committed Feb 27, 2021
1 parent f0dcd27 commit 6901fad
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 53 deletions.
2 changes: 2 additions & 0 deletions pkg/pdfcpu/optimize.go
Expand Up @@ -554,6 +554,8 @@ func parseResourcesDict(ctx *Context, pageDict Dict, pageNumber, pageObjNumber i
// Iterate over all pages and optimize resources.
func parsePagesDict(ctx *Context, pagesDict Dict, pageNumber int) (int, error) {

// TODO Integrate resource consolidation based on content stream requirements.

log.Optimize.Printf("parsePagesDict begin (next page=%d): %s\n", pageNumber+1, pagesDict)

// Get number of pages of this PDF file.
Expand Down
38 changes: 5 additions & 33 deletions pkg/pdfcpu/parse.go
Expand Up @@ -47,9 +47,8 @@ var (
)

func positionToNextWhitespace(s string) (int, string) {

for i, c := range s {
if unicode.IsSpace(c) {
if unicode.IsSpace(c) || c == 0x00 {
return i, s[i:]
}
}
Expand All @@ -59,14 +58,13 @@ func positionToNextWhitespace(s string) (int, string) {
// PositionToNextWhitespaceOrChar trims a string to next whitespace or one of given chars.
// Returns the index of the position or -1 if no match.
func positionToNextWhitespaceOrChar(s, chars string) (int, string) {

if len(chars) == 0 {
return positionToNextWhitespace(s)
}

for i, c := range s {
for _, m := range chars {
if c == m || unicode.IsSpace(c) {
if c == m || unicode.IsSpace(c) || c == 0x00 {
return i, s[i:]
}
}
Expand All @@ -76,11 +74,8 @@ func positionToNextWhitespaceOrChar(s, chars string) (int, string) {
}

func positionToNextEOL(s string) string {

chars := "\x0A\x0D"

for i, c := range s {
for _, m := range chars {
for _, m := range "\x0A\x0D" {
if c == m {
return s[i:]
}
Expand All @@ -91,14 +86,13 @@ func positionToNextEOL(s string) string {

// trimLeftSpace trims leading whitespace and trailing comment.
func trimLeftSpace(s string, relaxed bool) (outstr string, eol bool) {

log.Parse.Printf("TrimLeftSpace: begin %s\n", s)

whitespace := func(c rune) bool { return unicode.IsSpace(c) }
whitespace := func(c rune) bool { return unicode.IsSpace(c) || c == 0x00 }

whitespaceNoEol := func(r rune) bool {
switch r {
case '\t', '\v', '\f', ' ', 0x85, 0xA0:
case '\t', '\v', '\f', ' ', 0x85, 0xA0, 0x00:
return true
}
return false
Expand Down Expand Up @@ -174,7 +168,6 @@ func hexString(s string) (*string, bool) {
// balancedParenthesesPrefix returns the index of the end position of the balanced parentheses prefix of s
// or -1 if unbalanced. s has to start with '('
func balancedParenthesesPrefix(s string) int {

var j int
escaped := false

Expand Down Expand Up @@ -213,26 +206,21 @@ func forwardParseBuf(buf string, pos int) string {
if pos < len(buf) {
return buf[pos:]
}

return ""
}

func delimiter(b byte) bool {

s := "<>[]()/"

for i := 0; i < len(s); i++ {
if b == s[i] {
return true
}
}

return false
}

// parseObjectAttributes parses object number and generation of the next object for given string buffer.
func parseObjectAttributes(line *string) (objectNumber *int, generationNumber *int, err error) {

log.Parse.Printf("ParseObjectAttributes: buf=<%s>\n", *line)

if line == nil || len(*line) == 0 {
Expand Down Expand Up @@ -294,7 +282,6 @@ func parseObjectAttributes(line *string) (objectNumber *int, generationNumber *i
}

func parseArray(line *string) (*Array, error) {

if line == nil || len(*line) == 0 {
return nil, errNoArray
}
Expand Down Expand Up @@ -356,7 +343,6 @@ func parseArray(line *string) (*Array, error) {
}

func parseStringLiteral(line *string) (Object, error) {

// Balanced pairs of parenthesis are allowed.
// Empty literals are allowed.
// \ needs special treatment.
Expand Down Expand Up @@ -413,9 +399,7 @@ func parseStringLiteral(line *string) (Object, error) {
}

func parseHexLiteral(line *string) (Object, error) {

// hexliterals have no whitespace and can't be empty.

if line == nil || len(*line) == 0 {
return nil, errBufNotAvailable
}
Expand Down Expand Up @@ -448,7 +432,6 @@ func parseHexLiteral(line *string) (Object, error) {
}

func validateNameHexSequence(s string) error {

for i := 0; i < len(s); {
c := s[i]
if c != '#' {
Expand All @@ -471,14 +454,11 @@ func validateNameHexSequence(s string) error {

i += 3
}

return nil
}

func parseName(line *string) (*Name, error) {

// see 7.3.5

if line == nil || len(*line) == 0 {
return nil, errBufNotAvailable
}
Expand Down Expand Up @@ -577,7 +557,6 @@ func processDictKeys(line *string, relaxed bool) (Dict, error) {
}

func parseDict(line *string, relaxed bool) (Dict, error) {

if line == nil || len(*line) == 0 {
return nil, errNoDictionary
}
Expand Down Expand Up @@ -656,7 +635,6 @@ func startParseNumericOrIndRef(l string) (string, string, int) {
}

func parseNumericOrIndRef(line *string) (Object, error) {

if noBuf(line) {
return nil, errBufNotAvailable
}
Expand Down Expand Up @@ -757,7 +735,6 @@ func parseNumericOrIndRef(line *string) (Object, error) {
}

func parseHexLiteralOrDict(l *string) (val Object, err error) {

if len(*l) < 2 {
return nil, errBufNotAvailable
}
Expand Down Expand Up @@ -787,7 +764,6 @@ func parseHexLiteralOrDict(l *string) (val Object, err error) {
}

func parseBooleanOrNull(l string) (val Object, s string, ok bool) {

// null, absent object
if strings.HasPrefix(l, "null") {
log.Parse.Println("parseBoolean: value = null")
Expand All @@ -811,7 +787,6 @@ func parseBooleanOrNull(l string) (val Object, s string, ok bool) {

// parseObject parses next Object from string buffer and returns the updated (left clipped) buffer.
func parseObject(line *string) (Object, error) {

if noBuf(line) {
return nil, errBufNotAvailable
}
Expand Down Expand Up @@ -887,9 +862,7 @@ func parseObject(line *string) (Object, error) {

// parseXRefStreamDict creates a XRefStreamDict out of a StreamDict.
func parseXRefStreamDict(sd *StreamDict) (*XRefStreamDict, error) {

log.Parse.Println("ParseXRefStreamDict: begin")

if sd.Size() == nil {
return nil, errors.New("pdfcpu: ParseXRefStreamDict: \"Size\" not available")
}
Expand Down Expand Up @@ -984,7 +957,6 @@ func parseXRefStreamDict(sd *StreamDict) (*XRefStreamDict, error) {

// objectStreamDict creates a ObjectStreamDict out of a StreamDict.
func objectStreamDict(sd *StreamDict) (*ObjectStreamDict, error) {

if sd.First() == nil {
return nil, errObjStreamMissingFirst
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/pdfcpu/parseContent.go
Expand Up @@ -31,7 +31,7 @@ var (
)

func whitespaceOrEOL(c rune) bool {
return unicode.IsSpace(c) || c == 0x0A || c == 0x0D
return unicode.IsSpace(c) || c == 0x0A || c == 0x0D || c == 0x00
}

func skipDict(l *string) error {
Expand Down Expand Up @@ -322,7 +322,7 @@ func parseContent(s string) (PageResourceNames, error) {

for pos := 0; ; {
t, err := nextContentToken(&s, prn)
//log.Parse.Printf("t = <%s>\n", t)
log.Parse.Printf("t = <%s>\n", t)
if err != nil {
return nil, err
}
Expand Down
44 changes: 36 additions & 8 deletions pkg/pdfcpu/read.go
Expand Up @@ -1810,10 +1810,38 @@ func int64Object(ctx *Context, objectNumber int) (*int64, error) {

}

func readStreamContentBlindly(rd io.Reader) (buf []byte, err error) {
// Weak heuristic for reading in stream data for cases where stream length is unknown.
// ...data...{eol}endstream{eol}endobj
var i int
for i = -1; i < 0; i = bytes.Index(buf, []byte("endstream")) {
buf, err = growBufBy(buf, defaultBufSize, rd)
if err != nil {
return nil, err
}
}

buf = buf[:i]

j := 0

// Cut off trailing eol's.
for i = len(buf) - 1; i >= 0 && (buf[i] == 0x0A || buf[i] == 0x0D); i-- {
j++
}

return buf[:i+1], nil
}

// Reads and returns a file buffer with length = stream length using provided reader positioned at offset.
func readContentStream(rd io.Reader, streamLength int) ([]byte, error) {
func readStreamContent(rd io.Reader, streamLength int) ([]byte, error) {

log.Read.Printf("readContentStream: begin streamLength:%d\n", streamLength)
log.Read.Printf("readStreamContent: begin streamLength:%d\n", streamLength)

// If streamLength == 0 read until "endstream" then fix "Length"
if streamLength == 0 {
return readStreamContentBlindly(rd)
}

buf := make([]byte, streamLength)

Expand All @@ -1832,12 +1860,12 @@ func readContentStream(rd io.Reader, streamLength int) ([]byte, error) {
return buf[:eob], nil
}

log.Read.Printf("readContentStream: count=%d, buflen=%d(%X)\n", count, len(buf), len(buf))
log.Read.Printf("readStreamContent: count=%d, buflen=%d(%X)\n", count, len(buf), len(buf))
totalCount += count

}

log.Read.Printf("readContentStream: end\n")
log.Read.Printf("readStreamContent: end\n")

return buf, nil
}
Expand Down Expand Up @@ -1880,14 +1908,14 @@ func loadEncodedStreamContent(ctx *Context, sd *StreamDict) ([]byte, error) {

// Buffer stream contents.
// Read content from disk.
rawContent, err := readContentStream(rd, int(*sd.StreamLength))
rawContent, err := readStreamContent(rd, int(*sd.StreamLength))
if err != nil {
return nil, err
}

// Sometimes the stream dict length is corrupt and needs to be fixed.
l := int64(len(rawContent))
if l < *sd.StreamLength {
if *sd.StreamLength == 0 || l < *sd.StreamLength {
sd.StreamLength = &l
sd.Dict["Length"] = Integer(l)
}
Expand Down Expand Up @@ -1937,7 +1965,7 @@ func saveDecodedStreamContent(ctx *Context, sd *StreamDict, objNr, genNr int, de
return nil
}

// Actual decoding of content stream.
// Actual decoding of stream data.
err = sd.Decode()
if err == filter.ErrUnsupportedFilter {
err = nil
Expand Down Expand Up @@ -2222,7 +2250,7 @@ func dereferenceObject(ctx *Context, objNr int) error {

o := entry.Object

// Already dereferenced stream dict.
// Already dereferenced object.
if o != nil {
logStream(entry.Object)
updateBinaryTotalSize(ctx, o)
Expand Down
10 changes: 0 additions & 10 deletions pkg/pdfcpu/utf16.go
Expand Up @@ -35,8 +35,6 @@ var ErrInvalidUTF16BE = errors.New("pdfcpu: invalid UTF-16BE detected")
func IsStringUTF16BE(s string) bool {
s1 := fmt.Sprintf("%s", s)
ok := strings.HasPrefix(s1, "\376\377") // 0xFE 0xFF
//log.Debug.Printf("IsStringUTF16BE: <%s> returning %v\n", s1, ok)
//log.Debug.Printf("\n%s", hex.Dump([]byte(s1)))
return ok
}

Expand All @@ -50,9 +48,6 @@ func IsUTF16BE(b []byte) bool {
}

func decodeUTF16String(b []byte) (string, error) {

//log.Debug.Printf("decodeUTF16String: begin %v\n", b)

// We only accept big endian byte order.
if !IsUTF16BE(b) {
log.Debug.Printf("decodeUTF16String: not UTF16BE: %v\n", b)
Expand All @@ -68,13 +63,10 @@ func decodeUTF16String(b []byte) (string, error) {
// Collect code points.
for i := 0; i < len(b); {

//log.Debug.Printf("i=%d\n", i)

val := (uint16(b[i]) << 8) + uint16(b[i+1])

if val <= 0xD7FF || val > 0xE000 && val <= 0xFFFF {
// Basic Multilingual Plane
//log.Debug.Println("decodeUTF16String: Basic Multilingual Plane detected")
u16 = append(u16, val)
i += 2
continue
Expand All @@ -91,7 +83,6 @@ func decodeUTF16String(b []byte) (string, error) {
}

// Supplementary Planes
//log.Debug.Println("decodeUTF16String: Supplementary Planes detected")
u16 = append(u16, val)
val = (uint16(b[i+2]) << 8) + uint16(b[i+3])
if val < 0xDC00 || val > 0xDFFF {
Expand All @@ -110,7 +101,6 @@ func decodeUTF16String(b []byte) (string, error) {
decb = append(decb, utf8Buf[:n]...)
}

//log.Debug.Printf("decodeUTF16String: end %s\n", hex.Dump(decb))
return string(decb), nil
}

Expand Down

0 comments on commit 6901fad

Please sign in to comment.