From 99643282bb3e8e8c2da1a734f3740b3707aa74c3 Mon Sep 17 00:00:00 2001
From: Horst Rutter <hhrutter@gmail.com>
Date: Sun, 26 Nov 2023 16:21:01 +0100
Subject: [PATCH] Fix #727

---
 cmd/pdfcpu/process.go         |  13 +++--
 pkg/api/validate.go           |   8 +--
 pkg/cli/cli.go                |   4 +-
 pkg/pdfcpu/extract.go         |  28 ++--------
 pkg/pdfcpu/model/xreftable.go |  61 ++++++++++++--------
 pkg/pdfcpu/read.go            |  88 +----------------------------
 pkg/pdfcpu/scan/scan.go       | 102 ++++++++++++++++++++++++++++++++++
 7 files changed, 163 insertions(+), 141 deletions(-)
 create mode 100644 pkg/pdfcpu/scan/scan.go

diff --git a/cmd/pdfcpu/process.go b/cmd/pdfcpu/process.go
index 9010e9ea..db82c8e7 100644
--- a/cmd/pdfcpu/process.go
+++ b/cmd/pdfcpu/process.go
@@ -1833,18 +1833,21 @@ func processListImagesCommand(conf *model.Configuration) {
 }
 
 func processDumpCommand(conf *model.Configuration) {
-	s := "No dump for you!! One year!\n\n"
+	s := "No dump for you! - One year!\n\n"
 	if len(flag.Args()) != 3 {
 		fmt.Fprintln(os.Stderr, s)
 		os.Exit(1)
 	}
 
-	mode := flag.Arg(0)
-	hex := mode[0] == 'h' || mode[0] == 'H'
-
 	vals := []int{0, 0}
-	if hex {
+
+	mode := strings.ToLower(flag.Arg(0))
+
+	switch mode[0] {
+	case 'a':
 		vals[0] = 1
+	case 'h':
+		vals[0] = 2
 	}
 
 	objNr, err := strconv.Atoi(flag.Arg(1))
diff --git a/pkg/api/validate.go b/pkg/api/validate.go
index a3dd9aa3..d530deb6 100644
--- a/pkg/api/validate.go
+++ b/pkg/api/validate.go
@@ -131,7 +131,7 @@ func ValidateFiles(inFiles []string, conf *model.Configuration) error {
 }
 
 // DumpObject writes an object from rs to stdout.
-func DumpObject(rs io.ReadSeeker, objNr int, hex bool, conf *model.Configuration) error {
+func DumpObject(rs io.ReadSeeker, mode, objNr int, conf *model.Configuration) error {
 	if rs == nil {
 		return errors.New("pdfcpu: DumpObject: missing rs")
 	}
@@ -154,13 +154,13 @@ func DumpObject(rs io.ReadSeeker, objNr int, hex bool, conf *model.Configuration
 		return errors.Wrap(err, fmt.Sprintf("validation error (obj#:%d)%s", ctx.CurObj, s))
 	}
 
-	ctx.DumpStream(objNr, hex)
+	ctx.DumpObject(objNr, mode)
 
 	return err
 }
 
 // DumpObjectFile writes an object from rs to stdout.
-func DumpObjectFile(inFile string, objNr int, hex bool, conf *model.Configuration) error {
+func DumpObjectFile(inFile string, mode, objNr int, conf *model.Configuration) error {
 	if conf == nil {
 		conf = model.NewDefaultConfiguration()
 	}
@@ -172,5 +172,5 @@ func DumpObjectFile(inFile string, objNr int, hex bool, conf *model.Configuratio
 
 	defer f.Close()
 
-	return DumpObject(f, objNr, hex, conf)
+	return DumpObject(f, mode, objNr, conf)
 }
diff --git a/pkg/cli/cli.go b/pkg/cli/cli.go
index be815364..dd1ecba4 100644
--- a/pkg/cli/cli.go
+++ b/pkg/cli/cli.go
@@ -282,9 +282,9 @@ func ListImages(cmd *Command) ([]string, error) {
 
 // Dump known object to stdout.
 func Dump(cmd *Command) ([]string, error) {
-	hex := cmd.IntVals[0] == 1
+	mode := cmd.IntVals[0]
 	objNr := cmd.IntVals[1]
-	return nil, api.DumpObjectFile(*cmd.InFile, objNr, hex, cmd.Conf)
+	return nil, api.DumpObjectFile(*cmd.InFile, mode, objNr, cmd.Conf)
 }
 
 // Create renders page content corresponding to declarations found in inFileJSON and writes the result to outFile.
diff --git a/pkg/pdfcpu/extract.go b/pkg/pdfcpu/extract.go
index 6bcbe39d..1f5a2087 100644
--- a/pkg/pdfcpu/extract.go
+++ b/pkg/pdfcpu/extract.go
@@ -18,6 +18,7 @@ package pdfcpu
 
 import (
 	"bytes"
+	"fmt"
 	"io"
 	"strings"
 
@@ -321,27 +322,6 @@ func img(
 	resourceID, filters, lastFilter string,
 	objNr int) (*model.Image, error) {
 
-	// "ImageMask" is a flag indicating whether the image shall be treated as an image mask.
-	// We do not extract imageMasks with the exception of CCITTDecoded images.
-	if imgMask {
-		// bpc = 1
-		if lastFilter != filter.CCITTFax {
-			if log.InfoEnabled() {
-				log.Info.Printf("ExtractImage(%d): skip img with imageMask\n", objNr)
-			}
-			return nil, nil
-		}
-	}
-
-	// An image XObject defining an image mask to be applied to this image, or an array specifying a range of colours to be applied to it as a colour key mask.
-	// Ignore if image has a Mask defined.
-	if sm, _ := sd.Find("Mask"); sm != nil {
-		if log.InfoEnabled() {
-			log.Info.Printf("ExtractImage(%d): skip image, unsupported \"Mask\"\n", objNr)
-		}
-		return nil, nil
-	}
-
 	// CCITTDecoded images / (bit) masks don't have a ColorSpace attribute, but we render image files.
 	if lastFilter == filter.CCITTFax {
 		if _, err := ctx.DereferenceDictEntry(sd.Dict, "ColorSpace"); err != nil {
@@ -365,8 +345,12 @@ func img(
 		}
 
 	default:
+		msg := fmt.Sprintf("pdfcpu: ExtractImage(obj#%d): skipping img, filter %s unsupported", objNr, filters)
 		if log.DebugEnabled() {
-			log.Debug.Printf("ExtractImage(%d): skip img, filter %s unsupported\n", objNr, filters)
+			log.Debug.Println(msg)
+		}
+		if log.CLIEnabled() {
+			log.CLI.Println(msg)
 		}
 		return nil, nil
 	}
diff --git a/pkg/pdfcpu/model/xreftable.go b/pkg/pdfcpu/model/xreftable.go
index ffd96767..2e76a189 100644
--- a/pkg/pdfcpu/model/xreftable.go
+++ b/pkg/pdfcpu/model/xreftable.go
@@ -17,6 +17,7 @@
 package model
 
 import (
+	"bufio"
 	"bytes"
 	"encoding/hex"
 	"fmt"
@@ -29,6 +30,7 @@ import (
 
 	"github.com/pdfcpu/pdfcpu/pkg/filter"
 	"github.com/pdfcpu/pdfcpu/pkg/log"
+	"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/scan"
 	"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/types"
 	"github.com/pkg/errors"
 )
@@ -1054,7 +1056,11 @@ func (xRefTable *XRefTable) sortedKeys() []int {
 	return keys
 }
 
-func (xRefTable *XRefTable) DumpStream(objNr int, hexOut bool) {
+func (xRefTable *XRefTable) DumpObject(objNr, mode int) {
+	// mode
+	//  0 .. silent / obj only
+	//  1 .. ascii
+	//  2 .. hex
 	entry := xRefTable.Table[objNr]
 	if entry == nil || entry.Free || entry.Compressed || entry.Object == nil {
 		fmt.Println(":(")
@@ -1088,32 +1094,41 @@ func (xRefTable *XRefTable) DumpStream(objNr int, hexOut bool) {
 		}
 	}
 
-	sd, ok := entry.Object.(types.StreamDict)
-	if ok {
+	if mode > 0 {
+		sd, ok := entry.Object.(types.StreamDict)
+		if ok {
 
-		err := sd.Decode()
-		if err == filter.ErrUnsupportedFilter {
-			str += "stream filter unsupported!"
-			fmt.Println(str)
-			return
-		}
-		if err != nil {
-			str += "decoding problem encountered!"
-			fmt.Println(str)
-			return
-		}
+			err := sd.Decode()
+			if err == filter.ErrUnsupportedFilter {
+				str += "stream filter unsupported!"
+				fmt.Println(str)
+				return
+			}
+			if err != nil {
+				str += "decoding problem encountered!"
+				fmt.Println(str)
+				return
+			}
 
-		s := "decoded stream content (length = %d)\n%s\n"
-		if hexOut {
-			str += fmt.Sprintf(s, len(sd.Content), hex.Dump(sd.Content))
-		} else {
-			str += fmt.Sprintf(s, len(sd.Content), sd.Content)
+			s := "decoded stream content (length = %d)\n%s\n"
+			s1 := ""
+			switch mode {
+			case 1:
+				sc := bufio.NewScanner(bytes.NewReader(sd.Content))
+				sc.Split(scan.LinesForSingleEol)
+				for sc.Scan() {
+					s1 += sc.Text() + "\n"
+				}
+				str += fmt.Sprintf(s, len(sd.Content), s1)
+			case 2:
+				str += fmt.Sprintf(s, len(sd.Content), hex.Dump(sd.Content))
+			}
 		}
-	}
 
-	osd, ok := entry.Object.(types.ObjectStreamDict)
-	if ok {
-		str += fmt.Sprintf("object stream count:%d size of objectarray:%d\n", osd.ObjCount, len(osd.ObjArray))
+		osd, ok := entry.Object.(types.ObjectStreamDict)
+		if ok {
+			str += fmt.Sprintf("object stream count:%d size of objectarray:%d\n", osd.ObjCount, len(osd.ObjArray))
+		}
 	}
 
 	fmt.Println(str)
diff --git a/pkg/pdfcpu/read.go b/pkg/pdfcpu/read.go
index b6faf2ed..63f6b16c 100644
--- a/pkg/pdfcpu/read.go
+++ b/pkg/pdfcpu/read.go
@@ -28,6 +28,7 @@ import (
 	"github.com/pdfcpu/pdfcpu/pkg/filter"
 	"github.com/pdfcpu/pdfcpu/pkg/log"
 	"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
+	"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/scan"
 	"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/types"
 	"github.com/pkg/errors"
 )
@@ -124,89 +125,6 @@ func fillBuffer(r io.Reader, buf []byte) (int, error) {
 	return n, err
 }
 
-// ScanLines is a split function for a Scanner that returns each line of
-// text, stripped of any trailing end-of-line marker. The returned line may
-// be empty. The end-of-line marker is one carriage return followed
-// by one newline or one carriage return or one newline.
-// The last non-empty line of input will be returned even if it has no newline.
-func scanLines(data []byte, atEOF bool) (advance int, token []byte, err error) {
-	if atEOF && len(data) == 0 {
-		return 0, nil, nil
-	}
-
-	indCR := bytes.IndexByte(data, '\r')
-	indLF := bytes.IndexByte(data, '\n')
-
-	switch {
-
-	case indCR >= 0 && indLF >= 0:
-		if indCR < indLF {
-			if indLF == indCR+1 {
-				// 0x0D0A
-				return indLF + 1, data[0:indCR], nil
-			}
-			// 0x0D ... 0x0A
-			return indCR + 1, data[0:indCR], nil
-		}
-		// 0x0A ... 0x0D
-		return indLF + 1, data[0:indLF], nil
-
-	case indCR >= 0:
-		// We have a full carriage return terminated line.
-		return indCR + 1, data[0:indCR], nil
-
-	case indLF >= 0:
-		// We have a full newline-terminated line.
-		return indLF + 1, data[0:indLF], nil
-
-	}
-
-	// If we're at EOF, we have a final, non-terminated line. Return it.
-	if atEOF {
-		return len(data), data, nil
-	}
-
-	// Request more data.
-	return 0, nil, nil
-}
-
-func scanLinesForSingleEol(data []byte, atEOF bool) (advance int, token []byte, err error) {
-	if atEOF && len(data) == 0 {
-		return 0, nil, nil
-	}
-
-	indCR := bytes.IndexByte(data, '\r')
-	indLF := bytes.IndexByte(data, '\n')
-
-	switch {
-
-	case indCR >= 0 && indLF >= 0:
-		if indCR < indLF {
-			// 0x0D ... 0x0A
-			return indCR + 2, data[0:indCR], nil
-		}
-		// 0x0A ... 0x0D
-		return indLF + 2, data[0:indLF], nil
-
-	case indCR >= 0:
-		// We have a full carriage return terminated line.
-		return indCR + 1, data[0:indCR], nil
-
-	case indLF >= 0:
-		// We have a full newline-terminated line.
-		return indLF + 1, data[0:indLF], nil
-
-	}
-
-	// If we're at EOF, we have a final, non-terminated line. Return it.
-	if atEOF {
-		return len(data), data, nil
-	}
-
-	// Request more data.
-	return 0, nil, nil
-}
-
 func newPositionedReader(rs io.ReadSeeker, offset *int64) (*bufio.Reader, error) {
 	if _, err := rs.Seek(*offset, io.SeekStart); err != nil {
 		return nil, err
@@ -1413,7 +1331,7 @@ func bypassXrefSection(ctx *model.Context, offExtra int64) error {
 	}
 
 	s := bufio.NewScanner(rd)
-	s.Split(scanLinesForSingleEol)
+	s.Split(scan.LinesForSingleEol)
 
 	bb := []byte{}
 	var (
@@ -1538,7 +1456,7 @@ func tryXRefSection(ctx *model.Context, rs io.ReadSeeker, offset *int64, offExtr
 	s := bufio.NewScanner(rd)
 	buf := make([]byte, 0, 4096)
 	s.Buffer(buf, 1024*1024)
-	s.Split(scanLines)
+	s.Split(scan.Lines)
 
 	line, err := scanLine(s)
 	if err != nil {
diff --git a/pkg/pdfcpu/scan/scan.go b/pkg/pdfcpu/scan/scan.go
new file mode 100644
index 00000000..24dc50d8
--- /dev/null
+++ b/pkg/pdfcpu/scan/scan.go
@@ -0,0 +1,102 @@
+/*
+Copyright 2023 The pdfcpu Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scan
+
+import "bytes"
+
+// Lines is a split function for a Scanner that returns each line of
+// text, stripped of any trailing end-of-line marker. The returned line may
+// be empty. The end-of-line marker is one carriage return followed
+// by one newline or one carriage return or one newline.
+// The last non-empty line of input will be returned even if it has no newline.
+func Lines(data []byte, atEOF bool) (advance int, token []byte, err error) {
+	if atEOF && len(data) == 0 {
+		return 0, nil, nil
+	}
+
+	indCR := bytes.IndexByte(data, '\r')
+	indLF := bytes.IndexByte(data, '\n')
+
+	switch {
+
+	case indCR >= 0 && indLF >= 0:
+		if indCR < indLF {
+			if indLF == indCR+1 {
+				// 0x0D0A
+				return indLF + 1, data[0:indCR], nil
+			}
+			// 0x0D ... 0x0A
+			return indCR + 1, data[0:indCR], nil
+		}
+		// 0x0A ... 0x0D
+		return indLF + 1, data[0:indLF], nil
+
+	case indCR >= 0:
+		// We have a full carriage return terminated line.
+		return indCR + 1, data[0:indCR], nil
+
+	case indLF >= 0:
+		// We have a full newline-terminated line.
+		return indLF + 1, data[0:indLF], nil
+
+	}
+
+	// If we're at EOF, we have a final, non-terminated line. Return it.
+	if atEOF {
+		return len(data), data, nil
+	}
+
+	// Request more data.
+	return 0, nil, nil
+}
+
+func LinesForSingleEol(data []byte, atEOF bool) (advance int, token []byte, err error) {
+	if atEOF && len(data) == 0 {
+		return 0, nil, nil
+	}
+
+	indCR := bytes.IndexByte(data, '\r')
+	indLF := bytes.IndexByte(data, '\n')
+
+	switch {
+
+	case indCR >= 0 && indLF >= 0:
+		if indCR < indLF {
+			// 0x0D ... 0x0A
+			return indCR + 2, data[0:indCR], nil
+		}
+		// 0x0A ... 0x0D
+		return indLF + 2, data[0:indLF], nil
+
+	case indCR >= 0:
+		// We have a full carriage return terminated line.
+		return indCR + 1, data[0:indCR], nil
+
+	case indLF >= 0:
+		// We have a full newline-terminated line.
+		return indLF + 1, data[0:indLF], nil
+
+	}
+
+	// If we're at EOF, we have a final, non-terminated line. Return it.
+	if atEOF {
+		return len(data), data, nil
+	}
+
+	// Request more data.
+	return 0, nil, nil
+}