From 99643282bb3e8e8c2da1a734f3740b3707aa74c3 Mon Sep 17 00:00:00 2001 From: Horst Rutter Date: Sun, 26 Nov 2023 16:21:01 +0100 Subject: [PATCH] Fix #727 --- cmd/pdfcpu/process.go | 13 +++-- pkg/api/validate.go | 8 +-- pkg/cli/cli.go | 4 +- pkg/pdfcpu/extract.go | 28 ++-------- pkg/pdfcpu/model/xreftable.go | 61 ++++++++++++-------- pkg/pdfcpu/read.go | 88 +---------------------------- pkg/pdfcpu/scan/scan.go | 102 ++++++++++++++++++++++++++++++++++ 7 files changed, 163 insertions(+), 141 deletions(-) create mode 100644 pkg/pdfcpu/scan/scan.go diff --git a/cmd/pdfcpu/process.go b/cmd/pdfcpu/process.go index 9010e9ea..db82c8e7 100644 --- a/cmd/pdfcpu/process.go +++ b/cmd/pdfcpu/process.go @@ -1833,18 +1833,21 @@ func processListImagesCommand(conf *model.Configuration) { } func processDumpCommand(conf *model.Configuration) { - s := "No dump for you!! One year!\n\n" + s := "No dump for you! - One year!\n\n" if len(flag.Args()) != 3 { fmt.Fprintln(os.Stderr, s) os.Exit(1) } - mode := flag.Arg(0) - hex := mode[0] == 'h' || mode[0] == 'H' - vals := []int{0, 0} - if hex { + + mode := strings.ToLower(flag.Arg(0)) + + switch mode[0] { + case 'a': vals[0] = 1 + case 'h': + vals[0] = 2 } objNr, err := strconv.Atoi(flag.Arg(1)) diff --git a/pkg/api/validate.go b/pkg/api/validate.go index a3dd9aa3..d530deb6 100644 --- a/pkg/api/validate.go +++ b/pkg/api/validate.go @@ -131,7 +131,7 @@ func ValidateFiles(inFiles []string, conf *model.Configuration) error { } // DumpObject writes an object from rs to stdout. -func DumpObject(rs io.ReadSeeker, objNr int, hex bool, conf *model.Configuration) error { +func DumpObject(rs io.ReadSeeker, mode, objNr int, conf *model.Configuration) error { if rs == nil { return errors.New("pdfcpu: DumpObject: missing rs") } @@ -154,13 +154,13 @@ func DumpObject(rs io.ReadSeeker, objNr int, hex bool, conf *model.Configuration return errors.Wrap(err, fmt.Sprintf("validation error (obj#:%d)%s", ctx.CurObj, s)) } - ctx.DumpStream(objNr, hex) + ctx.DumpObject(objNr, mode) return err } // DumpObjectFile writes an object from rs to stdout. -func DumpObjectFile(inFile string, objNr int, hex bool, conf *model.Configuration) error { +func DumpObjectFile(inFile string, mode, objNr int, conf *model.Configuration) error { if conf == nil { conf = model.NewDefaultConfiguration() } @@ -172,5 +172,5 @@ func DumpObjectFile(inFile string, objNr int, hex bool, conf *model.Configuratio defer f.Close() - return DumpObject(f, objNr, hex, conf) + return DumpObject(f, mode, objNr, conf) } diff --git a/pkg/cli/cli.go b/pkg/cli/cli.go index be815364..dd1ecba4 100644 --- a/pkg/cli/cli.go +++ b/pkg/cli/cli.go @@ -282,9 +282,9 @@ func ListImages(cmd *Command) ([]string, error) { // Dump known object to stdout. func Dump(cmd *Command) ([]string, error) { - hex := cmd.IntVals[0] == 1 + mode := cmd.IntVals[0] objNr := cmd.IntVals[1] - return nil, api.DumpObjectFile(*cmd.InFile, objNr, hex, cmd.Conf) + return nil, api.DumpObjectFile(*cmd.InFile, mode, objNr, cmd.Conf) } // Create renders page content corresponding to declarations found in inFileJSON and writes the result to outFile. diff --git a/pkg/pdfcpu/extract.go b/pkg/pdfcpu/extract.go index 6bcbe39d..1f5a2087 100644 --- a/pkg/pdfcpu/extract.go +++ b/pkg/pdfcpu/extract.go @@ -18,6 +18,7 @@ package pdfcpu import ( "bytes" + "fmt" "io" "strings" @@ -321,27 +322,6 @@ func img( resourceID, filters, lastFilter string, objNr int) (*model.Image, error) { - // "ImageMask" is a flag indicating whether the image shall be treated as an image mask. - // We do not extract imageMasks with the exception of CCITTDecoded images. - if imgMask { - // bpc = 1 - if lastFilter != filter.CCITTFax { - if log.InfoEnabled() { - log.Info.Printf("ExtractImage(%d): skip img with imageMask\n", objNr) - } - return nil, nil - } - } - - // An image XObject defining an image mask to be applied to this image, or an array specifying a range of colours to be applied to it as a colour key mask. - // Ignore if image has a Mask defined. - if sm, _ := sd.Find("Mask"); sm != nil { - if log.InfoEnabled() { - log.Info.Printf("ExtractImage(%d): skip image, unsupported \"Mask\"\n", objNr) - } - return nil, nil - } - // CCITTDecoded images / (bit) masks don't have a ColorSpace attribute, but we render image files. if lastFilter == filter.CCITTFax { if _, err := ctx.DereferenceDictEntry(sd.Dict, "ColorSpace"); err != nil { @@ -365,8 +345,12 @@ func img( } default: + msg := fmt.Sprintf("pdfcpu: ExtractImage(obj#%d): skipping img, filter %s unsupported", objNr, filters) if log.DebugEnabled() { - log.Debug.Printf("ExtractImage(%d): skip img, filter %s unsupported\n", objNr, filters) + log.Debug.Println(msg) + } + if log.CLIEnabled() { + log.CLI.Println(msg) } return nil, nil } diff --git a/pkg/pdfcpu/model/xreftable.go b/pkg/pdfcpu/model/xreftable.go index ffd96767..2e76a189 100644 --- a/pkg/pdfcpu/model/xreftable.go +++ b/pkg/pdfcpu/model/xreftable.go @@ -17,6 +17,7 @@ package model import ( + "bufio" "bytes" "encoding/hex" "fmt" @@ -29,6 +30,7 @@ import ( "github.com/pdfcpu/pdfcpu/pkg/filter" "github.com/pdfcpu/pdfcpu/pkg/log" + "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/scan" "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/types" "github.com/pkg/errors" ) @@ -1054,7 +1056,11 @@ func (xRefTable *XRefTable) sortedKeys() []int { return keys } -func (xRefTable *XRefTable) DumpStream(objNr int, hexOut bool) { +func (xRefTable *XRefTable) DumpObject(objNr, mode int) { + // mode + // 0 .. silent / obj only + // 1 .. ascii + // 2 .. hex entry := xRefTable.Table[objNr] if entry == nil || entry.Free || entry.Compressed || entry.Object == nil { fmt.Println(":(") @@ -1088,32 +1094,41 @@ func (xRefTable *XRefTable) DumpStream(objNr int, hexOut bool) { } } - sd, ok := entry.Object.(types.StreamDict) - if ok { + if mode > 0 { + sd, ok := entry.Object.(types.StreamDict) + if ok { - err := sd.Decode() - if err == filter.ErrUnsupportedFilter { - str += "stream filter unsupported!" - fmt.Println(str) - return - } - if err != nil { - str += "decoding problem encountered!" - fmt.Println(str) - return - } + err := sd.Decode() + if err == filter.ErrUnsupportedFilter { + str += "stream filter unsupported!" + fmt.Println(str) + return + } + if err != nil { + str += "decoding problem encountered!" + fmt.Println(str) + return + } - s := "decoded stream content (length = %d)\n%s\n" - if hexOut { - str += fmt.Sprintf(s, len(sd.Content), hex.Dump(sd.Content)) - } else { - str += fmt.Sprintf(s, len(sd.Content), sd.Content) + s := "decoded stream content (length = %d)\n%s\n" + s1 := "" + switch mode { + case 1: + sc := bufio.NewScanner(bytes.NewReader(sd.Content)) + sc.Split(scan.LinesForSingleEol) + for sc.Scan() { + s1 += sc.Text() + "\n" + } + str += fmt.Sprintf(s, len(sd.Content), s1) + case 2: + str += fmt.Sprintf(s, len(sd.Content), hex.Dump(sd.Content)) + } } - } - osd, ok := entry.Object.(types.ObjectStreamDict) - if ok { - str += fmt.Sprintf("object stream count:%d size of objectarray:%d\n", osd.ObjCount, len(osd.ObjArray)) + osd, ok := entry.Object.(types.ObjectStreamDict) + if ok { + str += fmt.Sprintf("object stream count:%d size of objectarray:%d\n", osd.ObjCount, len(osd.ObjArray)) + } } fmt.Println(str) diff --git a/pkg/pdfcpu/read.go b/pkg/pdfcpu/read.go index b6faf2ed..63f6b16c 100644 --- a/pkg/pdfcpu/read.go +++ b/pkg/pdfcpu/read.go @@ -28,6 +28,7 @@ import ( "github.com/pdfcpu/pdfcpu/pkg/filter" "github.com/pdfcpu/pdfcpu/pkg/log" "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model" + "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/scan" "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/types" "github.com/pkg/errors" ) @@ -124,89 +125,6 @@ func fillBuffer(r io.Reader, buf []byte) (int, error) { return n, err } -// ScanLines is a split function for a Scanner that returns each line of -// text, stripped of any trailing end-of-line marker. The returned line may -// be empty. The end-of-line marker is one carriage return followed -// by one newline or one carriage return or one newline. -// The last non-empty line of input will be returned even if it has no newline. -func scanLines(data []byte, atEOF bool) (advance int, token []byte, err error) { - if atEOF && len(data) == 0 { - return 0, nil, nil - } - - indCR := bytes.IndexByte(data, '\r') - indLF := bytes.IndexByte(data, '\n') - - switch { - - case indCR >= 0 && indLF >= 0: - if indCR < indLF { - if indLF == indCR+1 { - // 0x0D0A - return indLF + 1, data[0:indCR], nil - } - // 0x0D ... 0x0A - return indCR + 1, data[0:indCR], nil - } - // 0x0A ... 0x0D - return indLF + 1, data[0:indLF], nil - - case indCR >= 0: - // We have a full carriage return terminated line. - return indCR + 1, data[0:indCR], nil - - case indLF >= 0: - // We have a full newline-terminated line. - return indLF + 1, data[0:indLF], nil - - } - - // If we're at EOF, we have a final, non-terminated line. Return it. - if atEOF { - return len(data), data, nil - } - - // Request more data. - return 0, nil, nil -} - -func scanLinesForSingleEol(data []byte, atEOF bool) (advance int, token []byte, err error) { - if atEOF && len(data) == 0 { - return 0, nil, nil - } - - indCR := bytes.IndexByte(data, '\r') - indLF := bytes.IndexByte(data, '\n') - - switch { - - case indCR >= 0 && indLF >= 0: - if indCR < indLF { - // 0x0D ... 0x0A - return indCR + 2, data[0:indCR], nil - } - // 0x0A ... 0x0D - return indLF + 2, data[0:indLF], nil - - case indCR >= 0: - // We have a full carriage return terminated line. - return indCR + 1, data[0:indCR], nil - - case indLF >= 0: - // We have a full newline-terminated line. - return indLF + 1, data[0:indLF], nil - - } - - // If we're at EOF, we have a final, non-terminated line. Return it. - if atEOF { - return len(data), data, nil - } - - // Request more data. - return 0, nil, nil -} - func newPositionedReader(rs io.ReadSeeker, offset *int64) (*bufio.Reader, error) { if _, err := rs.Seek(*offset, io.SeekStart); err != nil { return nil, err @@ -1413,7 +1331,7 @@ func bypassXrefSection(ctx *model.Context, offExtra int64) error { } s := bufio.NewScanner(rd) - s.Split(scanLinesForSingleEol) + s.Split(scan.LinesForSingleEol) bb := []byte{} var ( @@ -1538,7 +1456,7 @@ func tryXRefSection(ctx *model.Context, rs io.ReadSeeker, offset *int64, offExtr s := bufio.NewScanner(rd) buf := make([]byte, 0, 4096) s.Buffer(buf, 1024*1024) - s.Split(scanLines) + s.Split(scan.Lines) line, err := scanLine(s) if err != nil { diff --git a/pkg/pdfcpu/scan/scan.go b/pkg/pdfcpu/scan/scan.go new file mode 100644 index 00000000..24dc50d8 --- /dev/null +++ b/pkg/pdfcpu/scan/scan.go @@ -0,0 +1,102 @@ +/* +Copyright 2023 The pdfcpu Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scan + +import "bytes" + +// Lines is a split function for a Scanner that returns each line of +// text, stripped of any trailing end-of-line marker. The returned line may +// be empty. The end-of-line marker is one carriage return followed +// by one newline or one carriage return or one newline. +// The last non-empty line of input will be returned even if it has no newline. +func Lines(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + + indCR := bytes.IndexByte(data, '\r') + indLF := bytes.IndexByte(data, '\n') + + switch { + + case indCR >= 0 && indLF >= 0: + if indCR < indLF { + if indLF == indCR+1 { + // 0x0D0A + return indLF + 1, data[0:indCR], nil + } + // 0x0D ... 0x0A + return indCR + 1, data[0:indCR], nil + } + // 0x0A ... 0x0D + return indLF + 1, data[0:indLF], nil + + case indCR >= 0: + // We have a full carriage return terminated line. + return indCR + 1, data[0:indCR], nil + + case indLF >= 0: + // We have a full newline-terminated line. + return indLF + 1, data[0:indLF], nil + + } + + // If we're at EOF, we have a final, non-terminated line. Return it. + if atEOF { + return len(data), data, nil + } + + // Request more data. + return 0, nil, nil +} + +func LinesForSingleEol(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + + indCR := bytes.IndexByte(data, '\r') + indLF := bytes.IndexByte(data, '\n') + + switch { + + case indCR >= 0 && indLF >= 0: + if indCR < indLF { + // 0x0D ... 0x0A + return indCR + 2, data[0:indCR], nil + } + // 0x0A ... 0x0D + return indLF + 2, data[0:indLF], nil + + case indCR >= 0: + // We have a full carriage return terminated line. + return indCR + 1, data[0:indCR], nil + + case indLF >= 0: + // We have a full newline-terminated line. + return indLF + 1, data[0:indLF], nil + + } + + // If we're at EOF, we have a final, non-terminated line. Return it. + if atEOF { + return len(data), data, nil + } + + // Request more data. + return 0, nil, nil +}