diff --git a/pkg/pdfcpu/model/parse.go b/pkg/pdfcpu/model/parse.go index 8aac6c85..e4651b92 100644 --- a/pkg/pdfcpu/model/parse.go +++ b/pkg/pdfcpu/model/parse.go @@ -527,78 +527,90 @@ func parseName(line *string) (*types.Name, error) { return &nameObj, nil } +func insertKey(d types.Dict, key string, val types.Object, usesHexCodes bool) (bool, error) { + var duplicateKeyErr bool + + if !usesHexCodes { + if strings.IndexByte(key, '#') < 0 { + // Avoid expensive "DecodeName". + if _, found := d[key]; !found { + d[key] = val + } else { + duplicateKeyErr = true + } + } else { + duplicateKeyErr = d.Insert(key, val) + usesHexCodes = true + } + } else { + duplicateKeyErr = d.Insert(key, val) + } + + if duplicateKeyErr { + // for now we digest duplicate keys. + // TODO + // if !validationRelaxed { + // return false, errDictionaryDuplicateKey + // } + // if log.CLIEnabled() { + // log.CLI.Printf("ParseDict: digesting duplicate key\n") + // } + _ = duplicateKeyErr + } + + if log.ParseEnabled() { + log.Parse.Printf("ParseDict: dict[%s]=%v\n", key, val) + } + + return usesHexCodes, nil +} + func processDictKeys(line *string, relaxed bool) (types.Dict, error) { l := *line var eol bool - var hasNames bool + var usesHexCodes bool d := types.NewDict() for !strings.HasPrefix(l, ">>") { - key, err := parseName(&l) + keyName, err := parseName(&l) if err != nil { return nil, err } if log.ParseEnabled() { - log.Parse.Printf("ParseDict: key = %s\n", key) + log.Parse.Printf("ParseDict: key = %s\n", keyName) } - // position to first non whitespace after key + // Position to first non whitespace after key. l, eol = trimLeftSpace(l, relaxed) if len(l) == 0 { if log.ParseEnabled() { log.Parse.Println("ParseDict: only whitespace after key") } - // only whitespace after key + // Only whitespace after key. return nil, errDictionaryNotTerminated } - // Fix for #252: - // For dicts with kv pairs terminated by eol we accept a missing value as an empty string. + var val types.Object + if eol { - obj := types.StringLiteral("") - if log.ParseEnabled() { - log.Parse.Printf("ParseDict: dict[%s]=%v\n", key, obj) - } - stringKey := string(*key) - if !hasNames { - // Avoid expensive "DecodeName" on existing keys in "Insert". - if _, found := d[stringKey]; found { - return nil, errDictionaryDuplicateKey - } - d[stringKey] = obj - hasNames = strings.IndexByte(stringKey, '#') >= 0 - } else { - if ok := d.Insert(stringKey, obj); !ok { - return nil, errDictionaryDuplicateKey - } + // #252: For dicts with kv pairs terminated by eol we accept a missing value as an empty string. + val = types.StringLiteral("") + } else { + if val, err = ParseObject(&l); err != nil { + return nil, err } - continue - } - - obj, err := ParseObject(&l) - if err != nil { - return nil, err } // Specifying the null object as the value of a dictionary entry (7.3.7, "Dictionary Objects") - // hall be equivalent to omitting the entry entirely. - if obj != nil { - stringKey := string(*key) - if !hasNames { - // Avoid expensive "DecodeName" on existing keys in "Insert". - if _, found := d[stringKey]; !found { - d[stringKey] = obj - hasNames = strings.IndexByte(stringKey, '#') >= 0 - } - } else { - d.Insert(stringKey, obj) + // shall be equivalent to omitting the entry entirely. + if val != nil { + detectedHexCodes, err := insertKey(d, string(*keyName), val, usesHexCodes) + if err != nil { + return nil, err } - if log.ParseEnabled() { - log.Parse.Printf("ParseDict: dict[%s]=%v\n", key, obj) + if !usesHexCodes && detectedHexCodes { + usesHexCodes = true } - // if ok := d.Insert(string(*key), obj); !ok { - // return nil, errDictionaryDuplicateKey - // } } // We are positioned on the char behind the last parsed dict value. diff --git a/pkg/pdfcpu/model/parse_dict_test.go b/pkg/pdfcpu/model/parse_dict_test.go index 86d79edd..fc713014 100644 --- a/pkg/pdfcpu/model/parse_dict_test.go +++ b/pkg/pdfcpu/model/parse_dict_test.go @@ -148,8 +148,6 @@ func doTestParseDictWithComments(t *testing.T) { } func doTestLargeDicts(t *testing.T) { - // Make sure parsing large dictionaries is fast. Found a file in the wild - // that has two dictionaries with about 200.000 entries each. var sb strings.Builder sb.WriteString("<<") for i := 0; i < 50000; i++ { diff --git a/pkg/pdfcpu/read.go b/pkg/pdfcpu/read.go index 42ee526f..f0914d4e 100644 --- a/pkg/pdfcpu/read.go +++ b/pkg/pdfcpu/read.go @@ -1361,7 +1361,10 @@ func showRep() { // bypassXrefSection is a fix for digesting corrupt xref sections. // It populates the xRefTable by reading in all indirect objects line by line // and works on the assumption of a single xref section - meaning no incremental updates. -func bypassXrefSection(ctx *model.Context, offExtra int64) error { +func bypassXrefSection(ctx *model.Context, offExtra int64, wasErr error) error { + if log.ReadEnabled() { + log.Read.Printf("bypassXRefSection after %v\n", wasErr) + } var z int64 g := types.FreeHeadGeneration ctx.Table[0] = &model.XRefTableEntry{ @@ -1561,11 +1564,10 @@ func buildXRefTableStartingAt(ctx *model.Context, offset *int64) error { return err } if offset, err = parseXRefStream(ctx, rd, offset, offExtra); err != nil { - if log.ReadEnabled() { - log.Read.Printf("bypassXRefSection after %v\n", err) + if ctx.XRefTable.ValidationMode == model.ValidationRelaxed { + // Try fix for corrupt single xref section. + return bypassXrefSection(ctx, offExtra, err) } - // Try fix for corrupt single xref section. - return bypassXrefSection(ctx, offExtra) } } diff --git a/pkg/pdfcpu/types/dict.go b/pkg/pdfcpu/types/dict.go index 10a168a3..c444f0b0 100644 --- a/pkg/pdfcpu/types/dict.go +++ b/pkg/pdfcpu/types/dict.go @@ -52,13 +52,12 @@ func (d Dict) Clone() Object { } // Insert adds a new entry to this PDFDict. -func (d Dict) Insert(key string, value Object) (ok bool) { - _, found := d.Find(key) - if !found { - d[key] = value - ok = true +func (d Dict) Insert(k string, v Object) bool { + if _, found := d.Find(k); !found { + d[k] = v + return true } - return ok + return false } // InsertBool adds a new bool entry to this PDFDict. diff --git a/pkg/pdfcpu/validate/colorspace.go b/pkg/pdfcpu/validate/colorspace.go index d849b7d3..71c8c1ac 100644 --- a/pkg/pdfcpu/validate/colorspace.go +++ b/pkg/pdfcpu/validate/colorspace.go @@ -597,8 +597,10 @@ func validateColorSpaceArray(xRefTable *model.XRefTable, a types.Array, excludeP case model.DeviceNCS: err = validateDeviceNColorSpace(xRefTable, a, model.V13) - // Relaxed validation: - case model.DeviceRGBCS: + case model.DeviceGrayCS, model.DeviceRGBCS, model.DeviceCMYKCS: + if xRefTable.ValidationMode != model.ValidationRelaxed { + err = errors.Errorf("pdfcpu: validateColorSpaceArray: undefined color space: %s\n", name) + } default: err = errors.Errorf("pdfcpu: validateColorSpaceArray: undefined color space: %s\n", name)