Fix #775, #490

pdfcpu · Jan 27, 2024 · 043541b · 043541b
1 parent 04634d3
commit 043541b
Show file tree

Hide file tree

Showing 5 changed files with 73 additions and 60 deletions.
diff --git a/pkg/pdfcpu/model/parse.go b/pkg/pdfcpu/model/parse.go
@@ -527,78 +527,90 @@ func parseName(line *string) (*types.Name, error) {
 	return &nameObj, nil
 }
 
+func insertKey(d types.Dict, key string, val types.Object, usesHexCodes bool) (bool, error) {
+	var duplicateKeyErr bool
+
+	if !usesHexCodes {
+		if strings.IndexByte(key, '#') < 0 {
+			// Avoid expensive "DecodeName".
+			if _, found := d[key]; !found {
+				d[key] = val
+			} else {
+				duplicateKeyErr = true
+			}
+		} else {
+			duplicateKeyErr = d.Insert(key, val)
+			usesHexCodes = true
+		}
+	} else {
+		duplicateKeyErr = d.Insert(key, val)
+	}
+
+	if duplicateKeyErr {
+		// for now we digest duplicate keys.
+		// TODO
+		// if !validationRelaxed {
+		// 	return false, errDictionaryDuplicateKey
+		// }
+		// if log.CLIEnabled() {
+		// 	log.CLI.Printf("ParseDict: digesting duplicate key\n")
+		// }
+		_ = duplicateKeyErr
+	}
+
+	if log.ParseEnabled() {
+		log.Parse.Printf("ParseDict: dict[%s]=%v\n", key, val)
+	}
+
+	return usesHexCodes, nil
+}
+
 func processDictKeys(line *string, relaxed bool) (types.Dict, error) {
 	l := *line
 	var eol bool
-	var hasNames bool
+	var usesHexCodes bool
 	d := types.NewDict()
 	for !strings.HasPrefix(l, ">>") {
-		key, err := parseName(&l)
+		keyName, err := parseName(&l)
 		if err != nil {
 			return nil, err
 		}
 		if log.ParseEnabled() {
-			log.Parse.Printf("ParseDict: key = %s\n", key)
+			log.Parse.Printf("ParseDict: key = %s\n", keyName)
 		}
 
-		// position to first non whitespace after key
+		// Position to first non whitespace after key.
 		l, eol = trimLeftSpace(l, relaxed)
 
 		if len(l) == 0 {
 			if log.ParseEnabled() {
 				log.Parse.Println("ParseDict: only whitespace after key")
 			}
-			// only whitespace after key
+			// Only whitespace after key.
 			return nil, errDictionaryNotTerminated
 		}
 
-		// Fix for #252:
-		// For dicts with kv pairs terminated by eol we accept a missing value as an empty string.
+		var val types.Object
+
 		if eol {
-			obj := types.StringLiteral("")
-			if log.ParseEnabled() {
-				log.Parse.Printf("ParseDict: dict[%s]=%v\n", key, obj)
-			}
-			stringKey := string(*key)
-			if !hasNames {
-				// Avoid expensive "DecodeName" on existing keys in "Insert".
-				if _, found := d[stringKey]; found {
-					return nil, errDictionaryDuplicateKey
-				}
-				d[stringKey] = obj
-				hasNames = strings.IndexByte(stringKey, '#') >= 0
-			} else {
-				if ok := d.Insert(stringKey, obj); !ok {
-					return nil, errDictionaryDuplicateKey
-				}
+			// #252: For dicts with kv pairs terminated by eol we accept a missing value as an empty string.
+			val = types.StringLiteral("")
+		} else {
+			if val, err = ParseObject(&l); err != nil {
+				return nil, err
 			}
-			continue
-		}
-
-		obj, err := ParseObject(&l)
-		if err != nil {
-			return nil, err
 		}
 
 		// Specifying the null object as the value of a dictionary entry (7.3.7, "Dictionary Objects")
-		// hall be equivalent to omitting the entry entirely.
-		if obj != nil {
-			stringKey := string(*key)
-			if !hasNames {
-				// Avoid expensive "DecodeName" on existing keys in "Insert".
-				if _, found := d[stringKey]; !found {
-					d[stringKey] = obj
-					hasNames = strings.IndexByte(stringKey, '#') >= 0
-				}
-			} else {
-				d.Insert(stringKey, obj)
+		// shall be equivalent to omitting the entry entirely.
+		if val != nil {
+			detectedHexCodes, err := insertKey(d, string(*keyName), val, usesHexCodes)
+			if err != nil {
+				return nil, err
 			}
-			if log.ParseEnabled() {
-				log.Parse.Printf("ParseDict: dict[%s]=%v\n", key, obj)
+			if !usesHexCodes && detectedHexCodes {
+				usesHexCodes = true
 			}
-			// if ok := d.Insert(string(*key), obj); !ok {
-			// 	return nil, errDictionaryDuplicateKey
-			// }
 		}
 
 		// We are positioned on the char behind the last parsed dict value.

diff --git a/pkg/pdfcpu/model/parse_dict_test.go b/pkg/pdfcpu/model/parse_dict_test.go
@@ -148,8 +148,6 @@ func doTestParseDictWithComments(t *testing.T) {
 }
 
 func doTestLargeDicts(t *testing.T) {
-	// Make sure parsing large dictionaries is fast. Found a file in the wild
-	// that has two dictionaries with about 200.000 entries each.
 	var sb strings.Builder
 	sb.WriteString("<<")
 	for i := 0; i < 50000; i++ {

diff --git a/pkg/pdfcpu/read.go b/pkg/pdfcpu/read.go
@@ -1361,7 +1361,10 @@ func showRep() {
 // bypassXrefSection is a fix for digesting corrupt xref sections.
 // It populates the xRefTable by reading in all indirect objects line by line
 // and works on the assumption of a single xref section - meaning no incremental updates.
-func bypassXrefSection(ctx *model.Context, offExtra int64) error {
+func bypassXrefSection(ctx *model.Context, offExtra int64, wasErr error) error {
+	if log.ReadEnabled() {
+		log.Read.Printf("bypassXRefSection after %v\n", wasErr)
+	}
 	var z int64
 	g := types.FreeHeadGeneration
 	ctx.Table[0] = &model.XRefTableEntry{
@@ -1561,11 +1564,10 @@ func buildXRefTableStartingAt(ctx *model.Context, offset *int64) error {
 			return err
 		}
 		if offset, err = parseXRefStream(ctx, rd, offset, offExtra); err != nil {
-			if log.ReadEnabled() {
-				log.Read.Printf("bypassXRefSection after %v\n", err)
+			if ctx.XRefTable.ValidationMode == model.ValidationRelaxed {
+				// Try fix for corrupt single xref section.
+				return bypassXrefSection(ctx, offExtra, err)
 			}
-			// Try fix for corrupt single xref section.
-			return bypassXrefSection(ctx, offExtra)
 		}
 
 	}

diff --git a/pkg/pdfcpu/types/dict.go b/pkg/pdfcpu/types/dict.go
@@ -52,13 +52,12 @@ func (d Dict) Clone() Object {
 }
 
 // Insert adds a new entry to this PDFDict.
-func (d Dict) Insert(key string, value Object) (ok bool) {
-	_, found := d.Find(key)
-	if !found {
-		d[key] = value
-		ok = true
+func (d Dict) Insert(k string, v Object) bool {
+	if _, found := d.Find(k); !found {
+		d[k] = v
+		return true
 	}
-	return ok
+	return false
 }
 
 // InsertBool adds a new bool entry to this PDFDict.

diff --git a/pkg/pdfcpu/validate/colorspace.go b/pkg/pdfcpu/validate/colorspace.go
@@ -597,8 +597,10 @@ func validateColorSpaceArray(xRefTable *model.XRefTable, a types.Array, excludeP
 	case model.DeviceNCS:
 		err = validateDeviceNColorSpace(xRefTable, a, model.V13)
 
-	// Relaxed validation:
-	case model.DeviceRGBCS:
+	case model.DeviceGrayCS, model.DeviceRGBCS, model.DeviceCMYKCS:
+		if xRefTable.ValidationMode != model.ValidationRelaxed {
+			err = errors.Errorf("pdfcpu: validateColorSpaceArray: undefined color space: %s\n", name)
+		}
 
 	default:
 		err = errors.Errorf("pdfcpu: validateColorSpaceArray: undefined color space: %s\n", name)