From 3d4cbdbdfac6a8cb89beea5f7373b5d483fede45 Mon Sep 17 00:00:00 2001
From: Joachim Bauch <bauch@struktur.de>
Date: Mon, 29 Jan 2024 15:21:33 +0100
Subject: [PATCH] Further improve parsing of dictionaries / names.

With this change, the names are decoded internally, so they be can compared
directly when adding entries to dictionaries. On writing, the names are
encoded if necessary.

Also removed some duplicate code for name encoding / decoding.
---
 pkg/pdfcpu/model/parse.go           |  68 ++++--------------
 pkg/pdfcpu/model/parse_dict_test.go |   1 +
 pkg/pdfcpu/model/parse_test.go      |  68 ++++++++++++++++++
 pkg/pdfcpu/types/dict.go            |  28 ++++----
 pkg/pdfcpu/types/dict_test.go       |  32 +++++++++
 pkg/pdfcpu/types/string.go          | 105 +++++++++++++++++++++++-----
 pkg/pdfcpu/types/string_test.go     |  24 +++++++
 pkg/pdfcpu/types/types.go           |  25 +------
 8 files changed, 242 insertions(+), 109 deletions(-)
 create mode 100644 pkg/pdfcpu/model/parse_test.go
 create mode 100644 pkg/pdfcpu/types/dict_test.go

diff --git a/pkg/pdfcpu/model/parse.go b/pkg/pdfcpu/model/parse.go
index 5e5df743..bb07b91e 100644
--- a/pkg/pdfcpu/model/parse.go
+++ b/pkg/pdfcpu/model/parse.go
@@ -18,7 +18,6 @@ package model
 
 import (
 	"context"
-	"encoding/hex"
 	"strconv"
 	"strings"
 	"unicode"
@@ -461,30 +460,13 @@ func parseHexLiteral(line *string) (types.Object, error) {
 	return types.HexLiteral(*hexStr), nil
 }
 
-func validateNameHexSequence(s string) error {
-	for i := 0; i < len(s); {
-		c := s[i]
-		if c != '#' {
-			i++
-			continue
-		}
-
-		// # detected, next 2 chars have to exist.
-		if len(s) < i+3 {
-			return errNameObjectCorrupt
-		}
-
-		s1 := s[i+1 : i+3]
-
-		// And they have to be hex characters.
-		_, err := hex.DecodeString(s1)
-		if err != nil {
-			return errNameObjectCorrupt
-		}
-
-		i += 3
+func decodeNameHexSequence(s string) (string, error) {
+	decoded, err := types.DecodeName(s)
+	if err != nil {
+		return "", errNameObjectCorrupt
 	}
-	return nil
+
+	return decoded, nil
 }
 
 func parseName(line *string) (*types.Name, error) {
@@ -518,8 +500,8 @@ func parseName(line *string) (*types.Name, error) {
 		l = l[:eok]
 	}
 
-	// Validate optional #xx sequences
-	err := validateNameHexSequence(l)
+	// Decode optional #xx sequences
+	l, err := decodeNameHexSequence(l)
 	if err != nil {
 		return nil, err
 	}
@@ -528,48 +510,30 @@ func parseName(line *string) (*types.Name, error) {
 	return &nameObj, nil
 }
 
-func insertKey(d types.Dict, key string, val types.Object, usesHexCodes bool) (bool, error) {
-	var duplicateKeyErr bool
-
-	if !usesHexCodes {
-		if strings.IndexByte(key, '#') < 0 {
-			// Avoid expensive "DecodeName".
-			if _, found := d[key]; !found {
-				d[key] = val
-			} else {
-				duplicateKeyErr = true
-			}
-		} else {
-			duplicateKeyErr = d.Insert(key, val)
-			usesHexCodes = true
-		}
+func insertKey(d types.Dict, key string, val types.Object) error {
+	if _, found := d[key]; !found {
+		d[key] = val
 	} else {
-		duplicateKeyErr = d.Insert(key, val)
-	}
-
-	if duplicateKeyErr {
 		// for now we digest duplicate keys.
 		// TODO
 		// if !validationRelaxed {
-		// 	return false, errDictionaryDuplicateKey
+		// 	return errDictionaryDuplicateKey
 		// }
 		// if log.CLIEnabled() {
 		// 	log.CLI.Printf("ParseDict: digesting duplicate key\n")
 		// }
-		_ = duplicateKeyErr
 	}
 
 	if log.ParseEnabled() {
 		log.Parse.Printf("ParseDict: dict[%s]=%v\n", key, val)
 	}
 
-	return usesHexCodes, nil
+	return nil
 }
 
 func processDictKeys(c context.Context, line *string, relaxed bool) (types.Dict, error) {
 	l := *line
 	var eol bool
-	var usesHexCodes bool
 	d := types.NewDict()
 
 	for !strings.HasPrefix(l, ">>") {
@@ -612,13 +576,9 @@ func processDictKeys(c context.Context, line *string, relaxed bool) (types.Dict,
 		// Specifying the null object as the value of a dictionary entry (7.3.7, "Dictionary Objects")
 		// shall be equivalent to omitting the entry entirely.
 		if val != nil {
-			detectedHexCodes, err := insertKey(d, string(*keyName), val, usesHexCodes)
-			if err != nil {
+			if err := insertKey(d, string(*keyName), val); err != nil {
 				return nil, err
 			}
-			if !usesHexCodes && detectedHexCodes {
-				usesHexCodes = true
-			}
 		}
 
 		// We are positioned on the char behind the last parsed dict value.
diff --git a/pkg/pdfcpu/model/parse_dict_test.go b/pkg/pdfcpu/model/parse_dict_test.go
index fc713014..b6004f76 100644
--- a/pkg/pdfcpu/model/parse_dict_test.go
+++ b/pkg/pdfcpu/model/parse_dict_test.go
@@ -150,6 +150,7 @@ func doTestParseDictWithComments(t *testing.T) {
 func doTestLargeDicts(t *testing.T) {
 	var sb strings.Builder
 	sb.WriteString("<<")
+	sb.WriteString("/Key#28#29 (Value)")
 	for i := 0; i < 50000; i++ {
 		sb.WriteString(fmt.Sprintf("/Key%d (Value)", i))
 	}
diff --git a/pkg/pdfcpu/model/parse_test.go b/pkg/pdfcpu/model/parse_test.go
new file mode 100644
index 00000000..ddd191ec
--- /dev/null
+++ b/pkg/pdfcpu/model/parse_test.go
@@ -0,0 +1,68 @@
+/*
+Copyright 2024 The pdfcpu Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package model
+
+import (
+	"testing"
+)
+
+func TestDecodeNameHexInvalid(t *testing.T) {
+	testcases := []string{
+		"#",
+		"#A",
+		"#a",
+		"#G0",
+		"#00",
+		"Fo\x00",
+	}
+	for _, tc := range testcases {
+		if decoded, err := decodeNameHexSequence(tc); err == nil {
+			t.Errorf("expected error decoding %s, got %s", tc, decoded)
+		}
+	}
+}
+
+func TestDecodeNameHexValid(t *testing.T) {
+	testcases := []struct {
+		Input    string
+		Expected string
+	}{
+		{"", ""},
+		{"Foo", "Foo"},
+		{"A#23", "A#"},
+		// Examples from "7.3.5 Name Objects"
+		{"Name1", "Name1"},
+		{"ASomewhatLongerName", "ASomewhatLongerName"},
+		{"A;Name_With-Various***Characters?", "A;Name_With-Various***Characters?"},
+		{"1.2", "1.2"},
+		{"$$", "$$"},
+		{"@pattern", "@pattern"},
+		{".notdef", ".notdef"},
+		{"Lime#20Green", "Lime Green"},
+		{"paired#28#29parentheses", "paired()parentheses"},
+		{"The_Key_of_F#23_Minor", "The_Key_of_F#_Minor"},
+		{"A#42", "AB"},
+	}
+	for _, tc := range testcases {
+		decoded, err := decodeNameHexSequence(tc.Input)
+		if err != nil {
+			t.Errorf("decoding %s failed: %s", tc.Input, err)
+		} else if decoded != tc.Expected {
+			t.Errorf("expected %s when decoding %s, got %s", tc.Expected, tc.Input, decoded)
+		}
+	}
+}
diff --git a/pkg/pdfcpu/types/dict.go b/pkg/pdfcpu/types/dict.go
index bb727e62..40734152 100644
--- a/pkg/pdfcpu/types/dict.go
+++ b/pkg/pdfcpu/types/dict.go
@@ -440,17 +440,16 @@ func (d Dict) indentedString(level int) string {
 	for _, k := range keys {
 
 		v := d[k]
-		key, _ := DecodeName(k)
 
 		if subdict, ok := v.(Dict); ok {
 			dictStr := subdict.indentedString(level + 1)
-			logstr = append(logstr, fmt.Sprintf("%s<%s, %s>\n", tabstr, key, dictStr))
+			logstr = append(logstr, fmt.Sprintf("%s<%s, %s>\n", tabstr, k, dictStr))
 			continue
 		}
 
 		if a, ok := v.(Array); ok {
 			arrStr := a.indentedString(level + 1)
-			logstr = append(logstr, fmt.Sprintf("%s<%s, %s>\n", tabstr, key, arrStr))
+			logstr = append(logstr, fmt.Sprintf("%s<%s, %s>\n", tabstr, k, arrStr))
 			continue
 		}
 
@@ -462,7 +461,7 @@ func (d Dict) indentedString(level int) string {
 			}
 		}
 
-		logstr = append(logstr, fmt.Sprintf("%s<%s, %v>\n", tabstr, key, val))
+		logstr = append(logstr, fmt.Sprintf("%s<%s, %v>\n", tabstr, k, val))
 
 	}
 
@@ -486,63 +485,64 @@ func (d Dict) PDFString() string {
 	for _, k := range keys {
 
 		v := d[k]
+		keyName := EncodeName(k)
 
 		if v == nil {
-			logstr = append(logstr, fmt.Sprintf("/%s null", k))
+			logstr = append(logstr, fmt.Sprintf("/%s null", keyName))
 			continue
 		}
 
 		d, ok := v.(Dict)
 		if ok {
-			logstr = append(logstr, fmt.Sprintf("/%s%s", k, d.PDFString()))
+			logstr = append(logstr, fmt.Sprintf("/%s%s", keyName, d.PDFString()))
 			continue
 		}
 
 		a, ok := v.(Array)
 		if ok {
-			logstr = append(logstr, fmt.Sprintf("/%s%s", k, a.PDFString()))
+			logstr = append(logstr, fmt.Sprintf("/%s%s", keyName, a.PDFString()))
 			continue
 		}
 
 		ir, ok := v.(IndirectRef)
 		if ok {
-			logstr = append(logstr, fmt.Sprintf("/%s %s", k, ir.PDFString()))
+			logstr = append(logstr, fmt.Sprintf("/%s %s", keyName, ir.PDFString()))
 			continue
 		}
 
 		n, ok := v.(Name)
 		if ok {
-			logstr = append(logstr, fmt.Sprintf("/%s%s", k, n.PDFString()))
+			logstr = append(logstr, fmt.Sprintf("/%s%s", keyName, n.PDFString()))
 			continue
 		}
 
 		i, ok := v.(Integer)
 		if ok {
-			logstr = append(logstr, fmt.Sprintf("/%s %s", k, i.PDFString()))
+			logstr = append(logstr, fmt.Sprintf("/%s %s", keyName, i.PDFString()))
 			continue
 		}
 
 		f, ok := v.(Float)
 		if ok {
-			logstr = append(logstr, fmt.Sprintf("/%s %s", k, f.PDFString()))
+			logstr = append(logstr, fmt.Sprintf("/%s %s", keyName, f.PDFString()))
 			continue
 		}
 
 		b, ok := v.(Boolean)
 		if ok {
-			logstr = append(logstr, fmt.Sprintf("/%s %s", k, b.PDFString()))
+			logstr = append(logstr, fmt.Sprintf("/%s %s", keyName, b.PDFString()))
 			continue
 		}
 
 		sl, ok := v.(StringLiteral)
 		if ok {
-			logstr = append(logstr, fmt.Sprintf("/%s%s", k, sl.PDFString()))
+			logstr = append(logstr, fmt.Sprintf("/%s%s", keyName, sl.PDFString()))
 			continue
 		}
 
 		hl, ok := v.(HexLiteral)
 		if ok {
-			logstr = append(logstr, fmt.Sprintf("/%s%s", k, hl.PDFString()))
+			logstr = append(logstr, fmt.Sprintf("/%s%s", keyName, hl.PDFString()))
 			continue
 		}
 
diff --git a/pkg/pdfcpu/types/dict_test.go b/pkg/pdfcpu/types/dict_test.go
new file mode 100644
index 00000000..e28816b4
--- /dev/null
+++ b/pkg/pdfcpu/types/dict_test.go
@@ -0,0 +1,32 @@
+/*
+Copyright 2024 The pdfcpu Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package types
+
+import (
+	"testing"
+)
+
+func TestEncodeDict(t *testing.T) {
+	dict := Dict{
+		"A()": Integer(1),
+	}
+	expected := `<</A#28#29 1>>`
+	s := dict.PDFString()
+	if s != expected {
+		t.Errorf("expected %s for %+v, got %s", expected, dict, s)
+	}
+}
diff --git a/pkg/pdfcpu/types/string.go b/pkg/pdfcpu/types/string.go
index 439078d5..1fddf8d1 100644
--- a/pkg/pdfcpu/types/string.go
+++ b/pkg/pdfcpu/types/string.go
@@ -19,7 +19,6 @@ package types
 import (
 	"bytes"
 	"encoding/hex"
-	"fmt"
 	"strconv"
 	"strings"
 	"unicode/utf8"
@@ -226,33 +225,103 @@ func Reverse(s string) string {
 	return string(outRunes)
 }
 
+// needsHexSequence checks if a given character must be hex-encoded.
+// See "7.3.5 Name Objects" for details.
+func needsHexSequence(ch byte) bool {
+	switch ch {
+	// Delimiter characters (see "7.2.2 Character Set")
+	case '(':
+		fallthrough
+	case ')':
+		fallthrough
+	case '<':
+		fallthrough
+	case '>':
+		fallthrough
+	case '[':
+		fallthrough
+	case ']':
+		fallthrough
+	case '{':
+		fallthrough
+	case '}':
+		fallthrough
+	case '/':
+		fallthrough
+	case '%':
+		return true
+	case '#':
+		return true
+	default:
+		return ch < '!' || ch > '~'
+	}
+}
+
 // EncodeName applies name encoding according to PDF spec.
 func EncodeName(s string) string {
-	bb := []byte{}
+	replaced := false
+	var sb strings.Builder // will be used only if replacements are necessary
 	for i := 0; i < len(s); i++ {
-		bb = append(bb, []byte(fmt.Sprintf("#%x", s[i]))...)
+		ch := s[i]
+		// TODO: This should handle the invalid character 0x00.
+		if needsHexSequence(ch) {
+			if !replaced {
+				sb.WriteString(s[:i])
+			}
+			sb.WriteByte('#')
+			sb.WriteString(hex.EncodeToString([]byte{ch}))
+			replaced = true
+		} else if replaced {
+			sb.WriteByte(ch)
+		}
 	}
-	return string(bb)
+	if !replaced {
+		return s
+	}
+	return sb.String()
 }
 
 // DecodeName applies name decoding according to PDF spec.
 func DecodeName(s string) (string, error) {
-	if len(s) == 0 || strings.IndexByte(s, '#') < 0 {
-		return s, nil
-	}
-	var bb []byte
-	for i := 0; i < len(s); {
-		if s[i] == '#' {
-			hb, err := hex.DecodeString(s[i+1 : i+3])
-			if err != nil {
-				return "", err
+	replaced := false
+	var sb strings.Builder // will be used only if replacements are necessary
+	for i := 0; i < len(s); i++ {
+		c := s[i]
+		if c == 0 {
+			return "", errors.New("a name may not contain a null byte")
+		} else if c != '#' {
+			if replaced {
+				sb.WriteByte(c)
 			}
-			bb = append(bb, hb...)
-			i += 3
 			continue
 		}
-		bb = append(bb, s[i])
-		i++
+
+		// # detected, next 2 chars have to exist.
+		if len(s) < i+3 {
+			return "", errors.New("not enough characters after #")
+		}
+
+		s1 := s[i+1 : i+3]
+
+		// And they have to be hex characters.
+		decoded, err := hex.DecodeString(s1)
+		if err != nil {
+			return "", err
+		}
+
+		if decoded[0] == 0 {
+			return "", errors.New("a name may not contain a null byte")
+		}
+
+		if !replaced {
+			sb.WriteString(s[:i])
+			replaced = true
+		}
+		sb.Write(decoded)
+		i += 2
+	}
+	if !replaced {
+		return s, nil
 	}
-	return string(bb), nil
+	return sb.String(), nil
 }
diff --git a/pkg/pdfcpu/types/string_test.go b/pkg/pdfcpu/types/string_test.go
index 952635fa..2fe19c32 100644
--- a/pkg/pdfcpu/types/string_test.go
+++ b/pkg/pdfcpu/types/string_test.go
@@ -174,3 +174,27 @@ func TestDecodeName(t *testing.T) {
 		})
 	}
 }
+
+func TestEncodeName(t *testing.T) {
+	testcases := []struct {
+		Input    string
+		Expected string
+	}{
+		{"Foo", "Foo"},
+		{"A#", "A#23"},
+		{"F#o", "F#23o"},
+		{"A;Name_With-Various***Characters?", "A;Name_With-Various***Characters?"},
+		{"1.2", "1.2"},
+		{"$$", "$$"},
+		{"@pattern", "@pattern"},
+		{".notdef", ".notdef"},
+		{"Lime Green", "Lime#20Green"},
+		{"paired()parentheses", "paired#28#29parentheses"},
+		{"The_Key_of_F#_Minor", "The_Key_of_F#23_Minor"},
+	}
+	for _, tc := range testcases {
+		if encoded := EncodeName(tc.Input); encoded != tc.Expected {
+			t.Errorf("expected %s for %s, got %s", tc.Expected, tc.Input, encoded)
+		}
+	}
+}
diff --git a/pkg/pdfcpu/types/types.go b/pkg/pdfcpu/types/types.go
index 96a6d1a3..961a9945 100644
--- a/pkg/pdfcpu/types/types.go
+++ b/pkg/pdfcpu/types/types.go
@@ -17,7 +17,6 @@ limitations under the License.
 package types
 
 import (
-	"bytes"
 	"encoding/hex"
 	"fmt"
 	"strconv"
@@ -388,34 +387,14 @@ func (nameObject Name) String() string {
 func (nameObject Name) PDFString() string {
 	s := " "
 	if len(nameObject) > 0 {
-		s = string(nameObject)
+		s = EncodeName(string(nameObject))
 	}
 	return fmt.Sprintf("/%s", s)
 }
 
 // Value returns a string value for this PDF object.
 func (nameObject Name) Value() string {
-
-	s := string(nameObject)
-	var b bytes.Buffer
-
-	for i := 0; i < len(s); {
-		c := s[i]
-		if c != '#' {
-			b.WriteByte(c)
-			i++
-			continue
-		}
-
-		// # detected, next 2 chars have to exist.
-		// This gets checked during parsing.
-		s1 := s[i+1 : i+3]
-		b1, _ := hex.DecodeString(s1)
-		b.WriteByte(b1[0])
-		i += 3
-	}
-
-	return b.String()
+	return string(nameObject)
 }
 
 ///////////////////////////////////////////////////////////////////////////////////