model/optimize/clean_fonts.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package optimize

import (
	"bytes"
	"errors"

	"github.com/unidoc/unitype"

	"github.com/oliverpool/unipdf/v3/common"
	"github.com/oliverpool/unipdf/v3/core"
	"github.com/oliverpool/unipdf/v3/extractor"
	"github.com/oliverpool/unipdf/v3/internal/textencoding"
	"github.com/oliverpool/unipdf/v3/model"
)

// CleanFonts cleans up embedded fonts, reducing font sizes.
type CleanFonts struct {
	// Subset embedded fonts if encountered (if true).
	// Otherwise attempts to reduce the font program.
	Subset bool
}

func optimizeFontsWithSubsetting(objects []core.PdfObject) (processed map[*core.PdfObjectStream]struct{}, err error) {
	// 1. Identify all fonts.
	// 2. Identify content streams and their Resources dictionaries (both via page, forms and annotations).
	// 3. Process content streams.
	processed = map[*core.PdfObjectStream]struct{}{}

	fontMap := map[*model.PdfFont]struct{}{}

	objstr := getObjectStructure(objects)
	for _, p := range objstr.pages {
		pdict, ok := core.GetDict(p.PdfObject)
		if !ok {
			continue
		}
		resourcesDict, ok := core.GetDict(pdict.Get("Resources"))
		if !ok {
			continue
		}
		contents, _ := getPageContents(pdict.Get("Contents"))
		resources, err := model.NewPdfPageResourcesFromDict(resourcesDict)
		if err != nil {
			return nil, err
		}

		allContents := []content{
			{
				content:   contents,
				resources: resources,
			},
		}

		annotContents := getAnnotationContents(pdict.Get("Annots"))
		if annotContents != nil {
			allContents = append(allContents, annotContents...)
		}

		for _, cont := range allContents {
			e, err := extractor.NewFromContents(cont.content, cont.resources)
			if err != nil {
				return nil, err
			}

			pt, _, _, err := e.ExtractPageText()
			if err != nil {
				return nil, err
			}

			for _, el := range pt.Marks().Elements() {
				if el.Font == nil {
					continue
				}
				if _, has := fontMap[el.Font]; !has {
					fontMap[el.Font] = struct{}{}
				}
			}
		}
	}

	// Map of font program stream to font. Multiple fonts can use the same font program.
	fontFileMap := map[*core.PdfObjectStream][]*model.PdfFont{}
	for font := range fontMap {
		fontDesc := font.FontDescriptor()
		if fontDesc == nil || fontDesc.FontFile2 == nil {
			continue
		}
		stream, ok := core.GetStream(fontDesc.FontFile2)
		if !ok {
			continue
		}
		fontFileMap[stream] = append(fontFileMap[stream], font)
	}

	for stream := range fontFileMap {
		var allRunes []rune
		var allIndices []unitype.GlyphIndex

		for _, font := range fontFileMap[stream] {
			switch t := font.Encoder().(type) {
			case *textencoding.IdentityEncoder:
				// TODO: This terminology is wrong as those are not runes, just charcodes cast as runes.
				//   Identity encoder maps via 2-byte encoding directly from 2byte charcode to glyph index.
				runes := t.RegisteredRunes()
				indices := make([]unitype.GlyphIndex, len(runes))
				for i, r := range runes {
					indices[i] = unitype.GlyphIndex(r)
				}
				allIndices = append(allIndices, indices...)
			case *textencoding.TrueTypeFontEncoder:
				runes := t.RegisteredRunes()
				allRunes = append(allRunes, runes...)
			case textencoding.SimpleEncoder:
				charcodes := t.Charcodes()
				for _, c := range charcodes {
					r, ok := t.CharcodeToRune(c)
					if !ok {
						common.Log.Debug("Charcode<->rune not found: %d", c)
						continue
					}
					allRunes = append(allRunes, r)
				}
			}
		}

		err = subsetFontStream(stream, allRunes, allIndices)
		if err != nil {
			common.Log.Debug("ERROR subsetting font stream: %v", err)
			return nil, err
		}
		processed[stream] = struct{}{}
	}
	return processed, nil
}

// Subsets the font program in `stream` with the subset based on the `runes` and glyph `indices`.
func subsetFontStream(stream *core.PdfObjectStream, runes []rune, indices []unitype.GlyphIndex) error {
	stream, ok := core.GetStream(stream)
	if !ok {
		common.Log.Debug("Embedded font object not found -- ABORT subsetting")
		return errors.New("fontfile2 not found")
	}
	decoded, err := core.DecodeStream(stream)
	if err != nil {
		common.Log.Debug("Decode error: %v", err)
		return err
	}

	fnt, err := unitype.Parse(bytes.NewReader(decoded))
	if err != nil {
		common.Log.Debug("Error parsing %d byte font", len(stream.Stream))
		return err
	}

	allIndices := indices
	if len(runes) > 0 {
		indices := fnt.LookupRunes(runes)
		allIndices = append(allIndices, indices...)
	}

	fnt, err = fnt.SubsetKeepIndices(allIndices)
	if err != nil {
		common.Log.Debug("ERROR subsetting font: %v", err)
		return err
	}

	var buf bytes.Buffer
	err = fnt.Write(&buf)
	if err != nil {
		common.Log.Debug("ERROR Writing font: %v", err)
		return err
	}
	if buf.Len() > len(decoded) {
		common.Log.Debug("Re-written font is larger than original - skip")
		return nil
	}

	newstream, err := core.MakeStream(buf.Bytes(), core.NewFlateEncoder())
	if err != nil {
		common.Log.Debug("ERROR Writing font: %v", err)
		return err
	}
	// Overwrite.
	*stream = *newstream
	stream.Set("Length1", core.MakeInteger(int64(buf.Len())))

	return nil
}

// Optimize optimizes PDF objects to decrease PDF size.
func (c *CleanFonts) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfObject, err error) {
	var processed map[*core.PdfObjectStream]struct{}
	if c.Subset {
		var err error
		processed, err = optimizeFontsWithSubsetting(objects)
		if err != nil {
			return nil, err
		}
	}

	// Clean font streams by loading and rewriting with minimal needed tables.
	for _, obj := range objects {
		stream, isStreamObj := core.GetStream(obj)
		if !isStreamObj {
			continue
		}
		if _, has := processed[stream]; has {
			// Skip - has been processed.
			continue
		}

		encoder, err := core.NewEncoderFromStream(stream)
		if err != nil {
			common.Log.Debug("ERROR getting encoder: %v - ignoring", err)
			continue
		}

		decoded, err := encoder.DecodeStream(stream)
		if err != nil {
			common.Log.Debug("Decoding error : %v - ignoring", err)
			continue
		}
		if len(decoded) < 4 {
			continue
		}

		version := string(decoded[:4])
		if version == "OTTO" {
			// Fonts based on PostScript outlines not supported yet.
			// See https://docs.microsoft.com/en-us/typography/opentype/spec/otff
			continue
		}
		if version != "\x00\x01\x00\x00" && version != "true" {
			continue
		}

		fnt, err := unitype.Parse(bytes.NewReader(decoded))
		if err != nil {
			common.Log.Debug("ERROR Parsing font: %v - ignoring", err)
			continue
		}
		err = fnt.Optimize()
		if err != nil {
			continue
		}

		var buf bytes.Buffer
		err = fnt.Write(&buf)
		if err != nil {
			common.Log.Debug("ERROR Writing font: %v - ignoring", err)
			continue
		}
		if buf.Len() > len(decoded) {
			common.Log.Debug("Re-written font is larger than original - skip")
			continue
		}

		newstream, err := core.MakeStream(buf.Bytes(), core.NewFlateEncoder())
		if err != nil {
			continue
		}
		// Overwrite.
		*stream = *newstream
		stream.Set("Length1", core.MakeInteger(int64(buf.Len())))
	}
	return objects, nil
}

// content describes page or font contents which is a content stream along with resources.
type content struct {
	content   string
	resources *model.PdfPageResources
}

// Best effort to get annotation contents.
func getAnnotationContents(annotsObj core.PdfObject) []content {
	if annotsObj == nil {
		return nil
	}
	annotsArr, ok := core.GetArray(annotsObj)
	if !ok {
		common.Log.Debug("Annots not an array")
		return nil
	}

	var annotContents []content
	for _, obj := range annotsArr.Elements() {
		annotDict, ok := core.GetDict(obj)
		if !ok {
			// Ignore any non dict elements.
			common.Log.Debug("Ignoring non-dict element in Annots")
			continue
		}

		// Appearance.
		appDict, ok := core.GetDict(annotDict.Get("AP"))
		if !ok {
			common.Log.Debug("No AP entry - skipping")
			continue
		}

		normal := core.TraceToDirectObject(appDict.Get("N"))
		if normal == nil {
			common.Log.Debug("No N entry - skipping")
			continue
		}

		var stream *core.PdfObjectStream
		switch t := normal.(type) {
		case *core.PdfObjectDictionary:
			appState, ok := core.GetName(annotDict.Get("AS"))
			if !ok {
				common.Log.Debug("No AS entry - skipping")
				continue
			}
			stream, ok = core.GetStream(t.Get(*appState))
			if !ok {
				common.Log.Debug("Form not found - skipping")
				continue
			}
		case *core.PdfObjectStream:
			stream = t
		}
		if stream == nil {
			common.Log.Debug("Form not found (nil) - skipping")
			continue
		}

		xform, err := model.NewXObjectFormFromStream(stream)
		if err != nil {
			common.Log.Debug("Error loading form: %v - ignoring", err)
			continue
		}

		contents, err := xform.GetContentStream()
		if err != nil {
			common.Log.Debug("Error decoding contents: %v", err)
			continue
		}

		annotContents = append(annotContents, content{
			content:   string(contents),
			resources: xform.Resources,
		})
	}

	return annotContents
}