Skip to content

Commit

Permalink
redesigned extraction api
Browse files Browse the repository at this point in the history
  • Loading branch information
hhrutter committed Mar 31, 2018
1 parent 27ddb49 commit 22180f1
Show file tree
Hide file tree
Showing 13 changed files with 546 additions and 494 deletions.
17 changes: 14 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ Required build version: go1.8 and up


## Usage

pdfcpu validate [-verbose] [-mode strict|relaxed] [-upw userpw] [-opw ownerpw] inFile
pdfcpu optimize [-verbose] [-stats csvFile] [-upw userpw] [-opw ownerpw] inFile [outFile]
pdfcpu split [-verbose] [-upw userpw] [-opw ownerpw] inFile outDir
Expand All @@ -74,9 +74,20 @@ Required build version: go1.8 and up

## Status

Version: 0.1.8
Version: 0.1.9

* Redesigned extraction API with focus on returning the extracted data rather than writing it somewhere.
* It is up to the API consumer how to process the extracted data.

```
func ImageData(ctx *types.PDFContext, objNr int) (*types.ImageObject, error)
func FontData(ctx *types.PDFContext, objNr int) (*types.FontObject, error)
func ContentData(ctx *types.PDFContext, objNr int) (data []byte, err error)
```




* introduces the interface PDFObject


## Contributing
Expand Down
258 changes: 254 additions & 4 deletions api.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package pdfcpu

import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
"regexp"
"strconv"
Expand Down Expand Up @@ -438,7 +440,7 @@ func setPageSelection(pageCount int, pageSelection []string) (selectedPages type
func pagesForPageSelection(pageCount int, pageSelection []string) (selectedPages types.IntSet, err error) {

if pageSelection == nil || len(pageSelection) == 0 {
log.Info.Println("pagesForPageSelection: invalid pageSelection")
log.Info.Println("pagesForPageSelection: empty pageSelection")
return nil, nil
}

Expand Down Expand Up @@ -549,6 +551,79 @@ func Merge(filesIn []string, fileOut string, config *types.Configuration) error
return nil
}

func ensureSelectedPages(ctx *types.PDFContext, selectedPages *types.IntSet) {

if selectedPages != nil && len(*selectedPages) > 0 {
return
}

m := types.IntSet{}
for i := 1; i <= ctx.PageCount; i++ {
m[i] = true
}

*selectedPages = m
}

func imageObjNrs(ctx *types.PDFContext, page int) []int {

o := []int{}

for k, v := range ctx.Optimize.PageImages[page-1] {
if v {
o = append(o, k)
}
}

return o
}

func doExtractImages(ctx *types.PDFContext, selectedPages types.IntSet) error {

ensureSelectedPages(ctx, &selectedPages)

visited := types.IntSet{}

for p, v := range selectedPages {

if v {

log.Info.Printf("writing images for page %d\n", p)

for _, objNr := range imageObjNrs(ctx, p) {

if visited[objNr] {
continue
}

visited[objNr] = true

io, err := extract.ImageData(ctx, objNr)
if err != nil {
return err
}

if io == nil {
continue
}

fileName := fmt.Sprintf("%s/%s_%d_%d.%s", ctx.Write.DirName, io.ResourceNamesString(), p, objNr, io.Extension)
fmt.Printf("writing %s\n", fileName)

err = ioutil.WriteFile(fileName, io.Data(), os.ModePerm)
if err != nil {
return err
}

}

}

}

return nil
}

// ExtractImages dumps embedded image resources from fileIn into dirOut for selected pages.
func ExtractImages(fileIn, dirOut string, pageSelection []string, config *types.Configuration) error {

Expand All @@ -569,7 +644,7 @@ func ExtractImages(fileIn, dirOut string, pageSelection []string, config *types.
}

ctx.Write.DirName = dirOut
err = extract.Images(ctx, pages)
err = doExtractImages(ctx, pages)
if err != nil {
return err
}
Expand All @@ -588,6 +663,64 @@ func ExtractImages(fileIn, dirOut string, pageSelection []string, config *types.
return nil
}

func fontObjNrs(ctx *types.PDFContext, page int) []int {

o := []int{}

for k, v := range ctx.Optimize.PageFonts[page-1] {
if v {
o = append(o, k)
}
}

return o
}

func doExtractFonts(ctx *types.PDFContext, selectedPages types.IntSet) error {

ensureSelectedPages(ctx, &selectedPages)

visited := types.IntSet{}

for p, v := range selectedPages {

if v {

log.Info.Printf("writing fonts for page %d\n", p)

for _, objNr := range fontObjNrs(ctx, p) {

if visited[objNr] {
continue
}

visited[objNr] = true

fo, err := extract.FontData(ctx, objNr)
if err != nil {
return err
}

if fo == nil {
continue
}

fileName := fmt.Sprintf("%s/%s_%d_%d.%s", ctx.Write.DirName, fo.ResourceNamesString(), p, objNr, fo.Extension)

err = ioutil.WriteFile(fileName, fo.Data, os.ModePerm)
if err != nil {
return err
}

}

}

}

return nil
}

// ExtractFonts dumps embedded fontfiles from fileIn into dirOut for selected pages.
func ExtractFonts(fileIn, dirOut string, pageSelection []string, config *types.Configuration) error {

Expand All @@ -608,7 +741,7 @@ func ExtractFonts(fileIn, dirOut string, pageSelection []string, config *types.C
}

ctx.Write.DirName = dirOut
err = extract.Fonts(ctx, pages)
err = doExtractFonts(ctx, pages)
if err != nil {
return err
}
Expand Down Expand Up @@ -667,6 +800,123 @@ func ExtractPages(fileIn, dirOut string, pageSelection []string, config *types.C
return nil
}

func contentObjNrs(ctx *types.PDFContext, page int) ([]int, error) {

objNrs := []int{}

d, err := ctx.PageDict(page)
if err != nil {
return nil, err
}

obj, found := d.Find("Contents")
if !found || obj == nil {
return nil, nil
}

var objNr int

indRef, ok := obj.(types.PDFIndirectRef)
if ok {
objNr = indRef.ObjectNumber.Value()
}

obj, err = ctx.Dereference(obj)
if err != nil {
return nil, err
}

if obj == nil {
return nil, nil
}

switch obj := obj.(type) {

case types.PDFStreamDict:

objNrs = append(objNrs, objNr)

case types.PDFArray:

for _, obj := range obj {

indRef, ok := obj.(types.PDFIndirectRef)
if !ok {
return nil, errors.Errorf("missing indref for page tree dict content no page %d", page)
}

sd, err := ctx.DereferenceStreamDict(obj)
if err != nil {
return nil, err
}

if sd == nil {
continue
}

objNrs = append(objNrs, indRef.ObjectNumber.Value())

}

}

return objNrs, nil
}

func doExtractContent(ctx *types.PDFContext, selectedPages types.IntSet) error {

ensureSelectedPages(ctx, &selectedPages)

visited := types.IntSet{}

for p, v := range selectedPages {

if v {

log.Info.Printf("writing content for page %d\n", p)

objNrs, err := contentObjNrs(ctx, p)
if err != nil {
return err
}

if objNrs == nil {
continue
}

for _, objNr := range objNrs {

if visited[objNr] {
continue
}

visited[objNr] = true

b, err := extract.ContentData(ctx, objNr)
if err != nil {
return err
}

if b == nil {
continue
}

fileName := fmt.Sprintf("%s/%d_%d.txt", ctx.Write.DirName, p, objNr)

err = ioutil.WriteFile(fileName, b, os.ModePerm)
if err != nil {
return err
}

}

}

}

return nil
}

// ExtractContent dumps "PDF source" files from fileIn into dirOut for selected pages.
func ExtractContent(fileIn, dirOut string, pageSelection []string, config *types.Configuration) error {

Expand All @@ -687,7 +937,7 @@ func ExtractContent(fileIn, dirOut string, pageSelection []string, config *types
}

ctx.Write.DirName = dirOut
err = extract.Content(ctx, pages)
err = doExtractContent(ctx, pages)
if err != nil {
return err
}
Expand Down
7 changes: 4 additions & 3 deletions cmd/pdfcpu/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,10 @@ func parseFlagsAndGetCommand() (command string) {
}

// Parse commandline flags.
flag.CommandLine.Parse(os.Args[i:])
err := flag.CommandLine.Parse(os.Args[i:])
if err != nil {
os.Exit(1)
}

return
}
Expand Down Expand Up @@ -285,7 +288,6 @@ func prepareExtractCommand(config *types.Configuration) *pdfcpu.Command {

dirnameOut := flag.Arg(1)

var err error
pages, err := pdfcpu.ParsePageSelection(pageSelection)
if err != nil {
log.Fatalf("extract: problem with flag pageSelection: %v", err)
Expand Down Expand Up @@ -318,7 +320,6 @@ func prepareTrimCommand(config *types.Configuration) *pdfcpu.Command {
os.Exit(1)
}

var err error
pages, err := pdfcpu.ParsePageSelection(pageSelection)
if err != nil {
log.Fatalf("trim: problem with flag pageSelection: %v", err)
Expand Down
Loading

0 comments on commit 22180f1

Please sign in to comment.