Skip to content

Commit

Permalink
Add Split by page number command
Browse files Browse the repository at this point in the history
  • Loading branch information
hhrutter committed Oct 23, 2023
1 parent 821def5 commit 1807371
Show file tree
Hide file tree
Showing 9 changed files with 200 additions and 24 deletions.
38 changes: 34 additions & 4 deletions cmd/pdfcpu/process.go
Expand Up @@ -218,12 +218,37 @@ func processOptimizeCommand(conf *model.Configuration) {
process(cli.OptimizeCommand(inFile, outFile, conf))
}

func processSplitVyPageNumberCommand(inFile, outDir string, conf *model.Configuration) {
if len(flag.Args()) == 2 {
fmt.Fprintln(os.Stderr, "split: missing page numbers")
os.Exit(1)
}

ii := types.IntSet{}
for i := 2; i < len(flag.Args()); i++ {
p, err := strconv.Atoi(flag.Arg(i))
if err != nil || p < 2 {
fmt.Fprintln(os.Stderr, "split: pageNr is a numeric value >= 2")
os.Exit(1)
}
ii[p] = true
}

pageNrs := make([]int, 0, len(ii))
for k := range ii {
pageNrs = append(pageNrs, k)
}
sort.Ints(pageNrs)

process(cli.SplitByPageNrCommand(inFile, outDir, pageNrs, conf))
}

func processSplitCommand(conf *model.Configuration) {
if mode == "" {
mode = "span"
}
mode = extractModeCompletion(mode, []string{"span", "bookmark"})
if mode == "" || len(flag.Args()) < 2 || len(flag.Args()) > 3 || selectedPages != "" {
mode = extractModeCompletion(mode, []string{"span", "bookmark", "page"})
if mode == "" || len(flag.Args()) < 2 || selectedPages != "" {
fmt.Fprintf(os.Stderr, "%s\n\n", usageSplit)
os.Exit(1)
}
Expand All @@ -233,6 +258,13 @@ func processSplitCommand(conf *model.Configuration) {
ensurePDFExtension(inFile)
}

outDir := flag.Arg(1)

if mode == "page" {
processSplitVyPageNumberCommand(inFile, outDir, conf)
return
}

span := 0

if mode == "span" {
Expand All @@ -247,8 +279,6 @@ func processSplitCommand(conf *model.Configuration) {
}
}

outDir := flag.Arg(1)

process(cli.SplitCommand(inFile, outDir, span, conf))
}

Expand Down
35 changes: 31 additions & 4 deletions cmd/pdfcpu/usage.go
Expand Up @@ -109,22 +109,49 @@ relaxed ... (default) like strict but doesn't complain about common seen spec vi
inFile ... input PDF file
outFile ... output PDF file`

usageSplit = "usage: pdfcpu split [-m(ode) span|bookmark] inFile outDir [span]" + generalFlags
usageLongSplit = `Generate a set of PDFs for the input file in outDir according to given span value or along bookmarks.
usageSplit = "usage: pdfcpu split [-m(ode) span|bookmark|page] inFile outDir [span|pageNr...]" + generalFlags
usageLongSplit = `Generate a set of PDFs for the input file in outDir according to given span value or along bookmarks or page numbers.
mode ... split mode (defaults to span)
inFile ... input PDF file
outDir ... output directory
span ... split span in pages (default: 1) for mode "span"
pageNr ... split before a specific page number for mode "page"
The split modes are:
span ... Split into PDF files with span pages each (default).
span itself defaults to 1 resulting in single page PDF files.
bookmark ... Split into PDF files representing sections defined by existing bookmarks.
span will be ignored.
Assumption: inFile contains an outline dictionary.`
Assumption: inFile contains an outline dictionary.
page ... Split before specific page numbers.
Eg. pdfcpu split test.pdf . (= pdfcpu split -m span test.pdf . 1)
generates:
test_1.pdf
test_2.pdf
etc.
pdfcpu split test.pdf . 2 (= pdfcpu split -m span test.pdf . 2)
generates:
test_1-2.pdf
test_3-4.pdf
etc.
pdfcpu split -m bookmark test.pdf .
generates:
test_bm1Title_1-4.pdf
test_bm2Title.5-7-pdf
etc.
pdfcpu split -m page test.pdf . 2 4 10
generates:
test_1.pdf
test_2-3.pdf
test_4-9.pdf
test_10-20.pdf`

usageMerge = "usage: pdfcpu merge [-m(ode) create|append] [-s(ort) -b(ookmarks)] outFile inFile..." + generalFlags
usageLongMerge = `Concatenate a sequence of PDFs/inFiles into outFile.
Expand Down
91 changes: 78 additions & 13 deletions pkg/api/split.go
Expand Up @@ -151,6 +151,31 @@ func pageSpans(ctx *model.Context, span int) ([]*PageSpan, error) {
return pss, nil
}

func writePageSpans(ctx *model.Context, span int, outDir, fileName string) error {
forBookmark := false

for i := 0; i < ctx.PageCount/span; i++ {
start := i * span
from, thru := start+1, start+span
path := splitOutPath(outDir, fileName, forBookmark, from, thru)
if err := writePageSpan(ctx, from, thru, path); err != nil {
return err
}
}

// A possible last file has less than span pages.
if ctx.PageCount%span > 0 {
start := (ctx.PageCount / span) * span
from, thru := start+1, ctx.PageCount
path := splitOutPath(outDir, fileName, forBookmark, from, thru)
if err := writePageSpan(ctx, from, thru, path); err != nil {
return err
}
}

return nil
}

func writePageSpansSplitAlongBookmarks(ctx *model.Context, outDir string) error {
forBookmark := true

Expand All @@ -174,29 +199,34 @@ func writePageSpansSplitAlongBookmarks(ctx *model.Context, outDir string) error
return nil
}

func writePageSpans(ctx *model.Context, span int, outDir, fileName string) error {
func writePageSpansSplitAlongPages(ctx *model.Context, pageNrs []int, outDir, fileName string) error {
// pageNumbers is a a sorted sequence of page numbers.
forBookmark := false
from, thru := 1, 0

for i := 0; i < ctx.PageCount/span; i++ {
start := i * span
from, thru := start+1, start+span
path := splitOutPath(outDir, fileName, forBookmark, from, thru)
if err := writePageSpan(ctx, from, thru, path); err != nil {
return err
}
if len(pageNrs) < 1 {
return errors.New("pdfcpu: split along pageNrs - missing pageNrs")
}

// A possible last file has less than span pages.
if ctx.PageCount%span > 0 {
start := (ctx.PageCount / span) * span
from, thru := start+1, ctx.PageCount
if pageNrs[0] > ctx.PageCount {
return errors.New("pdfcpu: split along pageNrs - invalid page number sequence.")
}

for i := 0; i < len(pageNrs); i++ {
thru = pageNrs[i] - 1
if thru >= ctx.PageCount {
break
}
path := splitOutPath(outDir, fileName, forBookmark, from, thru)
if err := writePageSpan(ctx, from, thru, path); err != nil {
return err
}
from = thru + 1
}

return nil
thru = ctx.PageCount
path := splitOutPath(outDir, fileName, forBookmark, from, thru)
return writePageSpan(ctx, from, thru, path)
}

// SplitRaw returns page spans for the PDF stream read from rs obeying given split span.
Expand Down Expand Up @@ -262,3 +292,38 @@ func SplitFile(inFile, outDir string, span int, conf *model.Configuration) error

return Split(f, outDir, filepath.Base(inFile), span, conf)
}

// SplitFile generates a sequence of PDF files in outDir for rs splitting along pageNrs.
func SplitByPageNr(rs io.ReadSeeker, outDir, fileName string, pageNrs []int, conf *model.Configuration) error {
if rs == nil {
return errors.New("pdfcpu: SplitByPageNr: missing rs")
}

ctx, err := context(rs, conf)
if err != nil {
return err
}

return writePageSpansSplitAlongPages(ctx, pageNrs, outDir, fileName)
}

// SplitFile generates a sequence of PDF files in outDir for inFile splitting it along pageNrs.
func SplitByPageNrFile(inFile, outDir string, pageNrs []int, conf *model.Configuration) error {
f, err := os.Open(inFile)
if err != nil {
return err
}
if log.CLIEnabled() {
log.CLI.Printf("splitting %s to %s/...\n", inFile, outDir)
}

defer func() {
if err != nil {
f.Close()
return
}
err = f.Close()
}()

return SplitByPageNr(f, outDir, filepath.Base(inFile), pageNrs, conf)
}
19 changes: 17 additions & 2 deletions pkg/api/test/split_test.go
Expand Up @@ -48,8 +48,8 @@ func TestSplitSpan2(t *testing.T) {
}
}

func TestSplit0ByBookmark(t *testing.T) {
msg := "TestSplit0ByBookmark"
func TestSplitByBookmark(t *testing.T) {
msg := "TestSplitByBookmark"
fileName := "5116.DCT_Filter.pdf"
inFile := filepath.Join(inDir, fileName)

Expand All @@ -60,6 +60,21 @@ func TestSplit0ByBookmark(t *testing.T) {
}
}

func TestSplitByPageNr(t *testing.T) {
msg := "TestSplitByPageNr"
fileName := "5116.DCT_Filter.pdf"
inFile := filepath.Join(inDir, fileName)

// Generate page section 1
// Generate page section 2-9
// Generate page section 10-49
// Generate page section 50-last page

if err := api.SplitByPageNrFile(inFile, outDir, []int{2, 10, 50}, nil); err != nil {
t.Fatalf("%s: %v\n", msg, err)
}
}

func TestSplitLowLevel(t *testing.T) {
msg := "TestSplitLowLevel"
inFile := filepath.Join(inDir, "TheGoProgrammingLanguageCh1.pdf")
Expand Down
5 changes: 5 additions & 0 deletions pkg/cli/cli.go
Expand Up @@ -72,6 +72,11 @@ func Split(cmd *Command) ([]string, error) {
return nil, api.SplitFile(*cmd.InFile, *cmd.OutDir, cmd.IntVal, cmd.Conf)
}

// Split inFile along pages and write result files to outDir.
func SplitByPageNr(cmd *Command) ([]string, error) {
return nil, api.SplitByPageNrFile(*cmd.InFile, *cmd.OutDir, cmd.IntVals, cmd.Conf)
}

// Trim inFile and write result to outFile.
func Trim(cmd *Command) ([]string, error) {
return nil, api.TrimFile(*cmd.InFile, *cmd.OutFile, cmd.PageSelection, cmd.Conf)
Expand Down
17 changes: 16 additions & 1 deletion pkg/cli/cmd.go
Expand Up @@ -61,6 +61,7 @@ var cmdMap = map[model.CommandMode]func(cmd *Command) ([]string, error){
model.VALIDATE: Validate,
model.OPTIMIZE: Optimize,
model.SPLIT: Split,
model.SPLITBYPAGENR: SplitByPageNr,
model.MERGECREATE: MergeCreate,
model.MERGEAPPEND: MergeAppend,
model.EXTRACTIMAGES: ExtractImages,
Expand Down Expand Up @@ -161,7 +162,7 @@ func OptimizeCommand(inFile, outFile string, conf *model.Configuration) *Command
Conf: conf}
}

// SplitCommand creates a new command to split a file into single page files.
// SplitCommand creates a new command to split a file according to span or along bookmarks..
func SplitCommand(inFile, dirNameOut string, span int, conf *model.Configuration) *Command {
if conf == nil {
conf = model.NewDefaultConfiguration()
Expand All @@ -175,6 +176,20 @@ func SplitCommand(inFile, dirNameOut string, span int, conf *model.Configuration
Conf: conf}
}

// SplitByPageNrCommand creates a new command to split a file into files along given pages.
func SplitByPageNrCommand(inFile, dirNameOut string, pageNrs []int, conf *model.Configuration) *Command {
if conf == nil {
conf = model.NewDefaultConfiguration()
}
conf.Cmd = model.SPLITBYPAGENR
return &Command{
Mode: model.SPLITBYPAGENR,
InFile: &inFile,
OutDir: &dirNameOut,
IntVals: pageNrs,
Conf: conf}
}

// MergeCreateCommand creates a new command to merge files.
// Outfile will be created. An existing outFile will be overwritten.
func MergeCreateCommand(inFiles []string, outFile string, conf *model.Configuration) *Command {
Expand Down
17 changes: 17 additions & 0 deletions pkg/cli/test/split_test.go
Expand Up @@ -59,10 +59,27 @@ func TestSplitByBookmarkCommand(t *testing.T) {
msg := "TestSplitByBookmarkCommand"
fileName := "5116.DCT_Filter.pdf"
inFile := filepath.Join(inDir, fileName)

span := 0 // This means we are going to split by bookmarks.

cmd := cli.SplitCommand(inFile, outDir, span, conf)
if _, err := cli.Process(cmd); err != nil {
t.Fatalf("%s %s: %v\n", msg, inFile, err)
}
}

func TestSplitByPageNrCommand(t *testing.T) {
msg := "TestSplitByPageNrCommand"
fileName := "5116.DCT_Filter.pdf"
inFile := filepath.Join(inDir, fileName)

// Generate page section 1
// Generate page section 2-9
// Generate page section 10-49
// Generate page section 50-last page

cmd := cli.SplitByPageNrCommand(inFile, outDir, []int{2, 10, 50}, conf)
if _, err := cli.Process(cmd); err != nil {
t.Fatalf("%s %s: %v\n", msg, inFile, err)
}
}
1 change: 1 addition & 0 deletions pkg/pdfcpu/crypto.go
Expand Up @@ -53,6 +53,7 @@ var (
model.LISTINFO: {0, 0},
model.OPTIMIZE: {0, 0},
model.SPLIT: {1, 0},
model.SPLITBYPAGENR: {1, 0},
model.MERGECREATE: {0, 0},
model.MERGEAPPEND: {0, 0},
model.EXTRACTIMAGES: {1, 0},
Expand Down
1 change: 1 addition & 0 deletions pkg/pdfcpu/model/configuration.go
Expand Up @@ -63,6 +63,7 @@ const (
LISTINFO
OPTIMIZE
SPLIT
SPLITBYPAGENR
MERGECREATE
MERGEAPPEND
EXTRACTIMAGES
Expand Down

0 comments on commit 1807371

Please sign in to comment.