Skip to content

Commit

Permalink
Merge pull request restic#3731 from metalsp0rk/feature/min-packsize-flag
Browse files Browse the repository at this point in the history
Feature: min packsize flag
  • Loading branch information
MichaelEischer authored Aug 7, 2022
2 parents 8fa64a8 + 7266f07 commit 2930a10
Show file tree
Hide file tree
Showing 17 changed files with 191 additions and 34 deletions.
12 changes: 12 additions & 0 deletions changelog/unreleased/issue-2291
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Enhancement: Allow pack size customization

Restic now uses a target pack size of 16 MiB by default. It can be customized
using the `--pack-size size` option. Supported pack sizes range between 4 and
128 MiB.

It is possible to migrate an existing repository to _larger_ pack files using
`prune --repack-small`. This will rewrite every pack file which is
significantly smaller than the target size.

https://github.com/restic/restic/issues/2291
https://github.com/restic/restic/pull/3731
8 changes: 7 additions & 1 deletion cmd/restic/cmd_init.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,13 @@ func runInit(opts InitOptions, gopts GlobalOptions, args []string) error {
return errors.Fatalf("create repository at %s failed: %v\n", location.StripPassword(gopts.Repo), err)
}

s := repository.New(be, repository.Options{Compression: gopts.Compression})
s, err := repository.New(be, repository.Options{
Compression: gopts.Compression,
PackSize: gopts.PackSize * 1024 * 1024,
})
if err != nil {
return err
}

err = s.Init(gopts.ctx, version, gopts.password, chunkerPolynomial)
if err != nil {
Expand Down
34 changes: 30 additions & 4 deletions cmd/restic/cmd_prune.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ type PruneOptions struct {
MaxRepackBytes uint64

RepackCachableOnly bool
RepackSmall bool
RepackUncompressed bool
}

Expand All @@ -70,6 +71,7 @@ func addPruneOptions(c *cobra.Command) {
f.StringVar(&pruneOptions.MaxUnused, "max-unused", "5%", "tolerate given `limit` of unused data (absolute value in bytes with suffixes k/K, m/M, g/G, t/T, a value in % or the word 'unlimited')")
f.StringVar(&pruneOptions.MaxRepackSize, "max-repack-size", "", "maximum `size` to repack (allowed suffixes: k/K, m/M, g/G, t/T)")
f.BoolVar(&pruneOptions.RepackCachableOnly, "repack-cacheable-only", false, "only repack packs which are cacheable")
f.BoolVar(&pruneOptions.RepackSmall, "repack-small", false, "repack pack files below 80%% of target pack size")
f.BoolVar(&pruneOptions.RepackUncompressed, "repack-uncompressed", false, "repack all uncompressed data")
}

Expand Down Expand Up @@ -422,7 +424,14 @@ func decidePackAction(ctx context.Context, opts PruneOptions, gopts GlobalOption
repackPacks := restic.NewIDSet()

var repackCandidates []packInfoWithID
var repackSmallCandidates []packInfoWithID
repoVersion := repo.Config().Version
// only repack very small files by default
targetPackSize := repo.PackSize() / 25
if opts.RepackSmall {
// consider files with at least 80% of the target size as large enough
targetPackSize = repo.PackSize() / 5 * 4
}

// loop over all packs and decide what to do
bar := newProgressMax(!gopts.Quiet, uint64(len(indexPack)), "packs processed")
Expand Down Expand Up @@ -477,8 +486,12 @@ func decidePackAction(ctx context.Context, opts PruneOptions, gopts GlobalOption
stats.packs.keep++

case p.unusedBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress:
// All blobs in pack are used and not mixed => keep pack!
stats.packs.keep++
if packSize >= int64(targetPackSize) {
// All blobs in pack are used and not mixed => keep pack!
stats.packs.keep++
} else {
repackSmallCandidates = append(repackSmallCandidates, packInfoWithID{ID: id, packInfo: p})
}

default:
// all other packs are candidates for repacking
Expand Down Expand Up @@ -521,11 +534,19 @@ func decidePackAction(ctx context.Context, opts PruneOptions, gopts GlobalOption
}
}

if len(repackSmallCandidates) < 10 {
// too few small files to be worth the trouble, this also prevents endlessly repacking
// if there is just a single pack file below the target size
stats.packs.keep += uint(len(repackSmallCandidates))
} else {
repackCandidates = append(repackCandidates, repackSmallCandidates...)
}

// Sort repackCandidates such that packs with highest ratio unused/used space are picked first.
// This is equivalent to sorting by unused / total space.
// Instead of unused[i] / used[i] > unused[j] / used[j] we use
// unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64
// Morover packs containing trees are sorted to the beginning
// Moreover packs containing trees and too small packs are sorted to the beginning
sort.Slice(repackCandidates, func(i, j int) bool {
pi := repackCandidates[i].packInfo
pj := repackCandidates[j].packInfo
Expand All @@ -534,6 +555,10 @@ func decidePackAction(ctx context.Context, opts PruneOptions, gopts GlobalOption
return true
case pj.tpe != restic.DataBlob && pi.tpe == restic.DataBlob:
return false
case pi.unusedSize+pi.usedSize < uint64(targetPackSize) && pj.unusedSize+pj.usedSize >= uint64(targetPackSize):
return true
case pj.unusedSize+pj.usedSize < uint64(targetPackSize) && pi.unusedSize+pi.usedSize >= uint64(targetPackSize):
return false
}
return pi.unusedSize*pj.usedSize > pj.unusedSize*pi.usedSize
})
Expand All @@ -552,6 +577,7 @@ func decidePackAction(ctx context.Context, opts PruneOptions, gopts GlobalOption
for _, p := range repackCandidates {
reachedUnusedSizeAfter := (stats.size.unused-stats.size.remove-stats.size.repackrm < maxUnusedSizeAfter)
reachedRepackSize := stats.size.repack+p.unusedSize+p.usedSize >= opts.MaxRepackBytes
packIsLargeEnough := p.unusedSize+p.usedSize >= uint64(targetPackSize)

switch {
case reachedRepackSize:
Expand All @@ -561,7 +587,7 @@ func decidePackAction(ctx context.Context, opts PruneOptions, gopts GlobalOption
// repacking non-data packs / uncompressed-trees is only limited by repackSize
repack(p.ID, p.packInfo)

case reachedUnusedSizeAfter:
case reachedUnusedSizeAfter && packIsLargeEnough:
// for all other packs stop repacking if tolerated unused size is reached.
stats.packs.keep++

Expand Down
14 changes: 13 additions & 1 deletion cmd/restic/global.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"syscall"
"time"
Expand Down Expand Up @@ -62,6 +63,7 @@ type GlobalOptions struct {
NoCache bool
CleanupCache bool
Compression repository.CompressionMode
PackSize uint

backend.TransportOptions
limiter.Limits
Expand Down Expand Up @@ -102,6 +104,9 @@ func init() {
return nil
})

// parse target pack size from env, on error the default value will be used
targetPackSize, _ := strconv.ParseUint(os.Getenv("RESTIC_PACK_SIZE"), 10, 32)

f := cmdRoot.PersistentFlags()
f.StringVarP(&globalOptions.Repo, "repo", "r", os.Getenv("RESTIC_REPOSITORY"), "`repository` to backup to or restore from (default: $RESTIC_REPOSITORY)")
f.StringVarP(&globalOptions.RepositoryFile, "repository-file", "", os.Getenv("RESTIC_REPOSITORY_FILE"), "`file` to read the repository location from (default: $RESTIC_REPOSITORY_FILE)")
Expand All @@ -121,6 +126,7 @@ func init() {
f.Var(&globalOptions.Compression, "compression", "compression mode (only available for repository format version 2), one of (auto|off|max)")
f.IntVar(&globalOptions.Limits.UploadKb, "limit-upload", 0, "limits uploads to a maximum rate in KiB/s. (default: unlimited)")
f.IntVar(&globalOptions.Limits.DownloadKb, "limit-download", 0, "limits downloads to a maximum rate in KiB/s. (default: unlimited)")
f.UintVar(&globalOptions.PackSize, "pack-size", uint(targetPackSize), "set target pack size in MiB, created pack files may be larger (default: $RESTIC_PACK_SIZE)")
f.StringSliceVarP(&globalOptions.Options, "option", "o", []string{}, "set extended option (`key=value`, can be specified multiple times)")
// Use our "generate" command instead of the cobra provided "completion" command
cmdRoot.CompletionOptions.DisableDefaultCmd = true
Expand Down Expand Up @@ -440,7 +446,13 @@ func OpenRepository(opts GlobalOptions) (*repository.Repository, error) {
}
}

s := repository.New(be, repository.Options{Compression: opts.Compression})
s, err := repository.New(be, repository.Options{
Compression: opts.Compression,
PackSize: opts.PackSize * 1024 * 1024,
})
if err != nil {
return nil, err
}

passwordTriesLeft := 1
if stdinIsTerminal() && opts.password == "" {
Expand Down
5 changes: 5 additions & 0 deletions cmd/restic/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1611,6 +1611,11 @@ func testPruneVariants(t *testing.T, unsafeNoSpaceRecovery bool) {
checkOpts := CheckOptions{ReadData: true}
testPrune(t, opts, checkOpts)
})
t.Run("Small", func(t *testing.T) {
opts := PruneOptions{MaxUnused: "unlimited", RepackSmall: true}
checkOpts := CheckOptions{ReadData: true, CheckUnused: true}
testPrune(t, opts, checkOpts)
})
}

func testPrune(t *testing.T, pruneOpts PruneOptions, checkOpts CheckOptions) {
Expand Down
1 change: 1 addition & 0 deletions doc/040_backup.rst
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,7 @@ environment variables. The following lists these environment variables:
RESTIC_CACHE_DIR Location of the cache directory
RESTIC_COMPRESSION Compression mode (only available for repository format version 2)
RESTIC_PROGRESS_FPS Frames per second by which the progress bar is updated
RESTIC_PACK_SIZE Target size for pack files
TMPDIR Location for temporary files
Expand Down
24 changes: 24 additions & 0 deletions doc/047_tuning_backup_parameters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- for subsections
^ for subsubsections
" for paragraphs
########################
Tuning Backup Parameters
########################
Expand Down Expand Up @@ -48,3 +49,26 @@ which will compress very fast), ``max`` (which will trade backup speed and CPU u
slightly better compression), or ``off`` (which disables compression). Each setting is
only applied for the single run of restic. The option can also be set via the environment
variable ``RESTIC_COMPRESSION``.


Pack Size
=========

In certain instances, such as very large repositories (in the TiB range) or very fast
upload connections, it is desirable to use larger pack sizes to reduce the number of
files in the repository and improve upload performance. Notable examples are OpenStack
Swift and some Google Drive Team accounts, where there are hard limits on the total
number of files. Larger pack sizes can also improve the backup speed for a repository
stored on a local HDD. This can be achieved by either using the ``--pack-size`` option
or defining the ``$RESTIC_PACK_SIZE`` environment variable. Restic currently defaults
to a 16 MiB pack size.

The side effect of increasing the pack size is requiring more disk space for temporary pack
files created before uploading. The space must be available in the system default temp
directory, unless overwritten by setting the ``$TMPDIR`` environment variable. In addition,
depending on the backend the memory usage can also increase by a similar amount. Restic
requires temporary space according to the pack size, multiplied by the number
of backend connections plus one. For example, if the backend uses 5 connections (the default
for most backends), with a target pack size of 64 MiB, you'll need a *minimum* of 384 MiB
of space in the temp directory. A bit of tuning may be required to strike a balance between
resource usage at the backup client and the number of pack files in the repository.
2 changes: 2 additions & 0 deletions doc/manual_rest.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ Usage help is available:
--key-hint key key ID of key to try decrypting first (default: $RESTIC_KEY_HINT)
--limit-download int limits downloads to a maximum rate in KiB/s. (default: unlimited)
--limit-upload int limits uploads to a maximum rate in KiB/s. (default: unlimited)
--pack-size uint set target pack size in MiB. (default: $RESTIC_PACK_SIZE)
--no-cache do not use a local cache
--no-lock do not lock the repository, this allows some operations on read-only repositories
-o, --option key=value set extended option (key=value, can be specified multiple times)
Expand Down Expand Up @@ -128,6 +129,7 @@ command:
--key-hint key key ID of key to try decrypting first (default: $RESTIC_KEY_HINT)
--limit-download int limits downloads to a maximum rate in KiB/s. (default: unlimited)
--limit-upload int limits uploads to a maximum rate in KiB/s. (default: unlimited)
--pack-size uint set target pack size in MiB. (default: $RESTIC_PACK_SIZE)
--no-cache do not use a local cache
--no-lock do not lock the repository, this allows some operations on read-only repositories
-o, --option key=value set extended option (key=value, can be specified multiple times)
Expand Down
2 changes: 2 additions & 0 deletions internal/backend/s3/s3.go
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,8 @@ func (be *Backend) Save(ctx context.Context, h restic.Handle, rd restic.RewindRe
opts.ContentType = "application/octet-stream"
// the only option with the high-level api is to let the library handle the checksum computation
opts.SendContentMd5 = true
// only use multipart uploads for very large files
opts.PartSize = 200 * 1024 * 1024

debug.Log("PutObject(%v, %v, %v)", be.cfg.Bucket, objName, rd.Length())
info, err := be.client.PutObject(ctx, be.cfg.Bucket, objName, ioutil.NopCloser(rd), int64(rd.Length()), opts)
Expand Down
3 changes: 2 additions & 1 deletion internal/checker/checker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,8 @@ func TestCheckerModifiedData(t *testing.T) {
t.Logf("archived as %v", sn.ID().Str())

beError := &errorBackend{Backend: repo.Backend()}
checkRepo := repository.New(beError, repository.Options{})
checkRepo, err := repository.New(beError, repository.Options{})
test.OK(t, err)
test.OK(t, checkRepo.SearchKey(context.TODO(), test.TestPassword, 5, ""))

chkr := checker.New(checkRepo, false)
Expand Down
7 changes: 7 additions & 0 deletions internal/pack/pack.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,13 @@ func (p *Packer) Count() int {
return len(p.blobs)
}

// HeaderFull returns true if the pack header is full.
func (p *Packer) HeaderFull() bool {
p.m.Lock()
defer p.m.Unlock()
return headerSize+uint(len(p.blobs)+1)*entrySize > MaxHeaderSize
}

// Blobs returns the slice of blobs that have been written.
func (p *Packer) Blobs() []restic.Blob {
p.m.Lock()
Expand Down
20 changes: 10 additions & 10 deletions internal/repository/packer_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,19 @@ type packerManager struct {
key *crypto.Key
queueFn func(ctx context.Context, t restic.BlobType, p *Packer) error

pm sync.Mutex
packer *Packer
pm sync.Mutex
packer *Packer
packSize uint
}

const minPackSize = 4 * 1024 * 1024

// newPackerManager returns an new packer manager which writes temporary files
// to a temporary directory
func newPackerManager(key *crypto.Key, tpe restic.BlobType, queueFn func(ctx context.Context, t restic.BlobType, p *Packer) error) *packerManager {
func newPackerManager(key *crypto.Key, tpe restic.BlobType, packSize uint, queueFn func(ctx context.Context, t restic.BlobType, p *Packer) error) *packerManager {
return &packerManager{
tpe: tpe,
key: key,
queueFn: queueFn,
tpe: tpe,
key: key,
queueFn: queueFn,
packSize: packSize,
}
}

Expand Down Expand Up @@ -87,8 +87,8 @@ func (r *packerManager) SaveBlob(ctx context.Context, t restic.BlobType, id rest
return 0, err
}

// if the pack is not full enough, put back to the list
if packer.Size() < minPackSize {
// if the pack and header is not full enough, put back to the list
if packer.Size() < r.packSize && !packer.HeaderFull() {
debug.Log("pack is not full enough (%d bytes)", packer.Size())
return size, nil
}
Expand Down
6 changes: 3 additions & 3 deletions internal/repository/packer_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ func min(a, b int) int {
}

func fillPacks(t testing.TB, rnd *rand.Rand, pm *packerManager, buf []byte) (bytes int) {
for i := 0; i < 100; i++ {
for i := 0; i < 102; i++ {
l := rnd.Intn(maxBlobSize)
id := randomID(rnd)
buf = buf[:l]
Expand Down Expand Up @@ -70,7 +70,7 @@ func testPackerManager(t testing.TB) int64 {
rnd := rand.New(rand.NewSource(randomSeed))

savedBytes := int(0)
pm := newPackerManager(crypto.NewRandomKey(), restic.DataBlob, func(ctx context.Context, tp restic.BlobType, p *Packer) error {
pm := newPackerManager(crypto.NewRandomKey(), restic.DataBlob, DefaultPackSize, func(ctx context.Context, tp restic.BlobType, p *Packer) error {
err := p.Finalize()
if err != nil {
return err
Expand Down Expand Up @@ -104,7 +104,7 @@ func BenchmarkPackerManager(t *testing.B) {

for i := 0; i < t.N; i++ {
rnd.Seed(randomSeed)
pm := newPackerManager(crypto.NewRandomKey(), restic.DataBlob, func(ctx context.Context, t restic.BlobType, p *Packer) error {
pm := newPackerManager(crypto.NewRandomKey(), restic.DataBlob, DefaultPackSize, func(ctx context.Context, t restic.BlobType, p *Packer) error {
return nil
})
fillPacks(t, rnd, pm, blobBuf)
Expand Down
Loading

0 comments on commit 2930a10

Please sign in to comment.