Skip to content

Commit

Permalink
Merge pull request #2341 from kolyshkin/test-cpt-lazy
Browse files Browse the repository at this point in the history
runc checkpoint: fix --status-fd to accept fd
  • Loading branch information
Mrunal Patel committed May 18, 2020
2 parents 67fac52 + ca1d135 commit 825e91a
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 64 deletions.
2 changes: 1 addition & 1 deletion checkpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ checkpointed.`,
cli.BoolFlag{Name: "ext-unix-sk", Usage: "allow external unix sockets"},
cli.BoolFlag{Name: "shell-job", Usage: "allow shell jobs"},
cli.BoolFlag{Name: "lazy-pages", Usage: "use userfaultfd to lazily restore memory pages"},
cli.StringFlag{Name: "status-fd", Value: "", Usage: "criu writes \\0 to this FD once lazy-pages is ready"},
cli.IntFlag{Name: "status-fd", Value: -1, Usage: "criu writes \\0 to this FD once lazy-pages is ready"},
cli.StringFlag{Name: "page-server", Value: "", Usage: "ADDRESS:PORT of the page server"},
cli.BoolFlag{Name: "file-locks", Usage: "handle file locks, for safety"},
cli.BoolFlag{Name: "pre-dump", Usage: "dump container's memory information only, leave the container running after this"},
Expand Down
56 changes: 26 additions & 30 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -882,26 +882,6 @@ func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
return nil
}

func waitForCriuLazyServer(r *os.File, status string) error {

data := make([]byte, 1)
_, err := r.Read(data)
if err != nil {
return err
}
fd, err := os.OpenFile(status, os.O_TRUNC|os.O_WRONLY, os.ModeAppend)
if err != nil {
return err
}
_, err = fd.Write(data)
fd.Close()
if err != nil {
return err
}

return nil
}

func (c *linuxContainer) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) {
// CRIU will evaluate a configuration starting with release 3.11.
// Settings in the configuration file will overwrite RPC settings.
Expand Down Expand Up @@ -1069,27 +1049,34 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
} else {
t = criurpc.CriuReqType_DUMP
}
req := &criurpc.CriuReq{
Type: &t,
Opts: &rpcOpts,
}

if criuOpts.LazyPages {
// lazy migration requested; check if criu supports it
feat := criurpc.CriuFeatures{
LazyPages: proto.Bool(true),
}

if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
return err
}

statusRead, statusWrite, err := os.Pipe()
if err != nil {
return err
if fd := criuOpts.StatusFd; fd != -1 {
// check that the FD is valid
flags, err := unix.FcntlInt(uintptr(fd), unix.F_GETFL, 0)
if err != nil {
return fmt.Errorf("invalid --status-fd argument %d: %w", fd, err)
}
// and writable
if flags&unix.O_WRONLY == 0 {
return fmt.Errorf("invalid --status-fd argument %d: not writable", fd)
}

rpcOpts.StatusFd = proto.Int32(int32(fd))
}
rpcOpts.StatusFd = proto.Int32(int32(statusWrite.Fd()))
go waitForCriuLazyServer(statusRead, criuOpts.StatusFd)
}

req := &criurpc.CriuReq{
Type: &t,
Opts: &rpcOpts,
}

// no need to dump all this in pre-dump
Expand Down Expand Up @@ -1573,6 +1560,15 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
oob := make([]byte, 4096)
for true {
n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob)
if req.Opts != nil && req.Opts.StatusFd != nil {
// Close status_fd as soon as we got something back from criu,
// assuming it has consumed (reopened) it by this time.
// Otherwise it will might be left open forever and whoever
// is waiting on it will wait forever.
fd := int(*req.Opts.StatusFd)
_ = unix.Close(fd)
req.Opts.StatusFd = nil
}
if err != nil {
return err
}
Expand Down
2 changes: 1 addition & 1 deletion libcontainer/criu_opts_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,5 @@ type CriuOpts struct {
EmptyNs uint32 // don't c/r properties for namespace from this mask
AutoDedup bool // auto deduplication for incremental dumps
LazyPages bool // restore memory pages lazily using userfaultfd
StatusFd string // fd for feedback when lazy server is ready
StatusFd int // fd for feedback when lazy server is ready
}
2 changes: 1 addition & 1 deletion restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,6 @@ func criuOptions(context *cli.Context) *libcontainer.CriuOpts {
PreDump: context.Bool("pre-dump"),
AutoDedup: context.Bool("auto-dedup"),
LazyPages: context.Bool("lazy-pages"),
StatusFd: context.String("status-fd"),
StatusFd: context.Int("status-fd"),
}
}
58 changes: 27 additions & 31 deletions tests/integration/checkpoint.bats
Original file line number Diff line number Diff line change
Expand Up @@ -136,11 +136,6 @@ function simple_cr() {
# This should not be necessary: https://github.com/checkpoint-restore/criu/issues/575
sed -i 's;"readonly": true;"readonly": false;' config.json

# For lazy migration we need to know when CRIU is ready to serve
# the memory pages via TCP.
lazy_pipe=`mktemp -u /tmp/lazy-pipe-XXXXXX`
mkfifo $lazy_pipe

# TCP port for lazy migration
port=27277

Expand All @@ -153,34 +148,31 @@ function simple_cr() {
# checkpoint the running container
mkdir image-dir
mkdir work-dir
# Double fork taken from helpers.bats
# We need to start 'runc checkpoint --lazy-pages' in the background,
# so we double fork in the shell.
(runc --criu "$CRIU" checkpoint --lazy-pages --page-server 0.0.0.0:${port} --status-fd ${lazy_pipe} --work-path ./work-dir --image-path ./image-dir test_busybox & ) &
# Sleeping here. This is ugly, but not sure how else to handle it.
# The return code of the in the background running runc is needed, if
# there is some basic error. If the lazy migration is ready can
# be handled by $lazy_pipe. Which probably will always be ready
# after sleeping two seconds.
sleep 2
# Check if inventory.img was written
[ -e image-dir/inventory.img ]
# If the inventory.img exists criu checkpointed some things, let's see
# if there were other errors in the log file.
run grep -B 5 Error ./work-dir/dump.log -q
[ "$status" -eq 1 ]

# This will block until CRIU is ready to serve memory pages
cat $lazy_pipe
[ "$status" -eq 1 ]
# For lazy migration we need to know when CRIU is ready to serve
# the memory pages via TCP.
exec 72<> <(:)
exec 70</proc/self/fd/72 71>/proc/self/fd/72
exec 72>&-

__runc --criu "$CRIU" checkpoint --lazy-pages --page-server 0.0.0.0:${port} --status-fd 71 --work-path ./work-dir --image-path ./image-dir test_busybox &
cpt_pid=$!

# wait for lazy page server to be ready
out=$(timeout 2 dd if=/proc/self/fd/70 bs=1 count=1 2>/dev/null | od)
exec 71>&-
out=$(echo $out) # rm newlines
# show errors if there are any before we fail
grep -B5 Error ./work-dir/dump.log || true
# expecting \0 which od prints as
[ "$out" = "0000000 000000 0000001" ]

unlink $lazy_pipe
# Check if inventory.img was written
[ -e image-dir/inventory.img ]

# Double fork taken from helpers.bats
# We need to start 'criu lazy-pages' in the background,
# so we double fork in the shell.
# Start CRIU in lazy-daemon mode
$(${CRIU} lazy-pages --page-server --address 127.0.0.1 --port ${port} -D image-dir &) &
${CRIU} lazy-pages --page-server --address 127.0.0.1 --port ${port} -D image-dir &
lp_pid=$!

# Restore lazily from checkpoint.
# The restored container needs a different name as the checkpointed
Expand All @@ -190,8 +182,6 @@ function simple_cr() {
# continue to run if the migration failed at some point.
__runc --criu "$CRIU" restore -d --work-path ./image-dir --image-path ./image-dir --lazy-pages test_busybox_restore <&60 >&51 2>&51
[ $? -eq 0 ]
run grep -B 5 Error ./work-dir/dump.log -q
[ "$status" -eq 1 ]

# busybox should be back up and running
testcontainer test_busybox_restore running
Expand All @@ -200,6 +190,12 @@ function simple_cr() {
[ "$status" -eq 0 ]
[[ ${output} == "ok" ]]

wait $cpt_pid
[ $? -eq 0 ]

wait $lp_pid
[ $? -eq 0 ]

check_pipes
}

Expand Down

0 comments on commit 825e91a

Please sign in to comment.