From 6916425464d6c00593d4b06907db4b31163f9ad1 Mon Sep 17 00:00:00 2001 From: lfbzhm Date: Wed, 7 Feb 2024 15:49:03 +0000 Subject: [PATCH 1/4] bump go to 1.22.x Signed-off-by: lifubang --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e9fe1d8ffba..b68f3a7d080 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,7 +24,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-20.04, ubuntu-22.04, actuated-arm64-6cpu-8gb] - go-version: [1.20.x, 1.21.x] + go-version: [1.20.x, 1.21.x, 1.22.x] rootless: ["rootless", ""] race: ["-race", ""] criu: ["", "criu-dev"] From a2d4311a3f3779708dcb6e7c2a72a961fa6c0d37 Mon Sep 17 00:00:00 2001 From: lifubang Date: Thu, 4 Apr 2024 00:23:47 +0800 Subject: [PATCH 2/4] use fork to replace clone(CLONE_PARENT) As the description in #4233, there is a bug in glibc, pthread_self() will return wrong info after we do `clone(CLONE_PARENT)` in libct/nsenter, it will cause runc can't work in `go 1.22.*`. So we use fork(2) to replace clone(2) in libct/nsenter, but there is a double-fork in nsenter, so we need to use `PR_SET_CHILD_SUBREAPER` to let runc can reap grand child process in libct/nsenter. Signed-off-by: lifubang --- contrib/completions/bash/runc | 2 - exec.go | 21 +++++------ libcontainer/container_linux.go | 6 +++ libcontainer/nsenter/nsexec.c | 66 +++++++-------------------------- man/runc-restore.8.md | 3 -- man/runc-run.8.md | 3 -- restore.go | 2 +- run.go | 2 +- signals.go | 9 +---- utils_linux.go | 55 ++++++++++++++------------- 10 files changed, 60 insertions(+), 109 deletions(-) diff --git a/contrib/completions/bash/runc b/contrib/completions/bash/runc index 353c8ffdbbd..6365943b269 100644 --- a/contrib/completions/bash/runc +++ b/contrib/completions/bash/runc @@ -458,7 +458,6 @@ _runc_run() { --help --detatch -d - --no-subreaper --no-pivot --no-new-keyring " @@ -623,7 +622,6 @@ _runc_restore() { --file-locks --detach -d - --no-subreaper --no-pivot --auto-dedup --lazy-pages diff --git a/exec.go b/exec.go index 675f12fb905..71a5a506af1 100644 --- a/exec.go +++ b/exec.go @@ -181,17 +181,16 @@ func execProcess(context *cli.Context) (int, error) { } r := &runner{ - enableSubreaper: false, - shouldDestroy: false, - container: container, - consoleSocket: context.String("console-socket"), - pidfdSocket: context.String("pidfd-socket"), - detach: context.Bool("detach"), - pidFile: context.String("pid-file"), - action: CT_ACT_RUN, - init: false, - preserveFDs: context.Int("preserve-fds"), - subCgroupPaths: cgPaths, + shouldDestroy: false, + container: container, + consoleSocket: context.String("console-socket"), + pidfdSocket: context.String("pidfd-socket"), + detach: context.Bool("detach"), + pidFile: context.String("pid-file"), + action: CT_ACT_RUN, + init: false, + preserveFDs: context.Int("preserve-fds"), + subCgroupPaths: cgPaths, } return r.run(p) } diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 13be71ccb89..7587c3ad276 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -341,6 +341,12 @@ func (c *Container) start(process *Process) (retErr error) { if err := utils.CloseExecFrom(3); err != nil { return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err) } + + // Set us as the subreaper to let the grandchild process reparent to us. + if err := system.SetSubreaper(1); err != nil { + return fmt.Errorf("unable to set subreaper: %w", err) + } + if err := parent.start(); err != nil { return fmt.Errorf("unable to start container process: %w", err) } diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index c771ac7e116..1cef72832a8 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -52,20 +52,6 @@ enum sync_t { /* Stores the current stage of nsexec. */ int current_stage = STAGE_SETUP; -/* Assume the stack grows down, so arguments should be above it. */ -struct clone_t { - /* - * Reserve some space for clone() to locate arguments - * and retcode in this place - */ - char stack[4096] __attribute__((aligned(16))); - char stack_ptr[0]; - - /* There's two children. This is used to execute the different code. */ - jmp_buf *env; - int jmpval; -}; - struct nlconfig_t { char *data; @@ -303,23 +289,15 @@ static void update_oom_score_adj(char *data, size_t len) bail("failed to update /proc/self/oom_score_adj"); } -/* A dummy function that just jumps to the given jumpval. */ -static int child_func(void *arg) __attribute__((noinline)); -static int child_func(void *arg) +static int fork_and_run(jmp_buf *env, int jmpval) { - struct clone_t *ca = (struct clone_t *)arg; - longjmp(*ca->env, ca->jmpval); -} - -static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline)); -static int clone_parent(jmp_buf *env, int jmpval) -{ - struct clone_t ca = { - .env = env, - .jmpval = jmpval, - }; - - return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); + pid_t pid = fork(); + if (pid < 0) + bail("failed to fork"); + if (pid > 0) + return pid; + /* Jump back to the big switch in nsexec... */ + longjmp(*env, jmpval); } /* Returns the clone(2) flag for a namespace, given the name of a namespace. */ @@ -644,7 +622,7 @@ void nsexec(void) * * Unfortunately, it's not as simple as that. We have to fork to enter the * PID namespace (the PID namespace only applies to children). Since we'll - * have to double-fork, this clone_parent() call won't be able to get the + * have to double-fork, this fork() call won't be able to get the * PID of the _actual_ init process (without doing more synchronisation than * I can deal with at the moment). So we'll just get the parent to send it * for us, the only job of this process is to update @@ -655,15 +633,6 @@ void nsexec(void) * will be in that namespace (and it will not be able to give us a PID value * that makes sense without resorting to sending things with cmsg). * - * This also deals with an older issue caused by dumping cloneflags into - * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so - * we have to unshare(2) before clone(2) in order to do this. This was fixed - * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was - * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're - * aware, the last mainline kernel which had this bug was Linux 3.12. - * However, we cannot comment on which kernels the broken patch was - * backported to. - * * -- Aleksa "what has my life come to?" Sarai */ @@ -687,7 +656,7 @@ void nsexec(void) /* Start the process of getting a container. */ write_log(DEBUG, "spawn stage-1"); - stage1_pid = clone_parent(&env, STAGE_CHILD); + stage1_pid = fork_and_run(&env, STAGE_CHILD); if (stage1_pid < 0) bail("unable to spawn stage-1"); @@ -755,9 +724,7 @@ void nsexec(void) /* * Send both the stage-1 and stage-2 pids back to runc. * runc needs the stage-2 to continue process management, - * but because stage-1 was spawned with CLONE_PARENT we - * cannot reap it within stage-0 and thus we need to ask - * runc to reap the zombie for us. + * and ask runc to reap the zombie for us. */ write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc", stage1_pid, stage2_pid); @@ -892,9 +859,8 @@ void nsexec(void) } /* - * We don't have the privileges to do any mapping here (see the - * clone_parent rant). So signal stage-0 to do the mapping for - * us. + * We don't have the privileges to do any mapping here. + * So signal stage-0 to do the mapping for us. */ write_log(DEBUG, "request stage-0 to map user namespace"); s = SYNC_USERMAP_PLS; @@ -925,10 +891,6 @@ void nsexec(void) * ordering might break in the future (especially with rootless * containers). But for now, it's not possible to split this into * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. - * - * Note that we don't merge this with clone() because there were - * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) - * was broken, so we'll just do it the long way anyway. */ try_unshare(config.cloneflags, "remaining namespaces"); @@ -955,7 +917,7 @@ void nsexec(void) * to actually enter the new PID namespace. */ write_log(DEBUG, "spawn stage-2"); - stage2_pid = clone_parent(&env, STAGE_INIT); + stage2_pid = fork_and_run(&env, STAGE_INIT); if (stage2_pid < 0) bail("unable to spawn stage-2"); diff --git a/man/runc-restore.8.md b/man/runc-restore.8.md index eab50db9717..7a19d0e8b62 100644 --- a/man/runc-restore.8.md +++ b/man/runc-restore.8.md @@ -55,9 +55,6 @@ image files directory. **--pid-file** _path_ : Specify the file to write the initial container process' PID to. -**--no-subreaper** -: Disable the use of the subreaper used to reap reparented processes. - **--no-pivot** : Do not use pivot root to jail process inside rootfs. This should not be used except in exceptional circumstances, and may be unsafe from the security diff --git a/man/runc-run.8.md b/man/runc-run.8.md index 4959469718b..2fec84a636c 100644 --- a/man/runc-run.8.md +++ b/man/runc-run.8.md @@ -26,9 +26,6 @@ referencing the master end of the console's pseudoterminal. See **--pid-file** _path_ : Specify the file to write the initial container process' PID to. -**--no-subreaper** -: Disable the use of the subreaper used to reap reparented processes. - **--no-pivot** : Do not use pivot root to jail process inside rootfs. This should not be used except in exceptional circumstances, and may be unsafe from the security diff --git a/restore.go b/restore.go index d65afcfc788..3b592121715 100644 --- a/restore.go +++ b/restore.go @@ -70,7 +70,7 @@ using the runc checkpoint command.`, }, cli.BoolFlag{ Name: "no-subreaper", - Usage: "disable the use of the subreaper used to reap reparented processes", + Usage: "(ignored) disable the use of the subreaper used to reap reparented processes. This flag has been ignored by runc, and will be removed in 1.3.0", }, cli.BoolFlag{ Name: "no-pivot", diff --git a/run.go b/run.go index b03b8129bd9..ccdcc8167b4 100644 --- a/run.go +++ b/run.go @@ -54,7 +54,7 @@ command(s) that get executed on start, edit the args parameter of the spec. See }, cli.BoolFlag{ Name: "no-subreaper", - Usage: "disable the use of the subreaper used to reap reparented processes", + Usage: "(ignored) disable the use of the subreaper used to reap reparented processes. This flag has been ignored by runc, and will be removed in 1.3.0", }, cli.BoolFlag{ Name: "no-pivot", diff --git a/signals.go b/signals.go index e0bc7c61cab..68be26eb05f 100644 --- a/signals.go +++ b/signals.go @@ -5,7 +5,6 @@ import ( "os/signal" "github.com/opencontainers/runc/libcontainer" - "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/utils" "github.com/sirupsen/logrus" @@ -18,13 +17,7 @@ const signalBufferSize = 2048 // while still forwarding all other signals to the process. // If notifySocket is present, use it to read systemd notifications from the container and // forward them to notifySocketHost. -func newSignalHandler(enableSubreaper bool, notifySocket *notifySocket) *signalHandler { - if enableSubreaper { - // set us as the subreaper before registering the signal handler for the container - if err := system.SetSubreaper(1); err != nil { - logrus.Warn(err) - } - } +func newSignalHandler(notifySocket *notifySocket) *signalHandler { // ensure that we have a large buffer size so that we do not miss any signals // in case we are not processing them fast enough. s := make(chan os.Signal, signalBufferSize) diff --git a/utils_linux.go b/utils_linux.go index a59301c1874..6bf2f9a3050 100644 --- a/utils_linux.go +++ b/utils_linux.go @@ -190,20 +190,19 @@ func createContainer(context *cli.Context, id string, spec *specs.Spec) (*libcon } type runner struct { - init bool - enableSubreaper bool - shouldDestroy bool - detach bool - listenFDs []*os.File - preserveFDs int - pidFile string - consoleSocket string - pidfdSocket string - container *libcontainer.Container - action CtAct - notifySocket *notifySocket - criuOpts *libcontainer.CriuOpts - subCgroupPaths map[string]string + init bool + shouldDestroy bool + detach bool + listenFDs []*os.File + preserveFDs int + pidFile string + consoleSocket string + pidfdSocket string + container *libcontainer.Container + action CtAct + notifySocket *notifySocket + criuOpts *libcontainer.CriuOpts + subCgroupPaths map[string]string } func (r *runner) run(config *specs.Process) (int, error) { @@ -247,10 +246,11 @@ func (r *runner) run(config *specs.Process) (int, error) { return -1, err } detach := r.detach || (r.action == CT_ACT_CREATE) + // Setting up IO is a two stage process. We need to modify process to deal // with detaching containers, and then we get a tty after the container has // started. - handler := newSignalHandler(r.enableSubreaper, r.notifySocket) + handler := newSignalHandler(r.notifySocket) tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket) if err != nil { return -1, err @@ -396,19 +396,18 @@ func startContainer(context *cli.Context, action CtAct, criuOpts *libcontainer.C } r := &runner{ - enableSubreaper: !context.Bool("no-subreaper"), - shouldDestroy: !context.Bool("keep"), - container: container, - listenFDs: listenFDs, - notifySocket: notifySocket, - consoleSocket: context.String("console-socket"), - pidfdSocket: context.String("pidfd-socket"), - detach: context.Bool("detach"), - pidFile: context.String("pid-file"), - preserveFDs: context.Int("preserve-fds"), - action: action, - criuOpts: criuOpts, - init: true, + shouldDestroy: !context.Bool("keep"), + container: container, + listenFDs: listenFDs, + notifySocket: notifySocket, + consoleSocket: context.String("console-socket"), + pidfdSocket: context.String("pidfd-socket"), + detach: context.Bool("detach"), + pidFile: context.String("pid-file"), + preserveFDs: context.Int("preserve-fds"), + action: action, + criuOpts: criuOpts, + init: true, } return r.run(spec.Process) } From 2366a6b522bc41dd03fa77538d8af68203c0ad29 Mon Sep 17 00:00:00 2001 From: lifubang Date: Thu, 4 Apr 2024 00:28:04 +0800 Subject: [PATCH 3/4] Revert "ci/cross-i386: pin Go to 1.21.x" This reverts commit ac31da6b8049885142a460fd62dadb6d89884cfe. Signed-off-by: lifubang --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b68f3a7d080..933176c581b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -219,7 +219,7 @@ jobs: - name: install go uses: actions/setup-go@v5 with: - go-version: 1.21.x # TODO: switch to 1.x (latest stable) once Go 1.22 vs glibc issue is fixed. + go-version: 1.x # Latest stable - name: unit test env: From 1e60e59189873828298c221d1a9496dc8722954a Mon Sep 17 00:00:00 2001 From: lifubang Date: Thu, 4 Apr 2024 00:31:09 +0800 Subject: [PATCH 4/4] Revert "[hotfix] nsenter: refuse to build with Go 1.22 on glibc" This reverts commit e377e16846b8997a4ef52d7044cf0efcecac5e2e. Signed-off-by: lifubang --- libcontainer/nsenter/nsenter_go122.go | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100644 libcontainer/nsenter/nsenter_go122.go diff --git a/libcontainer/nsenter/nsenter_go122.go b/libcontainer/nsenter/nsenter_go122.go deleted file mode 100644 index 2b9ece0ad29..00000000000 --- a/libcontainer/nsenter/nsenter_go122.go +++ /dev/null @@ -1,15 +0,0 @@ -//go:build go1.22 - -package nsenter - -/* -// We know for sure that glibc has issues with pthread_self() when called from -// Go after nsenter has run. This is likely a more general problem with how we -// ignore the rules in signal-safety(7), and so it's possible musl will also -// have issues, but as this is just a hotfix let's only block glibc builds. -#include -#ifdef __GLIBC__ -# error "runc does not currently work properly with Go >=1.22. See ." -#endif -*/ -import "C"