Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

init: delay seccomp application as late as possible #1569

Merged
merged 2 commits into from Sep 7, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 12 additions & 1 deletion libcontainer/setns_init_linux.go
Expand Up @@ -47,7 +47,10 @@ func (l *linuxSetnsInit) Init() error {
return err
}
}
if l.config.Config.Seccomp != nil {
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err
}
Expand All @@ -61,5 +64,13 @@ func (l *linuxSetnsInit) Init() error {
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
}
// Set seccomp as close to execve as possible, so as few syscalls take
// place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles).
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return newSystemErrorWithCause(err, "init seccomp")
}
}
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
}
40 changes: 24 additions & 16 deletions libcontainer/standard_init_linux.go
Expand Up @@ -30,28 +30,28 @@ func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
var newperms uint32

if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
// with user ns we need 'other' search permissions
// With user ns we need 'other' search permissions.
newperms = 0x8
} else {
// without user ns we need 'UID' search permissions
// Without user ns we need 'UID' search permissions.
newperms = 0x80000
}

// create a unique per session container name that we can
// join in setns; however, other containers can also join it
// Create a unique per session container name that we can join in setns;
// However, other containers can also join it.
return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms
}

func (l *linuxStandardInit) Init() error {
if !l.config.Config.NoNewKeyring {
ringname, keepperms, newperms := l.getSessionRingParams()

// do not inherit the parent's session keyring
// Do not inherit the parent's session keyring.
sessKeyId, err := keys.JoinSessionKeyring(ringname)
if err != nil {
return err
}
// make session keyring searcheable
// Make session keyring searcheable.
if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
return err
}
Expand Down Expand Up @@ -150,39 +150,47 @@ func (l *linuxStandardInit) Init() error {
if err := pdeath.Restore(); err != nil {
return err
}
// compare the parent from the initial start of the init process and make sure that it did not change.
// if the parent changes that means it died and we were reparented to something else so we should
// just kill ourself and not cause problems for someone else.
// Compare the parent from the initial start of the init process and make
// sure that it did not change. if the parent changes that means it died
// and we were reparented to something else so we should just kill ourself
// and not cause problems for someone else.
if unix.Getppid() != l.parentPid {
return unix.Kill(unix.Getpid(), unix.SIGKILL)
}
// check for the arg before waiting to make sure it exists and it is returned
// as a create time error.
// Check for the arg before waiting to make sure it exists and it is
// returned as a create time error.
name, err := exec.LookPath(l.config.Args[0])
if err != nil {
return err
}
// close the pipe to signal that we have completed our init.
// Close the pipe to signal that we have completed our init.
l.pipe.Close()
// Wait for the FIFO to be opened on the other side before exec-ing the
// user process. We open it through /proc/self/fd/$fd, because the fd that
// was given to us was an O_PATH fd to the fifo itself. Linux allows us to
// re-open an O_PATH fd through /proc.
fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
if err != nil {
return newSystemErrorWithCause(err, "openat exec fifo")
return newSystemErrorWithCause(err, "open exec fifo")
}
if _, err := unix.Write(fd, []byte("0")); err != nil {
return newSystemErrorWithCause(err, "write 0 exec fifo")
}
// Close the O_PATH fifofd fd before exec because the kernel resets
// dumpable in the wrong order. This has been fixed in newer kernels, but
// we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
// N.B. the core issue itself (passing dirfds to the host filesystem) has
// since been resolved.
// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
unix.Close(l.fifoFd)
// Set seccomp as close to execve as possible, so as few syscalls take
// place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles).
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return newSystemErrorWithCause(err, "init seccomp")
}
}
// close the statedir fd before exec because the kernel resets dumpable in the wrong order
// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
unix.Close(l.fifoFd)
if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
return newSystemErrorWithCause(err, "exec user process")
}
Expand Down