Permalink
Fetching contributors…
Cannot retrieve contributors at this time
543 lines (466 sloc) 16.3 KB
package main
import (
"crypto/sha256"
"fmt"
"io"
"io/ioutil"
"os"
"path"
"strings"
"github.com/lxc/lxd/shared"
"github.com/lxc/lxd/shared/logger"
log "github.com/lxc/lxd/shared/log15"
)
const (
APPARMOR_CMD_LOAD = "r"
APPARMOR_CMD_UNLOAD = "R"
APPARMOR_CMD_PARSE = "Q"
)
var aaPath = shared.VarPath("security", "apparmor")
const AA_PROFILE_BASE = `
### Base profile
capability,
dbus,
file,
network,
umount,
# Allow us to receive signals from anywhere.
signal (receive),
# Allow us to send signals to ourselves
signal peer=@{profile_name},
# Allow other processes to read our /proc entries, futexes, perf tracing and
# kcmp for now (they will need 'read' in the first place). Administrators can
# override with:
# deny ptrace (readby) ...
ptrace (readby),
# Allow other processes to trace us by default (they will need 'trace' in
# the first place). Administrators can override with:
# deny ptrace (tracedby) ...
ptrace (tracedby),
# Allow us to ptrace ourselves
ptrace peer=@{profile_name},
# ignore DENIED message on / remount
deny mount options=(ro, remount) -> /,
deny mount options=(ro, remount, silent) -> /,
# allow tmpfs mounts everywhere
mount fstype=tmpfs,
# allow hugetlbfs mounts everywhere
mount fstype=hugetlbfs,
# allow mqueue mounts everywhere
mount fstype=mqueue,
# allow fuse mounts everywhere
mount fstype=fuse,
mount fstype=fuse.*,
# deny access under /proc/bus to avoid e.g. messing with pci devices directly
deny @{PROC}/bus/** wklx,
# deny writes in /proc/sys/fs but allow binfmt_misc to be mounted
mount fstype=binfmt_misc -> /proc/sys/fs/binfmt_misc/,
deny @{PROC}/sys/fs/** wklx,
# allow efivars to be mounted, writing to it will be blocked though
mount fstype=efivarfs -> /sys/firmware/efi/efivars/,
# block some other dangerous paths
deny @{PROC}/kcore rwklx,
deny @{PROC}/sysrq-trigger rwklx,
# deny writes in /sys except for /sys/fs/cgroup, also allow
# fusectl, securityfs and debugfs to be mounted there (read-only)
mount fstype=fusectl -> /sys/fs/fuse/connections/,
mount fstype=securityfs -> /sys/kernel/security/,
mount fstype=debugfs -> /sys/kernel/debug/,
deny mount fstype=debugfs -> /var/lib/ureadahead/debugfs/,
mount fstype=proc -> /proc/,
mount fstype=sysfs -> /sys/,
mount options=(rw, nosuid, nodev, noexec, remount) -> /sys/,
deny /sys/firmware/efi/efivars/** rwklx,
# note, /sys/kernel/security/** handled below
mount options=(move) /sys/fs/cgroup/cgmanager/ -> /sys/fs/cgroup/cgmanager.lower/,
mount options=(ro, nosuid, nodev, noexec, remount, strictatime) -> /sys/fs/cgroup/,
# deny reads from debugfs
deny /sys/kernel/debug/{,**} rwklx,
# allow paths to be made slave, shared, private or unbindable
# FIXME: This currently doesn't work due to the apparmor parser treating those as allowing all mounts.
# mount options=(rw,make-slave) -> **,
# mount options=(rw,make-rslave) -> **,
# mount options=(rw,make-shared) -> **,
# mount options=(rw,make-rshared) -> **,
# mount options=(rw,make-private) -> **,
# mount options=(rw,make-rprivate) -> **,
# mount options=(rw,make-unbindable) -> **,
# mount options=(rw,make-runbindable) -> **,
# allow bind-mounts of anything except /proc, /sys and /dev
mount options=(rw,bind) /[^spd]*{,/**},
mount options=(rw,bind) /d[^e]*{,/**},
mount options=(rw,bind) /de[^v]*{,/**},
mount options=(rw,bind) /dev/.[^l]*{,/**},
mount options=(rw,bind) /dev/.l[^x]*{,/**},
mount options=(rw,bind) /dev/.lx[^c]*{,/**},
mount options=(rw,bind) /dev/.lxc?*{,/**},
mount options=(rw,bind) /dev/[^.]*{,/**},
mount options=(rw,bind) /dev?*{,/**},
mount options=(rw,bind) /p[^r]*{,/**},
mount options=(rw,bind) /pr[^o]*{,/**},
mount options=(rw,bind) /pro[^c]*{,/**},
mount options=(rw,bind) /proc?*{,/**},
mount options=(rw,bind) /s[^y]*{,/**},
mount options=(rw,bind) /sy[^s]*{,/**},
mount options=(rw,bind) /sys?*{,/**},
# allow moving mounts except for /proc, /sys and /dev
mount options=(rw,move) /[^spd]*{,/**},
mount options=(rw,move) /d[^e]*{,/**},
mount options=(rw,move) /de[^v]*{,/**},
mount options=(rw,move) /dev/.[^l]*{,/**},
mount options=(rw,move) /dev/.l[^x]*{,/**},
mount options=(rw,move) /dev/.lx[^c]*{,/**},
mount options=(rw,move) /dev/.lxc?*{,/**},
mount options=(rw,move) /dev/[^.]*{,/**},
mount options=(rw,move) /dev?*{,/**},
mount options=(rw,move) /p[^r]*{,/**},
mount options=(rw,move) /pr[^o]*{,/**},
mount options=(rw,move) /pro[^c]*{,/**},
mount options=(rw,move) /proc?*{,/**},
mount options=(rw,move) /s[^y]*{,/**},
mount options=(rw,move) /sy[^s]*{,/**},
mount options=(rw,move) /sys?*{,/**},
# generated by: lxc-generate-aa-rules.py container-rules.base
deny /proc/sys/[^kn]*{,/**} wklx,
deny /proc/sys/k[^e]*{,/**} wklx,
deny /proc/sys/ke[^r]*{,/**} wklx,
deny /proc/sys/ker[^n]*{,/**} wklx,
deny /proc/sys/kern[^e]*{,/**} wklx,
deny /proc/sys/kerne[^l]*{,/**} wklx,
deny /proc/sys/kernel/[^smhd]*{,/**} wklx,
deny /proc/sys/kernel/d[^o]*{,/**} wklx,
deny /proc/sys/kernel/do[^m]*{,/**} wklx,
deny /proc/sys/kernel/dom[^a]*{,/**} wklx,
deny /proc/sys/kernel/doma[^i]*{,/**} wklx,
deny /proc/sys/kernel/domai[^n]*{,/**} wklx,
deny /proc/sys/kernel/domain[^n]*{,/**} wklx,
deny /proc/sys/kernel/domainn[^a]*{,/**} wklx,
deny /proc/sys/kernel/domainna[^m]*{,/**} wklx,
deny /proc/sys/kernel/domainnam[^e]*{,/**} wklx,
deny /proc/sys/kernel/domainname?*{,/**} wklx,
deny /proc/sys/kernel/h[^o]*{,/**} wklx,
deny /proc/sys/kernel/ho[^s]*{,/**} wklx,
deny /proc/sys/kernel/hos[^t]*{,/**} wklx,
deny /proc/sys/kernel/host[^n]*{,/**} wklx,
deny /proc/sys/kernel/hostn[^a]*{,/**} wklx,
deny /proc/sys/kernel/hostna[^m]*{,/**} wklx,
deny /proc/sys/kernel/hostnam[^e]*{,/**} wklx,
deny /proc/sys/kernel/hostname?*{,/**} wklx,
deny /proc/sys/kernel/m[^s]*{,/**} wklx,
deny /proc/sys/kernel/ms[^g]*{,/**} wklx,
deny /proc/sys/kernel/msg*/** wklx,
deny /proc/sys/kernel/s[^he]*{,/**} wklx,
deny /proc/sys/kernel/se[^m]*{,/**} wklx,
deny /proc/sys/kernel/sem*/** wklx,
deny /proc/sys/kernel/sh[^m]*{,/**} wklx,
deny /proc/sys/kernel/shm*/** wklx,
deny /proc/sys/kernel?*{,/**} wklx,
deny /proc/sys/n[^e]*{,/**} wklx,
deny /proc/sys/ne[^t]*{,/**} wklx,
deny /proc/sys/net?*{,/**} wklx,
deny /sys/[^fdck]*{,/**} wklx,
deny /sys/c[^l]*{,/**} wklx,
deny /sys/cl[^a]*{,/**} wklx,
deny /sys/cla[^s]*{,/**} wklx,
deny /sys/clas[^s]*{,/**} wklx,
deny /sys/class/[^n]*{,/**} wklx,
deny /sys/class/n[^e]*{,/**} wklx,
deny /sys/class/ne[^t]*{,/**} wklx,
deny /sys/class/net?*{,/**} wklx,
deny /sys/class?*{,/**} wklx,
deny /sys/d[^e]*{,/**} wklx,
deny /sys/de[^v]*{,/**} wklx,
deny /sys/dev[^i]*{,/**} wklx,
deny /sys/devi[^c]*{,/**} wklx,
deny /sys/devic[^e]*{,/**} wklx,
deny /sys/device[^s]*{,/**} wklx,
deny /sys/devices/[^v]*{,/**} wklx,
deny /sys/devices/v[^i]*{,/**} wklx,
deny /sys/devices/vi[^r]*{,/**} wklx,
deny /sys/devices/vir[^t]*{,/**} wklx,
deny /sys/devices/virt[^u]*{,/**} wklx,
deny /sys/devices/virtu[^a]*{,/**} wklx,
deny /sys/devices/virtua[^l]*{,/**} wklx,
deny /sys/devices/virtual/[^n]*{,/**} wklx,
deny /sys/devices/virtual/n[^e]*{,/**} wklx,
deny /sys/devices/virtual/ne[^t]*{,/**} wklx,
deny /sys/devices/virtual/net?*{,/**} wklx,
deny /sys/devices/virtual?*{,/**} wklx,
deny /sys/devices?*{,/**} wklx,
deny /sys/f[^s]*{,/**} wklx,
deny /sys/fs/[^c]*{,/**} wklx,
deny /sys/fs/c[^g]*{,/**} wklx,
deny /sys/fs/cg[^r]*{,/**} wklx,
deny /sys/fs/cgr[^o]*{,/**} wklx,
deny /sys/fs/cgro[^u]*{,/**} wklx,
deny /sys/fs/cgrou[^p]*{,/**} wklx,
deny /sys/fs/cgroup?*{,/**} wklx,
deny /sys/fs?*{,/**} wklx,
`
const AA_PROFILE_NESTING = `
pivot_root,
ptrace,
signal,
deny /dev/.lxd/proc/** rw,
deny /dev/.lxd/sys/** rw,
mount /var/lib/lxd/shmounts/ -> /var/lib/lxd/shmounts/,
mount none -> /var/lib/lxd/shmounts/,
mount fstype=proc -> /usr/lib/*/lxc/**,
mount fstype=sysfs -> /usr/lib/*/lxc/**,
mount options=(rw,bind),
mount options=(rw,rbind),
mount options=(rw,make-rshared),
# there doesn't seem to be a way to ask for:
# mount options=(ro,nosuid,nodev,noexec,remount,bind),
# as we always get mount to $cdir/proc/sys with those flags denied
# So allow all mounts until that is straightened out:
mount,
mount options=bind /var/lib/lxd/shmounts/** -> /var/lib/lxd/**,
`
const AA_PROFILE_UNPRIVILEGED = `
pivot_root,
mount options=(rw,make-slave) -> **,
mount options=(rw,make-rslave) -> **,
mount options=(rw,make-shared) -> **,
mount options=(rw,make-rshared) -> **,
mount options=(rw,make-private) -> **,
mount options=(rw,make-rprivate) -> **,
mount options=(rw,make-unbindable) -> **,
mount options=(rw,make-runbindable) -> **,
mount options=(rw,bind),
mount options=(rw,rbind),
`
func mkApparmorName(name string) string {
if len(name)+7 >= 253 {
hash := sha256.New()
io.WriteString(hash, name)
return fmt.Sprintf("%x", hash.Sum(nil))
}
return name
}
func AANamespace(c container) string {
/* / is not allowed in apparmor namespace names; let's also trim the
* leading / so it doesn't look like "-var-lib-lxd"
*/
lxddir := strings.Replace(strings.Trim(shared.VarPath(""), "/"), "/", "-", -1)
lxddir = mkApparmorName(lxddir)
return fmt.Sprintf("lxd-%s_<%s>", c.Name(), lxddir)
}
func AAProfileFull(c container) string {
lxddir := shared.VarPath("")
lxddir = mkApparmorName(lxddir)
return fmt.Sprintf("lxd-%s_<%s>", c.Name(), lxddir)
}
func AAProfileShort(c container) string {
return fmt.Sprintf("lxd-%s", c.Name())
}
// getProfileContent generates the apparmor profile template from the given
// container. This includes the stock lxc includes as well as stuff from
// raw.apparmor.
func getAAProfileContent(c container) string {
profile := strings.TrimLeft(AA_PROFILE_BASE, "\n")
// Apply new features
if aaParserSupports("unix") {
profile += `
### Feature: unix
# Allow receive via unix sockets from anywhere
unix (receive),
# Allow all unix in the container
unix peer=(label=@{profile_name}),
`
}
// Apply cgns bits
if shared.PathExists("/proc/self/ns/cgroup") {
profile += "\n ### Feature: cgroup namespace\n"
profile += " mount fstype=cgroup -> /sys/fs/cgroup/**,\n"
}
state := c.DaemonState()
if state.OS.AppArmorStacking && !state.OS.AppArmorStacked {
profile += "\n ### Feature: apparmor stacking\n"
profile += ` ### Configuration: apparmor profile loading (in namespace)
deny /sys/k[^e]*{,/**} wklx,
deny /sys/ke[^r]*{,/**} wklx,
deny /sys/ker[^n]*{,/**} wklx,
deny /sys/kern[^e]*{,/**} wklx,
deny /sys/kerne[^l]*{,/**} wklx,
deny /sys/kernel/[^s]*{,/**} wklx,
deny /sys/kernel/s[^e]*{,/**} wklx,
deny /sys/kernel/se[^c]*{,/**} wklx,
deny /sys/kernel/sec[^u]*{,/**} wklx,
deny /sys/kernel/secu[^r]*{,/**} wklx,
deny /sys/kernel/secur[^i]*{,/**} wklx,
deny /sys/kernel/securi[^t]*{,/**} wklx,
deny /sys/kernel/securit[^y]*{,/**} wklx,
deny /sys/kernel/security/[^a]*{,/**} wklx,
deny /sys/kernel/security/a[^p]*{,/**} wklx,
deny /sys/kernel/security/ap[^p]*{,/**} wklx,
deny /sys/kernel/security/app[^a]*{,/**} wklx,
deny /sys/kernel/security/appa[^r]*{,/**} wklx,
deny /sys/kernel/security/appar[^m]*{,/**} wklx,
deny /sys/kernel/security/apparm[^o]*{,/**} wklx,
deny /sys/kernel/security/apparmo[^r]*{,/**} wklx,
deny /sys/kernel/security/apparmor?*{,/**} wklx,
deny /sys/kernel/security?*{,/**} wklx,
deny /sys/kernel?*{,/**} wklx,
`
profile += fmt.Sprintf(" change_profile -> \":%s:*\",\n", AANamespace(c))
profile += fmt.Sprintf(" change_profile -> \":%s://*\",\n", AANamespace(c))
} else {
profile += "\n ### Feature: apparmor stacking (not present)\n"
profile += " deny /sys/k*{,/**} rwklx,\n"
}
if c.IsNesting() {
// Apply nesting bits
profile += "\n ### Configuration: nesting\n"
profile += strings.TrimLeft(AA_PROFILE_NESTING, "\n")
if !state.OS.AppArmorStacking || state.OS.AppArmorStacked {
profile += fmt.Sprintf(" change_profile -> \"%s\",\n", AAProfileFull(c))
}
}
if !c.IsPrivileged() || state.OS.RunningInUserNS {
// Apply unprivileged bits
profile += "\n ### Configuration: unprivileged containers\n"
profile += strings.TrimLeft(AA_PROFILE_UNPRIVILEGED, "\n")
}
// Append raw.apparmor
rawApparmor, ok := c.ExpandedConfig()["raw.apparmor"]
if ok {
profile += "\n ### Configuration: raw.apparmor\n"
for _, line := range strings.Split(strings.Trim(rawApparmor, "\n"), "\n") {
profile += fmt.Sprintf(" %s\n", line)
}
}
return fmt.Sprintf(`#include <tunables/global>
profile "%s" flags=(attach_disconnected,mediate_deleted) {
%s
}
`, AAProfileFull(c), strings.Trim(profile, "\n"))
}
func runApparmor(command string, c container) error {
state := c.DaemonState()
if !state.OS.AppArmorAvailable {
return nil
}
output, err := shared.RunCommand("apparmor_parser", []string{
fmt.Sprintf("-%sWL", command),
path.Join(aaPath, "cache"),
path.Join(aaPath, "profiles", AAProfileShort(c)),
}...)
if err != nil {
logger.Error("Running apparmor",
log.Ctx{"action": command, "output": output, "err": err})
}
return err
}
func mkApparmorNamespace(c container, namespace string) error {
state := c.DaemonState()
if !state.OS.AppArmorStacking || state.OS.AppArmorStacked {
return nil
}
p := path.Join("/sys/kernel/security/apparmor/policy/namespaces", namespace)
if err := os.Mkdir(p, 0755); !os.IsExist(err) {
return err
}
return nil
}
// Ensure that the container's policy is loaded into the kernel so the
// container can boot.
func AALoadProfile(c container) error {
state := c.DaemonState()
if !state.OS.AppArmorAdmin {
return nil
}
if err := mkApparmorNamespace(c, AANamespace(c)); err != nil {
return err
}
/* In order to avoid forcing a profile parse (potentially slow) on
* every container start, let's use apparmor's binary policy cache,
* which checks mtime of the files to figure out if the policy needs to
* be regenerated.
*
* Since it uses mtimes, we shouldn't just always write out our local
* apparmor template; instead we should check to see whether the
* template is the same as ours. If it isn't we should write our
* version out so that the new changes are reflected and we definitely
* force a recompile.
*/
profile := path.Join(aaPath, "profiles", AAProfileShort(c))
content, err := ioutil.ReadFile(profile)
if err != nil && !os.IsNotExist(err) {
return err
}
updated := getAAProfileContent(c)
if string(content) != string(updated) {
if err := os.MkdirAll(path.Join(aaPath, "cache"), 0700); err != nil {
return err
}
if err := os.MkdirAll(path.Join(aaPath, "profiles"), 0700); err != nil {
return err
}
if err := ioutil.WriteFile(profile, []byte(updated), 0600); err != nil {
return err
}
}
return runApparmor(APPARMOR_CMD_LOAD, c)
}
// Ensure that the container's policy namespace is unloaded to free kernel
// memory. This does not delete the policy from disk or cache.
func AADestroy(c container) error {
state := c.DaemonState()
if !state.OS.AppArmorAdmin {
return nil
}
if state.OS.AppArmorStacking && !state.OS.AppArmorStacked {
p := path.Join("/sys/kernel/security/apparmor/policy/namespaces", AANamespace(c))
if err := os.Remove(p); err != nil {
logger.Error("error removing apparmor namespace", log.Ctx{"err": err, "ns": p})
}
}
return runApparmor(APPARMOR_CMD_UNLOAD, c)
}
// Parse the profile without loading it into the kernel.
func AAParseProfile(c container) error {
state := c.DaemonState()
if !state.OS.AppArmorAvailable {
return nil
}
return runApparmor(APPARMOR_CMD_PARSE, c)
}
// Delete the policy from cache/disk.
func AADeleteProfile(c container) {
state := c.DaemonState()
if !state.OS.AppArmorAdmin {
return
}
/* It's ok if these deletes fail: if the container was never started,
* we'll have never written a profile or cached it.
*/
os.Remove(path.Join(aaPath, "cache", AAProfileShort(c)))
os.Remove(path.Join(aaPath, "profiles", AAProfileShort(c)))
}
func aaParserSupports(feature string) bool {
out, err := shared.RunCommand("apparmor_parser", "--version")
if err != nil {
return false
}
major := 0
minor := 0
micro := 0
_, err = fmt.Sscanf(strings.Split(out, "\n")[0], "AppArmor parser version %d.%d.%d", &major, &minor, &micro)
if err != nil {
return false
}
switch feature {
case "unix":
if major < 2 {
return false
}
if major == 2 && minor < 10 {
return false
}
if major == 2 && minor == 10 && micro < 95 {
return false
}
}
return true
}