Permalink
Cannot retrieve contributors at this time
Fetching contributors…
| package main | |
| import ( | |
| "crypto/sha256" | |
| "fmt" | |
| "io" | |
| "io/ioutil" | |
| "os" | |
| "path" | |
| "strings" | |
| "github.com/lxc/lxd/shared" | |
| "github.com/lxc/lxd/shared/logger" | |
| log "github.com/lxc/lxd/shared/log15" | |
| ) | |
| const ( | |
| APPARMOR_CMD_LOAD = "r" | |
| APPARMOR_CMD_UNLOAD = "R" | |
| APPARMOR_CMD_PARSE = "Q" | |
| ) | |
| var aaPath = shared.VarPath("security", "apparmor") | |
| const AA_PROFILE_BASE = ` | |
| ### Base profile | |
| capability, | |
| dbus, | |
| file, | |
| network, | |
| umount, | |
| # Allow us to receive signals from anywhere. | |
| signal (receive), | |
| # Allow us to send signals to ourselves | |
| signal peer=@{profile_name}, | |
| # Allow other processes to read our /proc entries, futexes, perf tracing and | |
| # kcmp for now (they will need 'read' in the first place). Administrators can | |
| # override with: | |
| # deny ptrace (readby) ... | |
| ptrace (readby), | |
| # Allow other processes to trace us by default (they will need 'trace' in | |
| # the first place). Administrators can override with: | |
| # deny ptrace (tracedby) ... | |
| ptrace (tracedby), | |
| # Allow us to ptrace ourselves | |
| ptrace peer=@{profile_name}, | |
| # ignore DENIED message on / remount | |
| deny mount options=(ro, remount) -> /, | |
| deny mount options=(ro, remount, silent) -> /, | |
| # allow tmpfs mounts everywhere | |
| mount fstype=tmpfs, | |
| # allow hugetlbfs mounts everywhere | |
| mount fstype=hugetlbfs, | |
| # allow mqueue mounts everywhere | |
| mount fstype=mqueue, | |
| # allow fuse mounts everywhere | |
| mount fstype=fuse, | |
| mount fstype=fuse.*, | |
| # deny access under /proc/bus to avoid e.g. messing with pci devices directly | |
| deny @{PROC}/bus/** wklx, | |
| # deny writes in /proc/sys/fs but allow binfmt_misc to be mounted | |
| mount fstype=binfmt_misc -> /proc/sys/fs/binfmt_misc/, | |
| deny @{PROC}/sys/fs/** wklx, | |
| # allow efivars to be mounted, writing to it will be blocked though | |
| mount fstype=efivarfs -> /sys/firmware/efi/efivars/, | |
| # block some other dangerous paths | |
| deny @{PROC}/kcore rwklx, | |
| deny @{PROC}/sysrq-trigger rwklx, | |
| # deny writes in /sys except for /sys/fs/cgroup, also allow | |
| # fusectl, securityfs and debugfs to be mounted there (read-only) | |
| mount fstype=fusectl -> /sys/fs/fuse/connections/, | |
| mount fstype=securityfs -> /sys/kernel/security/, | |
| mount fstype=debugfs -> /sys/kernel/debug/, | |
| deny mount fstype=debugfs -> /var/lib/ureadahead/debugfs/, | |
| mount fstype=proc -> /proc/, | |
| mount fstype=sysfs -> /sys/, | |
| mount options=(rw, nosuid, nodev, noexec, remount) -> /sys/, | |
| deny /sys/firmware/efi/efivars/** rwklx, | |
| # note, /sys/kernel/security/** handled below | |
| mount options=(move) /sys/fs/cgroup/cgmanager/ -> /sys/fs/cgroup/cgmanager.lower/, | |
| mount options=(ro, nosuid, nodev, noexec, remount, strictatime) -> /sys/fs/cgroup/, | |
| # deny reads from debugfs | |
| deny /sys/kernel/debug/{,**} rwklx, | |
| # allow paths to be made slave, shared, private or unbindable | |
| # FIXME: This currently doesn't work due to the apparmor parser treating those as allowing all mounts. | |
| # mount options=(rw,make-slave) -> **, | |
| # mount options=(rw,make-rslave) -> **, | |
| # mount options=(rw,make-shared) -> **, | |
| # mount options=(rw,make-rshared) -> **, | |
| # mount options=(rw,make-private) -> **, | |
| # mount options=(rw,make-rprivate) -> **, | |
| # mount options=(rw,make-unbindable) -> **, | |
| # mount options=(rw,make-runbindable) -> **, | |
| # allow bind-mounts of anything except /proc, /sys and /dev | |
| mount options=(rw,bind) /[^spd]*{,/**}, | |
| mount options=(rw,bind) /d[^e]*{,/**}, | |
| mount options=(rw,bind) /de[^v]*{,/**}, | |
| mount options=(rw,bind) /dev/.[^l]*{,/**}, | |
| mount options=(rw,bind) /dev/.l[^x]*{,/**}, | |
| mount options=(rw,bind) /dev/.lx[^c]*{,/**}, | |
| mount options=(rw,bind) /dev/.lxc?*{,/**}, | |
| mount options=(rw,bind) /dev/[^.]*{,/**}, | |
| mount options=(rw,bind) /dev?*{,/**}, | |
| mount options=(rw,bind) /p[^r]*{,/**}, | |
| mount options=(rw,bind) /pr[^o]*{,/**}, | |
| mount options=(rw,bind) /pro[^c]*{,/**}, | |
| mount options=(rw,bind) /proc?*{,/**}, | |
| mount options=(rw,bind) /s[^y]*{,/**}, | |
| mount options=(rw,bind) /sy[^s]*{,/**}, | |
| mount options=(rw,bind) /sys?*{,/**}, | |
| # allow moving mounts except for /proc, /sys and /dev | |
| mount options=(rw,move) /[^spd]*{,/**}, | |
| mount options=(rw,move) /d[^e]*{,/**}, | |
| mount options=(rw,move) /de[^v]*{,/**}, | |
| mount options=(rw,move) /dev/.[^l]*{,/**}, | |
| mount options=(rw,move) /dev/.l[^x]*{,/**}, | |
| mount options=(rw,move) /dev/.lx[^c]*{,/**}, | |
| mount options=(rw,move) /dev/.lxc?*{,/**}, | |
| mount options=(rw,move) /dev/[^.]*{,/**}, | |
| mount options=(rw,move) /dev?*{,/**}, | |
| mount options=(rw,move) /p[^r]*{,/**}, | |
| mount options=(rw,move) /pr[^o]*{,/**}, | |
| mount options=(rw,move) /pro[^c]*{,/**}, | |
| mount options=(rw,move) /proc?*{,/**}, | |
| mount options=(rw,move) /s[^y]*{,/**}, | |
| mount options=(rw,move) /sy[^s]*{,/**}, | |
| mount options=(rw,move) /sys?*{,/**}, | |
| # generated by: lxc-generate-aa-rules.py container-rules.base | |
| deny /proc/sys/[^kn]*{,/**} wklx, | |
| deny /proc/sys/k[^e]*{,/**} wklx, | |
| deny /proc/sys/ke[^r]*{,/**} wklx, | |
| deny /proc/sys/ker[^n]*{,/**} wklx, | |
| deny /proc/sys/kern[^e]*{,/**} wklx, | |
| deny /proc/sys/kerne[^l]*{,/**} wklx, | |
| deny /proc/sys/kernel/[^smhd]*{,/**} wklx, | |
| deny /proc/sys/kernel/d[^o]*{,/**} wklx, | |
| deny /proc/sys/kernel/do[^m]*{,/**} wklx, | |
| deny /proc/sys/kernel/dom[^a]*{,/**} wklx, | |
| deny /proc/sys/kernel/doma[^i]*{,/**} wklx, | |
| deny /proc/sys/kernel/domai[^n]*{,/**} wklx, | |
| deny /proc/sys/kernel/domain[^n]*{,/**} wklx, | |
| deny /proc/sys/kernel/domainn[^a]*{,/**} wklx, | |
| deny /proc/sys/kernel/domainna[^m]*{,/**} wklx, | |
| deny /proc/sys/kernel/domainnam[^e]*{,/**} wklx, | |
| deny /proc/sys/kernel/domainname?*{,/**} wklx, | |
| deny /proc/sys/kernel/h[^o]*{,/**} wklx, | |
| deny /proc/sys/kernel/ho[^s]*{,/**} wklx, | |
| deny /proc/sys/kernel/hos[^t]*{,/**} wklx, | |
| deny /proc/sys/kernel/host[^n]*{,/**} wklx, | |
| deny /proc/sys/kernel/hostn[^a]*{,/**} wklx, | |
| deny /proc/sys/kernel/hostna[^m]*{,/**} wklx, | |
| deny /proc/sys/kernel/hostnam[^e]*{,/**} wklx, | |
| deny /proc/sys/kernel/hostname?*{,/**} wklx, | |
| deny /proc/sys/kernel/m[^s]*{,/**} wklx, | |
| deny /proc/sys/kernel/ms[^g]*{,/**} wklx, | |
| deny /proc/sys/kernel/msg*/** wklx, | |
| deny /proc/sys/kernel/s[^he]*{,/**} wklx, | |
| deny /proc/sys/kernel/se[^m]*{,/**} wklx, | |
| deny /proc/sys/kernel/sem*/** wklx, | |
| deny /proc/sys/kernel/sh[^m]*{,/**} wklx, | |
| deny /proc/sys/kernel/shm*/** wklx, | |
| deny /proc/sys/kernel?*{,/**} wklx, | |
| deny /proc/sys/n[^e]*{,/**} wklx, | |
| deny /proc/sys/ne[^t]*{,/**} wklx, | |
| deny /proc/sys/net?*{,/**} wklx, | |
| deny /sys/[^fdck]*{,/**} wklx, | |
| deny /sys/c[^l]*{,/**} wklx, | |
| deny /sys/cl[^a]*{,/**} wklx, | |
| deny /sys/cla[^s]*{,/**} wklx, | |
| deny /sys/clas[^s]*{,/**} wklx, | |
| deny /sys/class/[^n]*{,/**} wklx, | |
| deny /sys/class/n[^e]*{,/**} wklx, | |
| deny /sys/class/ne[^t]*{,/**} wklx, | |
| deny /sys/class/net?*{,/**} wklx, | |
| deny /sys/class?*{,/**} wklx, | |
| deny /sys/d[^e]*{,/**} wklx, | |
| deny /sys/de[^v]*{,/**} wklx, | |
| deny /sys/dev[^i]*{,/**} wklx, | |
| deny /sys/devi[^c]*{,/**} wklx, | |
| deny /sys/devic[^e]*{,/**} wklx, | |
| deny /sys/device[^s]*{,/**} wklx, | |
| deny /sys/devices/[^v]*{,/**} wklx, | |
| deny /sys/devices/v[^i]*{,/**} wklx, | |
| deny /sys/devices/vi[^r]*{,/**} wklx, | |
| deny /sys/devices/vir[^t]*{,/**} wklx, | |
| deny /sys/devices/virt[^u]*{,/**} wklx, | |
| deny /sys/devices/virtu[^a]*{,/**} wklx, | |
| deny /sys/devices/virtua[^l]*{,/**} wklx, | |
| deny /sys/devices/virtual/[^n]*{,/**} wklx, | |
| deny /sys/devices/virtual/n[^e]*{,/**} wklx, | |
| deny /sys/devices/virtual/ne[^t]*{,/**} wklx, | |
| deny /sys/devices/virtual/net?*{,/**} wklx, | |
| deny /sys/devices/virtual?*{,/**} wklx, | |
| deny /sys/devices?*{,/**} wklx, | |
| deny /sys/f[^s]*{,/**} wklx, | |
| deny /sys/fs/[^c]*{,/**} wklx, | |
| deny /sys/fs/c[^g]*{,/**} wklx, | |
| deny /sys/fs/cg[^r]*{,/**} wklx, | |
| deny /sys/fs/cgr[^o]*{,/**} wklx, | |
| deny /sys/fs/cgro[^u]*{,/**} wklx, | |
| deny /sys/fs/cgrou[^p]*{,/**} wklx, | |
| deny /sys/fs/cgroup?*{,/**} wklx, | |
| deny /sys/fs?*{,/**} wklx, | |
| ` | |
| const AA_PROFILE_NESTING = ` | |
| pivot_root, | |
| ptrace, | |
| signal, | |
| deny /dev/.lxd/proc/** rw, | |
| deny /dev/.lxd/sys/** rw, | |
| mount /var/lib/lxd/shmounts/ -> /var/lib/lxd/shmounts/, | |
| mount none -> /var/lib/lxd/shmounts/, | |
| mount fstype=proc -> /usr/lib/*/lxc/**, | |
| mount fstype=sysfs -> /usr/lib/*/lxc/**, | |
| mount options=(rw,bind), | |
| mount options=(rw,rbind), | |
| mount options=(rw,make-rshared), | |
| # there doesn't seem to be a way to ask for: | |
| # mount options=(ro,nosuid,nodev,noexec,remount,bind), | |
| # as we always get mount to $cdir/proc/sys with those flags denied | |
| # So allow all mounts until that is straightened out: | |
| mount, | |
| mount options=bind /var/lib/lxd/shmounts/** -> /var/lib/lxd/**, | |
| ` | |
| const AA_PROFILE_UNPRIVILEGED = ` | |
| pivot_root, | |
| mount options=(rw,make-slave) -> **, | |
| mount options=(rw,make-rslave) -> **, | |
| mount options=(rw,make-shared) -> **, | |
| mount options=(rw,make-rshared) -> **, | |
| mount options=(rw,make-private) -> **, | |
| mount options=(rw,make-rprivate) -> **, | |
| mount options=(rw,make-unbindable) -> **, | |
| mount options=(rw,make-runbindable) -> **, | |
| mount options=(rw,bind), | |
| mount options=(rw,rbind), | |
| ` | |
| func mkApparmorName(name string) string { | |
| if len(name)+7 >= 253 { | |
| hash := sha256.New() | |
| io.WriteString(hash, name) | |
| return fmt.Sprintf("%x", hash.Sum(nil)) | |
| } | |
| return name | |
| } | |
| func AANamespace(c container) string { | |
| /* / is not allowed in apparmor namespace names; let's also trim the | |
| * leading / so it doesn't look like "-var-lib-lxd" | |
| */ | |
| lxddir := strings.Replace(strings.Trim(shared.VarPath(""), "/"), "/", "-", -1) | |
| lxddir = mkApparmorName(lxddir) | |
| return fmt.Sprintf("lxd-%s_<%s>", c.Name(), lxddir) | |
| } | |
| func AAProfileFull(c container) string { | |
| lxddir := shared.VarPath("") | |
| lxddir = mkApparmorName(lxddir) | |
| return fmt.Sprintf("lxd-%s_<%s>", c.Name(), lxddir) | |
| } | |
| func AAProfileShort(c container) string { | |
| return fmt.Sprintf("lxd-%s", c.Name()) | |
| } | |
| // getProfileContent generates the apparmor profile template from the given | |
| // container. This includes the stock lxc includes as well as stuff from | |
| // raw.apparmor. | |
| func getAAProfileContent(c container) string { | |
| profile := strings.TrimLeft(AA_PROFILE_BASE, "\n") | |
| // Apply new features | |
| if aaParserSupports("unix") { | |
| profile += ` | |
| ### Feature: unix | |
| # Allow receive via unix sockets from anywhere | |
| unix (receive), | |
| # Allow all unix in the container | |
| unix peer=(label=@{profile_name}), | |
| ` | |
| } | |
| // Apply cgns bits | |
| if shared.PathExists("/proc/self/ns/cgroup") { | |
| profile += "\n ### Feature: cgroup namespace\n" | |
| profile += " mount fstype=cgroup -> /sys/fs/cgroup/**,\n" | |
| } | |
| state := c.DaemonState() | |
| if state.OS.AppArmorStacking && !state.OS.AppArmorStacked { | |
| profile += "\n ### Feature: apparmor stacking\n" | |
| profile += ` ### Configuration: apparmor profile loading (in namespace) | |
| deny /sys/k[^e]*{,/**} wklx, | |
| deny /sys/ke[^r]*{,/**} wklx, | |
| deny /sys/ker[^n]*{,/**} wklx, | |
| deny /sys/kern[^e]*{,/**} wklx, | |
| deny /sys/kerne[^l]*{,/**} wklx, | |
| deny /sys/kernel/[^s]*{,/**} wklx, | |
| deny /sys/kernel/s[^e]*{,/**} wklx, | |
| deny /sys/kernel/se[^c]*{,/**} wklx, | |
| deny /sys/kernel/sec[^u]*{,/**} wklx, | |
| deny /sys/kernel/secu[^r]*{,/**} wklx, | |
| deny /sys/kernel/secur[^i]*{,/**} wklx, | |
| deny /sys/kernel/securi[^t]*{,/**} wklx, | |
| deny /sys/kernel/securit[^y]*{,/**} wklx, | |
| deny /sys/kernel/security/[^a]*{,/**} wklx, | |
| deny /sys/kernel/security/a[^p]*{,/**} wklx, | |
| deny /sys/kernel/security/ap[^p]*{,/**} wklx, | |
| deny /sys/kernel/security/app[^a]*{,/**} wklx, | |
| deny /sys/kernel/security/appa[^r]*{,/**} wklx, | |
| deny /sys/kernel/security/appar[^m]*{,/**} wklx, | |
| deny /sys/kernel/security/apparm[^o]*{,/**} wklx, | |
| deny /sys/kernel/security/apparmo[^r]*{,/**} wklx, | |
| deny /sys/kernel/security/apparmor?*{,/**} wklx, | |
| deny /sys/kernel/security?*{,/**} wklx, | |
| deny /sys/kernel?*{,/**} wklx, | |
| ` | |
| profile += fmt.Sprintf(" change_profile -> \":%s:*\",\n", AANamespace(c)) | |
| profile += fmt.Sprintf(" change_profile -> \":%s://*\",\n", AANamespace(c)) | |
| } else { | |
| profile += "\n ### Feature: apparmor stacking (not present)\n" | |
| profile += " deny /sys/k*{,/**} rwklx,\n" | |
| } | |
| if c.IsNesting() { | |
| // Apply nesting bits | |
| profile += "\n ### Configuration: nesting\n" | |
| profile += strings.TrimLeft(AA_PROFILE_NESTING, "\n") | |
| if !state.OS.AppArmorStacking || state.OS.AppArmorStacked { | |
| profile += fmt.Sprintf(" change_profile -> \"%s\",\n", AAProfileFull(c)) | |
| } | |
| } | |
| if !c.IsPrivileged() || state.OS.RunningInUserNS { | |
| // Apply unprivileged bits | |
| profile += "\n ### Configuration: unprivileged containers\n" | |
| profile += strings.TrimLeft(AA_PROFILE_UNPRIVILEGED, "\n") | |
| } | |
| // Append raw.apparmor | |
| rawApparmor, ok := c.ExpandedConfig()["raw.apparmor"] | |
| if ok { | |
| profile += "\n ### Configuration: raw.apparmor\n" | |
| for _, line := range strings.Split(strings.Trim(rawApparmor, "\n"), "\n") { | |
| profile += fmt.Sprintf(" %s\n", line) | |
| } | |
| } | |
| return fmt.Sprintf(`#include <tunables/global> | |
| profile "%s" flags=(attach_disconnected,mediate_deleted) { | |
| %s | |
| } | |
| `, AAProfileFull(c), strings.Trim(profile, "\n")) | |
| } | |
| func runApparmor(command string, c container) error { | |
| state := c.DaemonState() | |
| if !state.OS.AppArmorAvailable { | |
| return nil | |
| } | |
| output, err := shared.RunCommand("apparmor_parser", []string{ | |
| fmt.Sprintf("-%sWL", command), | |
| path.Join(aaPath, "cache"), | |
| path.Join(aaPath, "profiles", AAProfileShort(c)), | |
| }...) | |
| if err != nil { | |
| logger.Error("Running apparmor", | |
| log.Ctx{"action": command, "output": output, "err": err}) | |
| } | |
| return err | |
| } | |
| func mkApparmorNamespace(c container, namespace string) error { | |
| state := c.DaemonState() | |
| if !state.OS.AppArmorStacking || state.OS.AppArmorStacked { | |
| return nil | |
| } | |
| p := path.Join("/sys/kernel/security/apparmor/policy/namespaces", namespace) | |
| if err := os.Mkdir(p, 0755); !os.IsExist(err) { | |
| return err | |
| } | |
| return nil | |
| } | |
| // Ensure that the container's policy is loaded into the kernel so the | |
| // container can boot. | |
| func AALoadProfile(c container) error { | |
| state := c.DaemonState() | |
| if !state.OS.AppArmorAdmin { | |
| return nil | |
| } | |
| if err := mkApparmorNamespace(c, AANamespace(c)); err != nil { | |
| return err | |
| } | |
| /* In order to avoid forcing a profile parse (potentially slow) on | |
| * every container start, let's use apparmor's binary policy cache, | |
| * which checks mtime of the files to figure out if the policy needs to | |
| * be regenerated. | |
| * | |
| * Since it uses mtimes, we shouldn't just always write out our local | |
| * apparmor template; instead we should check to see whether the | |
| * template is the same as ours. If it isn't we should write our | |
| * version out so that the new changes are reflected and we definitely | |
| * force a recompile. | |
| */ | |
| profile := path.Join(aaPath, "profiles", AAProfileShort(c)) | |
| content, err := ioutil.ReadFile(profile) | |
| if err != nil && !os.IsNotExist(err) { | |
| return err | |
| } | |
| updated := getAAProfileContent(c) | |
| if string(content) != string(updated) { | |
| if err := os.MkdirAll(path.Join(aaPath, "cache"), 0700); err != nil { | |
| return err | |
| } | |
| if err := os.MkdirAll(path.Join(aaPath, "profiles"), 0700); err != nil { | |
| return err | |
| } | |
| if err := ioutil.WriteFile(profile, []byte(updated), 0600); err != nil { | |
| return err | |
| } | |
| } | |
| return runApparmor(APPARMOR_CMD_LOAD, c) | |
| } | |
| // Ensure that the container's policy namespace is unloaded to free kernel | |
| // memory. This does not delete the policy from disk or cache. | |
| func AADestroy(c container) error { | |
| state := c.DaemonState() | |
| if !state.OS.AppArmorAdmin { | |
| return nil | |
| } | |
| if state.OS.AppArmorStacking && !state.OS.AppArmorStacked { | |
| p := path.Join("/sys/kernel/security/apparmor/policy/namespaces", AANamespace(c)) | |
| if err := os.Remove(p); err != nil { | |
| logger.Error("error removing apparmor namespace", log.Ctx{"err": err, "ns": p}) | |
| } | |
| } | |
| return runApparmor(APPARMOR_CMD_UNLOAD, c) | |
| } | |
| // Parse the profile without loading it into the kernel. | |
| func AAParseProfile(c container) error { | |
| state := c.DaemonState() | |
| if !state.OS.AppArmorAvailable { | |
| return nil | |
| } | |
| return runApparmor(APPARMOR_CMD_PARSE, c) | |
| } | |
| // Delete the policy from cache/disk. | |
| func AADeleteProfile(c container) { | |
| state := c.DaemonState() | |
| if !state.OS.AppArmorAdmin { | |
| return | |
| } | |
| /* It's ok if these deletes fail: if the container was never started, | |
| * we'll have never written a profile or cached it. | |
| */ | |
| os.Remove(path.Join(aaPath, "cache", AAProfileShort(c))) | |
| os.Remove(path.Join(aaPath, "profiles", AAProfileShort(c))) | |
| } | |
| func aaParserSupports(feature string) bool { | |
| out, err := shared.RunCommand("apparmor_parser", "--version") | |
| if err != nil { | |
| return false | |
| } | |
| major := 0 | |
| minor := 0 | |
| micro := 0 | |
| _, err = fmt.Sscanf(strings.Split(out, "\n")[0], "AppArmor parser version %d.%d.%d", &major, &minor, µ) | |
| if err != nil { | |
| return false | |
| } | |
| switch feature { | |
| case "unix": | |
| if major < 2 { | |
| return false | |
| } | |
| if major == 2 && minor < 10 { | |
| return false | |
| } | |
| if major == 2 && minor == 10 && micro < 95 { | |
| return false | |
| } | |
| } | |
| return true | |
| } |