Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LXD 4.10 breaks kubernetes support due to nf_conntrack_tcp_timeout_established #3627

Closed
ihanick opened this issue Jan 22, 2021 · 5 comments · Fixed by #3628
Closed

LXD 4.10 breaks kubernetes support due to nf_conntrack_tcp_timeout_established #3627

ihanick opened this issue Jan 22, 2021 · 5 comments · Fixed by #3628

Comments

@ihanick
Copy link

ihanick commented Jan 22, 2021

Required information

  • Distribution: ubuntu focal (Vagrant ubuntu/focal64)
  • Distribution version: snap LXD 4.10
  • The output of "lxc info" or if that fails:
# lxc info
config: {}
api_extensions:
- storage_zfs_remove_snapshots
- container_host_shutdown_timeout
- container_stop_priority
- container_syscall_filtering
- auth_pki
- container_last_used_at
- etag
- patch
- usb_devices
- https_allowed_credentials
- image_compression_algorithm
- directory_manipulation
- container_cpu_time
- storage_zfs_use_refquota
- storage_lvm_mount_options
- network
- profile_usedby
- container_push
- container_exec_recording
- certificate_update
- container_exec_signal_handling
- gpu_devices
- container_image_properties
- migration_progress
- id_map
- network_firewall_filtering
- network_routes
- storage
- file_delete
- file_append
- network_dhcp_expiry
- storage_lvm_vg_rename
- storage_lvm_thinpool_rename
- network_vlan
- image_create_aliases
- container_stateless_copy
- container_only_migration
- storage_zfs_clone_copy
- unix_device_rename
- storage_lvm_use_thinpool
- storage_rsync_bwlimit
- network_vxlan_interface
- storage_btrfs_mount_options
- entity_description
- image_force_refresh
- storage_lvm_lv_resizing
- id_map_base
- file_symlinks
- container_push_target
- network_vlan_physical
- storage_images_delete
- container_edit_metadata
- container_snapshot_stateful_migration
- storage_driver_ceph
- storage_ceph_user_name
- resource_limits
- storage_volatile_initial_source
- storage_ceph_force_osd_reuse
- storage_block_filesystem_btrfs
- resources
- kernel_limits
- storage_api_volume_rename
- macaroon_authentication
- network_sriov
- console
- restrict_devlxd
- migration_pre_copy
- infiniband
- maas_network
- devlxd_events
- proxy
- network_dhcp_gateway
- file_get_symlink
- network_leases
- unix_device_hotplug
- storage_api_local_volume_handling
- operation_description
- clustering
- event_lifecycle
- storage_api_remote_volume_handling
- nvidia_runtime
- container_mount_propagation
- container_backup
- devlxd_images
- container_local_cross_pool_handling
- proxy_unix
- proxy_udp
- clustering_join
- proxy_tcp_udp_multi_port_handling
- network_state
- proxy_unix_dac_properties
- container_protection_delete
- unix_priv_drop
- pprof_http
- proxy_haproxy_protocol
- network_hwaddr
- proxy_nat
- network_nat_order
- container_full
- candid_authentication
- backup_compression
- candid_config
- nvidia_runtime_config
- storage_api_volume_snapshots
- storage_unmapped
- projects
- candid_config_key
- network_vxlan_ttl
- container_incremental_copy
- usb_optional_vendorid
- snapshot_scheduling
- container_copy_project
- clustering_server_address
- clustering_image_replication
- container_protection_shift
- snapshot_expiry
- container_backup_override_pool
- snapshot_expiry_creation
- network_leases_location
- resources_cpu_socket
- resources_gpu
- resources_numa
- kernel_features
- id_map_current
- event_location
- storage_api_remote_volume_snapshots
- network_nat_address
- container_nic_routes
- rbac
- cluster_internal_copy
- seccomp_notify
- lxc_features
- container_nic_ipvlan
- network_vlan_sriov
- storage_cephfs
- container_nic_ipfilter
- resources_v2
- container_exec_user_group_cwd
- container_syscall_intercept
- container_disk_shift
- storage_shifted
- resources_infiniband
- daemon_storage
- instances
- image_types
- resources_disk_sata
- clustering_roles
- images_expiry
- resources_network_firmware
- backup_compression_algorithm
- ceph_data_pool_name
- container_syscall_intercept_mount
- compression_squashfs
- container_raw_mount
- container_nic_routed
- container_syscall_intercept_mount_fuse
- container_disk_ceph
- virtual-machines
- image_profiles
- clustering_architecture
- resources_disk_id
- storage_lvm_stripes
- vm_boot_priority
- unix_hotplug_devices
- api_filtering
- instance_nic_network
- clustering_sizing
- firewall_driver
- projects_limits
- container_syscall_intercept_hugetlbfs
- limits_hugepages
- container_nic_routed_gateway
- projects_restrictions
- custom_volume_snapshot_expiry
- volume_snapshot_scheduling
- trust_ca_certificates
- snapshot_disk_usage
- clustering_edit_roles
- container_nic_routed_host_address
- container_nic_ipvlan_gateway
- resources_usb_pci
- resources_cpu_threads_numa
- resources_cpu_core_die
- api_os
- container_nic_routed_host_table
- container_nic_ipvlan_host_table
- container_nic_ipvlan_mode
- resources_system
- images_push_relay
- network_dns_search
- container_nic_routed_limits
- instance_nic_bridged_vlan
- network_state_bond_bridge
- usedby_consistency
- custom_block_volumes
- clustering_failure_domains
- resources_gpu_mdev
- console_vga_type
- projects_limits_disk
- network_type_macvlan
- network_type_sriov
- container_syscall_intercept_bpf_devices
- network_type_ovn
- projects_networks
- projects_networks_restricted_uplinks
- custom_volume_backup
- backup_override_name
- storage_rsync_compression
- network_type_physical
- network_ovn_external_subnets
- network_ovn_nat
- network_ovn_external_routes_remove
- tpm_device_type
- storage_zfs_clone_copy_rebase
- gpu_mdev
- resources_pci_iommu
- resources_network_usb
- resources_disk_address
- network_physical_ovn_ingress_mode
- network_ovn_dhcp
- network_physical_routes_anycast
- projects_limits_instances
- network_state_vlan
- instance_nic_bridged_port_isolation
- instance_bulk_state_change
api_status: stable
api_version: "1.0"
auth: trusted
public: false
auth_methods:
- tls
environment:
  addresses: []
  architectures:
  - x86_64
  - i686
  certificate: |
    -----BEGIN CERTIFICATE-----
    MIICFTCCAZugAwIBAgIQc/8Yzq+wFeHe3PMLmrBCazAKBggqhkjOPQQDAzA6MRww
    GgYDVQQKExNsaW51eGNvbnRhaW5lcnMub3JnMRowGAYDVQQDDBFyb290QHVidW50
    dS1mb2NhbDAeFw0yMTAxMjIwMjI1NTJaFw0zMTAxMjAwMjI1NTJaMDoxHDAaBgNV
    BAoTE2xpbnV4Y29udGFpbmVycy5vcmcxGjAYBgNVBAMMEXJvb3RAdWJ1bnR1LWZv
    Y2FsMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAEhWc0+9vwteON5yCJF4H2kthp/te/
    ItfBuU0dL0qgg2NXt8jEHWteJZvKNi/yT4Z4JuMilGqPjdnJRyCD8RW250s8IHnk
    gVg7zfSQz5YkG8DOq4VG7tjFxeLHby3AAtKTo2YwZDAOBgNVHQ8BAf8EBAMCBaAw
    EwYDVR0lBAwwCgYIKwYBBQUHAwEwDAYDVR0TAQH/BAIwADAvBgNVHREEKDAmggx1
    YnVudHUtZm9jYWyHBH8AAAGHEAAAAAAAAAAAAAAAAAAAAAEwCgYIKoZIzj0EAwMD
    aAAwZQIxAKCgC9yfKDjgqk6AKkq7y/AojiQGNynaRlsE7fvP7xFV1TjhuOZzArMS
    /TB4F8yF1QIwLtq6orJ4PyNx2zTZEdFuRS764y5Kn3MAT5MJ52eA8Xworjj1wgXs
    d3JkZCWw0qhx
    -----END CERTIFICATE-----
  certificate_fingerprint: 4d8f2e6f6eab1576797e1e7bbbd49c82a2b146ead0ea18458546bfb198f764e3
  driver: qemu | lxc
  driver_version: 5.2.0 | 4.0.6
  firewall: nftables
  kernel: Linux
  kernel_architecture: x86_64
  kernel_features:
    netnsid_getifaddrs: "true"
    seccomp_listener: "true"
    seccomp_listener_continue: "true"
    shiftfs: "false"
    uevent_injection: "true"
    unpriv_fscaps: "true"
  kernel_version: 5.4.0-29-generic
  lxc_features:
    cgroup2: "true"
    devpts_fd: "true"
    mount_injection_file: "true"
    network_gateway_device_route: "true"
    network_ipvlan: "true"
    network_l2proxy: "true"
    network_phys_macvlan_mtu: "true"
    network_veth_router: "true"
    pidfd: "true"
    seccomp_allow_deny_syntax: "true"
    seccomp_notify: "true"
    seccomp_proxy_send_notify_fd: "true"
  os_name: Ubuntu
  os_version: "20.04"
  project: default
  server: lxd
  server_clustered: false
  server_name: ubuntu-focal
  server_pid: 6617
  server_version: "4.10"
  storage: dir
  storage_version: "1"

Issue description

After snap refresh from 4.9 to 4.10, I'm not able to run K3S. This kubernetes distribution (and other ones) requires a specific value for sysctl net.netfilter.nf_conntrack_tcp_timeout_established=86400. In 4.9 it was allowed to change the value, in 4.10 the value in procfs became read-only, but LXD also is not respecting host value.

Steps to reproduce

  1. Load nf_conntrack module: modprobe nf_conntrack
  2. Change nf_conntrack_tcp_timeout_established to one day: sysctl net.netfilter.nf_conntrack_tcp_timeout_established=86400
  3. Create a container lxc launch images:centos/7/amd64 c7 -c security.nesting=true -c security.privileged=true
  4. Attach to container's shell: lxc exec c7 bash
  5. Sysctl returns wrong value, 432000 instead of host's 86400: sysctl net.netfilter.nf_conntrack_tcp_timeout_established
  6. Attempt to change (unlike to 4.9) returns "sysctl: setting key "net.netfilter.nf_conntrack_tcp_timeout_established": Read-only file system" sysctl net.netfilter.nf_conntrack_tcp_timeout_established=86400
@stgraber
Copy link
Member

Hmm, LXD doesn't have any control over that. This is purely up to the kernel.

@ihanick
Copy link
Author

ihanick commented Jan 22, 2021

This is purely up to the kernel.

The issue is not happening with snap install lxd --channel=4.9/stable
Also different kernels allows to reproduce same problem: e.g. from Fedora 33: 5.10.8-200.fc33.x86_64

@stgraber stgraber transferred this issue from canonical/lxd Jan 22, 2021
@stgraber
Copy link
Member

This is a liblxc regression in the latest stable release.
@brauner can you sort this one out ASAP and we'll cherry-pick a fix into the snap?

Basically /proc/sys is mounted read-only (expected) but /proc/sys/net is no longer being kept read-write.
I believe we merged a contribution recently which was supposed to introduce this behavior but ONLY if cap_net_admin is dropped.

In the stable-4.0 branch, that would be those two commits:

@ihanick
Copy link
Author

ihanick commented Jan 22, 2021

was supposed to introduce this behavior but ONLY if cap_net_admin is dropped

It checks only for lxc.cap.keep, but not lxc.cap.drop.

I've tried to specify lxc.cap.keep in command line, but it's impossible to set lxc.cap.keep if lxc.cap.drop is specified.

Btw, due to privileged, the container already has cap_net_admin

root@ubuntu-focal:~# lxc launch images:centos/7/amd64 c7 -c security.nesting=true -c security.privileged=true
Creating c7
Starting c7                               
root@ubuntu-focal:~# lxc exec c7 bash
[root@c7 ~]# cap
capsh      captoinfo  
[root@c7 ~]# capsh --print
Current: = cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,cap_fsetid,cap_kill,cap_setgid,cap_setuid,cap_setpcap,cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,cap_sys_chroot,cap_sys_ptrace,cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,cap_sys_resource,cap_sys_tty_config,cap_mknod,cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,cap_mac_override,cap_mac_admin,cap_syslog,35,36,37+ep
Bounding set =cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,cap_fsetid,cap_kill,cap_setgid,cap_setuid,cap_setpcap,cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,cap_sys_chroot,cap_sys_ptrace,cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,cap_sys_resource,cap_sys_tty_config,cap_mknod,cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,cap_mac_override,cap_mac_admin,cap_syslog,35,36,37
Securebits: 00/0x0/1'b0
 secure-noroot: no (unlocked)
 secure-no-suid-fixup: no (unlocked)
 secure-keep-caps: no (unlocked)
uid=0(root)
gid=0(root)
groups=

brauner pushed a commit to brauner/lxc that referenced this issue Jan 22, 2021
Fixes: lxc#3627
Cc: stable-4.0
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
@brauner
Copy link
Member

brauner commented Jan 22, 2021

Should be fixed by #3628.

brauner pushed a commit to brauner/lxc that referenced this issue Jan 22, 2021
Fixes: lxc#3627
Cc: stable-4.0
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
brauner pushed a commit that referenced this issue Jan 22, 2021
Fixes: #3627
Cc: stable-4.0
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Development

Successfully merging a pull request may close this issue.

3 participants