From 53e3bf3cd91437df958b77572c5cd769418e3fc8 Mon Sep 17 00:00:00 2001 From: trown Date: Fri, 3 May 2019 15:42:43 -0400 Subject: [PATCH] openstack: Remove the Service VM The experimental OpenStack backend used to create an extra server running DNS and load balancer services that the cluster needed. OpenStack does not always come with DNSaaS or LBaaS so we had to provide the functionality the OpenShift cluster depends on (e.g. the etcd SRV records, the api-int records & load balancing, etc.). This approach is undesirable for two reasons: first, it adds an extra node that the other IPI platforms do not need. Second, this node is a single point of failure. The Baremetal platform has faced the same issues and they have solved them with a few virtual IP addresses managed by keepalived in combination with coredns static pod running on every node using the mDNS protocol to update records as new nodes are added or removed and a similar static pod haproxy to load balance the control plane internally. The VIPs are defined here in the installer and they use the PlatformStatus field to be passed to the necessary machine-config-operator fields: https://github.com/openshift/api/pull/374 The Bare Metal IPI Networking Infrastructure document is broadly applicable here as well: https://github.com/openshift/installer/blob/master/docs/design/baremetal/networking-infrastructure.md Notable differences in OpenStack: * We only use the API and DNS VIPs right now * Instead of Baremetal's Ingress VIP (which is attached to the OpenShift routers) our haproxy static pods balance the 80 & 443 pods to the worker nodes * We do not run coredns on the bootstrap node. Instead, bootstrap itself uses one of the masters for DNS. These differences are not fundamental to OpenStack and we will be looking at aligning more closely with the Baremetal provider in the future. There is also a great oportunity to share some of the configuration files and scripts here. This change needs several other pull requests: Keepalived plus the coredns & haproxy static pods in the MCO: openshift/machine-config-operator/pull/740 Passing the API and DNS VIPs through the installer: https://github.com/openshift/installer/pull/1998 Vendoring the OpenStack PlatformStatus changes in the MCO: https://github.com/openshift/machine-config-operator/pull/978 Allowing to use PlatformStatus in the MCO templates: https://github.com/openshift/machine-config-operator/pull/943 Co-authored-by: Emilio Garcia Co-authored-by: John Trowbridge Co-authored-by: Martin Andre Co-authored-by: Tomas Sedovic Massive thanks to the Bare Metal and oVirt people! --- .../files/usr/local/bin/bootkube.sh.template | 2 + .../files/etc/keepalived/keepalived.conf.tmpl | 25 ++ .../openstack/files/usr/local/bin/fletcher8 | 10 + .../files/usr/local/bin/get_vip_subnet_cidr | 24 ++ .../usr/local/bin/keepalived.sh.template | 48 +++ .../systemd/units/keepalived.service | 18 + data/data/openstack/bootstrap/main.tf | 62 +++- data/data/openstack/bootstrap/variables.tf | 5 +- data/data/openstack/main.tf | 59 ++-- data/data/openstack/masters/main.tf | 4 +- data/data/openstack/masters/variables.tf | 8 - data/data/openstack/service/main.tf | 320 ------------------ data/data/openstack/service/variables.tf | 57 ---- data/data/openstack/topology/outputs.tf | 16 - .../openstack/topology/private-network.tf | 65 ++-- data/data/openstack/topology/sg-lb.tf | 85 ----- data/data/openstack/topology/sg-master.tf | 35 +- data/data/openstack/topology/variables.tf | 5 + data/data/openstack/variables-openstack.tf | 2 +- pkg/asset/ignition/bootstrap/bootstrap.go | 2 + pkg/asset/ignition/machine/node.go | 5 +- pkg/asset/manifests/infrastructure.go | 4 + 22 files changed, 277 insertions(+), 584 deletions(-) create mode 100644 data/data/bootstrap/openstack/files/etc/keepalived/keepalived.conf.tmpl create mode 100755 data/data/bootstrap/openstack/files/usr/local/bin/fletcher8 create mode 100755 data/data/bootstrap/openstack/files/usr/local/bin/get_vip_subnet_cidr create mode 100755 data/data/bootstrap/openstack/files/usr/local/bin/keepalived.sh.template create mode 100644 data/data/bootstrap/openstack/systemd/units/keepalived.service delete mode 100644 data/data/openstack/service/main.tf delete mode 100644 data/data/openstack/service/variables.tf delete mode 100644 data/data/openstack/topology/sg-lb.tf diff --git a/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template b/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template index f78c925ba30..b07f1cffcdd 100755 --- a/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template +++ b/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template @@ -19,6 +19,8 @@ fi MACHINE_CONFIG_OPERATOR_IMAGE=$(podman run --quiet --rm ${release} image machine-config-operator) MACHINE_CONFIG_OSCONTENT=$(podman run --quiet --rm ${release} image machine-os-content) MACHINE_CONFIG_ETCD_IMAGE=$(podman run --quiet --rm ${release} image etcd) +# FIXME(shadower): without this, the etcd containers later on keep failing with our custom MCO. Investigate what's goin on. +podman pull --quiet $MACHINE_CONFIG_ETCD_IMAGE MACHINE_CONFIG_KUBE_CLIENT_AGENT_IMAGE=$(podman run --quiet --rm ${release} image kube-client-agent) MACHINE_CONFIG_INFRA_IMAGE=$(podman run --quiet --rm ${release} image pod) diff --git a/data/data/bootstrap/openstack/files/etc/keepalived/keepalived.conf.tmpl b/data/data/bootstrap/openstack/files/etc/keepalived/keepalived.conf.tmpl new file mode 100644 index 00000000000..4f188baf74c --- /dev/null +++ b/data/data/bootstrap/openstack/files/etc/keepalived/keepalived.conf.tmpl @@ -0,0 +1,25 @@ +vrrp_script chk_ocp { + # NOTE(mandre) the fake kube-api server doesn't responds to the + # https://0:6443/readyz URL, we need to find another check + script "ss -tnl | grep 6443" + interval 1 + weight 50 +} + +vrrp_instance ${CLUSTER_NAME}_API { + state BACKUP + interface ${INTERFACE} + virtual_router_id ${API_VRID} + priority 50 + advert_int 1 + authentication { + auth_type PASS + auth_pass ${CLUSTER_NAME}_api_vip + } + virtual_ipaddress { + ${API_VIP}/${NET_MASK} + } + track_script { + chk_ocp + } +} diff --git a/data/data/bootstrap/openstack/files/usr/local/bin/fletcher8 b/data/data/bootstrap/openstack/files/usr/local/bin/fletcher8 new file mode 100755 index 00000000000..901544f3675 --- /dev/null +++ b/data/data/bootstrap/openstack/files/usr/local/bin/fletcher8 @@ -0,0 +1,10 @@ +#!/usr/libexec/platform-python +import sys + +data = map(ord, sys.argv[1]) +ckA = ckB = 0 + +for b in data: + ckA = (ckA + b) & 0xf + ckB = (ckB + ckA) & 0xf +print((ckB << 4) | ckA ) diff --git a/data/data/bootstrap/openstack/files/usr/local/bin/get_vip_subnet_cidr b/data/data/bootstrap/openstack/files/usr/local/bin/get_vip_subnet_cidr new file mode 100755 index 00000000000..4868cca81ac --- /dev/null +++ b/data/data/bootstrap/openstack/files/usr/local/bin/get_vip_subnet_cidr @@ -0,0 +1,24 @@ +#!/usr/libexec/platform-python +import sys +import socket +import struct + +vip = sys.argv[1] +iface_cidrs = sys.argv[2].split() +vip_int = struct.unpack("!I", socket.inet_aton(vip))[0] + +for iface_cidr in iface_cidrs: + ip, prefix = iface_cidr.split('/') + ip_int = struct.unpack("!I", socket.inet_aton(ip))[0] + prefix_int = int(prefix) + mask = int('1' * prefix_int + '0' * (32 - prefix_int), 2) + subnet_ip_int_min = ip_int & mask + subnet_ip = socket.inet_ntoa(struct.pack("!I", subnet_ip_int_min)) + subnet_ip_int_max = subnet_ip_int_min | int('1' * (32 - prefix_int), 2) + subnet_ip_max = socket.inet_ntoa(struct.pack("!I", subnet_ip_int_max)) + sys.stderr.write('Is %s between %s and %s\n' % (vip, subnet_ip, subnet_ip_max)) + if subnet_ip_int_min < vip_int < subnet_ip_int_max: + subnet_ip = socket.inet_ntoa(struct.pack("!I", subnet_ip_int_min)) + print('%s/%s' % (subnet_ip, prefix)) + sys.exit(0) +sys.exit(1) diff --git a/data/data/bootstrap/openstack/files/usr/local/bin/keepalived.sh.template b/data/data/bootstrap/openstack/files/usr/local/bin/keepalived.sh.template new file mode 100755 index 00000000000..55addacdce2 --- /dev/null +++ b/data/data/bootstrap/openstack/files/usr/local/bin/keepalived.sh.template @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +set -e + +mkdir --parents /etc/keepalived + +# TODO(shadower): switch to the keepalived image from the release: +# https://github.com/openshift/installer/pull/2025/files#diff-ce82c1d8a44f7dfc41dfc024085ccfeeR24 +KEEPALIVED_IMAGE=quay.io/celebdor/keepalived:latest +if ! podman inspect "$KEEPALIVED_IMAGE" &>/dev/null; then + echo "Pulling release image..." + podman pull "$KEEPALIVED_IMAGE" +fi + +# TODO(shadower): at least some of these can be passed into this +# template rather than discovered at runtime: +API_DNS="$(sudo awk -F[/:] '/apiServerURL/ {print $5}' /opt/openshift/manifests/cluster-infrastructure-02-config.yml)" +CLUSTER_NAME="$(awk -F. '{print $2}' <<< "$API_DNS")" +API_VIP="{{ .InstallConfig.Platform.OpenStack.APIVIP }}" +IFACE_CIDRS="$(ip addr show | grep -v "scope host" | grep -Po 'inet \K[\d.]+/[\d.]+' | xargs)" +SUBNET_CIDR="$(/usr/local/bin/get_vip_subnet_cidr "$API_VIP" "$IFACE_CIDRS")" +NET_MASK="$(echo "$SUBNET_CIDR" | cut -d "/" -f 2)" +INTERFACE="$(ip -o addr show to "$SUBNET_CIDR" | head -n 1 | awk '{print $2}')" +CLUSTER_DOMAIN="${API_DNS#*.}" + +# Virtual Router IDs. They must be different and 8 bit in length +API_VRID=$(/usr/local/bin/fletcher8 "$CLUSTER_NAME-api") +DNS_VRID=$(/usr/local/bin/fletcher8 "$CLUSTER_NAME-dns") + +export API_VIP +export CLUSTER_NAME +export INTERFACE +export API_VRID +export NET_MASK +envsubst < /etc/keepalived/keepalived.conf.tmpl | sudo tee /etc/keepalived/keepalived.conf + +MATCHES="$(sudo podman ps -a --format "{{`{{.Names}}`}}" | awk '/keepalived$/ {print $0}')" +if [[ -z "$MATCHES" ]]; then + # TODO(bnemec): Figure out how to run with less perms + podman create \ + --name keepalived \ + --volume /etc/keepalived:/etc/keepalived:z \ + --network=host \ + --privileged \ + --cap-add=ALL \ + "${KEEPALIVED_IMAGE}" \ + /usr/sbin/keepalived -f /etc/keepalived/keepalived.conf \ + --dont-fork -D -l -P +fi diff --git a/data/data/bootstrap/openstack/systemd/units/keepalived.service b/data/data/bootstrap/openstack/systemd/units/keepalived.service new file mode 100644 index 00000000000..ea9ee29d419 --- /dev/null +++ b/data/data/bootstrap/openstack/systemd/units/keepalived.service @@ -0,0 +1,18 @@ +[Unit] +Description=Manage node VIPs with keepalived +Wants=network-online.target +After=network-online.target + +[Service] +WorkingDirectory=/etc/keepalived +ExecStartPre=/usr/local/bin/keepalived.sh +ExecStart=/usr/bin/podman start -a keepalived +ExecStop=/usr/bin/podman stop -t 10 keepalived +ConditionPathExists=!/etc/pivot/image-pullspec + +Restart=on-failure +RestartSec=5 +TimeoutStartSec=600 + +[Install] +WantedBy=multi-user.target diff --git a/data/data/openstack/bootstrap/main.tf b/data/data/openstack/bootstrap/main.tf index 7c456c2a1fe..59c1dfcdf66 100644 --- a/data/data/openstack/bootstrap/main.tf +++ b/data/data/openstack/bootstrap/main.tf @@ -18,40 +18,73 @@ data "ignition_config" "redirect" { files = [ data.ignition_file.hostname.id, - data.ignition_file.bootstrap_ifcfg.id, + data.ignition_file.dns_conf.id, + data.ignition_file.dhcp_conf.id, + data.ignition_file.hosts.id, ] } -data "ignition_file" "bootstrap_ifcfg" { +data "ignition_file" "dhcp_conf" { filesystem = "root" - mode = "420" // 0644 - path = "/etc/sysconfig/network-scripts/ifcfg-eth0" + mode = "420" + path = "/etc/NetworkManager/conf.d/dhcp-client.conf" content { content = < /etc/haproxy/haproxy.cfg.new << EOF -listen ${var.cluster_id}-api-masters - bind 0.0.0.0:6443 - bind 0.0.0.0:22623 - mode tcp - balance roundrobin - server bootstrap-22623 ${var.bootstrap_ip} check port 22623 - server bootstrap-6443 ${var.bootstrap_ip} check port 6443 - ${replace(join("\n ", formatlist("server master-%s %s check port 6443", var.master_port_names, var.master_ips)), "master-port-", "")} -EOF - update_cfg_and_restart - exit 0 -fi - -for master in $MASTERS; -do - MASTER_LINES="$MASTER_LINES - server $master $master check port 6443" -done - -for worker in $WORKERS; -do - WORKER_LINES="$WORKER_LINES - server $worker $worker check port 443" -done - -cat > /etc/haproxy/haproxy.cfg.new << EOF -listen ${var.cluster_id}-api-masters - bind 0.0.0.0:6443 - bind 0.0.0.0:22623 - mode tcp - balance roundrobin$MASTER_LINES - -listen ${var.cluster_id}-api-workers - bind 0.0.0.0:80 - bind 0.0.0.0:443 - mode tcp - balance roundrobin$WORKER_LINES -EOF - -update_cfg_and_restart -TFEOF - -} -} - -data "ignition_file" "corefile" { - filesystem = "root" - mode = "420" // 0644 - path = "/etc/coredns/Corefile" - - content { - content = <