From 2d88a9b2bae865b7319907fea994bde1aa787b01 Mon Sep 17 00:00:00 2001 From: Alexander Constantinescu Date: Tue, 15 Jun 2021 14:58:49 +0200 Subject: [PATCH 1/2] Simplyify SDN healthcheck and exit directly on disconnect Signed-off-by: Alexander Constantinescu --- pkg/network/node/healthcheck.go | 103 ---------------------------- pkg/network/node/healthcheck_ovs.go | 36 ++++++++++ pkg/network/node/ovs/ovs.go | 2 +- pkg/network/node/sdn_controller.go | 7 +- 4 files changed, 38 insertions(+), 110 deletions(-) delete mode 100644 pkg/network/node/healthcheck.go create mode 100644 pkg/network/node/healthcheck_ovs.go diff --git a/pkg/network/node/healthcheck.go b/pkg/network/node/healthcheck.go deleted file mode 100644 index 0a30e6c3f0..0000000000 --- a/pkg/network/node/healthcheck.go +++ /dev/null @@ -1,103 +0,0 @@ -package node - -import ( - "fmt" - "time" - - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - utilwait "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/klog" - - "github.com/openshift/sdn/pkg/network/node/ovs/ovsclient" -) - -const ( - ovsDialTimeout = 5 * time.Second - ovsHealthcheckInterval = 30 * time.Second - ovsRecoveryTimeout = 10 * time.Second - ovsDialDefaultNetwork = "unix" - ovsDialDefaultAddress = "/var/run/openvswitch/db.sock" -) - -// dialAndPing connects to OVS once and pings the server. It returns -// the dial error (if any) or the ping error (if any), or neither. -func dialAndPing(network, addr string) (error, error) { - c, err := ovsclient.DialTimeout(network, addr, ovsDialTimeout) - if err != nil { - return err, nil - } - defer c.Close() - if err := c.Ping(); err != nil { - return nil, err - } - return nil, nil -} - -// waitForOVS polls until the OVS server responds to a connection and an 'echo' -// command. -func waitForOVS(network, addr string) error { - return utilwait.PollImmediate(time.Second, time.Minute, func() (bool, error) { - dialErr, pingErr := dialAndPing(network, addr) - if dialErr != nil { - klog.V(2).Infof("waiting for OVS to start: %v", dialErr) - return false, nil - } else if pingErr != nil { - klog.V(2).Infof("waiting for OVS to start, ping failed: %v", pingErr) - return false, nil - } - return true, nil - }) -} - -// runOVSHealthCheck runs two background loops - one that waits for disconnection -// from the OVS server and then checks healthFn, and one that periodically checks -// healthFn. If healthFn returns false in either of these two cases while the OVS -// server is responsive the node process will terminate. -func runOVSHealthCheck(network, addr string, healthFn func() error) { - // this loop holds an open socket connection to OVS until it times out, then - // checks for health - go utilwait.Until(func() { - c, err := ovsclient.DialTimeout(network, addr, ovsDialTimeout) - if err != nil { - utilruntime.HandleError(fmt.Errorf("SDN healthcheck unable to connect to OVS server: %v", err)) - return - } - defer c.Close() - - _ = c.WaitForDisconnect() - - // Poll OVS in a tight loop waiting for reconnect - err = utilwait.PollImmediate(100*time.Millisecond, ovsRecoveryTimeout, func() (bool, error) { - if dialErr, pingErr := dialAndPing(network, addr); dialErr != nil || pingErr != nil { - return false, nil - } - if err := healthFn(); err != nil { - return false, fmt.Errorf("OVS reinitialization required: %v", err) - } - return true, nil - }) - if err != nil { - // If OVS restarts and our health check fails, we exit - // TODO: make openshift-sdn able to reconcile without a restart - klog.Fatalf("SDN healthcheck detected OVS server change, restarting: %v", err) - } - klog.V(2).Infof("SDN healthcheck reconnected to OVS server") - }, ovsDialTimeout, utilwait.NeverStop) - - // this loop periodically verifies we can still connect to the OVS server and - // is an upper bound on the time we wait before detecting a failed OVS configuartion - go utilwait.Until(func() { - dialErr, pingErr := dialAndPing(network, addr) - if dialErr != nil { - klog.V(2).Infof("SDN healthcheck unable to reconnect to OVS server: %v", dialErr) - return - } else if pingErr != nil { - klog.V(2).Infof("SDN healthcheck unable to ping OVS server: %v", pingErr) - return - } - if err := healthFn(); err != nil { - klog.Fatalf("SDN healthcheck detected unhealthy OVS server, restarting: %v", err) - } - klog.V(4).Infof("SDN healthcheck succeeded") - }, ovsHealthcheckInterval, utilwait.NeverStop) -} diff --git a/pkg/network/node/healthcheck_ovs.go b/pkg/network/node/healthcheck_ovs.go new file mode 100644 index 0000000000..4c7b6ede33 --- /dev/null +++ b/pkg/network/node/healthcheck_ovs.go @@ -0,0 +1,36 @@ +package node + +import ( + "fmt" + "os" + "time" + + "github.com/openshift/sdn/pkg/network/node/ovs/ovsclient" + "k8s.io/klog" +) + +const ( + ovsDialDefaultNetwork = "unix" + ovsDialDefaultAddress = "/var/run/openvswitch/db.sock" +) + +func healthCheckOVS() error { + klog.Infof("Starting OVS health check") + c, err := ovsclient.DialTimeout(ovsDialDefaultNetwork, ovsDialDefaultAddress, time.Minute) + if err != nil { + return fmt.Errorf("Error connecting to OVS: %v", err) + } + if err := c.Ping(); err != nil { + if cErr := c.Close(); cErr != nil { + return fmt.Errorf("Error pinging OVS, err: %v, and closing the connection, err: %v", err, cErr) + } + return fmt.Errorf("Error pinging OVS, err: %v", err) + } + go func() { + klog.Infof("Starting SDN-OVS disconnection go-routine") + c.WaitForDisconnect() + klog.Errorf("Detected OVS server change, restarting") + os.Exit(1) + }() + return nil +} diff --git a/pkg/network/node/ovs/ovs.go b/pkg/network/node/ovs/ovs.go index aa8014a250..6ba1ed575b 100644 --- a/pkg/network/node/ovs/ovs.go +++ b/pkg/network/node/ovs/ovs.go @@ -155,7 +155,7 @@ func (ovsif *ovsExec) execWithStdin(cmd string, stdinArgs []string, args ...stri output, err := kcmd.CombinedOutput() if err != nil { - klog.V(2).Infof("Error executing %s: %s", cmd, string(output)) + klog.Errorf("Error executing cmd: %s with args: %v, output: \n%s", cmd, args, string(output)) return "", err } diff --git a/pkg/network/node/sdn_controller.go b/pkg/network/node/sdn_controller.go index f14ae68fdb..1fc9e263cf 100644 --- a/pkg/network/node/sdn_controller.go +++ b/pkg/network/node/sdn_controller.go @@ -123,7 +123,7 @@ func (plugin *OsdnNode) SetupSDN() (bool, map[string]podNetworkInfo, error) { plugin.localGatewayCIDR = fmt.Sprintf("%s/%d", localSubnetGateway, localSubnetMaskLength) - if err := waitForOVS(ovsDialDefaultNetwork, ovsDialDefaultAddress); err != nil { + if err := healthCheckOVS(); err != nil { return false, nil, err } @@ -151,11 +151,6 @@ func (plugin *OsdnNode) FinishSetupSDN() error { if err != nil { return err } - - // TODO: make it possible to safely reestablish node configuration after restart - // If OVS goes down and fails the health check, restart the entire process - runOVSHealthCheck(ovsDialDefaultNetwork, ovsDialDefaultAddress, plugin.alreadySetUp) - return nil } From 9a4ade11ea56d4e86bcf020ae6ef52c0bdfe07d6 Mon Sep 17 00:00:00 2001 From: Alexander Constantinescu Date: Tue, 15 Jun 2021 15:00:04 +0200 Subject: [PATCH 2/2] Be less lenient on errors when exec'ing OVS commands Signed-off-by: Alexander Constantinescu --- pkg/network/node/ovs/ovs.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/network/node/ovs/ovs.go b/pkg/network/node/ovs/ovs.go index 6ba1ed575b..23e0ab305b 100644 --- a/pkg/network/node/ovs/ovs.go +++ b/pkg/network/node/ovs/ovs.go @@ -106,10 +106,11 @@ const ( OVS_VSCTL = "ovs-vsctl" ) +// ~0.05 seconds in total var ovsBackoff utilwait.Backoff = utilwait.Backoff{ - Duration: 500 * time.Millisecond, + Duration: 10 * time.Millisecond, Factor: 1.25, - Steps: 10, + Steps: 4, } // ovsExec implements ovs.Interface via calls to ovs-ofctl and ovs-vsctl