Skip to content

Commit

Permalink
Bug 1958390: improve SDN's OVS healthcheck and logging
Browse files Browse the repository at this point in the history
This commit addresses three main issues:

- Improve logging in `ovs.go` by adding the args to the log and
  indicating clearly which command is being executed. In case certain
  exec's fail, it will greatly help narrow down what command failed to
  execute
- Run a stricter retry for OVS commands. In case we fail exec'ing OVS
  commands, fail much more quickly.
- Run the OVS healthcheck loop as early as possible, which means: right
  after we are sure that OVS is running.

The main reason for doing this is: solving the < OCP 4.6 problem
which was: SDN talking to a N-1 OVS and thinking everything is fine on
startup. If we aren't strict when checking connection errors to OVS
we might miss the upgrade of OVS from N-1 to N and continue thinking
everything is fine, while in fact OVS has been upgraded and had its
DB wiped.

Signed-off-by: Alexander Constantinescu <aconstan@redhat.com>
  • Loading branch information
alexanderConstantinescu committed May 26, 2021
1 parent 3ac5486 commit c5cfcbf
Show file tree
Hide file tree
Showing 7 changed files with 96 additions and 39 deletions.
19 changes: 10 additions & 9 deletions pkg/network/node/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import (
const (
ovsDialTimeout = 5 * time.Second
ovsHealthcheckInterval = 30 * time.Second
ovsRecoveryTimeout = 10 * time.Second
ovsRecoveryTimeout = 5 * time.Second
ovsDialDefaultNetwork = "unix"
ovsDialDefaultAddress = "/var/run/openvswitch/db.sock"
)
Expand All @@ -40,10 +40,10 @@ func waitForOVS(network, addr string) error {
return utilwait.PollImmediate(time.Second, time.Minute, func() (bool, error) {
dialErr, pingErr := dialAndPing(network, addr)
if dialErr != nil {
klog.V(2).Infof("waiting for OVS to start: %v", dialErr)
klog.Warningf("waiting for OVS to start: %v", dialErr)
return false, nil
} else if pingErr != nil {
klog.V(2).Infof("waiting for OVS to start, ping failed: %v", pingErr)
klog.Warningf("waiting for OVS to start, ping failed: %v", pingErr)
return false, nil
}
return true, nil
Expand All @@ -57,6 +57,7 @@ func waitForOVS(network, addr string) error {
func runOVSHealthCheck(network, addr string, healthFn func() error) {
// this loop holds an open socket connection to OVS until it times out, then
// checks for health
klog.Infof("Starting SDN healthcheck go-routines")
go utilwait.Until(func() {
c, err := ovsclient.DialTimeout(network, addr, ovsDialTimeout)
if err != nil {
Expand All @@ -80,25 +81,25 @@ func runOVSHealthCheck(network, addr string, healthFn func() error) {
if err != nil {
// If OVS restarts and our health check fails, we exit
// TODO: make openshift-sdn able to reconcile without a restart
klog.Warningf("SDN healthcheck detected OVS server change, restarting: %v", err)
klog.Errorf("SDN healthcheck detected OVS server change, restarting: %v", err)
os.Exit(1)
}
klog.V(2).Infof("SDN healthcheck reconnected to OVS server")
klog.Infof("SDN healthcheck reconnected to OVS server")
}, ovsDialTimeout, utilwait.NeverStop)

// this loop periodically verifies we can still connect to the OVS server and
// is an upper bound on the time we wait before detecting a failed OVS configuartion
// is an upper bound on the time we wait before detecting a failed OVS configuration
go utilwait.Until(func() {
dialErr, pingErr := dialAndPing(network, addr)
if dialErr != nil {
klog.V(2).Infof("SDN healthcheck unable to reconnect to OVS server: %v", dialErr)
klog.Warningf("SDN healthcheck unable to reconnect to OVS server: %v", dialErr)
return
} else if pingErr != nil {
klog.V(2).Infof("SDN healthcheck unable to ping OVS server: %v", pingErr)
klog.Warningf("SDN healthcheck unable to ping OVS server: %v", pingErr)
return
}
if err := healthFn(); err != nil {
klog.Warningf("SDN healthcheck detected unhealthy OVS server, restarting: %v", err)
klog.Errorf("SDN healthcheck detected unhealthy OVS server, restarting: %v", err)
os.Exit(1)
}
klog.V(4).Infof("SDN healthcheck succeeded")
Expand Down
2 changes: 1 addition & 1 deletion pkg/network/node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ func (node *OsdnNode) Start() error {
}
}

if err := node.FinishSetupSDN(); err != nil {
if err := node.oc.FinishSetupOVS(); err != nil {
return fmt.Errorf("could not complete SDN setup: %v", err)
}

Expand Down
3 changes: 3 additions & 0 deletions pkg/network/node/ovs/fake_ovs.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,9 @@ func (fake *ovsFake) Get(table, record, column string) (string, error) {
return "", err
}
if column == "options:dst_port" {
if _, exists := fake.ports[record]; !exists {
return "", fmt.Errorf("unknown OVS port")
}
return fmt.Sprintf("\"%s\"", fake.ports[record].dst_port), nil
}
return "", nil
Expand Down
6 changes: 3 additions & 3 deletions pkg/network/node/ovs/ovs.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,9 @@ const (
)

var ovsBackoff utilwait.Backoff = utilwait.Backoff{
Duration: 500 * time.Millisecond,
Duration: 100 * time.Millisecond,
Factor: 1.25,
Steps: 10,
Steps: 4,
}

// ovsExec implements ovs.Interface via calls to ovs-ofctl and ovs-vsctl
Expand Down Expand Up @@ -162,7 +162,7 @@ func (ovsif *ovsExec) execWithStdin(cmd string, stdinArgs []string, args ...stri

output, err := kcmd.CombinedOutput()
if err != nil {
klog.V(2).Infof("Error executing %s: %s", cmd, string(output))
klog.Errorf("Error executing cmd: %s with args: %v, output: %s", cmd, args, string(output))
return "", err
}

Expand Down
26 changes: 18 additions & 8 deletions pkg/network/node/ovscontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,22 +52,32 @@ func (oc *ovsController) getVersionNote() string {
return fmt.Sprintf("%02X.%02X", oc.pluginId, ruleVersion)
}

func (oc *ovsController) AlreadySetUp(vxlanPort uint32) bool {
func (oc *ovsController) AlreadySetUp(vxlanPort uint32, withSDNSetupVerification bool) bool {
flows, err := oc.ovs.DumpFlows("table=%d", ruleVersionTable)
if err != nil || len(flows) != 1 {
if err != nil {
return false
}

port, err := oc.ovs.Get("Interface", Vxlan0, "options:dst_port")
// the call to ovs.Get() returns the port number surrounded by double quotes
// so add them to the structs value for purposes of comparison
if err != nil || fmt.Sprintf("\"%d\"", vxlanPort) != port {
if err != nil {
return false
}
if parsed, err := ovs.ParseFlow(ovs.ParseForDump, flows[0]); err == nil {
return parsed.NoteHasPrefix(oc.getVersionNote())

if withSDNSetupVerification {
if len(flows) != 1 {
return false
}
// the call to ovs.Get() returns the port number surrounded by double quotes
// so add them to the structs value for purposes of comparison
if fmt.Sprintf("\"%d\"", vxlanPort) != port {
return false
}
if parsed, err := ovs.ParseFlow(ovs.ParseForDump, flows[0]); err == nil {
return parsed.NoteHasPrefix(oc.getVersionNote())
}
return false
}
return false
return true
}

func (oc *ovsController) SetupOVS(clusterNetworkCIDR []string, serviceNetworkCIDR, localSubnetCIDR, localSubnetGateway string, mtu uint32, vxlanPort uint32) error {
Expand Down
58 changes: 56 additions & 2 deletions pkg/network/node/ovscontroller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -774,7 +774,7 @@ func TestOVSEgressNetworkPolicy(t *testing.T) {
}
}

func TestAlreadySetUp(t *testing.T) {
func TestAlreadySetUpWithSDNSetupVerification(t *testing.T) {
testcases := []struct {
flow string
success bool
Expand Down Expand Up @@ -818,7 +818,61 @@ func TestAlreadySetUp(t *testing.T) {
if err := otx.Commit(); err != nil {
t.Fatalf("(%d) unexpected error from AddFlow: %v", i, err)
}
if success := oc.AlreadySetUp(4789); success != tc.success {
if success := oc.AlreadySetUp(4789, true); success != tc.success {
t.Fatalf("(%d) unexpected setup value %v (expected %v)", i, success, tc.success)
}
}
}

func TestAlreadySetUpWithoutSDNSetupVerification(t *testing.T) {
var oc *ovsController
testcases := []struct {
setup func()
success bool
}{
{
// Good setup with existing bridge and port
func() {
ovsif := ovs.NewFake(Br0)
if err := ovsif.AddBridge("fail_mode=secure", "protocols=OpenFlow13"); err != nil {
t.Fatalf("unexpected error from AddBridge: %v", err)
}
oc = NewOVSController(ovsif, 0, true, "172.17.0.4")
/* In order to test AlreadySetUp the vxlan port has to be added, we are not testing AddPort here */
_, err := ovsif.AddPort("vxlan0", 1, "type=vxlan", `options:remote_ip="flow"`, `options:key="flow"`, fmt.Sprintf("options:dst_port=%d", 4789))
if err != nil {
t.Fatalf("unexpected error from AddPort: %v", err)
}
},
true,
},
{
// Bad setup, invalid bridge set up
func() {
ovsif := ovs.NewFake(Br0)
ovsif.AddBridge("fail_mode=secure", "protocols=OpenFlow13", "-=blahhh")
oc = NewOVSController(ovsif, 0, true, "172.17.0.4")
},
false,
},
{
// Bad setup invalid port set up
func() {
ovsif := ovs.NewFake(Br0)
if err := ovsif.AddBridge("fail_mode=secure", "protocols=OpenFlow13"); err != nil {
t.Fatalf("unexpected error from AddBridge: %v", err)
}
oc = NewOVSController(ovsif, 0, true, "172.17.0.4")
/* In order to test AlreadySetUp the vxlan port has to be added, we are not testing AddPort here */
ovsif.AddPort("vxlan0", 1, "type=vxlan", "-=blahhh", `options:remote_ip="flow"`, `options:key="flow"`, fmt.Sprintf("options:dst_port=%d", 4789))
},
false,
},
}

for i, tc := range testcases {
tc.setup()
if success := oc.AlreadySetUp(4789, false); success != tc.success {
t.Fatalf("(%d) unexpected setup value %v (expected %v)", i, success, tc.success)
}
}
Expand Down
21 changes: 5 additions & 16 deletions pkg/network/node/sdn_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import (
"github.com/vishvananda/netlink"
)

func (plugin *OsdnNode) alreadySetUp() error {
func (plugin *OsdnNode) alreadySetUp(withSDNSetupVerification bool) error {
var found bool

l, err := netlink.LinkByName(Tun0)
Expand Down Expand Up @@ -59,11 +59,9 @@ func (plugin *OsdnNode) alreadySetUp() error {
return errors.New("cluster CIDR not found")
}
}

if !plugin.oc.AlreadySetUp(plugin.networkInfo.VXLANPort) {
if !plugin.oc.AlreadySetUp(plugin.networkInfo.VXLANPort, withSDNSetupVerification) {
return errors.New("plugin is not setup")
}

return nil
}

Expand Down Expand Up @@ -133,7 +131,7 @@ func (plugin *OsdnNode) SetupSDN() (bool, map[string]podNetworkInfo, error) {
klog.Warningf("[SDN setup] Could not get details of existing pods: %v", err)
}

if err := plugin.alreadySetUp(); err == nil {
if err := plugin.alreadySetUp(true); err == nil {
klog.Infof("[SDN setup] SDN is already set up")
} else {
klog.Infof("[SDN setup] full SDN setup required (%v)", err)
Expand All @@ -143,20 +141,11 @@ func (plugin *OsdnNode) SetupSDN() (bool, map[string]podNetworkInfo, error) {
changed = true
}

return changed, existingPods, nil
}

func (plugin *OsdnNode) FinishSetupSDN() error {
err := plugin.oc.FinishSetupOVS()
if err != nil {
return err
}

// TODO: make it possible to safely reestablish node configuration after restart
// If OVS goes down and fails the health check, restart the entire process
runOVSHealthCheck(ovsDialDefaultNetwork, ovsDialDefaultAddress, plugin.alreadySetUp)
runOVSHealthCheck(ovsDialDefaultNetwork, ovsDialDefaultAddress, func() error { return plugin.alreadySetUp(false) })

return nil
return changed, existingPods, nil
}

func (plugin *OsdnNode) setup(localSubnetCIDR, localSubnetGateway string) error {
Expand Down

0 comments on commit c5cfcbf

Please sign in to comment.