From 0c7f999ff3b302f185bcff3a0a47e6c3d1b80770 Mon Sep 17 00:00:00 2001 From: Surya Seetharaman Date: Mon, 6 Jun 2022 22:26:16 +0200 Subject: [PATCH] egressIP: node retrieval failure is not respected, causes panic Fixes panic seen during checkEgressNodesReachability E0606 11:57:34.063567 1 egressip.go:2096] Node: huirwang-0606c-m5w24-worker-2-dkjmc reachability changed, but could not retrieve node from cache, err: node "huirwang-0606c-m5w24-worker-2-dkjmc" not found W0606 11:57:34.063658 1 egressip.go:1657] Unable to remove GARP configuration on external logical switch port for egress node: huirwang-0606c-m5w24-worker-2-dkjmc, err: object not found panic: runtime error: invalid memory address or nil pointer dereference [signal SIGSEGV: segmentation violation code=0x1 addr=0x20 pc=0x176626d] goroutine 487 [running]: github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn.(*Controller).checkEgressNodesReachability(0xc000171c00) /home/surya/go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/ovn/egressip.go:2099 +0x3ed created by github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn.(*Controller).initClusterEgressPolicies /home/surya/go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/ovn/egressip.go:1772 +0xdb The problem here is that the node retrieval fails since the node has already been deleted, instead of respecting that failure we just log the error saying node object not found and actually continue to use that object further in the code which causes this panic. Signed-off-by: Surya Seetharaman (cherry picked from commit 36176e5fcc4fb36ce175a02770f84c4556c6ef76) (cherry picked from commit f436d25441dfbbf6dbd08c4d9535032426be923f) Conflicts: go-controller/pkg/ovn/egressip.go go-controller/pkg/ovn/obj_retry.go Since ovn-org/ovn-kubernetes#2965 (EIP retry) is missing in 4.10 and below (cherry picked from commit 97212e9c063f461edf93982e2bb96b12d046dfd9) Conflicts: go-controller/pkg/ovn/egressip.go Conflict again on 4.9 because libovsdb changes are missing. (cherry picked from commit 204ca5173081fcb5047df3d5197d570b5223cd44) --- go-controller/pkg/ovn/egressip.go | 36 ++++++++++++++----------------- go-controller/pkg/ovn/ovn.go | 12 +++++------ 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/go-controller/pkg/ovn/egressip.go b/go-controller/pkg/ovn/egressip.go index dafda79393..5874daf5c7 100644 --- a/go-controller/pkg/ovn/egressip.go +++ b/go-controller/pkg/ovn/egressip.go @@ -678,22 +678,22 @@ func (oc *Controller) setNodeEgressReachable(nodeName string, isReachable bool) } } -func (oc *Controller) addEgressNode(egressNode *kapi.Node) error { - klog.V(5).Infof("Egress node: %s about to be initialized", egressNode.Name) +func (oc *Controller) addEgressNode(nodeName string) error { + klog.V(5).Infof("Egress node: %s about to be initialized", nodeName) if stdout, stderr, err := util.RunOVNNbctl( "set", "logical_switch_port", - types.EXTSwitchToGWRouterPrefix+types.GWRouterPrefix+egressNode.Name, + types.EXTSwitchToGWRouterPrefix+types.GWRouterPrefix+nodeName, "options:nat-addresses=router", "options:exclude-lb-vips-from-garp=true", ); err != nil { klog.Errorf("Unable to configure GARP on external logical switch port for egress node: %s, "+ - "this will result in packet drops during egress IP re-assignment, stdout: %s, stderr: %s, err: %v", egressNode.Name, stdout, stderr, err) + "this will result in packet drops during egress IP re-assignment, stdout: %s, stderr: %s, err: %v", nodeName, stdout, stderr, err) } oc.eIPC.assignmentRetryMutex.Lock() defer oc.eIPC.assignmentRetryMutex.Unlock() for eIPName := range oc.eIPC.assignmentRetry { - klog.V(5).Infof("Re-assignment for EgressIP: %s attempted by new node: %s", eIPName, egressNode.Name) + klog.V(5).Infof("Re-assignment for EgressIP: %s attempted by new node: %s", eIPName, nodeName) eIP, err := oc.kube.GetEgressIP(eIPName) if errors.IsNotFound(err) { klog.Errorf("Re-assignment for EgressIP: EgressIP: %s not found in the api-server, err: %v", eIPName, err) @@ -716,17 +716,17 @@ func (oc *Controller) addEgressNode(egressNode *kapi.Node) error { return nil } -func (oc *Controller) deleteEgressNode(egressNode *kapi.Node) error { - klog.V(5).Infof("Egress node: %s about to be removed", egressNode.Name) +func (oc *Controller) deleteEgressNode(nodeName string) error { + klog.V(5).Infof("Egress node: %s about to be removed", nodeName) if stdout, stderr, err := util.RunOVNNbctl( "remove", "logical_switch_port", - types.EXTSwitchToGWRouterPrefix+types.GWRouterPrefix+egressNode.Name, + types.EXTSwitchToGWRouterPrefix+types.GWRouterPrefix+nodeName, "options", "nat-addresses=router", "exclude-lb-vips-from-garp=true", ); err != nil { - klog.Errorf("Unable to remove GARP configuration on external logical switch port for egress node: %s, stdout: %s, stderr: %s, err: %v", egressNode.Name, stdout, stderr, err) + klog.Errorf("Unable to remove GARP configuration on external logical switch port for egress node: %s, stdout: %s, stderr: %s, err: %v", nodeName, stdout, stderr, err) } oc.eIPC.assignmentRetryMutex.Lock() @@ -738,7 +738,7 @@ func (oc *Controller) deleteEgressNode(egressNode *kapi.Node) error { for _, eIP := range egressIPs.Items { needsReassignment := false for _, status := range eIP.Status.Items { - if status.Node == egressNode.Name { + if status.Node == nodeName { needsReassignment = true break } @@ -1180,19 +1180,15 @@ func (oc *Controller) checkEgressNodesReachability() { } oc.eIPC.allocatorMutex.Unlock() for nodeName, shouldDelete := range reAddOrDelete { - node, err := oc.kube.GetNode(nodeName) - if err != nil { - klog.Errorf("Node: %s reachability changed, but could not retrieve node from API server, err: %v", node.Name, err) - } if shouldDelete { - klog.Warningf("Node: %s is detected as unreachable, deleting it from egress assignment", node.Name) - if err := oc.deleteEgressNode(node); err != nil { - klog.Errorf("Node: %s is detected as unreachable, but could not re-assign egress IPs, err: %v", node.Name, err) + klog.Warningf("Node: %s is detected as unreachable, deleting it from egress assignment", nodeName) + if err := oc.deleteEgressNode(nodeName); err != nil { + klog.Errorf("Node: %s is detected as unreachable, but could not re-assign egress IPs, err: %v", nodeName, err) } } else { - klog.Infof("Node: %s is detected as reachable and ready again, adding it to egress assignment", node.Name) - if err := oc.addEgressNode(node); err != nil { - klog.Errorf("Node: %s is detected as reachable and ready again, but could not re-assign egress IPs, err: %v", node.Name, err) + klog.Infof("Node: %s is detected as reachable and ready again, adding it to egress assignment", nodeName) + if err := oc.addEgressNode(nodeName); err != nil { + klog.Errorf("Node: %s is detected as reachable and ready again, but could not re-assign egress IPs, err: %v", nodeName, err) } } } diff --git a/go-controller/pkg/ovn/ovn.go b/go-controller/pkg/ovn/ovn.go index d08a6546ff..24f5051744 100644 --- a/go-controller/pkg/ovn/ovn.go +++ b/go-controller/pkg/ovn/ovn.go @@ -770,7 +770,7 @@ func (oc *Controller) WatchEgressNodes() { oc.setNodeEgressReachable(node.Name, true) } if hasEgressLabel && isReachable && isReady { - if err := oc.addEgressNode(node); err != nil { + if err := oc.addEgressNode(node.Name); err != nil { klog.Error(err) } } @@ -791,7 +791,7 @@ func (oc *Controller) WatchEgressNodes() { if oldHadEgressLabel && !newHasEgressLabel { klog.Infof("Node: %s has been un-labelled, deleting it from egress assignment", newNode.Name) oc.setNodeEgressAssignable(oldNode.Name, false) - if err := oc.deleteEgressNode(oldNode); err != nil { + if err := oc.deleteEgressNode(oldNode.Name); err != nil { klog.Error(err) } return @@ -805,7 +805,7 @@ func (oc *Controller) WatchEgressNodes() { klog.Infof("Node: %s has been labelled, adding it for egress assignment", newNode.Name) oc.setNodeEgressAssignable(newNode.Name, true) if isNewReady && isNewReachable { - if err := oc.addEgressNode(newNode); err != nil { + if err := oc.addEgressNode(newNode.Name); err != nil { klog.Error(err) } } else { @@ -818,12 +818,12 @@ func (oc *Controller) WatchEgressNodes() { } if !isNewReady { klog.Warningf("Node: %s is not ready, deleting it from egress assignment", newNode.Name) - if err := oc.deleteEgressNode(newNode); err != nil { + if err := oc.deleteEgressNode(newNode.Name); err != nil { klog.Error(err) } } else if isNewReady && isNewReachable { klog.Infof("Node: %s is ready and reachable, adding it for egress assignment", newNode.Name) - if err := oc.addEgressNode(newNode); err != nil { + if err := oc.addEgressNode(newNode.Name); err != nil { klog.Error(err) } } @@ -835,7 +835,7 @@ func (oc *Controller) WatchEgressNodes() { } nodeLabels := node.GetLabels() if _, hasEgressLabel := nodeLabels[nodeEgressLabel]; hasEgressLabel { - if err := oc.deleteEgressNode(node); err != nil { + if err := oc.deleteEgressNode(node.Name); err != nil { klog.Error(err) } }