Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug 2094039: egressIP: node retrieval failure is not respected, causes panic #1130

Merged
merged 1 commit into from Jun 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
34 changes: 15 additions & 19 deletions go-controller/pkg/ovn/egressip.go
Expand Up @@ -1587,17 +1587,17 @@ func (oc *Controller) setNodeEgressReachable(nodeName string, isReachable bool)
}
}

func (oc *Controller) addEgressNode(egressNode *kapi.Node) error {
func (oc *Controller) addEgressNode(nodeName string) error {
var errors []error
klog.V(5).Infof("Egress node: %s about to be initialized", egressNode.Name)
klog.V(5).Infof("Egress node: %s about to be initialized", nodeName)
// This option will program OVN to start sending GARPs for all external IPS
// that the logical switch port has been configured to use. This is
// necessary for egress IP because if an egress IP is moved between two
// nodes, the nodes need to actively update the ARP cache of all neighbors
// as to notify them the change. If this is not the case: packets will
// continue to be routed to the old node which hosted the egress IP before
// it was moved, and the connections will fail.
portName := types.EXTSwitchToGWRouterPrefix + types.GWRouterPrefix + egressNode.Name
portName := types.EXTSwitchToGWRouterPrefix + types.GWRouterPrefix + nodeName
lsp := nbdb.LogicalSwitchPort{
Name: portName,
// Setting nat-addresses to router will send out GARPs for all externalIPs and LB VIPs
Expand All @@ -1608,7 +1608,7 @@ func (oc *Controller) addEgressNode(egressNode *kapi.Node) error {
err := libovsdbops.UpdateLogicalSwitchPortSetOptions(oc.nbClient, &lsp)
if err != nil {
errors = append(errors, fmt.Errorf("unable to configure GARP on external logical switch port for egress node: %s, "+
"this will result in packet drops during egress IP re-assignment, err: %v", egressNode.Name, err))
"this will result in packet drops during egress IP re-assignment, err: %v", nodeName, err))
}

// If a node has been labelled for egress IP we need to check if there are any
Expand Down Expand Up @@ -1639,20 +1639,20 @@ func (oc *Controller) addEgressNode(egressNode *kapi.Node) error {
return nil
}

func (oc *Controller) deleteEgressNode(egressNode *kapi.Node) error {
func (oc *Controller) deleteEgressNode(nodeName string) error {
var errors []error
klog.V(5).Infof("Egress node: %s about to be removed", egressNode.Name)
klog.V(5).Infof("Egress node: %s about to be removed", nodeName)
// This will remove the option described in addEgressNode from the logical
// switch port, since this node will not be used for egress IP assignments
// from now on.
portName := types.EXTSwitchToGWRouterPrefix + types.GWRouterPrefix + egressNode.Name
portName := types.EXTSwitchToGWRouterPrefix + types.GWRouterPrefix + nodeName
lsp := nbdb.LogicalSwitchPort{
Name: portName,
Options: map[string]string{"nat-addresses": "", "exclude-lb-vips-from-garp": ""},
}
err := libovsdbops.UpdateLogicalSwitchPortSetOptions(oc.nbClient, &lsp)
if err != nil {
errors = append(errors, fmt.Errorf("unable to remove GARP configuration on external logical switch port for egress node: %s, err: %v", egressNode.Name, err))
errors = append(errors, fmt.Errorf("unable to remove GARP configuration on external logical switch port for egress node: %s, err: %v", nodeName, err))
}

// Since the node has been labelled as "not usable" for egress IP
Expand All @@ -1664,7 +1664,7 @@ func (oc *Controller) deleteEgressNode(egressNode *kapi.Node) error {
}
for _, egressIP := range egressIPs.Items {
for _, status := range egressIP.Status.Items {
if status.Node == egressNode.Name {
if status.Node == nodeName {
// Send a "synthetic update" on all egress IPs which have an
// assignment to this node. The reconciliation loop for
// WatchEgressIP will see that the current assignment status to
Expand Down Expand Up @@ -2078,19 +2078,15 @@ func (oc *Controller) checkEgressNodesReachability() {
}
oc.eIPC.allocator.Unlock()
for nodeName, shouldDelete := range reAddOrDelete {
node, err := oc.watchFactory.GetNode(nodeName)
if err != nil {
klog.Errorf("Node: %s reachability changed, but could not retrieve node from cache, err: %v", nodeName, err)
}
if shouldDelete {
klog.Warningf("Node: %s is detected as unreachable, deleting it from egress assignment", node.Name)
if err := oc.deleteEgressNode(node); err != nil {
klog.Errorf("Node: %s is detected as unreachable, but could not re-assign egress IPs, err: %v", node.Name, err)
klog.Warningf("Node: %s is detected as unreachable, deleting it from egress assignment", nodeName)
if err := oc.deleteEgressNode(nodeName); err != nil {
klog.Errorf("Node: %s is detected as unreachable, but could not re-assign egress IPs, err: %v", nodeName, err)
}
} else {
klog.Infof("Node: %s is detected as reachable and ready again, adding it to egress assignment", node.Name)
if err := oc.addEgressNode(node); err != nil {
klog.Errorf("Node: %s is detected as reachable and ready again, but could not re-assign egress IPs, err: %v", node.Name, err)
klog.Infof("Node: %s is detected as reachable and ready again, adding it to egress assignment", nodeName)
if err := oc.addEgressNode(nodeName); err != nil {
klog.Errorf("Node: %s is detected as reachable and ready again, but could not re-assign egress IPs, err: %v", nodeName, err)
}
}
}
Expand Down
15 changes: 8 additions & 7 deletions go-controller/pkg/ovn/obj_retry.go
Expand Up @@ -2,14 +2,15 @@ package ovn

import (
"fmt"
ocpcloudnetworkapi "github.com/openshift/api/cloudnetwork/v1"
"math/rand"
"net"
"reflect"
"strings"
"sync"
"time"

ocpcloudnetworkapi "github.com/openshift/api/cloudnetwork/v1"

kapi "k8s.io/api/core/v1"
knet "k8s.io/api/networking/v1"
kerrors "k8s.io/apimachinery/pkg/api/errors"
Expand Down Expand Up @@ -755,7 +756,7 @@ func (oc *Controller) addResource(objectsToRetry *retryObjs, obj interface{}, fr
oc.setNodeEgressReachable(node.Name, true)
}
if hasEgressLabel && isReachable && isReady {
if err := oc.addEgressNode(node); err != nil {
if err := oc.addEgressNode(node.Name); err != nil {
return err
}
}
Expand Down Expand Up @@ -863,7 +864,7 @@ func (oc *Controller) updateResource(objectsToRetry *retryObjs, oldObj, newObj i
if oldHadEgressLabel && !newHasEgressLabel {
klog.Infof("Node: %s has been un-labeled, deleting it from egress assignment", newNode.Name)
oc.setNodeEgressAssignable(oldNode.Name, false)
return oc.deleteEgressNode(oldNode)
return oc.deleteEgressNode(oldNode.Name)
}
isOldReady := oc.isEgressNodeReady(oldNode)
isNewReady := oc.isEgressNodeReady(newNode)
Expand All @@ -874,7 +875,7 @@ func (oc *Controller) updateResource(objectsToRetry *retryObjs, oldObj, newObj i
klog.Infof("Node: %s has been labeled, adding it for egress assignment", newNode.Name)
oc.setNodeEgressAssignable(newNode.Name, true)
if isNewReady && isNewReachable {
if err := oc.addEgressNode(newNode); err != nil {
if err := oc.addEgressNode(newNode.Name); err != nil {
return err
}
} else {
Expand All @@ -887,12 +888,12 @@ func (oc *Controller) updateResource(objectsToRetry *retryObjs, oldObj, newObj i
}
if !isNewReady {
klog.Warningf("Node: %s is not ready, deleting it from egress assignment", newNode.Name)
if err := oc.deleteEgressNode(newNode); err != nil {
if err := oc.deleteEgressNode(newNode.Name); err != nil {
return err
}
} else if isNewReady && isNewReachable {
klog.Infof("Node: %s is ready and reachable, adding it for egress assignment", newNode.Name)
if err := oc.addEgressNode(newNode); err != nil {
if err := oc.addEgressNode(newNode.Name); err != nil {
return err
}
}
Expand Down Expand Up @@ -1031,7 +1032,7 @@ func (oc *Controller) deleteResource(objectsToRetry *retryObjs, obj, cachedObj i
nodeEgressLabel := util.GetNodeEgressLabel()
nodeLabels := node.GetLabels()
if _, hasEgressLabel := nodeLabels[nodeEgressLabel]; hasEgressLabel {
if err := oc.deleteEgressNode(node); err != nil {
if err := oc.deleteEgressNode(node.Name); err != nil {
return err
}
}
Expand Down