-
Notifications
You must be signed in to change notification settings - Fork 332
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Always allow egressip-node-updates #4102
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1579,91 +1579,6 @@ var _ = ginkgo.Describe("OVN cluster-manager EgressIP Operations", func() { | |
err := app.Run([]string{app.Name}) | ||
gomega.Expect(err).NotTo(gomega.HaveOccurred()) | ||
}) | ||
|
||
ginkgo.It("egress node update should not mark the node as reachable if there was no label/readiness change", func() { | ||
// When an egress node becomes reachable during a node update event and there is no changes to node labels/readiness | ||
// unassigned egress IP should be eventually added by the periodic reachability check. | ||
// Test steps: | ||
// - disable periodic check from running in background, so it can be called directly from the test | ||
// - assign egress IP to an available node | ||
// - make the node unreachable and verify that the egress IP was unassigned | ||
// - make the node reachable and update a node | ||
// - verify that the egress IP was assigned by calling the periodic reachability check | ||
app.Action = func(ctx *cli.Context) error { | ||
config.OVNKubernetesFeature.EnableInterconnect = true // no impact on global eIPC functions | ||
egressIP := "192.168.126.101" | ||
nodeIPv4 := "192.168.126.51/24" | ||
node := v1.Node{ | ||
ObjectMeta: metav1.ObjectMeta{ | ||
Name: node1Name, | ||
Annotations: map[string]string{ | ||
"k8s.ovn.org/node-primary-ifaddr": fmt.Sprintf("{\"ipv4\": \"%s\"}", nodeIPv4), | ||
"k8s.ovn.org/node-subnets": fmt.Sprintf("{\"default\":[\"%s\"]}", v4NodeSubnet), | ||
util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", nodeIPv4), | ||
}, | ||
Labels: map[string]string{ | ||
"k8s.ovn.org/egress-assignable": "", | ||
}, | ||
}, | ||
Status: v1.NodeStatus{ | ||
Conditions: []v1.NodeCondition{ | ||
{ | ||
Type: v1.NodeReady, | ||
Status: v1.ConditionTrue, | ||
}, | ||
}, | ||
}, | ||
} | ||
eIP1 := egressipv1.EgressIP{ | ||
ObjectMeta: newEgressIPMeta(egressIPName), | ||
Spec: egressipv1.EgressIPSpec{ | ||
EgressIPs: []string{egressIP}, | ||
}, | ||
} | ||
fakeClusterManagerOVN.start( | ||
&egressipv1.EgressIPList{ | ||
Items: []egressipv1.EgressIP{eIP1}, | ||
}, | ||
&v1.NodeList{ | ||
Items: []v1.Node{node}, | ||
}, | ||
) | ||
|
||
// Virtually disable background reachability check by using a huge interval | ||
fakeClusterManagerOVN.eIPC.reachabilityCheckInterval = time.Hour | ||
|
||
_, err := fakeClusterManagerOVN.eIPC.WatchEgressNodes() | ||
gomega.Expect(err).NotTo(gomega.HaveOccurred()) | ||
|
||
gomega.Eventually(getEgressIPStatusLen(eIP1.Name)).Should(gomega.Equal(1)) | ||
egressIPs, _ := getEgressIPStatus(eIP1.Name) | ||
gomega.Expect(egressIPs[0]).To(gomega.Equal(egressIP)) | ||
hcClient := fakeClusterManagerOVN.eIPC.allocator.cache[node.Name].healthClient.(*fakeEgressIPHealthClient) | ||
hcClient.FakeProbeFailure = true | ||
// explicitly call check reachability, periodic checker is not active | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume even if we delete this test case, there is another test case that checks that the reachability functionality works correctly, right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah good point, ; let me check this |
||
checkEgressNodesReachabilityIterate(fakeClusterManagerOVN.eIPC) | ||
gomega.Eventually(getEgressIPStatusLen(eIP1.Name)).Should(gomega.Equal(0)) | ||
|
||
hcClient.FakeProbeFailure = false | ||
node.Annotations["test"] = "dummy" | ||
_, err = fakeClusterManagerOVN.fakeClient.KubeClient.CoreV1().Nodes().Update(context.TODO(), &node, metav1.UpdateOptions{}) | ||
gomega.Expect(err).NotTo(gomega.HaveOccurred()) | ||
gomega.Eventually(hcClient.IsConnected()).Should(gomega.Equal(true)) | ||
// the node should not be marked as reachable in the update handler as it is not getting added | ||
gomega.Consistently(func() bool { return fakeClusterManagerOVN.eIPC.allocator.cache[node.Name].isReachable }).Should(gomega.Equal(false)) | ||
|
||
// egress IP should get assigned on the next checkEgressNodesReachabilityIterate call | ||
// explicitly call check reachability, periodic checker is not active | ||
checkEgressNodesReachabilityIterate(fakeClusterManagerOVN.eIPC) | ||
// this will trigger an immediate add retry for the node which we need to simulate for this test | ||
fakeClusterManagerOVN.eIPC.retryEgressNodes.RequestRetryObjs() | ||
gomega.Eventually(getEgressIPStatusLen(eIP1.Name)).Should(gomega.Equal(1)) | ||
|
||
return nil | ||
} | ||
err := app.Run([]string{app.Name}) | ||
gomega.Expect(err).NotTo(gomega.HaveOccurred()) | ||
}) | ||
}) | ||
|
||
ginkgo.Context("IPv6 assignment", func() { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -115,7 +115,6 @@ func (h *egressIPClusterControllerEventHandler) UpdateResource(oldObj, newObj in | |
klog.Infof("Node: %s has been un-labeled, deleting it from egress assignment", newNode.Name) | ||
return h.eIPC.deleteEgressNode(oldNode.Name) | ||
} | ||
isOldReady := h.eIPC.isEgressNodeReady(oldNode) | ||
isNewReady := h.eIPC.isEgressNodeReady(newNode) | ||
isNewReachable := h.eIPC.isEgressNodeReachable(newNode) | ||
isHostCIDRsAltered := util.NodeHostCIDRsAnnotationChanged(oldNode, newNode) | ||
|
@@ -133,16 +132,12 @@ func (h *egressIPClusterControllerEventHandler) UpdateResource(oldObj, newObj in | |
} | ||
return nil | ||
} | ||
if isOldReady == isNewReady && !isHostCIDRsAltered { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so walking through the consequences of not skipping potentially duplicate events:
This seems like a pretty big performance issue if there were a lot of egress IPs. We would constantly be churning on every node update all egress ips. Or am I missing something? Would a better approach be to expose the reAddorDelete map that the reachability check uses, and then we can have the state if the node wasReachable. Then we can change this if statement to be: if isOldReady == isNewReady && !isHostCIDRsAltered && wasReachable == isReachable ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I had the same thought to have a previous state of Reachability but the problem here is that the kubelet updates we get with oldNode, newNode both those obejcts will have healthy Reachability matrix so it still doesn't take action :( and doesn't help here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
SO this is a legit scale question, one I had too. But the thing to notice inside the add/delete node func is that we call the reconcileEIP only when The delete seems to be doing something else.. lemme double check |
||
return nil | ||
} | ||
if !isNewReady { | ||
klog.Warningf("Node: %s is not ready, deleting it from egress assignment", newNode.Name) | ||
if err := h.eIPC.deleteEgressNode(newNode.Name); err != nil { | ||
return err | ||
} | ||
} else if isNewReady && isNewReachable { | ||
klog.Infof("Node: %s is ready and reachable, adding it for egress assignment", newNode.Name) | ||
h.eIPC.setNodeEgressReachable(newNode.Name, isNewReachable) | ||
if err := h.eIPC.addEgressNode(newNode.Name); err != nil { | ||
return err | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So the original code that bypasses adding/updating a node if the ready states are equal: 1f101eb#diff-796adb9d8a9b5b5636dc81e7350a00ef143e6e5bcb52ec30dc350c866cd9208cR784
This pre-dates the reachability check. So this test case was added later. This test case seems flawed to me in the first place. We should use any event to trigger us to check and ensure that the node is added, whether or not that is a node update or a reachability check success.