Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Panic in network policy deletion #3034

Merged
merged 2 commits into from
Jun 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion go-controller/pkg/libovsdbops/acl.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ func DeleteACLs(nbClient libovsdbclient.Client, acls ...*nbdb.ACL) error {
Model: acl,
ModelPredicate: func(item *nbdb.ACL) bool { return isEquivalentACL(item, acl) },
ErrNotFound: false,
BulkOp: false,
BulkOp: true,
tssurya marked this conversation as resolved.
Show resolved Hide resolved
}
opModels = append(opModels, opModel)
}
Expand Down
58 changes: 57 additions & 1 deletion go-controller/pkg/ovn/policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,36 @@ func (oc *Controller) createDefaultDenyPGAndACLs(namespace, policy string, nsInf
return nil
}

// deleteDefaultDenyPGAndACLs deletes the default port groups and acls for a ns/policy
// must be called with a write lock on nsInfo
func (oc *Controller) deleteDefaultDenyPGAndACLs(namespace, policy string, nsInfo *namespaceInfo) error {
aclLogging := nsInfo.aclLogging.Deny
var aclsToBeDeleted []*nbdb.ACL

ingressPGName := defaultDenyPortGroup(namespace, ingressDefaultDenySuffix)
ingressDenyACL, ingressAllowACL := buildDenyACLs(namespace, policy, ingressPGName, aclLogging, knet.PolicyTypeIngress)
aclsToBeDeleted = append(aclsToBeDeleted, ingressDenyACL, ingressAllowACL)
egressPGName := defaultDenyPortGroup(namespace, egressDefaultDenySuffix)
egressDenyACL, egressAllowACL := buildDenyACLs(namespace, policy, egressPGName, aclLogging, knet.PolicyTypeEgress)
aclsToBeDeleted = append(aclsToBeDeleted, egressDenyACL, egressAllowACL)

err := libovsdbops.DeletePortGroups(oc.nbClient, ingressPGName, egressPGName)
if err != nil {
return err
}

nsInfo.portGroupEgressDenyName = ""
nsInfo.portGroupIngressDenyName = ""

// Manually remove the default ACLs instead of relying on ovsdb garbage collection to do so
err = libovsdbops.DeleteACLs(oc.nbClient, aclsToBeDeleted...)
if err != nil {
return err
}

return nil
}

func (oc *Controller) updateACLLoggingForPolicy(np *networkPolicy, logLevel string) error {
np.Lock()
defer np.Unlock()
Expand Down Expand Up @@ -1179,6 +1209,27 @@ func (oc *Controller) addNetworkPolicy(policy *knet.NetworkPolicy) error {
return fmt.Errorf("failed to create default port groups and acls for policy: %s/%s, error: %v",
policy.Namespace, policy.Name, err)
}
defer func() {
tssurya marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
nsInfo, nsUnlock, errDelete := oc.ensureNamespaceLocked(policy.Namespace, false, nil)
tssurya marked this conversation as resolved.
Show resolved Hide resolved
// rollback failed, best effort cleanup; won't add to retry mechanism since item doesn't exist in cache yet.
if errDelete != nil {
klog.Warningf("Rollback of default port groups and acls for policy: %s/%s failed, Unable to ensure namespace for network policy: error %v", policy.Namespace, policy.Name, errDelete)
return
}
if len(nsInfo.networkPolicies) == 0 {
// try rolling-back since creation of default acls/pgs failed
errDelete = oc.deleteDefaultDenyPGAndACLs(policy.Namespace, policy.Name, nsInfo)
nsUnlock()
if errDelete != nil {
// rollback failed, best effort cleanup; won't add to retry mechanism since item doesn't exist in cache yet.
klog.Warningf("Rollback of default port groups and acls for policy: %s/%s failed: error %v", policy.Namespace, policy.Name, errDelete)
}
} else {
nsUnlock()
}
}
}()
}
aclLogDeny := nsInfo.aclLogging.Deny
aclLogAllow := nsInfo.aclLogging.Allow
Expand Down Expand Up @@ -1242,8 +1293,9 @@ func (oc *Controller) deleteNetworkPolicy(policy *knet.NetworkPolicy, np *networ
if nsInfo == nil {
// if we didn't get nsInfo and np is nil, we cannot proceed
if np == nil {
return fmt.Errorf("failed to get namespace lock when deleting policy %s in namespace %s",
klog.Warningf("Failed to get namespace lock when deleting policy %s in namespace %s",
policy.Name, policy.Namespace)
return nil
}

if err := oc.destroyNetworkPolicy(np, false); err != nil {
Expand All @@ -1263,6 +1315,10 @@ func (oc *Controller) deleteNetworkPolicy(policy *knet.NetworkPolicy, np *networ
expectedLastPolicyNum = 1
np = foundNp
}
if np == nil {
klog.Warningf("Unable to delete network policy: %s/%s since its not found in cache", policy.Namespace, policy.Name)
tssurya marked this conversation as resolved.
Show resolved Hide resolved
return nil
}
isLastPolicyInNamespace := len(nsInfo.networkPolicies) == expectedLastPolicyNum
if err := oc.destroyNetworkPolicy(np, isLastPolicyInNamespace); err != nil {
return fmt.Errorf("failed to destroy network policy: %s/%s", policy.Namespace, policy.Name)
Expand Down
129 changes: 129 additions & 0 deletions go-controller/pkg/ovn/policy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1609,6 +1609,135 @@ var _ = ginkgo.Describe("OVN NetworkPolicy Operations", func() {
gomega.Expect(err).NotTo(gomega.HaveOccurred())
})

ginkgo.It("deleting a network policy that failed half-way through creation succeeds", func() {
app.Action = func(ctx *cli.Context) error {
namespace1 := *newNamespace(namespaceName1)
nPodTest := newTPod(
"node1",
"10.128.1.0/24",
"10.128.1.2",
"10.128.1.1",
"myPod",
"10.128.1.3",
"0a:58:0a:80:01:03",
namespace1.Name,
)
nPod := newPod(nPodTest.namespace, nPodTest.podName, nPodTest.nodeName, nPodTest.podIP)

const (
labelName string = "pod-name"
labelVal string = "server"
portNum int32 = 81
)
nPod.Labels[labelName] = labelVal

tcpProtocol := v1.Protocol(v1.ProtocolTCP)
networkPolicy := newNetworkPolicy("networkpolicy1", namespace1.Name,
metav1.LabelSelector{
MatchLabels: map[string]string{
labelName: labelVal,
},
},
[]knet.NetworkPolicyIngressRule{{
Ports: []knet.NetworkPolicyPort{{
Port: &intstr.IntOrString{IntVal: portNum},
Protocol: &tcpProtocol,
}},
}},
[]knet.NetworkPolicyEgressRule{{
Ports: []knet.NetworkPolicyPort{{
Port: &intstr.IntOrString{IntVal: portNum},
Protocol: &tcpProtocol,
}},
}},
)

egressOptions := map[string]string{
"apply-after-lb": "true",
}
pgHash := hashedPortGroup(networkPolicy.Namespace)
leftOverACLFromUpgrade1 := libovsdbops.BuildACL(
networkPolicy.Namespace+"_ARPallowPolicy",
nbdb.ACLDirectionFromLport,
types.DefaultAllowPriority,
"inport == @"+pgHash+"_"+egressDenyPG+" && arp",
nbdb.ACLActionAllow,
types.OvnACLLoggingMeter,
nbdb.ACLSeverityInfo,
false,
map[string]string{
defaultDenyPolicyTypeACLExtIdKey: string(knet.PolicyTypeEgress),
},
egressOptions,
)
leftOverACLFromUpgrade1.UUID = *leftOverACLFromUpgrade1.Name + "-ingressAllowACL-UUID1"

leftOverACLFromUpgrade2 := libovsdbops.BuildACL(
networkPolicy.Namespace+"_ARPallowPolicy",
nbdb.ACLDirectionFromLport,
types.DefaultAllowPriority,
"inport == @"+pgHash+"_"+egressDenyPG+" && (arp || nd)",
nbdb.ACLActionAllow,
types.OvnACLLoggingMeter,
nbdb.ACLSeverityInfo,
false,
map[string]string{
defaultDenyPolicyTypeACLExtIdKey: string(knet.PolicyTypeEgress),
},
egressOptions,
)
leftOverACLFromUpgrade2.UUID = *leftOverACLFromUpgrade2.Name + "-ingressAllowACL-UUID2"

initialDB1 := libovsdb.TestSetup{
NBData: []libovsdb.TestData{
&nbdb.LogicalSwitch{
Name: "node1",
},
leftOverACLFromUpgrade1,
leftOverACLFromUpgrade2,
},
}

fakeOvn.startWithDBSetup(initialDB1,
&v1.NamespaceList{
Items: []v1.Namespace{namespace1},
},
&v1.PodList{
Items: []v1.Pod{*nPod},
},
)
nPodTest.populateLogicalSwitchCache(fakeOvn, getLogicalSwitchUUID(fakeOvn.controller.nbClient, "node1"))
err := fakeOvn.controller.WatchNamespaces()
gomega.Expect(err).NotTo(gomega.HaveOccurred())
fakeOvn.controller.WatchPods()
fakeOvn.controller.WatchNetworkPolicy()

ginkgo.By("Creating a network policy that applies to a pod and ensuring creation fails")

err = fakeOvn.controller.addNetworkPolicy(networkPolicy)
gomega.Expect(err).To(gomega.HaveOccurred())
gomega.Expect(err.Error()).To(gomega.ContainSubstring("failed to create default port groups and acls for policy: namespace1/networkpolicy1, error: unexpectedly found multiple results for provided predicate"))

//ensure the default PGs and ACLs were removed via rollback from add failure
expectedData := []libovsdb.TestData{}
expectedData = append(expectedData, getExpectedDataPodsAndSwitches([]testPod{nPodTest}, []string{"node1"})...)
// note stale leftovers from previous upgrades won't be cleanedup
expectedData = append(expectedData, leftOverACLFromUpgrade1, leftOverACLFromUpgrade2)
gomega.Eventually(fakeOvn.nbClient).Should(libovsdb.HaveData(expectedData...))

ginkgo.By("Deleting the network policy that failed to create and ensuring we don't panic")
err = fakeOvn.controller.deleteNetworkPolicy(networkPolicy, nil)
// I0623 policy.go:1285] Deleting network policy networkpolicy1 in namespace namespace1, np is nil: true
// W0623 policy.go:1315] Unable to delete network policy: namespace1/networkpolicy1 since its not found in cache
gomega.Expect(err).NotTo(gomega.HaveOccurred())

return nil
}

err := app.Run([]string{app.Name})
gomega.Expect(err).NotTo(gomega.HaveOccurred())
})

ginkgo.It("reconciles a deleted namespace referenced by a networkpolicy with a local running pod", func() {
app.Action = func(ctx *cli.Context) error {
namespace1 := *newNamespace(namespaceName1)
Expand Down