Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OCPBUGS-3885: Fix post timeout reconcile #388

Merged
merged 2 commits into from Nov 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
137 changes: 66 additions & 71 deletions controllers/clustergroupupgrade_controller.go
Expand Up @@ -150,15 +150,40 @@ func (r *ClusterGroupUpgradeReconciler) Reconcile(ctx context.Context, req ctrl.
return
}

// This may be empty for the first reconcile but that's fine since it will get overwritten by the start of the cgu
clusters := r.getClustersListFromRemediationPlan(clusterGroupUpgrade)
suceededCondition := meta.FindStatusCondition(clusterGroupUpgrade.Status.Conditions, string(utils.ConditionTypes.Succeeded))
progressingCondition := meta.FindStatusCondition(clusterGroupUpgrade.Status.Conditions, string(utils.ConditionTypes.Progressing))

var allManagedPoliciesExist bool
var managedPoliciesInfo policiesInfo

if progressingCondition == nil || (progressingCondition.Status == metav1.ConditionFalse && progressingCondition.Reason != string(utils.ConditionReasons.Completed)) {
if suceededCondition != nil {
if clusterGroupUpgrade.Status.Status.CompletedAt.IsZero() {
if suceededCondition.Status == metav1.ConditionTrue {
// Upgrade has successfully finished
r.Log.Info("Upgrade is completed")
// Take actions after upgrade is completed
if err = r.takeActionsAfterCompletion(ctx, clusterGroupUpgrade); err != nil {
return
}
} else {
// Upgrade has failed
// On failure we don't want to complete actions other then to delete the resources
err = r.deleteResources(ctx, clusterGroupUpgrade)
if err != nil {
return
}
r.Recorder.Event(clusterGroupUpgrade, corev1.EventTypeWarning, suceededCondition.Reason, suceededCondition.Message)
r.Log.Info("CGU has failed")
r.addClustersStatusOnTimeout(clusterGroupUpgrade)
}
// Set completion time only after post actions are executed with no errors
clusterGroupUpgrade.Status.Status.CompletedAt = metav1.Now()
clusterGroupUpgrade.Status.Status.CurrentBatch = 0
clusterGroupUpgrade.Status.Status.CurrentBatchStartedAt = metav1.Time{}
clusterGroupUpgrade.Status.Status.CurrentBatchRemediationProgress = nil
}
} else if progressingCondition == nil || progressingCondition.Status == metav1.ConditionFalse {

var allManagedPoliciesExist bool
var managedPoliciesInfo policiesInfo
var clusters []string
var reconcile bool
clusters, reconcile, err = r.validateCR(ctx, clusterGroupUpgrade)
if err != nil {
Expand Down Expand Up @@ -251,77 +276,47 @@ func (r *ClusterGroupUpgradeReconciler) Reconcile(ctx context.Context, req ctrl.
err = r.updateStatus(ctx, clusterGroupUpgrade)
return
}
}

err = r.reconcilePrecaching(ctx, clusterGroupUpgrade, clusters, managedPoliciesInfo.presentPolicies)
if err != nil {
r.Log.Error(err, "reconcilePrecaching error")
return
}
if clusterGroupUpgrade.Status.Precaching != nil {
for _, v := range clusterGroupUpgrade.Status.Precaching.Status {
//nolint
if v == PrecacheStatePreparingToStart || v == PrecacheStateStarting || v == PrecacheStateActive {
err = r.updateStatus(ctx, clusterGroupUpgrade)
nextReconcile = requeueWithShortInterval()
return
}
err = r.reconcilePrecaching(ctx, clusterGroupUpgrade, clusters, managedPoliciesInfo.presentPolicies)
if err != nil {
r.Log.Error(err, "reconcilePrecaching error")
return
}
}

// Update the clusters list based on the precaching results
clusters = r.filterFailedPrecachingClusters(clusterGroupUpgrade, clusters)

// Check if there were any issues with the precaching
if len(clusters) == 0 && len(clusterGroupUpgrade.Status.RemediationPlan) != 0 {
// We expected to remediate some clusters but currently have none
// There should already be a condition present describing the issue we just need to set succeeded and requeue once
utils.SetStatusCondition(
&clusterGroupUpgrade.Status.Conditions,
utils.ConditionTypes.Progressing,
utils.ConditionReasons.Completed,
metav1.ConditionFalse,
"No clusters available for remediation (Precaching failed)",
)
utils.SetStatusCondition(
&clusterGroupUpgrade.Status.Conditions,
utils.ConditionTypes.Succeeded,
utils.ConditionReasons.Failed,
metav1.ConditionFalse,
"No clusters available for remediation (Precaching failed)",
)
// Requeue is not required since the succeeded condition will be checked right after this
r.updateStatus(ctx, clusterGroupUpgrade)
}

suceededCondition := meta.FindStatusCondition(clusterGroupUpgrade.Status.Conditions, string(utils.ConditionTypes.Succeeded))

if suceededCondition != nil {
if suceededCondition.Status == metav1.ConditionTrue {
// Upgrade has successfully finished
if clusterGroupUpgrade.Status.Status.CompletedAt.IsZero() {
r.Log.Info("Upgrade is completed")
// Take actions after upgrade is completed
clusterGroupUpgrade.Status.Status.CurrentBatch = 0
clusterGroupUpgrade.Status.Status.CurrentBatchStartedAt = metav1.Time{}
if err = r.takeActionsAfterCompletion(ctx, clusterGroupUpgrade); err != nil {
if clusterGroupUpgrade.Status.Precaching != nil {
for _, v := range clusterGroupUpgrade.Status.Precaching.Status {
//nolint
if v == PrecacheStatePreparingToStart || v == PrecacheStateStarting || v == PrecacheStateActive {
err = r.updateStatus(ctx, clusterGroupUpgrade)
nextReconcile = requeueWithShortInterval()
return
}
// Set completion time only after post actions are executed with no errors
clusterGroupUpgrade.Status.Status.CompletedAt = metav1.Now()
}
} else {
// Upgrade has failed
r.Recorder.Event(clusterGroupUpgrade, corev1.EventTypeWarning, suceededCondition.Reason, suceededCondition.Message)
r.Log.Info("CGU has failed")
r.addClustersStatusOnTimeout(clusterGroupUpgrade)
// On failure we don't want to complete actions other then to delete the resources
err = r.deleteResources(ctx, clusterGroupUpgrade)
if err != nil {
return
}
}
} else if progressingCondition == nil || progressingCondition.Status == metav1.ConditionFalse {

// Update the clusters list based on the precaching results
clusters = r.filterFailedPrecachingClusters(clusterGroupUpgrade, clusters)

// Check if there were any issues with the precaching
if len(clusters) == 0 && len(clusterGroupUpgrade.Status.RemediationPlan) != 0 {
// We expected to remediate some clusters but currently have none
// There should already be a condition present describing the issue we just need to set succeeded and requeue once
utils.SetStatusCondition(
&clusterGroupUpgrade.Status.Conditions,
utils.ConditionTypes.Progressing,
utils.ConditionReasons.Completed,
metav1.ConditionFalse,
"No clusters available for remediation (Precaching failed)",
)
utils.SetStatusCondition(
&clusterGroupUpgrade.Status.Conditions,
utils.ConditionTypes.Succeeded,
utils.ConditionReasons.Failed,
metav1.ConditionFalse,
"No clusters available for remediation (Precaching failed)",
)
// Requeue is not required since the succeeded condition will be checked right after this
r.updateStatus(ctx, clusterGroupUpgrade)
}

if !*clusterGroupUpgrade.Spec.Enable {
utils.SetStatusCondition(
Expand Down
4 changes: 0 additions & 4 deletions tests/kuttl/tests/upgrade-complete/03-assert.yaml
Expand Up @@ -45,7 +45,3 @@ status:
remediationPlan:
- - spoke1
- - spoke4
status:
currentBatchRemediationProgress:
spoke4:
state: Completed
60 changes: 60 additions & 0 deletions tests/kuttl/tests/upgrade-timeout/00-assert.yaml
@@ -0,0 +1,60 @@
apiVersion: ran.openshift.io/v1alpha1
kind: ClusterGroupUpgrade
metadata:
name: cgu-upgrade-complete
namespace: default
spec:
clusters:
- spoke1
- spoke4
enable: false
managedPolicies:
- policy1-common-cluster-version-policy
- policy2-common-pao-sub-policy
remediationStrategy:
maxConcurrency: 1
timeout: 240
status:
conditions:
- message: All selected clusters are valid
reason: ClusterSelectionCompleted
status: "True"
type: ClustersSelected
- message: Completed validation
reason: ValidationCompleted
status: "True"
type: Validated
- message: Not enabled
reason: NotEnabled
status: "False"
type: Progressing
copiedPolicies:
- cgu-upgrade-complete-policy1-common-cluster-versi-kuttl
- cgu-upgrade-complete-policy2-common-pao-sub-polic-kuttl
managedPoliciesContent:
policy2-common-pao-sub-policy: '[{"kind":"Subscription","name":"performance-addon-operator","apiVersion":"operators.coreos.com/v1alpha1","namespace":"openshift-performance-addon-operator"}]'
managedPoliciesForUpgrade:
- name: policy1-common-cluster-version-policy
namespace: default
- name: policy2-common-pao-sub-policy
namespace: default
managedPoliciesNs:
policy1-common-cluster-version-policy: default
policy2-common-pao-sub-policy: default
placementBindings:
- cgu-upgrade-complete-policy1-common-cluster-version-policy-placement-kuttl
- cgu-upgrade-complete-policy2-common-pao-sub-policy-placement-kuttl
placementRules:
- cgu-upgrade-complete-policy1-common-cluster-version-policy-placement-kuttl
- cgu-upgrade-complete-policy2-common-pao-sub-policy-placement-kuttl
remediationPlan:
- - spoke1
- - spoke4
safeResourceNames:
cgu-upgrade-complete-common-cluster-version-policy-config: cgu-upgrade-complete-common-cluster-version-policy-config-kuttl
cgu-upgrade-complete-common-pao-sub-policy-config: cgu-upgrade-complete-common-pao-sub-policy-config-kuttl
cgu-upgrade-complete-policy1-common-cluster-version-policy: cgu-upgrade-complete-policy1-common-cluster-versi-kuttl
cgu-upgrade-complete-policy1-common-cluster-version-policy-placement: cgu-upgrade-complete-policy1-common-cluster-version-policy-placement-kuttl
cgu-upgrade-complete-policy2-common-pao-sub-policy: cgu-upgrade-complete-policy2-common-pao-sub-polic-kuttl
cgu-upgrade-complete-policy2-common-pao-sub-policy-placement: cgu-upgrade-complete-policy2-common-pao-sub-policy-placement-kuttl
status: {}
24 changes: 24 additions & 0 deletions tests/kuttl/tests/upgrade-timeout/00-setup.yaml
@@ -0,0 +1,24 @@
apiVersion: kuttl.dev/v1beta1
kind: TestStep

commands:
# Create all the managed inform policies
- command: oc apply -f ../../../../deploy/acm/policies/all_policies/policy1-common-cluster-version-policy.yaml
namespaced: true
- command: oc apply -f ../../../../deploy/acm/policies/all_policies/policy2-common-pao-sub-policy.yaml
namespaced: true

# Patch the inform policies to reflect the compliance status.
- command: ../../../../deploy/acm/policies/patch-policies-status.sh default default
ignoreFailure: false

# Create all the child policies to map the inform policies above.
- command: oc apply --namespace=spoke1 -f ../../../../deploy/acm/policies/all_policies/child-policy1-common-cluster-version-policy.yaml
namespaced: true

- command: oc apply --namespace=spoke1 -f ../../../../deploy/acm/policies/all_policies/child-policy2-common-pao-sub-policy.yaml
namespaced: true

# Apply the UOCR.
- command: oc apply -f ../../../../deploy/upgrades/upgrade-complete/cgu-upgrade-complete.yaml
namespaced: true
65 changes: 65 additions & 0 deletions tests/kuttl/tests/upgrade-timeout/01-assert.yaml
@@ -0,0 +1,65 @@
apiVersion: ran.openshift.io/v1alpha1
kind: ClusterGroupUpgrade
metadata:
name: cgu-upgrade-complete
namespace: default
spec:
clusters:
- spoke1
- spoke4
enable: true
managedPolicies:
- policy1-common-cluster-version-policy
- policy2-common-pao-sub-policy
remediationStrategy:
maxConcurrency: 1
timeout: 240
status:
conditions:
- message: All selected clusters are valid
reason: ClusterSelectionCompleted
status: "True"
type: ClustersSelected
- message: Completed validation
reason: ValidationCompleted
status: "True"
type: Validated
- message: Remediating non-compliant policies
reason: InProgress
status: "True"
type: Progressing
copiedPolicies:
- cgu-upgrade-complete-policy1-common-cluster-versi-kuttl
- cgu-upgrade-complete-policy2-common-pao-sub-polic-kuttl
managedPoliciesContent:
policy2-common-pao-sub-policy: '[{"kind":"Subscription","name":"performance-addon-operator","apiVersion":"operators.coreos.com/v1alpha1","namespace":"openshift-performance-addon-operator"}]'
managedPoliciesForUpgrade:
- name: policy1-common-cluster-version-policy
namespace: default
- name: policy2-common-pao-sub-policy
namespace: default
managedPoliciesNs:
policy1-common-cluster-version-policy: default
policy2-common-pao-sub-policy: default
placementBindings:
- cgu-upgrade-complete-policy1-common-cluster-version-policy-placement-kuttl
- cgu-upgrade-complete-policy2-common-pao-sub-policy-placement-kuttl
placementRules:
- cgu-upgrade-complete-policy1-common-cluster-version-policy-placement-kuttl
- cgu-upgrade-complete-policy2-common-pao-sub-policy-placement-kuttl
remediationPlan:
- - spoke1
- - spoke4
safeResourceNames:
cgu-upgrade-complete-common-cluster-version-policy-config: cgu-upgrade-complete-common-cluster-version-policy-config-kuttl
cgu-upgrade-complete-common-pao-sub-policy-config: cgu-upgrade-complete-common-pao-sub-policy-config-kuttl
cgu-upgrade-complete-policy1-common-cluster-version-policy: cgu-upgrade-complete-policy1-common-cluster-versi-kuttl
cgu-upgrade-complete-policy1-common-cluster-version-policy-placement: cgu-upgrade-complete-policy1-common-cluster-version-policy-placement-kuttl
cgu-upgrade-complete-policy2-common-pao-sub-policy: cgu-upgrade-complete-policy2-common-pao-sub-polic-kuttl
cgu-upgrade-complete-policy2-common-pao-sub-policy-placement: cgu-upgrade-complete-policy2-common-pao-sub-policy-placement-kuttl
status:
currentBatch: 1
currentBatchRemediationProgress:
spoke1:
policyIndex: 0
state: InProgress
7 changes: 7 additions & 0 deletions tests/kuttl/tests/upgrade-timeout/01-start-upgrade.yaml
@@ -0,0 +1,7 @@
apiVersion: kuttl.dev/v1beta1
kind: TestStep

commands:
# Start the upgrade by enabling the UOCR.
- command: oc --namespace=default patch clustergroupupgrade.ran.openshift.io/cgu-upgrade-complete --patch '{"spec":{"enable":true}}' --type=merge
ignoreFailure: false
67 changes: 67 additions & 0 deletions tests/kuttl/tests/upgrade-timeout/02-assert.yaml
@@ -0,0 +1,67 @@
apiVersion: ran.openshift.io/v1alpha1
kind: ClusterGroupUpgrade
metadata:
name: cgu-upgrade-complete
namespace: default
spec:
clusters:
- spoke1
- spoke4
enable: true
managedPolicies:
- policy1-common-cluster-version-policy
- policy2-common-pao-sub-policy
remediationStrategy:
maxConcurrency: 1
timeout: 241
status:
conditions:
- message: All selected clusters are valid
reason: ClusterSelectionCompleted
status: "True"
- message: Completed validation
reason: ValidationCompleted
status: "True"
type: Validated
- message: Remediating non-compliant policies
reason: InProgress
status: "True"
type: Progressing
copiedPolicies:
- cgu-upgrade-complete-policy1-common-cluster-versi-kuttl
- cgu-upgrade-complete-policy2-common-pao-sub-polic-kuttl
managedPoliciesContent:
policy2-common-pao-sub-policy: '[{"kind":"Subscription","name":"performance-addon-operator","apiVersion":"operators.coreos.com/v1alpha1","namespace":"openshift-performance-addon-operator"}]'
clusters:
- name: spoke1
state: complete
managedPoliciesForUpgrade:
- name: policy1-common-cluster-version-policy
namespace: default
- name: policy2-common-pao-sub-policy
namespace: default
managedPoliciesNs:
policy1-common-cluster-version-policy: default
policy2-common-pao-sub-policy: default
placementBindings:
- cgu-upgrade-complete-policy1-common-cluster-version-policy-placement-kuttl
- cgu-upgrade-complete-policy2-common-pao-sub-policy-placement-kuttl
placementRules:
- cgu-upgrade-complete-policy1-common-cluster-version-policy-placement-kuttl
- cgu-upgrade-complete-policy2-common-pao-sub-policy-placement-kuttl
remediationPlan:
- - spoke1
- - spoke4
safeResourceNames:
cgu-upgrade-complete-common-cluster-version-policy-config: cgu-upgrade-complete-common-cluster-version-policy-config-kuttl
cgu-upgrade-complete-common-pao-sub-policy-config: cgu-upgrade-complete-common-pao-sub-policy-config-kuttl
cgu-upgrade-complete-policy1-common-cluster-version-policy: cgu-upgrade-complete-policy1-common-cluster-versi-kuttl
cgu-upgrade-complete-policy1-common-cluster-version-policy-placement: cgu-upgrade-complete-policy1-common-cluster-version-policy-placement-kuttl
cgu-upgrade-complete-policy2-common-pao-sub-policy: cgu-upgrade-complete-policy2-common-pao-sub-polic-kuttl
cgu-upgrade-complete-policy2-common-pao-sub-policy-placement: cgu-upgrade-complete-policy2-common-pao-sub-policy-placement-kuttl
status:
currentBatch: 2
currentBatchRemediationProgress:
spoke4:
policyIndex: 0
state: InProgress