Skip to content

Commit

Permalink
prep stage doc updates and unify logs and naming
Browse files Browse the repository at this point in the history
  • Loading branch information
pixelsoccupied committed May 16, 2024
1 parent 133ba4d commit 9b9eec7
Show file tree
Hide file tree
Showing 9 changed files with 114 additions and 189 deletions.
38 changes: 23 additions & 15 deletions controllers/prep_handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,13 @@ func (r *ImageBasedUpgradeReconciler) launchPrecaching(ctx context.Context, imag
return nil
}

func getSeedManifestPath(osname string) string {
return filepath.Join(
common.GetStaterootPath(osname),
filepath.Join(common.SeedDataDir, common.SeedClusterInfoFileName),
)
}

// validateIBUSpec validates the fields in the IBU spec
func (r *ImageBasedUpgradeReconciler) validateIBUSpec(ctx context.Context, ibu *ibuv1.ImageBasedUpgrade) error {
// Check spec against this cluster's version and possibly exit early
Expand Down Expand Up @@ -376,26 +383,26 @@ func (r *ImageBasedUpgradeReconciler) handlePrep(ctx context.Context, ibu *ibuv1
if _, err := prep.LaunchStaterootSetupJob(ctx, r.Client, ibu, r.Scheme, r.Log); err != nil {
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("failed launch stateroot job: %s", err.Error()), ibu)
}
return prepInProgressRequeue(r.Log, fmt.Sprintf("Successfully launched a new job `%s` in namespace `%s`", prep.StaterootSetupJobName, common.LcaNamespace), ibu)
return prepInProgressRequeue(r.Log, fmt.Sprintf("Successfully launched a new job for stateroot setup. %s", getJobMetadataString(staterootSetupJob)), ibu)
}
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("failed to get stateroot setup job: %s", err.Error()), ibu)
}

r.Log.Info("Verifying stateroot setup job status")

// job deletion not allowed
// job deletion is not allowed
if staterootSetupJob.GetDeletionTimestamp() != nil {
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("stateroot job is marked to be deleted, this is not allowed. job: %s, ns: %s", staterootSetupJob.GetName(), staterootSetupJob.GetNamespace()), ibu)
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("stateroot job is marked to be deleted, this is not allowed. %s", getJobMetadataString(staterootSetupJob)), ibu)
}

// check .status
_, staterootSetupFinishedType := common.IsJobFinished(staterootSetupJob)
switch staterootSetupFinishedType {
case "":
common.LogPodLogs(staterootSetupJob, r.Log, r.Clientset)
return prepInProgressRequeue(r.Log, "Stateroot setup in progress", ibu)
return prepInProgressRequeue(r.Log, fmt.Sprintf("Stateroot setup job in progress. %s", getJobMetadataString(staterootSetupJob)), ibu)
case kbatch.JobFailed:
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("stateroot setup job could not complete. please check job logs for more, job_name: %s, job_ns: %s", staterootSetupJob.GetName(), staterootSetupJob.GetNamespace()), ibu)
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("stateroot setup job failed to complete. %s", getJobMetadataString(staterootSetupJob)), ibu)
case kbatch.JobComplete:
r.Log.Info("Stateroot job completed successfully", "completion time", staterootSetupJob.Status.CompletionTime, "total time", staterootSetupJob.Status.CompletionTime.Sub(staterootSetupJob.Status.StartTime.Time))
}
Expand All @@ -408,26 +415,26 @@ func (r *ImageBasedUpgradeReconciler) handlePrep(ctx context.Context, ibu *ibuv1
if err := r.launchPrecaching(ctx, precache.ImageListFile, ibu); err != nil {
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("failed to launch precaching job: %s", err.Error()), ibu)
}
return prepInProgressRequeue(r.Log, fmt.Sprintf("Successfully launched a new job `%s` in namespace `%s`", precache.LcaPrecacheJobName, common.LcaNamespace), ibu)
return prepInProgressRequeue(r.Log, fmt.Sprintf("Successfully launched a new job precache. %s", getJobMetadataString(precacheJob)), ibu)
}
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("failed to get precache job: %s", err.Error()), ibu)
}

r.Log.Info("Verifying precache job status")

// job deletion not allowed
// job deletion is not allowed
if precacheJob.GetDeletionTimestamp() != nil {
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("precache job is marked to be deleted, this not allowed. job: %s, ns: %s", precacheJob.GetName(), precacheJob.GetNamespace()), ibu)
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("precache job is marked to be deleted, this not allowed. %s", getJobMetadataString(precacheJob)), ibu)
}

// check .status
_, precacheFinishedType := common.IsJobFinished(precacheJob)
switch precacheFinishedType {
case "":
common.LogPodLogs(precacheJob, r.Log, r.Clientset) // pod logs
return prepInProgressRequeue(r.Log, fmt.Sprintf("Precache job in progress: %s", precache.GetPrecacheStatusFileContent()), ibu)
return prepInProgressRequeue(r.Log, fmt.Sprintf("Precache job in progress. %s. %s", getJobMetadataString(precacheJob), precache.GetPrecacheStatusFileContent()), ibu)
case kbatch.JobFailed:
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("precache job could not complete. please check job logs for more, job_name: %s, job_ns: %s", precacheJob.GetName(), precacheJob.GetNamespace()), ibu)
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("precache job failed to complete. %s", getJobMetadataString(precacheJob)), ibu)
case kbatch.JobComplete:
r.Log.Info("Precache job completed successfully", "completion time", precacheJob.Status.CompletionTime, "total time", precacheJob.Status.CompletionTime.Sub(precacheJob.Status.StartTime.Time))
}
Expand Down Expand Up @@ -459,9 +466,10 @@ func prepInProgressRequeue(log logr.Logger, msg string, ibu *ibuv1.ImageBasedUpg
return requeueWithShortInterval(), nil
}

func getSeedManifestPath(osname string) string {
return filepath.Join(
common.GetStaterootPath(osname),
filepath.Join(common.SeedDataDir, common.SeedClusterInfoFileName),
)
// getJobMetadataString a helper to append job metadata for helpful logs
func getJobMetadataString(job *kbatch.Job) string {
if job == nil {
return "job is nil"
}
return fmt.Sprintf("job-name: %s, job-namespace: %s", job.GetName(), job.GetNamespace())
}
21 changes: 0 additions & 21 deletions docs/assets/precache_design.svg

This file was deleted.

105 changes: 64 additions & 41 deletions docs/image-based-upgrade.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ The Lifecycle Agent provides orchestration of an IBU on the target SNO via the `

- Prep
- This stage can only be set when the IBU is idle.
- During this stage, LCA does as much preparation as possible for the upgrade without impacting the current running version. This includes downloading the seed image, unpacking it as a new ostree stateroot and pulling all images specified by the image list built into the seed image,
refer to [precache-plugin](precache-plugin.md). If for whatever reason prep stage is interrupted (e.g pod restarts or system reboot), the stage will marked as fail and only to way to recover is to move back Idle stage (see above). But before moving to Idle,
please consider using [must-gather](must-gather.md) to allow for easier debugging later.
- During this stage,
LCA does as much preparation as possible for the upgrade without impacting the current running version. This includes downloading the seed image, unpacking it as a new ostree stateroot and pulling all images specified by the image list built into the seed image. If for whatever reason prep stage
is interrupted (e.g pod restarts or system reboot), the stage will be marked as fail, and the only way to recover is to move back `Idle` stage (see above). But before moving to Idle, please consider using [must-gather](must-gather.md) to allow for easier debugging later.

- Upgrade
- This stage can only be set if the prep stage completed successfully.
Expand Down Expand Up @@ -260,7 +260,7 @@ The success path upgrade will progress through the following stages:

Idle -> Prep -> Upgrade -> Idle

#### Starting the Prep stage
### Starting the Prep stage

The administrator patches the imagebasedupgrade CR:

Expand Down Expand Up @@ -290,60 +290,83 @@ oc patch imagebasedupgrade upgrade -n openshift-lifecycle-agent --type='json' -p

The "Prep" stage will:

- Perform the following validations:
- If the oadpContent is populated, validate that the specified configmap has been applied and is valid
- If the extraManifests is populated, validate that the specified configmap has been applied and is valid
- If a required CRD is missing from the current stateroot, a warning message will be included in the IBU CRs with annotation `lca.openshift.io/warn-extramanifest-cm-unknown-crd`.
- Validate that the desired upgrade version matches the version of the seed image
- Validate the version of the LCA in the seed image is compatible with the version on the running SNO
- Pull the seed image
- Unpack the seed image and create a new ostree stateroot

> [!CAUTION]
> At this point of `Prep` stage, it is VERY important to let it run to completion.
> To help avoid unintended consequences of accidental deletion (e.g moving to `Idle` stage while `Prep` in progress), there are blocks in place to allow it to run its normal course before continuing with the deletion request.
> During this wait (could be up to several minutes), please refer to the pod logs for more information.

- Pull all images specified by the image list built into the seed image. Refer to [precache-plugin](precache-plugin.md)
1. Perform various validation steps on IBU CR. This includes (but not limited to) the following:
- If the oadpContent is populated, validate that the specified configmap has been applied and is valid
- If the extraManifests is populated, validate that the specified configmap has been applied and is valid
- If a required CRD is missing from the current cluster, a warning message will be included in the IBU CR with annotation along with useful info as value.

```yaml
metadata:
annotations:
lca.openshift.io/warn-extramanifest-cm-unknown-crd: '...'
```

> 📝 Warnings are not enforced, and it is up to the user to decide if it's safe to proceed with `Upgrade` stage.
- Validate the version of the LCA in the seed image is compatible with the version on the running SNO
2. Setup new stateroot
- Pull the seed image
- Unpack the seed image and perform various validations such as assert that the desired upgrade version matches the version of the seed image
- Create a new ostree stateroot
> ⚠️ Caution
>
> At this point of `Prep` stage (Step 2), it is VERY important to let it run to completion. To help avoid unintended consequences of accidental deletion (e.g moving to `Idle` stage while `Prep` in progress), there are blocks in place to allow it to run its normal course before continuing with
the deletion request. During this wait (could be up to several minutes), please refer to the pod logs for more information.
3. Pull all images specified by the image list built into the seed image to streamline the
upgrade process. This step is also referred to as `Precache`.

Upon completion, the condition will be updated to "Prep Completed"

> [!TIP]
> Stateroot setup (Step 2) is done using a kubernetes job called `lca-prep-stateroot-setup`.
>
> ```shell
> oc -n openshift-lifecycle-agent logs -f job/lca-prep-stateroot-setup
> ```
>
> Precache (Step 3) is done using a kubernetes job called `lca-prep-precache`.
>
> ```shell
> oc -n openshift-lifecycle-agent logs -f job/lca-prep-precache
> ```

Condition samples:

Prep in progress:

```console
conditions:
- lastTransitionTime: "2024-04-19T19:25:29Z"
- lastTransitionTime: "2024-05-15T17:12:29Z"
message: In progress
observedGeneration: 5
observedGeneration: 21
reason: InProgress
status: "False"
type: Idle
- lastTransitionTime: "2024-04-19T19:25:29Z"
message: Setting up stateroot
observedGeneration: 5
- lastTransitionTime: "2024-05-15T17:12:29Z"
message: 'Stateroot setup job in progress. job-name: lca-prep-stateroot-setup,
job-namespace: openshift-lifecycle-agent'
observedGeneration: 21
reason: InProgress
status: "True"
type: PrepInProgress
observedGeneration: 5
observedGeneration: 21
validNextStages:
- Idle
conditions:
- lastTransitionTime: "2024-04-19T19:25:29Z"
- lastTransitionTime: "2024-05-15T18:14:40Z"
message: In progress
observedGeneration: 5
observedGeneration: 29
reason: InProgress
status: "False"
type: Idle
- lastTransitionTime: "2024-04-19T19:25:29Z"
message: 'Precaching progress: total: 115 (pulled: 10, failed: 0)'
observedGeneration: 5
- lastTransitionTime: "2024-05-15T18:14:40Z"
message: 'Precache job in progress. job-name: lca-prep-precache, job-namespace:
openshift-lifecycle-agent. total: 136 (pulled: 2, failed: 0)'
observedGeneration: 29
reason: InProgress
status: "True"
type: PrepInProgress
observedGeneration: 5
observedGeneration: 29
validNextStages:
- Idle
```
Expand All @@ -352,31 +375,31 @@ Prep completed:

```console
conditions:
- lastTransitionTime: "2024-04-19T19:25:29Z"
- lastTransitionTime: "2024-05-15T14:40:31Z"
message: In progress
observedGeneration: 5
observedGeneration: 13
reason: InProgress
status: "False"
type: Idle
- lastTransitionTime: "2024-04-19T19:26:52Z"
- lastTransitionTime: "2024-05-15T14:45:11Z"
message: Prep completed
observedGeneration: 5
observedGeneration: 13
reason: Completed
status: "False"
type: PrepInProgress
- lastTransitionTime: "2024-04-19T19:26:52Z"
message: Prep completed successfully
observedGeneration: 5
- lastTransitionTime: "2024-05-15T14:45:11Z"
message: Prep stage completed successfully
observedGeneration: 13
reason: Completed
status: "True"
type: PrepCompleted
observedGeneration: 5
observedGeneration: 13
validNextStages:
- Idle
- Upgrade
```

#### Starting the Upgrade stage
### Starting the Upgrade stage

This is where the actual upgrade happens. It consists of three main steps: pre-pivot, pivot and post-pivot.
This stage can only be applied if the prep stage completed successfully.
Expand Down Expand Up @@ -534,7 +557,7 @@ troubleshooting.

See [Automatic Rollback Examples](examples.md#automatic-rollback-examples) for examples of IBU CR after an automatic rollback.

#### Configuring Automatic Rollback
### Configuring Automatic Rollback

There is an `lca-init-monitor.service` that runs post-reboot with a configurable timeout. When LCA marks
the upgrade complete, it shuts down this monitor. If this point is not reached within the configured timeout, the
Expand Down
83 changes: 0 additions & 83 deletions docs/precache-plugin.md

This file was deleted.

3 changes: 1 addition & 2 deletions internal/precache/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ import (
// LCA Resources
const (
LcaPrecacheServiceAccount string = "lifecycle-agent-controller-manager"
LcaPrecacheJobName string = "lca-precache-job"
LcaPrecacheConfigMapName string = "lca-precache-cm"
LcaPrecacheResourceName string = "lca-prep-precache"
LcaPrecacheFinalizer = "lca.openshift.io/precache-finalizer"
)

Expand Down
Loading

0 comments on commit 9b9eec7

Please sign in to comment.