Skip to content

Commit

Permalink
prep stage doc updates and unify logs and naming
Browse files Browse the repository at this point in the history
  • Loading branch information
pixelsoccupied committed May 15, 2024
1 parent 3bb9b3d commit 9d6981d
Show file tree
Hide file tree
Showing 9 changed files with 95 additions and 170 deletions.
12 changes: 6 additions & 6 deletions controllers/prep_handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ func (r *ImageBasedUpgradeReconciler) handlePrep(ctx context.Context, ibu *ibuv1
if _, err := prep.LaunchStaterootSetupJob(ctx, r.Client, ibu, r.Scheme, r.Log); err != nil {
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("failed launch stateroot job: %s", err.Error()), ibu)
}
return prepInProgressRequeue(r.Log, fmt.Sprintf("Successfully launched a new job `%s` in namespace `%s`", prep.JobName, common.LcaNamespace), ibu)
return prepInProgressRequeue(r.Log, fmt.Sprintf("Successfully launched a new job for stateroot setup. job-name:'%s', job-namespace '%s'", prep.JobName, common.LcaNamespace), ibu)
}
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("failed to get stateroot setup job: %s", err.Error()), ibu)
}
Expand All @@ -393,9 +393,9 @@ func (r *ImageBasedUpgradeReconciler) handlePrep(ctx context.Context, ibu *ibuv1
switch staterootSetupFinishedType {
case "":
common.LogPodLogs(staterootSetupJob, r.Log, r.Clientset)
return prepInProgressRequeue(r.Log, "Stateroot setup in progress", ibu)
return prepInProgressRequeue(r.Log, fmt.Sprintf("Stateroot setup job in progress. job-name: %s, job-namespace: %s", staterootSetupJob.GetName(), staterootSetupJob.GetNamespace()), ibu)
case kbatch.JobFailed:
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("stateroot setup job could not complete. Please check job logs for more, job_name: %s, job_ns: %s", staterootSetupJob.GetName(), staterootSetupJob.GetNamespace()), ibu)
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("stateroot setup job failed to complete. job-name: %s, job-namespace: %s", staterootSetupJob.GetName(), staterootSetupJob.GetNamespace()), ibu)
case kbatch.JobComplete:
r.Log.Info("Stateroot job completed successfully", "completion time", staterootSetupJob.Status.CompletionTime, "total time", staterootSetupJob.Status.CompletionTime.Sub(staterootSetupJob.Status.StartTime.Time))
}
Expand All @@ -408,7 +408,7 @@ func (r *ImageBasedUpgradeReconciler) handlePrep(ctx context.Context, ibu *ibuv1
if err := r.launchPrecaching(ctx, precache.ImageListFile, ibu); err != nil {
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("failed to launch precaching job: %s", err.Error()), ibu)
}
return prepInProgressRequeue(r.Log, fmt.Sprintf("Successfully launched a new job `%s` in namespace `%s`", precache.LcaPrecacheJobName, common.LcaNamespace), ibu)
return prepInProgressRequeue(r.Log, fmt.Sprintf("Successfully launched a new job precache. job-name: %s, job-namespace: %s", precache.LcaPrecacheResourceName, common.LcaNamespace), ibu)
}
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("failed to get precache job: %s", err.Error()), ibu)
}
Expand All @@ -425,9 +425,9 @@ func (r *ImageBasedUpgradeReconciler) handlePrep(ctx context.Context, ibu *ibuv1
switch precacheFinishedType {
case "":
common.LogPodLogs(precacheJob, r.Log, r.Clientset) // pod logs
return prepInProgressRequeue(r.Log, fmt.Sprintf("Precache job in progress: %s", precache.GetPrecacheStatusFileContent()), ibu)
return prepInProgressRequeue(r.Log, fmt.Sprintf("Precache job in progress. job-name: %s, job-namespace: %s. %s", precacheJob.GetName(), precacheJob.GetNamespace(), precache.GetPrecacheStatusFileContent()), ibu)
case kbatch.JobFailed:
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("precache job could not complete. Please check job logs for more, job_name: %s, job_ns: %s", precacheJob.GetName(), precacheJob.GetNamespace()), ibu)
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("precache job failed to complete. job-name: %s, job-namespace: %s", precacheJob.GetName(), precacheJob.GetNamespace()), ibu)
case kbatch.JobComplete:
r.Log.Info("Precache job completed successfully", "completion time", precacheJob.Status.CompletionTime, "total time", precacheJob.Status.CompletionTime.Sub(precacheJob.Status.StartTime.Time))
}
Expand Down
21 changes: 0 additions & 21 deletions docs/assets/precache_design.svg

This file was deleted.

98 changes: 64 additions & 34 deletions docs/image-based-upgrade.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,14 @@ The Lifecycle Agent provides orchestration of an IBU on the target SNO via the `

- Prep
- This stage can only be set when the IBU is idle.
- During this stage, LCA does as much preparation as possible for the upgrade without impacting the current running version. This includes downloading the seed image, unpacking it as a new ostree stateroot and pulling all images specified by the image list built into the seed image,
refer to [precache-plugin](precache-plugin.md). If for whatever reason prep stage is interrupted (e.g pod restarts or system reboot), the stage will marked as fail and only to way to recover is to move back Idle stage (see above). But before moving to Idle,
- During this stage,
LCA does as much preparation as possible for the upgrade without impacting the current running version.
This includes downloading the seed image,
unpacking it as a new ostree stateroot and pulling all images specified by the image list built into the seed image.
If for whatever reason prep stage is interrupted (e.g pod restarts or system reboot),
the stage will be marked as fail, and the only way to recover is to move back `Idle` stage
(see above).
But before moving to Idle,
please consider using [must-gather](must-gather.md) to allow for easier debugging later.

- Upgrade
Expand Down Expand Up @@ -260,7 +266,7 @@ The success path upgrade will progress through the following stages:

Idle -> Prep -> Upgrade -> Idle

#### Starting the Prep stage
### Starting the Prep stage

The administrator patches the imagebasedupgrade CR:

Expand Down Expand Up @@ -290,54 +296,78 @@ oc patch imagebasedupgrade upgrade -n openshift-lifecycle-agent --type='json' -p

The "Prep" stage will:

- Pull the seed image
- Perform the following validations:
- If the oadpContent is populated, validate that the specified configmap has been applied and is valid
- If the extraManifests is populated, validate that the specified configmap has been applied and is valid
- If a required CRD is missing from the current stateroot, a warning message will be included in the IBU CRs with annotation `lca.openshift.io/warn-extramanifest-cm-unknown-crd`.
- Validate that the desired upgrade version matches the version of the seed image
- Validate the version of the LCA in the seed image is compatible with the version on the running SNO
- Unpack the seed image and create a new ostree stateroot
- Pull all images specified by the image list built into the seed image. Refer to [precache-plugin](precache-plugin.md)
1. Perform the following validations:
- If the oadpContent is populated, validate that the specified configmap has been applied and is valid
- If the extraManifests is populated, validate that the specified configmap has been applied and is valid
- If a required CRD is missing from the current stateroot, a warning message will be included in the IBU CR with annotation along with useful info as value.

```yaml
metadata:
annotations:
lca.openshift.io/warn-extramanifest-cm-unknown-crd: '...'
```

> 📝 Warnings are not enforced, and it is up to the user to decide if it's safe to proceed with `Upgrade` stage.
- Validate that the desired upgrade version matches the version of the seed image
- Validate the version of the LCA in the seed image is compatible with the version on the running SNO
2. Pull the seed image
3. Unpack the seed image and create a new ostree stateroot
4. Pull all images specified by the image list built into the seed image to streamline the
upgrade process. This step is also referred to as `Precache`.

Upon completion, the condition will be updated to "Prep Completed"

> [!TIP]
> Stateroot setup (Step 2 and Step 3) is done using a kubernetes job `lca-prep-stateroot-setup`.
>
> ```shell
> oc -n openshift-lifecycle-agent logs -f job/lca-prep-stateroot-setup
> ```
>
> Precache (Step 4) is done using a kubernetes job `lca-prep-precache`.
>
> ```shell
> oc -n openshift-lifecycle-agent logs -f job/lca-prep-precache
> ```
Condition samples:

Prep in progress:

```console
conditions:
- lastTransitionTime: "2024-04-19T19:25:29Z"
- lastTransitionTime: "2024-05-15T17:12:29Z"
message: In progress
observedGeneration: 5
observedGeneration: 21
reason: InProgress
status: "False"
type: Idle
- lastTransitionTime: "2024-04-19T19:25:29Z"
message: Setting up stateroot
observedGeneration: 5
- lastTransitionTime: "2024-05-15T17:12:29Z"
message: 'Stateroot setup job in progress. job-name: lca-prep-stateroot-setup,
job-namespace: openshift-lifecycle-agent'
observedGeneration: 21
reason: InProgress
status: "True"
type: PrepInProgress
observedGeneration: 5
observedGeneration: 21
validNextStages:
- Idle

conditions:
- lastTransitionTime: "2024-04-19T19:25:29Z"
- lastTransitionTime: "2024-05-15T18:14:40Z"
message: In progress
observedGeneration: 5
observedGeneration: 29
reason: InProgress
status: "False"
type: Idle
- lastTransitionTime: "2024-04-19T19:25:29Z"
message: 'Precaching progress: total: 115 (pulled: 10, failed: 0)'
observedGeneration: 5
- lastTransitionTime: "2024-05-15T18:14:40Z"
message: 'Precache job in progress. job-name: lca-prep-precache, job-namespace:
openshift-lifecycle-agent. total: 136 (pulled: 2, failed: 0)'
observedGeneration: 29
reason: InProgress
status: "True"
type: PrepInProgress
observedGeneration: 5
observedGeneration: 29
validNextStages:
- Idle
```
Expand All @@ -346,31 +376,31 @@ Prep completed:

```console
conditions:
- lastTransitionTime: "2024-04-19T19:25:29Z"
- lastTransitionTime: "2024-05-15T14:40:31Z"
message: In progress
observedGeneration: 5
observedGeneration: 13
reason: InProgress
status: "False"
type: Idle
- lastTransitionTime: "2024-04-19T19:26:52Z"
- lastTransitionTime: "2024-05-15T14:45:11Z"
message: Prep completed
observedGeneration: 5
observedGeneration: 13
reason: Completed
status: "False"
type: PrepInProgress
- lastTransitionTime: "2024-04-19T19:26:52Z"
message: Prep completed successfully
observedGeneration: 5
- lastTransitionTime: "2024-05-15T14:45:11Z"
message: Prep stage completed successfully
observedGeneration: 13
reason: Completed
status: "True"
type: PrepCompleted
observedGeneration: 5
observedGeneration: 13
validNextStages:
- Idle
- Upgrade
```

#### Starting the Upgrade stage
### Starting the Upgrade stage

This is where the actual upgrade happens. It consists of three main steps: pre-pivot, pivot and post-pivot.
This stage can only be applied if the prep stage completed successfully.
Expand Down Expand Up @@ -528,7 +558,7 @@ troubleshooting.

See [Automatic Rollback Examples](examples.md#automatic-rollback-examples) for examples of IBU CR after an automatic rollback.

#### Configuring Automatic Rollback
### Configuring Automatic Rollback

There is an `lca-init-monitor.service` that runs post-reboot with a configurable timeout. When LCA marks
the upgrade complete, it shuts down this monitor. If this point is not reached within the configured timeout, the
Expand Down
83 changes: 0 additions & 83 deletions docs/precache-plugin.md

This file was deleted.

3 changes: 1 addition & 2 deletions internal/precache/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ import (
// LCA Resources
const (
LcaPrecacheServiceAccount string = "lifecycle-agent-controller-manager"
LcaPrecacheJobName string = "lca-precache-job"
LcaPrecacheConfigMapName string = "lca-precache-cm"
LcaPrecacheResourceName string = "lca-prep-precache"
LcaPrecacheFinalizer = "lca.openshift.io/precache-finalizer"
)

Expand Down
12 changes: 6 additions & 6 deletions internal/precache/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ import (
func GetJob(ctx context.Context, c client.Client) (*batchv1.Job, error) {
job := &batchv1.Job{}
if err := c.Get(ctx, types.NamespacedName{
Name: LcaPrecacheJobName,
Name: LcaPrecacheResourceName,
Namespace: common.LcaNamespace,
}, job); err != nil {
return nil, err //nolint:wrapcheck
Expand All @@ -62,7 +62,7 @@ func renderConfigMap(imageList []string) *corev1.ConfigMap {

configMap := &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: LcaPrecacheConfigMapName,
Name: LcaPrecacheResourceName,
Namespace: common.LcaNamespace,
},
Data: data,
Expand Down Expand Up @@ -144,7 +144,7 @@ func renderJob(config *Config, log logr.Logger, ibu *ibuv1.ImageBasedUpgrade, sc

job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: LcaPrecacheJobName,
Name: LcaPrecacheResourceName,
Namespace: common.LcaNamespace,
Annotations: map[string]string{
"app.kubernetes.io/name": "lifecycle-agent-precache",
Expand Down Expand Up @@ -206,7 +206,7 @@ func renderJob(config *Config, log logr.Logger, ibu *ibuv1.ImageBasedUpgrade, sc
VolumeSource: corev1.VolumeSource{
ConfigMap: &corev1.ConfigMapVolumeSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: LcaPrecacheConfigMapName,
Name: LcaPrecacheResourceName,
},
DefaultMode: &defaultMode,
},
Expand All @@ -233,7 +233,7 @@ func renderJob(config *Config, log logr.Logger, ibu *ibuv1.ImageBasedUpgrade, sc
func deleteConfigMap(ctx context.Context, c client.Client) error {
cm := corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: LcaPrecacheConfigMapName,
Name: LcaPrecacheResourceName,
Namespace: common.LcaNamespace,
},
}
Expand All @@ -255,7 +255,7 @@ func deleteJob(ctx context.Context, c client.Client) error {

precache := batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: LcaPrecacheJobName,
Name: LcaPrecacheResourceName,
Namespace: common.LcaNamespace,
},
}
Expand Down
Loading

0 comments on commit 9d6981d

Please sign in to comment.