Skip to content

Commit

Permalink
Merge pull request #8089 from openshift-cherrypick-robot/cherry-pick-…
Browse files Browse the repository at this point in the history
…8004-to-release-4.15

[release-4.15] OCPBUGS-30098: feat: add check for SNO bootstrap condition
  • Loading branch information
openshift-merge-bot[bot] committed Mar 15, 2024
2 parents 3743665 + da0baba commit 4edfd0e
Showing 1 changed file with 93 additions and 1 deletion.
94 changes: 93 additions & 1 deletion cmd/openshift-install/create.go
Expand Up @@ -18,16 +18,20 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/dynamic"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/clientcmd"
clientwatch "k8s.io/client-go/tools/watch"

configv1 "github.com/openshift/api/config/v1"
operatorv1 "github.com/openshift/api/operator/v1"
configclient "github.com/openshift/client-go/config/clientset/versioned"
configinformers "github.com/openshift/client-go/config/informers/externalversions"
configlisters "github.com/openshift/client-go/config/listers/config/v1"
Expand Down Expand Up @@ -462,7 +466,15 @@ func waitForBootstrapComplete(ctx context.Context, config *rest.Config) *cluster
return newAPIError(err)
}

return waitForBootstrapConfigMap(ctx, client)
if err := waitForBootstrapConfigMap(ctx, client); err != nil {
return err
}

if err := waitForStableSNOBootstrap(ctx, config); err != nil {
return newBootstrapError(err)
}

return nil
}

// waitForBootstrapConfigMap watches the configmaps in the kube-system namespace
Expand Down Expand Up @@ -519,6 +531,64 @@ func waitForBootstrapConfigMap(ctx context.Context, client *kubernetes.Clientset
return nil
}

// When bootstrap on SNO deployments, we should not remove the bootstrap node prematurely,
// here we make sure that the deployment is stable.
// Given the nature of single node we just need to make sure things such as etcd are in the proper state
// before continuing.
func waitForStableSNOBootstrap(ctx context.Context, config *rest.Config) error {
timeout := 5 * time.Minute

// If we're not in a single node deployment, bail early
if isSNO, err := IsSingleNode(); err != nil {
logrus.Warningf("Can not determine if installing a Single Node cluster, continuing as normal install: %v", err)
return nil
} else if !isSNO {
return nil
}

snoBootstrapContext, cancel := context.WithTimeout(ctx, timeout)
defer cancel()

untilTime := time.Now().Add(timeout)
timezone, _ := untilTime.Zone()
logrus.Info("Detected Single Node deployment")
logrus.Infof("Waiting up to %v (until %v %s) for the bootstrap etcd member to be removed...",
timeout, untilTime.Format(time.Kitchen), timezone)

client, err := dynamic.NewForConfig(config)
if err != nil {
return fmt.Errorf("error creating dynamic client: %w", err)
}
gvr := schema.GroupVersionResource{
Group: operatorv1.SchemeGroupVersion.Group,
Version: operatorv1.SchemeGroupVersion.Version,
Resource: "etcds",
}
resourceClient := client.Resource(gvr)
// Validate the etcd operator has removed the bootstrap etcd member
return wait.PollUntilContextCancel(snoBootstrapContext, 1*time.Second, true, func(ctx context.Context) (done bool, err error) {
etcdOperator := &operatorv1.Etcd{}
etcdUnstructured, err := resourceClient.Get(ctx, "cluster", metav1.GetOptions{})
if err != nil {
// There might be service disruptions in SNO, we log those here but keep trying with in the time limit
logrus.Debugf("Error getting ETCD Cluster resource, retrying: %v", err)
return false, nil
}
err = runtime.DefaultUnstructuredConverter.FromUnstructured(etcdUnstructured.Object, etcdOperator)
if err != nil {
// This error should not happen, if we do, we log the error and keep retrying until we hit the limit
logrus.Debugf("Error parsing etcds resource, retrying: %v", err)
return false, nil
}
for _, condition := range etcdOperator.Status.Conditions {
if condition.Type == "EtcdBootstrapMemberRemoved" {
return configv1.ConditionStatus(condition.Status) == configv1.ConditionTrue, nil
}
}
return false, nil
})
}

// waitForInitializedCluster watches the ClusterVersion waiting for confirmation
// that the cluster has been initialized.
func waitForInitializedCluster(ctx context.Context, config *rest.Config) error {
Expand Down Expand Up @@ -870,3 +940,25 @@ func handleUnreachableAPIServer(config *rest.Config) error {

return nil
}

// IsSingleNode determines if we are in a single node configuration based off of the install config
// loaded from the asset store.
func IsSingleNode() (bool, error) {
assetStore, err := assetstore.NewStore(command.RootOpts.Dir)
if err != nil {
return false, fmt.Errorf("error loading asset store: %w", err)
}
installConfig, err := assetStore.Load(&installconfig.InstallConfig{})
if err != nil {
return false, fmt.Errorf("error loading installConfig: %w", err)
}
if installConfig == nil {
return false, fmt.Errorf("installConfig loaded from asset store was nil")
}

config := installConfig.(*installconfig.InstallConfig).Config
if machinePool := config.ControlPlane; machinePool != nil {
return *machinePool.Replicas == int64(1), nil
}
return false, nil
}

0 comments on commit 4edfd0e

Please sign in to comment.