From db22a8b5587d6d24ea536a6037b1245fa497e481 Mon Sep 17 00:00:00 2001 From: Mulham Raee Date: Mon, 3 Apr 2023 14:42:02 +0200 Subject: [PATCH] Check KAS loadbalancer health This enhances HCAvailable condition to better represent day2 state --- api/v1beta1/hostedcluster_conditions.go | 2 + .../hostedcontrolplane_controller.go | 76 +++++++++++++++++-- support/util/util.go | 13 ++++ test/e2e/util/oauth.go | 2 +- 4 files changed, 86 insertions(+), 7 deletions(-) diff --git a/api/v1beta1/hostedcluster_conditions.go b/api/v1beta1/hostedcluster_conditions.go index a1dab48321..d45d31a5f2 100644 --- a/api/v1beta1/hostedcluster_conditions.go +++ b/api/v1beta1/hostedcluster_conditions.go @@ -174,6 +174,8 @@ const ( ExternalDNSHostNotReachableReason = "ExternalDNSHostNotReachable" + KASLoadBalancerNotReachableReason = "KASLoadBalancerNotReachable" + ReconciliationPausedConditionReason = "ReconciliationPaused" ReconciliationInvalidPausedUntilConditionReason = "InvalidPausedUntilValue" ) diff --git a/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go b/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go index 2d130401c6..e183e45b6b 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go +++ b/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go @@ -6,6 +6,7 @@ import ( "errors" "fmt" "math/big" + "net/http" "os" "sort" "strings" @@ -580,6 +581,7 @@ func (r *HostedControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.R kubeConfigAvailable := hostedControlPlane.Status.KubeConfig != nil etcdCondition := meta.FindStatusCondition(hostedControlPlane.Status.Conditions, string(hyperv1.EtcdAvailable)) kubeAPIServerCondition := meta.FindStatusCondition(hostedControlPlane.Status.Conditions, string(hyperv1.KubeAPIServerAvailable)) + healthCheckErr := r.healthCheckKASLoadBalancers(ctx, hostedControlPlane) status := metav1.ConditionFalse var reason, message string @@ -599,6 +601,9 @@ func (r *HostedControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.R case kubeAPIServerCondition != nil && kubeAPIServerCondition.Status == metav1.ConditionFalse: reason = kubeAPIServerCondition.Reason message = kubeAPIServerCondition.Message + case healthCheckErr != nil: + reason = hyperv1.KASLoadBalancerNotReachableReason + message = healthCheckErr.Error() default: reason = hyperv1.AsExpectedReason message = "" @@ -665,6 +670,70 @@ func (r *HostedControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.R return result, nil } +// healthCheckKASLoadBalancers performs a health check on the KubeAPI server /healthz endpoint using the public and private load balancers hostnames directly +// This will detect if load balancers are down or deleted out of band +func (r *HostedControlPlaneReconciler) healthCheckKASLoadBalancers(ctx context.Context, hcp *hyperv1.HostedControlPlane) error { + serviceStrategy := util.ServicePublishingStrategyByTypeForHCP(hcp, hyperv1.APIServer) + if serviceStrategy == nil { + return fmt.Errorf("APIServer service strategy not specified") + } + + if serviceStrategy.Type == hyperv1.Route { + internalRoute := manifests.KubeAPIServerInternalRoute(hcp.Namespace) + if err := r.Get(ctx, client.ObjectKeyFromObject(internalRoute), internalRoute); err != nil { + return fmt.Errorf("failed to get kube apiserver internal route: %w", err) + } + if len(internalRoute.Status.Ingress) == 0 || internalRoute.Status.Ingress[0].RouterCanonicalHostname == "" { + return fmt.Errorf("APIServer internal route not admitted") + } + + if err := healthCheckKASEndpoint(internalRoute.Status.Ingress[0].RouterCanonicalHostname, hcp); err != nil { + return err + } + } + + var kasServices []*corev1.Service + if util.IsPrivateHCP(hcp) { + kasServices = append(kasServices, manifests.PrivateRouterService(hcp.Namespace)) + if serviceStrategy.Type == hyperv1.LoadBalancer { + kasServices = append(kasServices, manifests.KubeAPIServerPrivateService(hcp.Namespace)) + } + } else if serviceStrategy.Type != hyperv1.Route { + kasServices = append(kasServices, manifests.KubeAPIServerService(hcp.Namespace)) + } + + for _, svc := range kasServices { + if err := r.Get(ctx, client.ObjectKeyFromObject(svc), svc); err != nil { + return fmt.Errorf("failed to get kube apiserver service: %w", err) + } + + if len(svc.Status.LoadBalancer.Ingress) == 0 || svc.Status.LoadBalancer.Ingress[0].Hostname == "" { + return fmt.Errorf("APIServer load balancer is not provisioned") + } + + if err := healthCheckKASEndpoint(svc.Status.LoadBalancer.Ingress[0].Hostname, hcp); err != nil { + return err + } + } + + return nil +} + +func healthCheckKASEndpoint(hostname string, hcp *hyperv1.HostedControlPlane) error { + port := util.InternalAPIPortWithDefault(hcp, config.DefaultAPIServerPort) + healthEndpoint := fmt.Sprintf("https://%s:%d/healthz", hostname, port) + + resp, err := util.InsecureHTTPClient().Get(healthEndpoint) + if err != nil { + return err + } + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("APIServer endpoint %s is not healthy", hostname) + } + return nil +} + func (r *HostedControlPlaneReconciler) validateConfigAndClusterCapabilities(hc *hyperv1.HostedControlPlane) error { for _, svc := range hc.Spec.Services { if svc.Type == hyperv1.Route && !r.ManagementClusterCapabilities.Has(capabilities.CapabilityRoute) { @@ -3998,10 +4067,5 @@ func (r *HostedControlPlaneReconciler) GetGuestClusterClient(ctx context.Context return nil, err } - clientset, err := kubernetes.NewForConfig(restConfig) - if err != nil { - return nil, err - } - - return clientset, nil + return kubernetes.NewForConfig(restConfig) } diff --git a/support/util/util.go b/support/util/util.go index fe42a8d0b7..b776be3797 100644 --- a/support/util/util.go +++ b/support/util/util.go @@ -4,11 +4,13 @@ import ( "bytes" "compress/gzip" "context" + "crypto/tls" "encoding/base64" "fmt" "hash/fnv" "io" "net" + "net/http" "strings" "time" @@ -168,6 +170,17 @@ func ResolveDNSHostname(ctx context.Context, hostName string) error { return err } +// InsecureHTTPClient return an http.Client which skips server certificate verification +func InsecureHTTPClient() *http.Client { + return &http.Client{ + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: true, + }, + }, + } +} + // HashStruct takes a value, typically a string, and returns a 32-bit FNV-1a hashed version of the value as a string func HashStruct(o interface{}) string { hash := fnv.New32a() diff --git a/test/e2e/util/oauth.go b/test/e2e/util/oauth.go index 6347eebc0b..827e440859 100644 --- a/test/e2e/util/oauth.go +++ b/test/e2e/util/oauth.go @@ -106,7 +106,7 @@ func WaitForOAuthToken(t *testing.T, ctx context.Context, oauthRoute *routev1.Ro } var access_token string - err = wait.PollImmediateWithContext(ctx, time.Second, time.Minute, func(ctx context.Context) (done bool, err error) { + err = wait.PollImmediateWithContext(ctx, time.Second, time.Minute*2, func(ctx context.Context) (done bool, err error) { resp, err := httpClient.Do(request) if err != nil { t.Logf("Waiting for OAuth token request to succeed")