Skip to content

Commit

Permalink
Check KAS loadbalancer health
Browse files Browse the repository at this point in the history
This enhances HCAvailable condition to better represent day2 state
  • Loading branch information
muraee committed Mar 28, 2023
1 parent fd0c4ca commit f6927ce
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 7 deletions.
2 changes: 2 additions & 0 deletions api/v1beta1/hostedcluster_conditions.go
Expand Up @@ -174,6 +174,8 @@ const (

ExternalDNSHostNotReachableReason = "ExternalDNSHostNotReachable"

KASLoadBalancerNotReachableReason = "KASLoadBalancerNotReachable"

ReconciliationPausedConditionReason = "ReconciliationPaused"
ReconciliationInvalidPausedUntilConditionReason = "InvalidPausedUntilValue"
)
Expand Down
Expand Up @@ -6,6 +6,7 @@ import (
"errors"
"fmt"
"math/big"
"net/http"
"os"
"sort"
"strings"
Expand Down Expand Up @@ -580,6 +581,7 @@ func (r *HostedControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.R
kubeConfigAvailable := hostedControlPlane.Status.KubeConfig != nil
etcdCondition := meta.FindStatusCondition(hostedControlPlane.Status.Conditions, string(hyperv1.EtcdAvailable))
kubeAPIServerCondition := meta.FindStatusCondition(hostedControlPlane.Status.Conditions, string(hyperv1.KubeAPIServerAvailable))
healthCheckErr := r.healthCheckKASLoadBalancers(ctx, hostedControlPlane)

status := metav1.ConditionFalse
var reason, message string
Expand All @@ -599,6 +601,9 @@ func (r *HostedControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.R
case kubeAPIServerCondition != nil && kubeAPIServerCondition.Status == metav1.ConditionFalse:
reason = kubeAPIServerCondition.Reason
message = kubeAPIServerCondition.Message
case healthCheckErr != nil:
reason = hyperv1.KASLoadBalancerNotReachableReason
message = healthCheckErr.Error()
default:
reason = hyperv1.AsExpectedReason
message = ""
Expand Down Expand Up @@ -665,6 +670,70 @@ func (r *HostedControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.R
return result, nil
}

// healthCheckKASLoadBalancers pefroms a health check on the KubeAPI server /healthz endpoint using the the public and private load balancers hostnames directly
// This will detect if load balancers are down or deleted out of band
func (r *HostedControlPlaneReconciler) healthCheckKASLoadBalancers(ctx context.Context, hcp *hyperv1.HostedControlPlane) error {
serviceStrategy := util.ServicePublishingStrategyByTypeForHCP(hcp, hyperv1.APIServer)
if serviceStrategy == nil {
return fmt.Errorf("APIServer service strategy not specified")
}

if serviceStrategy.Type == hyperv1.Route {
internalRoute := manifests.KubeAPIServerInternalRoute(hcp.Namespace)
if err := r.Get(ctx, client.ObjectKeyFromObject(internalRoute), internalRoute); err != nil {
return fmt.Errorf("failed to get kube apiserver internal route: %w", err)
}
if len(internalRoute.Status.Ingress) == 0 || internalRoute.Status.Ingress[0].RouterCanonicalHostname == "" {
return fmt.Errorf("APIServer internal route not admitted")
}

if err := healthCheckKASEndpoint(internalRoute.Status.Ingress[0].RouterCanonicalHostname, hcp); err != nil {
return err
}
}

var kasServices []*corev1.Service
if util.IsPrivateHCP(hcp) {
kasServices = append(kasServices, manifests.PrivateRouterService(hcp.Namespace))
if serviceStrategy.Type == hyperv1.LoadBalancer {
kasServices = append(kasServices, manifests.KubeAPIServerPrivateService(hcp.Namespace))
}
} else if serviceStrategy.Type != hyperv1.Route {
kasServices = append(kasServices, manifests.KubeAPIServerService(hcp.Namespace))
}

for _, svc := range kasServices {
if err := r.Get(ctx, client.ObjectKeyFromObject(svc), svc); err != nil {
return fmt.Errorf("failed to get kube apiserver service: %w", err)
}

if len(svc.Status.LoadBalancer.Ingress) == 0 || svc.Status.LoadBalancer.Ingress[0].Hostname == "" {
return fmt.Errorf("APIServer load balancer is not provisioned")
}

if err := healthCheckKASEndpoint(svc.Status.LoadBalancer.Ingress[0].Hostname, hcp); err != nil {
return err
}
}

return nil
}

func healthCheckKASEndpoint(hostname string, hcp *hyperv1.HostedControlPlane) error {
port := util.InternalAPIPortWithDefault(hcp, config.DefaultAPIServerPort)
healthEndpoint := fmt.Sprintf("https://%s:%d/healthz", hostname, port)

resp, err := util.InsecureHTTPClient().Get(healthEndpoint)
if err != nil {
return err
}

if resp.StatusCode != http.StatusOK {
return fmt.Errorf("APIServer endpoint %s is not healthy", hostname)
}
return nil
}

func (r *HostedControlPlaneReconciler) validateConfigAndClusterCapabilities(hc *hyperv1.HostedControlPlane) error {
for _, svc := range hc.Spec.Services {
if svc.Type == hyperv1.Route && !r.ManagementClusterCapabilities.Has(capabilities.CapabilityRoute) {
Expand Down Expand Up @@ -3998,10 +4067,5 @@ func (r *HostedControlPlaneReconciler) GetGuestClusterClient(ctx context.Context
return nil, err
}

clientset, err := kubernetes.NewForConfig(restConfig)
if err != nil {
return nil, err
}

return clientset, nil
return kubernetes.NewForConfig(restConfig)
}
13 changes: 13 additions & 0 deletions support/util/util.go
Expand Up @@ -4,11 +4,13 @@ import (
"bytes"
"compress/gzip"
"context"
"crypto/tls"
"encoding/base64"
"fmt"
"hash/fnv"
"io"
"net"
"net/http"
"strings"
"time"

Expand Down Expand Up @@ -168,6 +170,17 @@ func ResolveDNSHostname(ctx context.Context, hostName string) error {
return err
}

// InsecureHTTPClient return an http.Client which skips server certificate verification
func InsecureHTTPClient() *http.Client {
return &http.Client{
Transport: &http.Transport{
TLSClientConfig: &tls.Config{
InsecureSkipVerify: true,
},
},
}
}

// HashStruct takes a value, typically a string, and returns a 32-bit FNV-1a hashed version of the value as a string
func HashStruct(o interface{}) string {
hash := fnv.New32a()
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/util/oauth.go
Expand Up @@ -106,7 +106,7 @@ func WaitForOAuthToken(t *testing.T, ctx context.Context, oauthRoute *routev1.Ro
}

var access_token string
err = wait.PollImmediateWithContext(ctx, time.Second, time.Minute, func(ctx context.Context) (done bool, err error) {
err = wait.PollImmediateWithContext(ctx, time.Second, time.Minute*2, func(ctx context.Context) (done bool, err error) {
resp, err := httpClient.Do(request)
if err != nil {
t.Logf("Waiting for OAuth token request to succeed")
Expand Down

0 comments on commit f6927ce

Please sign in to comment.