Skip to content

Commit

Permalink
Merge pull request #3697 from Nikokolas3270/OSD-19085
Browse files Browse the repository at this point in the history
OSD-19085: Replaced hypershift_cluster_cores metric with hypershift_cluster_vcpus metric
  • Loading branch information
openshift-merge-bot[bot] committed Apr 15, 2024
2 parents e446c10 + 96b8098 commit 0142eed
Show file tree
Hide file tree
Showing 2 changed files with 156 additions and 86 deletions.
121 changes: 81 additions & 40 deletions hypershift-operator/controllers/nodepool/metrics/metrics.go
Expand Up @@ -31,10 +31,14 @@ const (
CountByHClusterMetricName = "hypershift_hostedcluster_nodepools" // What about renaming it to hypershift_cluster_nodepools ?
countByHClusterMetricHelp = "Number of NodePools for a given HostedCluster"

CoresCountByHClusterMetricName = "hypershift_cluster_cores"
CoresCountByHClusterMetricHelp = "Number of cores for a given HostedCluster. " +
VCpusCountByHClusterMetricName = "hypershift_cluster_vcpus"
VCpusCountByHClusterMetricHelp = "Number of virtual CPUs as reported by the platform for a given HostedCluster. " +
"-1 if this number cannot be computed." // Be careful when changing this metric as it is used for billing the customers

VCpusComputationErrorByHClusterMetricName = "hypershift_cluster_vcpus_computation_error"
VCpusComputationErrorByHClusterMetricHelp = "Defined if and only if " + VCpusCountByHClusterMetricName + " is cannot be computed and is set to -1. " +
"Reason is given by the reason label which only takes a finite number of values."

TransitionDurationMetricName = "hypershift_nodepools_transition_seconds" // What about renaming it to hypershift_nodepools_transition_duration_seconds ?
transitionDurationMetricHelp = "Time in seconds it took for conditions to become true since the creation of the NodePool."

Expand Down Expand Up @@ -88,11 +92,16 @@ var (
countByHClusterMetricHelp,
hclusterLabels, nil)

coresCountByHClusterMetricDesc = prometheus.NewDesc(
CoresCountByHClusterMetricName,
CoresCountByHClusterMetricHelp,
vCpusCountByHClusterMetricDesc = prometheus.NewDesc(
VCpusCountByHClusterMetricName,
VCpusCountByHClusterMetricHelp,
hclusterLabels, nil)

vCpusComputationErrorByHClusterMetricDesc = prometheus.NewDesc(
VCpusComputationErrorByHClusterMetricName,
VCpusComputationErrorByHClusterMetricHelp,
append(hclusterLabels, "reason"), nil)

// One time series per node pool for below metrics
nodePoolLabels = []string{"namespace", "name", "_id", "cluster_name", "platform"}

Expand Down Expand Up @@ -122,7 +131,7 @@ type nodePoolsMetricsCollector struct {
ec2Client ec2iface.EC2API
clock clock.Clock

ec2InstanceTypeToCoresCount map[string]int32
ec2InstanceTypeToVCpusCount map[string]int32

transitionDurationMetric *prometheus.HistogramVec

Expand All @@ -134,7 +143,7 @@ func createNodePoolsMetricsCollector(client client.Client, ec2Client ec2iface.EC
Client: client,
ec2Client: ec2Client,
clock: clock,
ec2InstanceTypeToCoresCount: make(map[string]int32),
ec2InstanceTypeToVCpusCount: make(map[string]int32),
transitionDurationMetric: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: TransitionDurationMetricName,
Help: transitionDurationMetricHelp,
Expand All @@ -157,12 +166,13 @@ func (c *nodePoolsMetricsCollector) Describe(ch chan<- *prometheus.Desc) {
}

type hclusterData struct {
id string
namespace string
name string
platform hyperv1.PlatformType
nodePoolsCount int
coresCount int32
id string
namespace string
name string
platform hyperv1.PlatformType
nodePoolsCount int
vCpusCount int32
vCpusCountErrorReason string
}

func createFailureConditionToNodePoolsCountMap() *map[string]int {
Expand All @@ -181,27 +191,45 @@ func createFailureConditionToNodePoolsCountMap() *map[string]int {
return &res
}

func (c *nodePoolsMetricsCollector) resolveCoresCountPerNode(nodePool *hyperv1.NodePool, unresolvedEc2InstanceTypes *map[string]void) int32 {
if nodePool.Spec.Platform.Type != hyperv1.AWSPlatform || c.ec2Client == nil {
return -1
type vCpusDetail struct {
vCpusCount int32
vCpusCountErrorReason string // set only if vCpusCount == -1
}

func (c *nodePoolsMetricsCollector) retrieveVCpusDetailsPerNode(nodePool *hyperv1.NodePool, ec2InstanceTypeToResolutionErrorReason *map[string]string) vCpusDetail {
if nodePool.Spec.Platform.Type != hyperv1.AWSPlatform {
ctrllog.Log.Info("cannot retrieve the number of vCPUs for " + nodePool.Name + " node pool as its plaform is not supported (supported platforms: AWS)")

return vCpusDetail{vCpusCount: -1, vCpusCountErrorReason: "unsupported platform"}
}

if c.ec2Client == nil {
errorReason := "no AWS EC2 client"
ctrllog.Log.Error(fmt.Errorf(errorReason), "cannot retrieve the number of vCPUs for "+nodePool.Name+" node pool as the client used to query AWS API is not properly initialized")

return vCpusDetail{vCpusCount: -1, vCpusCountErrorReason: errorReason}
}

awsPlatform := nodePool.Spec.Platform.AWS

if awsPlatform == nil {
return -1
errorReason := "spec.platform.aws missing in node pool"
ctrllog.Log.Error(fmt.Errorf(errorReason), "cannot retrieve the number of vCPUs for "+nodePool.Name+" node pool as its specification is inconsistent")

return vCpusDetail{vCpusCount: -1, vCpusCountErrorReason: errorReason}
}

ec2InstanceType := awsPlatform.InstanceType

if _, isUnresolved := (*unresolvedEc2InstanceTypes)[ec2InstanceType]; isUnresolved {
return -1
if unresolvedErrorReason, isUnresolved := (*ec2InstanceTypeToResolutionErrorReason)[ec2InstanceType]; isUnresolved {
return vCpusDetail{vCpusCount: -1, vCpusCountErrorReason: unresolvedErrorReason}
}

if coreCountPerNode, isCached := c.ec2InstanceTypeToCoresCount[ec2InstanceType]; isCached {
return coreCountPerNode
if vCpusCountPerNode, isCached := c.ec2InstanceTypeToVCpusCount[ec2InstanceType]; isCached {
return vCpusDetail{vCpusCount: vCpusCountPerNode}
}
awsInput := ec2.DescribeInstanceTypesInput{InstanceTypes: []*string{&ec2InstanceType}}
unresolvedErrorReason := ""

if awsOuput, err := c.ec2Client.DescribeInstanceTypes(&awsInput); awsOuput != nil && err == nil {
if len(awsOuput.InstanceTypes) == 1 {
Expand All @@ -212,34 +240,36 @@ func (c *nodePoolsMetricsCollector) resolveCoresCountPerNode(nodePool *hyperv1.N
cpuInfo := ec2InstanceTypeInfo.VCpuInfo

if instanceTypeInInfo != nil && *instanceTypeInInfo == ec2InstanceType && cpuInfo != nil {
coreCountPtr := cpuInfo.DefaultCores
vCpusCountPtr := cpuInfo.DefaultVCpus

if coreCountPtr != nil {
coreCount := int32(*coreCountPtr)
if vCpusCountPtr != nil {
vCpusCount := int32(*vCpusCountPtr)

c.ec2InstanceTypeToCoresCount[ec2InstanceType] = coreCount
c.ec2InstanceTypeToVCpusCount[ec2InstanceType] = vCpusCount

return coreCount
return vCpusDetail{vCpusCount: vCpusCount}
}
}
}
}

ctrllog.Log.Error(fmt.Errorf("unexpected AWS output"), "unexpected output for EC2 verb 'describe-instance-types' while querying the following EC2 instance type: "+ec2InstanceType)
unresolvedErrorReason = "unexpected AWS output"
ctrllog.Log.Error(fmt.Errorf(unresolvedErrorReason), "unexpected output for EC2 verb 'describe-instance-types' while querying the following EC2 instance type: "+ec2InstanceType)
} else {
ctrllog.Log.Error(err, "failed to call AWS to resolve the number of cores per node for the following EC2 instance type: "+ec2InstanceType)
unresolvedErrorReason = "failed to call AWS"
ctrllog.Log.Error(err, "failed to call AWS to resolve the number of vCpus per node for the following EC2 instance type: "+ec2InstanceType)
}

(*unresolvedEc2InstanceTypes)[ec2InstanceType] = void{}
(*ec2InstanceTypeToResolutionErrorReason)[ec2InstanceType] = unresolvedErrorReason

return -1
return vCpusDetail{vCpusCount: -1, vCpusCountErrorReason: unresolvedErrorReason}
}

func (c *nodePoolsMetricsCollector) Collect(ch chan<- prometheus.Metric) {
ctx := context.Background()
currentCollectTime := c.clock.Now()
log := ctrllog.Log
unresolvedEc2InstanceTypes := make(map[string]void)
ec2InstanceTypeToResolutionErrorReason := make(map[string]string)

// Data retrieved from objects other than node pools in below loops
hclusterPathToData := make(map[string]*hclusterData)
Expand Down Expand Up @@ -361,14 +391,15 @@ func (c *nodePoolsMetricsCollector) Collect(ch chan<- prometheus.Metric) {
// countByHClusterMetric - aggregation
hclusterData.nodePoolsCount += 1

// coresCountByHClusterMetric - aggregation
if hclusterData.coresCount >= 0 && nodePool.Status.Replicas > 0 {
nodePoolCoresCount := c.resolveCoresCountPerNode(nodePool, &unresolvedEc2InstanceTypes)
// vCpusCountByHClusterMetric - aggregation
if hclusterData.vCpusCount >= 0 && nodePool.Status.Replicas > 0 {
nodeVCpusDetails := c.retrieveVCpusDetailsPerNode(nodePool, &ec2InstanceTypeToResolutionErrorReason)

if nodePoolCoresCount >= 0 {
hclusterData.coresCount += nodePoolCoresCount * nodePool.Status.Replicas
if nodeVCpusDetails.vCpusCountErrorReason == "" {
hclusterData.vCpusCount += nodeVCpusDetails.vCpusCount * nodePool.Status.Replicas
} else {
hclusterData.coresCount = -1
hclusterData.vCpusCount = -1
hclusterData.vCpusCountErrorReason = nodeVCpusDetails.vCpusCountErrorReason
}
}
}
Expand Down Expand Up @@ -488,13 +519,23 @@ func (c *nodePoolsMetricsCollector) Collect(ch chan<- prometheus.Metric) {
hclusterLabelValues...,
)

// coresCountByHClusterMetric
// vCpusCountByHClusterMetric
ch <- prometheus.MustNewConstMetric(
coresCountByHClusterMetricDesc,
vCpusCountByHClusterMetricDesc,
prometheus.GaugeValue,
float64(hclusterData.coresCount),
float64(hclusterData.vCpusCount),
hclusterLabelValues...,
)

// vCpusCountByHClusterMetric
if hclusterData.vCpusCountErrorReason != "" {
ch <- prometheus.MustNewConstMetric(
vCpusComputationErrorByHClusterMetricDesc,
prometheus.GaugeValue,
1.0,
append(hclusterLabelValues, hclusterData.vCpusCountErrorReason)...,
)
}
}

// transitionDurationMetric
Expand Down

0 comments on commit 0142eed

Please sign in to comment.