From 80a4ccbecba4295edd2d48f2339553500d91976f Mon Sep 17 00:00:00 2001 From: Michael Burke Date: Fri, 10 Oct 2025 10:19:15 -0400 Subject: [PATCH] OSDOCS 16483 Docs for Custom TLS certificates & GPUS in Custom Metrics Autoscaler --- ...a-autoscaling-custom-trigger-prom-gpu.adoc | 40 +++++++++++++++++++ .../nodes-cma-autoscaling-custom-trigger.adoc | 1 + 2 files changed, 41 insertions(+) create mode 100644 modules/nodes-cma-autoscaling-custom-trigger-prom-gpu.adoc diff --git a/modules/nodes-cma-autoscaling-custom-trigger-prom-gpu.adoc b/modules/nodes-cma-autoscaling-custom-trigger-prom-gpu.adoc new file mode 100644 index 000000000000..0f0c1d42b15b --- /dev/null +++ b/modules/nodes-cma-autoscaling-custom-trigger-prom-gpu.adoc @@ -0,0 +1,40 @@ +// Module included in the following assemblies: +// +// * nodes/cma/nodes-cma-autoscaling-custom-trigger.adoc + +:_mod-docs-content-type: CONCEPT +[id="nodes-cma-autoscaling-custom-trigger-prom-gpu_{context}"] += Configuring GPU-based autoscaling with Prometheus and DCGM metrics + +You can use the Custom Metrics Autoscaler with NVIDIA Data Center GPU Manager (DCGM) metrics to scale workloads based on GPU utilization. This is particularly useful for AI and machine learning workloads that require GPU resources. + +.Example scaled object with a Prometheus target for GPU-based autoscaling +[source,yaml,options="nowrap"] +---- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: gpu-scaledobject + namespace: my-namespace +spec: + scaleTargetRef: + kind: Deployment + name: gpu-deployment + minReplicaCount: 1 <1> + maxReplicaCount: 5 <2> + triggers: + - type: prometheus + metadata: + serverAddress: https://thanos-querier.openshift-monitoring.svc.cluster.local:9092 + namespace: my-namespace + metricName: gpu_utilization + threshold: '90' <3> + query: SUM(DCGM_FI_DEV_GPU_UTIL{instance=~".+", gpu=~".+"}) <4> + authModes: bearer + authenticationRef: + name: keda-trigger-auth-prometheus +---- +<1> Specifies the minimum number of replicas to maintain. For GPU workloads, this should not be set to `0` to ensure that metrics continue to be collected. +<2> Specifies the maximum number of replicas allowed during scale-up operations. +<3> Specifies the GPU utilization percentage threshold that triggers scaling. When the average GPU utilization exceeds 90%, the autoscaler scales up the deployment. +<4> Specifies a Prometheus query using NVIDIA DCGM metrics to monitor GPU utilization across all GPU devices. The `DCGM_FI_DEV_GPU_UTIL` metric provides GPU utilization percentages. diff --git a/nodes/cma/nodes-cma-autoscaling-custom-trigger.adoc b/nodes/cma/nodes-cma-autoscaling-custom-trigger.adoc index 1003e696d1ed..6d098239cace 100644 --- a/nodes/cma/nodes-cma-autoscaling-custom-trigger.adoc +++ b/nodes/cma/nodes-cma-autoscaling-custom-trigger.adoc @@ -22,6 +22,7 @@ You can configure a certificate authority xref:../../nodes/cma/nodes-cma-autosca // assemblies. include::modules/nodes-cma-autoscaling-custom-trigger-prom.adoc[leveloffset=+1] +include::modules/nodes-cma-autoscaling-custom-trigger-prom-gpu.adoc[leveloffset=+2] include::modules/nodes-cma-autoscaling-custom-prometheus-config.adoc[leveloffset=+2] [role="_additional-resources"]