Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,19 @@ You can use the instructions [here](https://docs.oracle.com/en-us/iaas/Content/C

**Images for NVIDIA shapes**

- [GPU driver 535.183.06 & CUDA 12.2](https://objectstorage.ca-toronto-1.oraclecloud.com/p/KOcEZeDpEAASLSKzumODnVr42mFwM_p9n1_Nra2FsV_F6BcpAkoH66HZxN4cCtIb/n/hpc_limited_availability/b/images/o/Ubuntu-22-OCA-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.09.18-0)
- [GPU driver 560 & CUDA 12.6](https://objectstorage.ca-montreal-1.oraclecloud.com/p/ts6fjAuj7hY4io5x_jfX3fyC70HRCG8-9gOFqAjuF0KE0s-6tgDZkbRRZIbMZmoN/n/hpc_limited_availability/b/images/o/Canonical-Ubuntu-22.04-2024.10.04-0-OCA-OFED-24.10-1.1.4.0-GPU-560-CUDA-12.6-2025-03-05.01)

- [GPU driver 550.90.12 & CUDA 12.4](https://objectstorage.ca-toronto-1.oraclecloud.com/p/EDngSWYfn3HjrN0xbfBSVCctRVKVvNf3NOW7DdInKMtgiZwiUqy7PsA_xifmI1oq/n/hpc_limited_availability/b/images/o/Ubuntu-22-OCA-OFED-23.10-2.1.3.1-GPU-550-CUDA-12.4-2024.09.18-0)

- [GPU driver 560.35.03 & CUDA 12.6](https://objectstorage.ca-toronto-1.oraclecloud.com/p/a_KKMCajcBpt9EfqgmnZbtUInpc6gdC5s2g1wz7b0KUCLW28DSvTKwMeOSgW5O0R/n/hpc_limited_availability/b/images/o/Ubuntu-22-OCA-OFED-23.10-2.1.3.1-GPU-560-CUDA-12.6-2024.09.18-0)
- [GPU driver 570 & CUDA 12.8](https://objectstorage.ca-montreal-1.oraclecloud.com/p/ts6fjAuj7hY4io5x_jfX3fyC70HRCG8-9gOFqAjuF0KE0s-6tgDZkbRRZIbMZmoN/n/hpc_limited_availability/b/images/o/Canonical-Ubuntu-22.04-2024.10.04-0-OCA-OFED-24.10-1.1.4.0-GPU-570-CUDA-12.8-2025-03-06.01)

**Image for AMD shapes**

- [ROCm 6.2](https://objectstorage.us-ashburn-1.oraclecloud.com/p/tpswnRAUmrJ49uLAGk_ku6B13hyGzf_Gv1vrggtDWhOywSM5YGzoMPiO88gc3Cv-/n/imagegen/b/GPU-imaging/o/Ubuntu-22-OFED-5.9-0.5.6.0.127-ROCM-6.2-90-2024.08.12-0.oci)
- [ROCm 6.3](https://objectstorage.ca-montreal-1.oraclecloud.com/p/ts6fjAuj7hY4io5x_jfX3fyC70HRCG8-9gOFqAjuF0KE0s-6tgDZkbRRZIbMZmoN/n/hpc_limited_availability/b/images/o/Canonical-Ubuntu-22.04-2024.10.04-0-OCA-OFED-24.10-1.1.4.0-ROCM-632-2025-03-05.01)


### Deploy the cluster using the Oracle Cloud Resource Manager template
You can easily deploy the cluster using the **Deploy to Oracle Cloud** button below.

[![Deploy to Oracle Cloud](https://oci-resourcemanager-plugin.plugins.oci.oraclecloud.com/latest/deploy-to-oracle-cloud.svg)](https://cloud.oracle.com/resourcemanager/stacks/create?zipUrl=https://github.com/oracle-quickstart/oci-hpc-oke/releases/download/v25.2.0/oke-rdma-quickstart-v25.2.0.zip)
[![Deploy to Oracle Cloud](https://oci-resourcemanager-plugin.plugins.oci.oraclecloud.com/latest/deploy-to-oracle-cloud.svg)](https://cloud.oracle.com/resourcemanager/stacks/create?zipUrl=https://github.com/oracle-quickstart/oci-hpc-oke/releases/download/v25.3.0/oke-rdma-quickstart-v25.3.0.zip)

For the image ID, use the ID of the image that you imported in the previous step.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Please note depending on the shape and its configuration, some health checks wil
You can deploy using the Node Problem Detector Helm chart. The health check scripts are created as a `ConfigMap`, so please make sure you use the `values.yaml` in the link below.

```
helm install gpu-rdma-node-problem-detector oci://ghcr.io/deliveryhero/helm-charts/node-problem-detector --version 2.3.15 \
helm install gpu-rdma-node-problem-detector oci://ghcr.io/deliveryhero/helm-charts/node-problem-detector --version 2.3.18 \
-f https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/main/manifests/node-problem-detector/values.yaml
```

Expand Down
1,256 changes: 884 additions & 372 deletions manifests/node-problem-detector/values.yaml

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,19 +1,15 @@
# Copyright (c) 2024 Oracle Corporation and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl

locals {
dcgm_exporter_version = one(helm_release.dcgm_exporter[*].version)
}

resource "helm_release" "dcgm_exporter" {
count = var.install_dcgm_exporter && var.install_monitoring ? 1 : 0
count = var.install_dcgm_exporter && var.install_node_problem_detector_kube_prometheus_stack ? 1 : 0
depends_on = [helm_release.prometheus]
namespace = var.monitoring_namespace
name = "dcgm-exporter"
chart = "dcgm-exporter"
repository = "https://nvidia.github.io/dcgm-exporter/helm-charts"
version = var.dcgm_exporter_chart_version
values = ["${file("./files/kube-dcgm-exporter-values.yaml")}"]
values = ["${file("./files/dcgm-exporter/values.yaml")}"]
create_namespace = false
recreate_pods = true
force_update = true
Expand Down
Binary file added terraform/files/.DS_Store
Binary file not shown.
118 changes: 118 additions & 0 deletions terraform/files/dcgm-exporter/values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9400"

serviceMonitor:
apiVersion: "monitoring.coreos.com/v1"
enabled: true
interval: 15s
honorLabels: false
additionalLabels:
release: kube-prometheus-stack
relabelings:
- sourceLabels: [__meta_kubernetes_pod_node_name]
separator: ;
regex: ^(.*)$
targetLabel: nodename
replacement: $1
action: replace
- sourceLabels: [__meta_kubernetes_node_provider_id]
targetLabel: instance_id
action: replace
- sourceLabels: [__meta_kubernetes_node_label_oci_oraclecloud_com_host_serial_number]
targetLabel: host_serial_number
action: replace
- sourceLabels: [__meta_kubernetes_node_label_node_kubernetes_io_instance_type]
targetLabel: instance_shape
action: replace

nodeSelector:
nvidia.com/gpu: "true"

tolerations:
- operator: Exists

customMetrics: |
# Format
# If line starts with a '#' it is considered a comment
# DCGM FIELD, Prometheus metric type, help message

# Clocks
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
# DCGM_EXP_CLOCK_EVENTS_COUNT, gauge, Count of clock events within the user-specified time window (see clock-events-count-window-size param).

# Temperature
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C).

# Power
DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W).
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).

# PCIE
DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML.
DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML.
DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.

# Utilization (the sample period varies depending on the product)
DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %).
DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %).

# Errors and violations
DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.
DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us).
DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us).
DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us).
DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us).
DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).
DCGM_EXP_XID_ERRORS_COUNT, gauge, Count of XID Errors within user-specified time window (see xid-count-window-size param).
# Memory usage
DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB).
DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB).

# ECC
DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.

# Retired pages
# DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors.
# DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors.
# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.

# NVLink
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries.
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes

# VGPU License status
DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status

# Remapped rows
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed

# Static configuration information. These appear as labels on the other metrics
DCGM_FI_DRIVER_VERSION, label, Driver Version
DCGM_FI_NVML_VERSION, label, NVML Version
DCGM_FI_DEV_BRAND, label, Device Brand
DCGM_FI_DEV_SERIAL, label, Device Serial Number
# DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version
# DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version
# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version
# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version
# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device
DCGM_FI_DEV_ROW_REMAP_PENDING, gauge, Whether remapping of rows is pending
DCGM_FI_DEV_COUNT, gauge, Number of Devices on the node
DCGM_FI_DEV_MEM_MAX_OP_TEMP, gauge, Maximum operating temperature for the memory of this GPU
DCGM_FI_DEV_GPU_MAX_OP_TEMP, gauge, Maximum operating temperature for this GPU
DCGM_FI_DEV_SLOWDOWN_TEMP, gauge, Slowdown temperature for the device
DCGM_FI_DEV_SHUTDOWN_TEMP, gauge, Shutdown temperature for the device
56 changes: 56 additions & 0 deletions terraform/files/grafana/alerts/cpu-profile.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
apiVersion: 1
groups:
- orgId: 1
name: Node Problem Detector
folder: OKE
interval: 1m
rules:
- uid: oke_npd_cpu_profile
title: CPU Profile
condition: C
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: prometheus
model:
editorMode: code
expr: problem_gauge{reason="CpuProfileHasIssues",type="CpuProfile"}
instant: true
intervalMs: 300000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations: {}
labels: {}
isPaused: false
56 changes: 56 additions & 0 deletions terraform/files/grafana/alerts/gpu-bad-pages.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
apiVersion: 1
groups:
- orgId: 1
name: Node Problem Detector
folder: OKE
interval: 1m
rules:
- uid: oke_npd_gpu_bad_pages
title: GPU Bad Pages
condition: C
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: prometheus
model:
editorMode: code
expr: problem_gauge{reason="GpuBadPagesHasIssues",type="GpuBadPages"}
instant: true
intervalMs: 300000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations: {}
labels: {}
isPaused: false
56 changes: 56 additions & 0 deletions terraform/files/grafana/alerts/gpu-bus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
apiVersion: 1
groups:
- orgId: 1
name: Node Problem Detector
folder: OKE
interval: 1m
rules:
- uid: oke_npd_gpu_bus
title: GPU Bus
condition: C
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: prometheus
model:
editorMode: code
expr: problem_gauge{reason="GpuBusHasIssues",type="GpuBus"}
instant: true
intervalMs: 300000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations: {}
labels: {}
isPaused: false
Loading