Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions test/extended/node/dra/nvidia/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ The tests automatically install the DRA driver if needed. This section is for ma
### Step 1: Add NVIDIA Helm Repository

```bash
helm repo add nvidia https://nvidia.github.io/gpu-operator
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia --force-update
helm repo update
```

Expand All @@ -143,24 +143,24 @@ This label ensures the DRA kubelet plugin only runs on GPU nodes and works aroun

```bash
# Create namespace
oc create namespace nvidia-dra-driver-gpu
oc create namespace nvidia-dra-driver

# Grant SCC permissions
oc adm policy add-scc-to-user privileged \
-z nvidia-dra-driver-gpu-service-account-controller \
-n nvidia-dra-driver-gpu
-n nvidia-dra-driver
oc adm policy add-scc-to-user privileged \
-z nvidia-dra-driver-gpu-service-account-kubeletplugin \
-n nvidia-dra-driver-gpu
-n nvidia-dra-driver
oc adm policy add-scc-to-user privileged \
-z compute-domain-daemon-service-account \
-n nvidia-dra-driver-gpu
-n nvidia-dra-driver

# Install via Helm (pinned to version used by tests)
# Version can be overridden via NVIDIA_DRA_DRIVER_VERSION environment variable
NVIDIA_DRA_DRIVER_VERSION=${NVIDIA_DRA_DRIVER_VERSION:-25.12.0}
helm install nvidia-dra-driver-gpu nvidia/nvidia-dra-driver-gpu \
--namespace nvidia-dra-driver-gpu \
helm install nvidia-dra-driver nvidia/nvidia-dra-driver-gpu \
--namespace nvidia-dra-driver \
--version ${NVIDIA_DRA_DRIVER_VERSION} \
--set nvidiaDriverRoot=/run/nvidia/driver \
--set gpuResourcesEnabledOverride=true \
Expand All @@ -179,7 +179,7 @@ helm install nvidia-dra-driver-gpu nvidia/nvidia-dra-driver-gpu \

```bash
# Check DRA driver pods
oc get pods -n nvidia-dra-driver-gpu
oc get pods -n nvidia-dra-driver
# Expected: All pods should be Running

# Verify ResourceSlices are published
Expand All @@ -191,8 +191,8 @@ oc get resourceslices

```bash
# Uninstall DRA Driver
helm uninstall nvidia-dra-driver-gpu -n nvidia-dra-driver-gpu --wait --timeout 5m
oc delete namespace nvidia-dra-driver-gpu
helm uninstall nvidia-dra-driver -n nvidia-dra-driver --wait --timeout 5m
oc delete namespace nvidia-dra-driver

# Remove SCC permissions
oc delete clusterrolebinding \
Expand Down Expand Up @@ -230,13 +230,13 @@ oc delete clusterrolebinding \
**Solution**:
```bash
# Check DRA driver logs
oc logs -n nvidia-dra-driver-gpu -l app.kubernetes.io/name=nvidia-dra-driver-gpu --all-containers
oc logs -n nvidia-dra-driver -l app.kubernetes.io/name=nvidia-dra-driver-gpu --all-containers

# Verify SCC permissions
oc describe scc privileged | grep nvidia-dra-driver-gpu

# Restart DRA driver if needed
oc delete pod -n nvidia-dra-driver-gpu -l app.kubernetes.io/name=nvidia-dra-driver-gpu
oc delete pod -n nvidia-dra-driver -l app.kubernetes.io/name=nvidia-dra-driver-gpu
```

## References
Expand Down
27 changes: 18 additions & 9 deletions test/extended/node/dra/nvidia/prerequisites_installer.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ const (
gpuOperatorNamespace = "nvidia-gpu-operator"

// DRA Driver constants
draDriverNamespace = "nvidia-dra-driver-gpu"
draDriverRelease = "nvidia-dra-driver-gpu"
draDriverNamespace = "nvidia-dra-driver"
draDriverRelease = "nvidia-dra-driver"
Comment thread
coderabbitai[bot] marked this conversation as resolved.
draDriverChart = "nvidia/nvidia-dra-driver-gpu"
draDriverDefaultVersion = "25.12.0"
draDriverControllerSA = "nvidia-dra-driver-gpu-service-account-controller"
Expand Down Expand Up @@ -114,18 +114,27 @@ func (pi *PrerequisitesInstaller) ensureHelm(ctx context.Context) error {
func (pi *PrerequisitesInstaller) addHelmRepoForDRADriver(ctx context.Context) error {
framework.Logf("Adding NVIDIA Helm repository for DRA driver")

// Add repo
cmd := exec.CommandContext(ctx, "helm", "repo", "add", "nvidia", "https://nvidia.github.io/gpu-operator")
// Add repo with force update to ensure we have the latest index
cmd := exec.CommandContext(ctx, "helm", "repo", "add", "nvidia", "https://helm.ngc.nvidia.com/nvidia", "--force-update")
output, err := cmd.CombinedOutput()
if err != nil && !strings.Contains(string(output), "already exists") {
if err != nil {
return fmt.Errorf("failed to add helm repo: %w\nOutput: %s", err, string(output))
}

// Update repo
cmd = exec.CommandContext(ctx, "helm", "repo", "update")
output, err = cmd.CombinedOutput()
// Update repo with retry logic (NVIDIA repo can be flaky)
var lastOutput []byte
var lastErr error
err = wait.PollUntilContextTimeout(ctx, 10*time.Second, 30*time.Second, true, func(ctx context.Context) (bool, error) {
cmd := exec.CommandContext(ctx, "helm", "repo", "update", "nvidia")
lastOutput, lastErr = cmd.CombinedOutput()
Comment thread
coderabbitai[bot] marked this conversation as resolved.
if lastErr != nil {
framework.Logf("Helm repo update failed: %v\nOutput: %s\nRetrying...", lastErr, string(lastOutput))
return false, nil
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
return true, nil
})
if err != nil {
return fmt.Errorf("failed to update helm repo: %w\nOutput: %s", err, string(output))
return fmt.Errorf("failed to update helm repo after retries: %w\nLast error: %v\nLast output: %s", err, lastErr, string(lastOutput))
}

framework.Logf("NVIDIA Helm repository added and updated")
Expand Down