From fec632c15302cb263bcccc2932a15b09d204857d Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Sat, 18 Jan 2025 22:26:53 +0000 Subject: [PATCH] [CI] Switch away from kubernetes executor mode We are running into reliability problems with the kubernetes executor mode, hypothesized currently to be due to a complex interaction with konnectivity-agent pods dying and subsequently killing in-process execs through the k8s control plane. The plan is to switch back to the in-container executor mode for now while we sort out the issue upstream with the konnectivity developers. --- premerge/linux_container_pod_template.yaml | 29 ------------ premerge/linux_runners_values.yaml | 54 +++++----------------- premerge/main.tf | 14 +----- 3 files changed, 12 insertions(+), 85 deletions(-) delete mode 100644 premerge/linux_container_pod_template.yaml diff --git a/premerge/linux_container_pod_template.yaml b/premerge/linux_container_pod_template.yaml deleted file mode 100644 index fb52ffae3..000000000 --- a/premerge/linux_container_pod_template.yaml +++ /dev/null @@ -1,29 +0,0 @@ -metadata: - annotations: - cluster-autoscaler.kubernetes.io/safe-to-evict: "false" -spec: - tolerations: - - key: "premerge-platform" - operator: "Equal" - value: "linux" - effect: "NoSchedule" - nodeSelector: - premerge-platform: linux - containers: - - name: $job - resources: - # The container is always scheduled on the same pod as the runner. - # Since we use the runner requests.cpu for scheduling/autoscaling, - # the request here should be set to something small. - # - # The limit however should be the number of cores of the node. Any limit - # inferior to the number of core could slow down the job. - # - # For memory however, the request/limits shall be correct. - # It's not used for scheduling, but can be used by k8 for OOM kill. - requests: - cpu: "100m" - memory: "230Gi" - limits: - cpu: 64 - memory: "256Gi" diff --git a/premerge/linux_runners_values.yaml b/premerge/linux_runners_values.yaml index 6c2e8b825..304fc71e7 100644 --- a/premerge/linux_runners_values.yaml +++ b/premerge/linux_runners_values.yaml @@ -4,17 +4,6 @@ githubConfigSecret: "github-token" minRunners: 0 maxRunners: 4 -containerMode: - type: "kubernetes" - kubernetesModeWorkVolumeClaim: - accessModes: ["ReadWriteOnce"] - storageClassName: "standard-rwo" - resources: - requests: - storage: "200Gi" - kubernetesModeServiceAccount: - annotations: - template: metadata: annotations: @@ -29,49 +18,28 @@ template: premerge-platform: linux containers: - name: runner - image: ghcr.io/actions/actions-runner:latest - command: ["/home/runner/run.sh"] + image: ghcr.io/llvm/ci-ubuntu-22.04-agent:latest + command: ["/home/gha/actions-runner/run.sh"] resources: - # The container will be scheduled on the same node as this runner. - # This means if we don't set the CPU request high-enough here, 2 - # containers will be scheduled on the same pod, meaning 2 jobs. + # If we don't set the CPU request high-enough here, 2 runners might + # be scheduled on the same pod, meaning 2 jobs, and they will starve + # each other. # # This number should be: # - greater than number_of_cores / 2: # A value lower than that could allow the scheduler to put 2 - # runners in the same pod. Meaning 2 containers in the same pod. - # Meaning 2 jobs sharing the resources. + # runners on the same node. Meaning 2 jobs sharing the resources of + # a single node. # - lower than number_of_cores: # Each pod has some basic services running (metrics for ex). Those # already require some amount of CPU (~0.5). This means we don't # exactly have N cores to allocate, but N - epsilon. # - # Memory however shall be handled at the container level. The runner - # itself doesn't need much, just using something enough not to get - # OOM killed. + # We also need to request sufficient memory to not get OOM killed. requests: cpu: 55 - memory: "8Gi" + memory: "200Gi" limits: cpu: 64 - memory: "8Gi" - env: - - name: ACTIONS_RUNNER_CONTAINER_HOOKS - value: /home/runner/k8s/index.js - - name: ACTIONS_RUNNER_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: ACTIONS_RUNNER_REQUIRE_JOB_CONTAINER - value: "true" - - name: ACTIONS_RUNNER_CONTAINER_HOOK_TEMPLATE - value: "/home/runner/pod-config/linux-container-pod-template.yaml" - volumeMounts: - - name: container-pod-config - mountPath: /home/runner/pod-config - securityContext: - fsGroup: 123 - volumes: - - name: container-pod-config - configMap: - name: linux-container-pod-template + memory: "256Gi" + diff --git a/premerge/main.tf b/premerge/main.tf index a595f2181..9e2789329 100644 --- a/premerge/main.tf +++ b/premerge/main.tf @@ -199,17 +199,6 @@ resource "kubernetes_secret" "windows_github_pat" { } -resource "kubernetes_config_map" "linux_container_pod_template" { - metadata { - name = "linux-container-pod-template" - namespace = "llvm-premerge-linux-runners" - } - - data = { - "linux-container-pod-template.yaml" : "${file("linux_container_pod_template.yaml")}" - } -} - resource "helm_release" "github_actions_runner_controller" { name = "llvm-premerge-controller" namespace = "llvm-premerge-controller" @@ -235,9 +224,8 @@ resource "helm_release" "github_actions_runner_set_linux" { depends_on = [ kubernetes_namespace.llvm_premerge_linux_runners, - kubernetes_config_map.linux_container_pod_template, - kubernetes_secret.linux_github_pat, helm_release.github_actions_runner_controller + kubernetes_secret.linux_github_pat ] }