From fec632c15302cb263bcccc2932a15b09d204857d Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Sat, 18 Jan 2025 22:26:53 +0000
Subject: [PATCH] [CI] Switch away from kubernetes executor mode

We are running into reliability problems with the kubernetes executor mode,
hypothesized currently to be due to a complex interaction with
konnectivity-agent pods dying and subsequently killing in-process execs through
the k8s control plane. The plan is to switch back to the in-container executor
mode for now while we sort out the issue upstream with the konnectivity
developers.
---
 premerge/linux_container_pod_template.yaml | 29 ------------
 premerge/linux_runners_values.yaml         | 54 +++++-----------------
 premerge/main.tf                           | 14 +-----
 3 files changed, 12 insertions(+), 85 deletions(-)
 delete mode 100644 premerge/linux_container_pod_template.yaml

diff --git a/premerge/linux_container_pod_template.yaml b/premerge/linux_container_pod_template.yaml
deleted file mode 100644
index fb52ffae3..000000000
--- a/premerge/linux_container_pod_template.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-metadata:
-  annotations:
-    cluster-autoscaler.kubernetes.io/safe-to-evict: "false"
-spec:
-  tolerations:
-    - key: "premerge-platform"
-      operator: "Equal"
-      value: "linux"
-      effect: "NoSchedule"
-  nodeSelector:
-    premerge-platform: linux
-  containers:
-  - name: $job
-    resources:
-      # The container is always scheduled on the same pod as the runner.
-      # Since we use the runner requests.cpu for scheduling/autoscaling,
-      # the request here should be set to something small.
-      #
-      # The limit however should be the number of cores of the node. Any limit
-      # inferior to the number of core could slow down the job.
-      #
-      # For memory however, the request/limits shall be correct.
-      # It's not used for scheduling, but can be used by k8 for OOM kill.
-      requests:
-        cpu: "100m"
-        memory: "230Gi"
-      limits:
-        cpu: 64
-        memory: "256Gi"
diff --git a/premerge/linux_runners_values.yaml b/premerge/linux_runners_values.yaml
index 6c2e8b825..304fc71e7 100644
--- a/premerge/linux_runners_values.yaml
+++ b/premerge/linux_runners_values.yaml
@@ -4,17 +4,6 @@ githubConfigSecret: "github-token"
 minRunners: 0
 maxRunners: 4
 
-containerMode:
-  type: "kubernetes"
-  kubernetesModeWorkVolumeClaim:
-    accessModes: ["ReadWriteOnce"]
-    storageClassName: "standard-rwo"
-    resources:
-      requests:
-        storage: "200Gi"
-  kubernetesModeServiceAccount:
-    annotations:
-
 template:
   metadata:
     annotations:
@@ -29,49 +18,28 @@ template:
       premerge-platform: linux
     containers:
     - name: runner
-      image: ghcr.io/actions/actions-runner:latest
-      command: ["/home/runner/run.sh"]
+      image: ghcr.io/llvm/ci-ubuntu-22.04-agent:latest
+      command: ["/home/gha/actions-runner/run.sh"]
       resources:
-        # The container will be scheduled on the same node as this runner.
-        # This means if we don't set the CPU request high-enough here, 2
-        # containers will be scheduled on the same pod, meaning 2 jobs.
+        # If we don't set the CPU request high-enough here, 2 runners might
+        # be scheduled on the same pod, meaning 2 jobs, and they will starve
+        # each other.
         #
         # This number should be:
         #  - greater than number_of_cores / 2:
         #    A value lower than that could allow the scheduler to put 2
-        #    runners in the same pod. Meaning 2 containers in the same pod.
-        #    Meaning 2 jobs sharing the resources.
+        #    runners on the same node. Meaning 2 jobs sharing the resources of
+        #    a single node.
         #  - lower than number_of_cores:
         #    Each pod has some basic services running (metrics for ex). Those
         #    already require some amount of CPU (~0.5). This means we don't
         #    exactly have N cores to allocate, but N - epsilon.
         #
-        # Memory however shall be handled at the container level. The runner
-        # itself doesn't need much, just using something enough not to get
-        # OOM killed.
+        # We also need to request sufficient memory to not get OOM killed.
         requests:
           cpu: 55
-          memory: "8Gi"
+          memory: "200Gi"
         limits:
           cpu: 64
-          memory: "8Gi"
-      env:
-        - name: ACTIONS_RUNNER_CONTAINER_HOOKS
-          value: /home/runner/k8s/index.js
-        - name: ACTIONS_RUNNER_POD_NAME
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.name
-        - name: ACTIONS_RUNNER_REQUIRE_JOB_CONTAINER
-          value: "true"
-        - name: ACTIONS_RUNNER_CONTAINER_HOOK_TEMPLATE
-          value: "/home/runner/pod-config/linux-container-pod-template.yaml"
-      volumeMounts:
-        - name: container-pod-config
-          mountPath: /home/runner/pod-config
-    securityContext:
-      fsGroup: 123
-    volumes:
-      - name: container-pod-config
-        configMap:
-          name: linux-container-pod-template
+          memory: "256Gi"
+
diff --git a/premerge/main.tf b/premerge/main.tf
index a595f2181..9e2789329 100644
--- a/premerge/main.tf
+++ b/premerge/main.tf
@@ -199,17 +199,6 @@ resource "kubernetes_secret" "windows_github_pat" {
 }
 
 
-resource "kubernetes_config_map" "linux_container_pod_template" {
-  metadata {
-    name      = "linux-container-pod-template"
-    namespace = "llvm-premerge-linux-runners"
-  }
-
-  data = {
-    "linux-container-pod-template.yaml" : "${file("linux_container_pod_template.yaml")}"
-  }
-}
-
 resource "helm_release" "github_actions_runner_controller" {
   name       = "llvm-premerge-controller"
   namespace  = "llvm-premerge-controller"
@@ -235,9 +224,8 @@ resource "helm_release" "github_actions_runner_set_linux" {
 
   depends_on = [
     kubernetes_namespace.llvm_premerge_linux_runners,
-    kubernetes_config_map.linux_container_pod_template,
-    kubernetes_secret.linux_github_pat,
     helm_release.github_actions_runner_controller
+    kubernetes_secret.linux_github_pat
   ]
 }