moved deployment to kubernetes files

Summary: Terraform does not support deployments on GCP using a GPU at the moment. So we need to deploy such cases using plain Kubernetes configuration files. The buildbot mlir-nvidia is configured in `deployment-mlir-nvidia.yaml` in this folder. Reviewers: tra Subscribers: mehdi_amini, rriddle, jpienaar, shauheen, antiagainst, nicolasvasilache, arpith-jacob, mgester, lucyrfox, aartbik, liufengdb, stephenneuendorffer, Joonsoo, grosul1, Kayjukh, jurahul, msifontes Differential Revision: https://reviews.llvm.org/D82434
llvm · Jun 25, 2020 · 2446cff · 2446cff
1 parent d99cf8c
commit 2446cff
Show file tree

Hide file tree

Showing 3 changed files with 87 additions and 81 deletions.
diff --git a/buildbot/google/terraform/README.md b/buildbot/google/terraform/README.md
@@ -51,4 +51,24 @@ In the [deployment](buildbot/google/terraform/main.tf) `mlir-nvidia` it is
 used as a volume of type `secret` and then mounted at `/secrets`. During the 
 runtime of the docker container, the script 
 [run.sh](../docker/buildbot-mlir-nvidia/run.sh) reads the secret from the file
-`/secrets/token` and uses it to create the worker configuration.
+`/secrets/token` and uses it to create the worker configuration.
+
+
+# Using GPUs on Google Cloud
+
+Terraform does not support deployments on GCP using a GPU at the moment.
+So we need to deploy such cases using plain Kubernetes configuration files.
+See this [issue](https://github.com/terraform-providers/terraform-provider-kubernetes/issues/149) 
+for more details.
+The buildbot mlir-nvidia is configured in `deployment-mlir-nvidia.yaml` in this
+folder. 
+
+For all non-GPU cases add a `"kubernetes_deployment"` to `main.tf`. 
+The contents is identical to the the Kubernetes file, just the markup is 
+different.
+
+Kubernetes files are also declarative, so you can re-deploy them when you made
+a change. They can be deployed with:
+```bash
+kubectl apply -f myfile.yaml
+```
diff --git a/buildbot/google/terraform/deployment-mlir-nvidia.yaml b/buildbot/google/terraform/deployment-mlir-nvidia.yaml
@@ -0,0 +1,63 @@
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mlir-nvidia
+spec:
+  # number of instances we want to run
+  replicas: 1
+  selector:
+    matchLabels:
+      app: buildbot-mlir-nvidia
+  # define strategy for updating the images
+  strategy:
+    rollingUpdate:
+      # do not deploy more replicas, as the buildbot server 
+      # can't handle multiple workers with the same credentials
+      maxSurge: 0
+      # Allow to have 0 replicas during updates. 
+      maxUnavailable: 1
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        app: buildbot-mlir-nvidia
+    spec:
+      containers:
+      # the image and version we want to run   
+      - image: gcr.io/sanitizer-bots/buildbot-mlir-nvidia:9
+        name: mlir-nvidia
+        # reserve "<number of cores>-1" for this image, kubernetes also
+        # needs <1 core for management tools
+        resources:
+          limits:
+            cpu: "15"
+            memory: 10Gi
+            # also request to use the GPU
+            # Note: this does not work in terraform at the moment
+            nvidia.com/gpu: "1"
+          requests:
+            cpu: "15"
+            memory: 10Gi
+            nvidia.com/gpu: "1"
+        volumeMounts:
+        # mount the secrets into a folder  
+        - mountPath: /secrets
+          mountPropagation: None
+          name: buildbot-token
+      # specify the nood pool on which to deploy
+      nodeSelector:
+        pool: nvidia-16core-pool
+      restartPolicy: Always
+      # FIXME: do we need this if we requested a GPU?
+      #tolerations:
+      #- effect: NoSchedule
+      #  key: nvidia.com/gpu
+      #  operator: Equal
+      #  value: present
+      volumes:
+      # declare the secret as a volume so we can mount it
+      - name: buildbot-token
+        secret:
+          optional: false
+          secretName: password-mlir-nvidia
diff --git a/buildbot/google/terraform/main.tf b/buildbot/google/terraform/main.tf
@@ -24,6 +24,9 @@ resource "google_container_cluster" "primary" {
 }
 
 # Create machines for mlir-nvidia
+# Note: The buildbot mlir-nividia is deployed using a kubernetes file. See
+# the README.md for details on GPUs.
+
 resource "google_container_node_pool" "nvidia_16core_pool_nodes" {
   name       = "nvidia-16core-pool"
   # specify a zone here (e.g. "-a") to avoid a redundant deployment
@@ -62,83 +65,3 @@ resource "google_container_node_pool" "nvidia_16core_pool_nodes" {
     }
   }
 }
-
-
-resource "kubernetes_deployment" "mlir-nvidia" {
-# FIXME: move to kubernetes yaml file, as terraform does not support GPU
-# resources on GKE.
-
-  metadata {
-    name = "mlir-nvidia"
-  }
-
-  spec {
-    replicas = 1
-
-    selector {
-      match_labels = {
-       app = "buildbot-mlir-nvidia"
-      }
-    }
-
-    template {
-      metadata {
-        labels = {
-          app = "buildbot-mlir-nvidia"
-          }
-        }
-      spec {
-        container {
-          name = "mlir-nvidia"
-          # Specify version number for docker image, this ensures sure you're
-          # deploying the right version of the image.
-          image = "${var.gcp_config.gcr_prefix}/buildbot-mlir-nvidia:3"
-
-          resources {
-            requests {
-              cpu = 15
-              memory = "10Gi"
-            }
-           limits {
-              cpu = 15
-              memory = "10Gi"
-              # FIXME: does not work in terraform
-              # https://github.com/terraform-providers/terraform-provider-kubernetes/issues/149
-              # We probably need to use native Kubernetes for all deployments
-              # with GPUs until this is implemented.
-              # nvidia.com/gpu = 1
-           }
-          }
-
-            volume_mount {
-              mount_path = "/secrets"
-              name = "buildbot-token"
-            }
-          }
-          volume {
-            name = "buildbot-token"
-            secret {
-              secret_name = "buildbot-token-mlir-nvidia"
-            }
-          }
-          # Nodes with a GPU are automatically get a "taint". We need to
-          # "tolerate" this taint, otherwise we can't deploy to that node.
-          # This is a safe guard to only deploy container that require GPUs 
-          # to machines with GPUs. More details: 
-          #  * https://cloud.google.com/kubernetes-engine/docs/how-to/gpus
-          #  * https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/
-          toleration {
-            key = "nvidia.com/gpu"
-            operator = "Equal"
-            value = "present"
-            effect = "NoSchedule"
-          }
-          # select which machines to deploy to, this is using the node pool
-          # defined above
-          node_selector = {
-            pool = "nvidia-16core-pool"
-          }
-      }
-    }
-  }
-}