diff --git a/buildbot/google/terraform/README.md b/buildbot/google/terraform/README.md index cc5988a66..fb82c6303 100644 --- a/buildbot/google/terraform/README.md +++ b/buildbot/google/terraform/README.md @@ -51,4 +51,24 @@ In the [deployment](buildbot/google/terraform/main.tf) `mlir-nvidia` it is used as a volume of type `secret` and then mounted at `/secrets`. During the runtime of the docker container, the script [run.sh](../docker/buildbot-mlir-nvidia/run.sh) reads the secret from the file -`/secrets/token` and uses it to create the worker configuration. \ No newline at end of file +`/secrets/token` and uses it to create the worker configuration. + + +# Using GPUs on Google Cloud + +Terraform does not support deployments on GCP using a GPU at the moment. +So we need to deploy such cases using plain Kubernetes configuration files. +See this [issue](https://github.com/terraform-providers/terraform-provider-kubernetes/issues/149) +for more details. +The buildbot mlir-nvidia is configured in `deployment-mlir-nvidia.yaml` in this +folder. + +For all non-GPU cases add a `"kubernetes_deployment"` to `main.tf`. +The contents is identical to the the Kubernetes file, just the markup is +different. + +Kubernetes files are also declarative, so you can re-deploy them when you made +a change. They can be deployed with: +```bash +kubectl apply -f myfile.yaml +``` diff --git a/buildbot/google/terraform/deployment-mlir-nvidia.yaml b/buildbot/google/terraform/deployment-mlir-nvidia.yaml new file mode 100644 index 000000000..5e68d6aab --- /dev/null +++ b/buildbot/google/terraform/deployment-mlir-nvidia.yaml @@ -0,0 +1,63 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mlir-nvidia +spec: + # number of instances we want to run + replicas: 1 + selector: + matchLabels: + app: buildbot-mlir-nvidia + # define strategy for updating the images + strategy: + rollingUpdate: + # do not deploy more replicas, as the buildbot server + # can't handle multiple workers with the same credentials + maxSurge: 0 + # Allow to have 0 replicas during updates. + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + labels: + app: buildbot-mlir-nvidia + spec: + containers: + # the image and version we want to run + - image: gcr.io/sanitizer-bots/buildbot-mlir-nvidia:9 + name: mlir-nvidia + # reserve "-1" for this image, kubernetes also + # needs <1 core for management tools + resources: + limits: + cpu: "15" + memory: 10Gi + # also request to use the GPU + # Note: this does not work in terraform at the moment + nvidia.com/gpu: "1" + requests: + cpu: "15" + memory: 10Gi + nvidia.com/gpu: "1" + volumeMounts: + # mount the secrets into a folder + - mountPath: /secrets + mountPropagation: None + name: buildbot-token + # specify the nood pool on which to deploy + nodeSelector: + pool: nvidia-16core-pool + restartPolicy: Always + # FIXME: do we need this if we requested a GPU? + #tolerations: + #- effect: NoSchedule + # key: nvidia.com/gpu + # operator: Equal + # value: present + volumes: + # declare the secret as a volume so we can mount it + - name: buildbot-token + secret: + optional: false + secretName: password-mlir-nvidia diff --git a/buildbot/google/terraform/main.tf b/buildbot/google/terraform/main.tf index de66b26dd..795464033 100644 --- a/buildbot/google/terraform/main.tf +++ b/buildbot/google/terraform/main.tf @@ -24,6 +24,9 @@ resource "google_container_cluster" "primary" { } # Create machines for mlir-nvidia +# Note: The buildbot mlir-nividia is deployed using a kubernetes file. See +# the README.md for details on GPUs. + resource "google_container_node_pool" "nvidia_16core_pool_nodes" { name = "nvidia-16core-pool" # specify a zone here (e.g. "-a") to avoid a redundant deployment @@ -62,83 +65,3 @@ resource "google_container_node_pool" "nvidia_16core_pool_nodes" { } } } - - -resource "kubernetes_deployment" "mlir-nvidia" { -# FIXME: move to kubernetes yaml file, as terraform does not support GPU -# resources on GKE. - - metadata { - name = "mlir-nvidia" - } - - spec { - replicas = 1 - - selector { - match_labels = { - app = "buildbot-mlir-nvidia" - } - } - - template { - metadata { - labels = { - app = "buildbot-mlir-nvidia" - } - } - spec { - container { - name = "mlir-nvidia" - # Specify version number for docker image, this ensures sure you're - # deploying the right version of the image. - image = "${var.gcp_config.gcr_prefix}/buildbot-mlir-nvidia:3" - - resources { - requests { - cpu = 15 - memory = "10Gi" - } - limits { - cpu = 15 - memory = "10Gi" - # FIXME: does not work in terraform - # https://github.com/terraform-providers/terraform-provider-kubernetes/issues/149 - # We probably need to use native Kubernetes for all deployments - # with GPUs until this is implemented. - # nvidia.com/gpu = 1 - } - } - - volume_mount { - mount_path = "/secrets" - name = "buildbot-token" - } - } - volume { - name = "buildbot-token" - secret { - secret_name = "buildbot-token-mlir-nvidia" - } - } - # Nodes with a GPU are automatically get a "taint". We need to - # "tolerate" this taint, otherwise we can't deploy to that node. - # This is a safe guard to only deploy container that require GPUs - # to machines with GPUs. More details: - # * https://cloud.google.com/kubernetes-engine/docs/how-to/gpus - # * https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ - toleration { - key = "nvidia.com/gpu" - operator = "Equal" - value = "present" - effect = "NoSchedule" - } - # select which machines to deploy to, this is using the node pool - # defined above - node_selector = { - pool = "nvidia-16core-pool" - } - } - } - } -}