helm: Add K8S probes to retriever-usvc (#244)

lianhao · web-flow · commit af47b3c0fec0 · 2024-07-31T11:26:57.000+08:00
- Add K8S probes to retriever-usvc
- Remove redundant values in xx-values.yaml
Signed-off-by: Lianhao Lu &lt;lianhao.lu@intel.com&gt;
diff --git a/helm-charts/chatqna/gaudi-values.yaml b/helm-charts/chatqna/gaudi-values.yaml
@@ -1,34 +1,6 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-# Default values for chatqna.
-# This is a YAML-formatted file.
-# Declare variables to be passed into your templates.
-
-replicaCount: 1
-
-image:
-  repository: opea/chatqna:latest
-  pullPolicy: IfNotPresent
-  # Overrides the image tag whose default is the chart appVersion.
-  # tag: "1.0"
-
-port: 8888
-service:
-  type: ClusterIP
-  port: 8888
-
-securityContext:
-  readOnlyRootFilesystem: true
-  allowPrivilegeEscalation: false
-  runAsNonRoot: true
-  runAsUser: 1000
-  capabilities:
-    drop:
-    - ALL
-  seccompProfile:
-    type: RuntimeDefault
-
 tei:
   image:
     repository: ghcr.io/huggingface/tei-gaudi
@@ -39,22 +11,14 @@ tei:
 
 # To override values in subchart tgi
 tgi:
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-  # LLM_MODEL_ID: /data/OpenCodeInterpreter-DS-6.7B
   image:
     repository: ghcr.io/huggingface/tgi-gaudi
     tag: "2.0.1"
   resources:
     limits:
       habana.ai/gaudi: 1
-
-global:
-  http_proxy:
-  https_proxy:
-  no_proxy:
-  HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here"
-  LANGCHAIN_TRACING_V2: false
-  LANGCHAIN_API_KEY: "insert-your-langchain-key-here"
-  # set modelUseHostPath to host directory if you want to use hostPath volume for model storage
-  # comment out modeluseHostPath if you want to download the model from huggingface
-  modelUseHostPath: /mnt/opea-models
+  extraArgs:
+    - "--max-input-length"
+    - "1024"
+    - "--max-total-tokens"
+    - "2048"
diff --git a/helm-charts/common/retriever-usvc/templates/deployment.yaml b/helm-charts/common/retriever-usvc/templates/deployment.yaml
@@ -48,19 +48,17 @@ spec:
           volumeMounts:
             - mountPath: /tmp
               name: tmp
-          {{- if not .Values.noProbe }}
+          {{- if .Values.livenessProbe }}
+          livenessProbe:
+            {{- toYaml .Values.livenessProbe | nindent 12 }}
+          {{- end }}
+          {{- if .Values.readinessProbe }}
+          readinessProbe:
+            {{- toYaml .Values.readinessProbe | nindent 12 }}
+          {{- end }}
+          {{- if .Values.startupProbe }}
           startupProbe:
-            exec:
-              command:
-              - curl
-              {{- if .Values.TEI_EMBEDDING_ENDPOINT }}
-              - {{ .Values.TEI_EMBEDDING_ENDPOINT }}
-              {{- else }}
-              - http://{{ .Release.Name }}-tei
-              {{- end }}
-            initialDelaySeconds: 5
-            periodSeconds: 5
-            failureThreshold: 120
+            {{- toYaml .Values.startupProbe | nindent 12 }}
           {{- end }}
           resources:
             {{- toYaml .Values.resources | nindent 12 }}
diff --git a/helm-charts/common/retriever-usvc/templates/tests/test-pod.yaml b/helm-charts/common/retriever-usvc/templates/tests/test-pod.yaml
@@ -15,12 +15,17 @@ spec:
     - name: curl
       #image: alpine/curl
       image: python:3.10.14
-      command: ['sh', '-c']
+      command: ['bash', '-c']
       args:
         - |
           your_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)");
-          curl http://{{ include "retriever-usvc.fullname" . }}:{{ .Values.service.port }}/v1/retrieval  -sS --fail-with-body \
+          max_retry=20;
+          for ((i=1; i<=max_retry; i++)); do
+            curl http://{{ include "retriever-usvc.fullname" . }}:{{ .Values.service.port }}/v1/retrieval  -sS --fail-with-body \
             -X POST \
             -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding}}" \
-            -H 'Content-Type: application/json'
+            -H 'Content-Type: application/json' && break;
+            sleep 10;
+          done;
+          if [ $i -gt $max_retry ]; then echo "retriever test failed."; exit 1; fi
   restartPolicy: Never
diff --git a/helm-charts/common/retriever-usvc/values.yaml b/helm-charts/common/retriever-usvc/values.yaml
@@ -55,6 +55,27 @@ resources: {}
   #   cpu: 100m
   #   memory: 128Mi
 
+livenessProbe:
+  httpGet:
+    path: v1/health_check
+    port: retriever-usvc
+  initialDelaySeconds: 5
+  periodSeconds: 5
+  failureThreshold: 24
+readinessProbe:
+  httpGet:
+    path: v1/health_check
+    port: retriever-usvc
+  initialDelaySeconds: 5
+  periodSeconds: 5
+startupProbe:
+  httpGet:
+    path: v1/health_check
+    port: retriever-usvc
+  initialDelaySeconds: 5
+  periodSeconds: 5
+  failureThreshold: 120
+
 nodeSelector: {}
 
 tolerations: []
diff --git a/helm-charts/common/tgi/nv-values.yaml b/helm-charts/common/tgi/nv-values.yaml
@@ -5,56 +5,10 @@
 # This is a YAML-formatted file.
 # Declare variables to be passed into your templates.
 
-replicaCount: 1
-
-port: 2080
-
 image:
   repository: ghcr.io/huggingface/text-generation-inference
-  pullPolicy: IfNotPresent
-  # Overrides the image tag whose default is the chart appVersion.
   tag: "2.0"
 
-imagePullSecrets: []
-nameOverride: ""
-fullnameOverride: ""
-
-podAnnotations: {}
-
-podSecurityContext: {}
-  # fsGroup: 2000
-
-securityContext:
-  readOnlyRootFilesystem: true
-  allowPrivilegeEscalation: false
-  runAsNonRoot: true
-  runAsUser: 1000
-  capabilities:
-    drop:
-    - ALL
-  seccompProfile:
-    type: RuntimeDefault
-
-service:
-  type: ClusterIP
-
 resources:
   limits:
     nvidia.com/gpu: 1
-
-nodeSelector: {}
-
-tolerations: []
-
-affinity: {}
-
-LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-
-global:
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here"
-  # set modelUseHostPath to host directory if you want to use hostPath volume for model storage
-  # comment out modeluseHostPath if you want to download the model from huggingface
-  modelUseHostPath: /mnt/opea-models
diff --git a/manifests/common/retriever-usvc.yaml b/manifests/common/retriever-usvc.yaml
@@ -106,6 +106,26 @@ spec:
           volumeMounts:
             - mountPath: /tmp
               name: tmp
+          livenessProbe:
+            failureThreshold: 24
+            httpGet:
+              path: v1/health_check
+              port: retriever-usvc
+            initialDelaySeconds: 5
+            periodSeconds: 5
+          readinessProbe:
+            httpGet:
+              path: v1/health_check
+              port: retriever-usvc
+            initialDelaySeconds: 5
+            periodSeconds: 5
+          startupProbe:
+            failureThreshold: 120
+            httpGet:
+              path: v1/health_check
+              port: retriever-usvc
+            initialDelaySeconds: 5
+            periodSeconds: 5
           resources:
             {}
       volumes:
diff --git a/manifests/common/tgi_nv.yaml b/manifests/common/tgi_nv.yaml
@@ -16,10 +16,7 @@ metadata:
 data:
   MODEL_ID: "Intel/neural-chat-7b-v3-3"
   PORT: "2080"
-  HUGGING_FACE_HUB_TOKEN: "insert-your-huggingface-token-here"
   HF_TOKEN: "insert-your-huggingface-token-here"
-  MAX_INPUT_TOKENS: "1024"
-  MAX_TOTAL_TOKENS: "4096"
   http_proxy: ""
   https_proxy: ""
   no_proxy: ""
@@ -102,6 +99,23 @@ spec:
             - name: http
               containerPort: 2080
               protocol: TCP
+          livenessProbe:
+            failureThreshold: 24
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            tcpSocket:
+              port: http
+          readinessProbe:
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            tcpSocket:
+              port: http
+          startupProbe:
+            failureThreshold: 120
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            tcpSocket:
+              port: http
           resources:
             limits:
               nvidia.com/gpu: 1