diff --git a/CodeGen/README.md b/CodeGen/README.md index 479ad11f29..9aebba4472 100644 --- a/CodeGen/README.md +++ b/CodeGen/README.md @@ -106,19 +106,58 @@ flowchart LR This CodeGen example can be deployed manually on various hardware platforms using Docker Compose or Kubernetes. Select the appropriate guide based on your target environment: -| Hardware | Deployment Mode | Guide Link | -| :-------------- | :------------------- | :----------------------------------------------------------------------- | -| Intel Xeon CPU | Single Node (Docker) | [Xeon Docker Compose Guide](./docker_compose/intel/cpu/xeon/README.md) | -| Intel Gaudi HPU | Single Node (Docker) | [Gaudi Docker Compose Guide](./docker_compose/intel/hpu/gaudi/README.md) | -| AMD EPYC CPU | Single Node (Docker) | [EPYC Docker Compose Guide](./docker_compose/amd/cpu/epyc/README.md) | -| AMD ROCm GPU | Single Node (Docker) | [ROCm Docker Compose Guide](./docker_compose/amd/gpu/rocm/README.md) | -| Intel Xeon CPU | Kubernetes (Helm) | [Kubernetes Helm Guide](./kubernetes/helm/README.md) | -| Intel Gaudi HPU | Kubernetes (Helm) | [Kubernetes Helm Guide](./kubernetes/helm/README.md) | -| Intel Xeon CPU | Kubernetes (GMC) | [Kubernetes GMC Guide](./kubernetes/gmc/README.md) | -| Intel Gaudi HPU | Kubernetes (GMC) | [Kubernetes GMC Guide](./kubernetes/gmc/README.md) | +| Hardware | Deployment Mode | Guide Link | +| :-------------- | :----------------------------------- | :--------------------------------------------------------------------------------------- | +| Intel Xeon CPU | Single Node (Docker) | [Xeon Docker Compose Guide](./docker_compose/intel/cpu/xeon/README.md) | +| Intel Xeon CPU | Single Node (Docker) with Monitoring | [Xeon Docker Compose with Monitoring Guide](./docker_compose/intel/cpu/xeon/README.md) | +| Intel Gaudi HPU | Single Node (Docker) | [Gaudi Docker Compose Guide](./docker_compose/intel/hpu/gaudi/README.md) | +| Intel Gaudi HPU | Single Node (Docker) with Monitoring | [Gaudi Docker Compose with Monitoring Guide](./docker_compose/intel/hpu/gaudi/README.md) | +| AMD EPYC CPU | Single Node (Docker) | [EPYC Docker Compose Guide](./docker_compose/amd/cpu/epyc/README.md) | +| AMD ROCm GPU | Single Node (Docker) | [ROCm Docker Compose Guide](./docker_compose/amd/gpu/rocm/README.md) | +| Intel Xeon CPU | Kubernetes (Helm) | [Kubernetes Helm Guide](./kubernetes/helm/README.md) | +| Intel Gaudi HPU | Kubernetes (Helm) | [Kubernetes Helm Guide](./kubernetes/helm/README.md) | +| Intel Xeon CPU | Kubernetes (GMC) | [Kubernetes GMC Guide](./kubernetes/gmc/README.md) | +| Intel Gaudi HPU | Kubernetes (GMC) | [Kubernetes GMC Guide](./kubernetes/gmc/README.md) | _Note: Building custom microservice images can be done using the resources in [GenAIComps](https://github.com/opea-project/GenAIComps)._ +## Monitoring + +The CodeGen example supports monitoring capabilities for Intel Xeon and Intel Gaudi platforms. Monitoring includes: + +- **Prometheus**: For metrics collection and querying +- **Grafana**: For visualization and dashboards +- **Node Exporter**: For system metrics collection + +### Monitoring Features + +- Real-time metrics collection from all CodeGen microservices +- Pre-configured dashboards for: + - vLLM/TGI performance metrics + - CodeGen MegaService metrics + - System resource utilization + - Node-level metrics + +### Enabling Monitoring + +Monitoring can be enabled by using the `compose.monitoring.yaml` file along with the main compose file: + +```bash +# For Intel Xeon +docker compose -f compose.yaml -f compose.monitoring.yaml up -d + +# For Intel Gaudi +docker compose -f compose.yaml -f compose.monitoring.yaml up -d +``` + +### Accessing Monitoring Services + +Once deployed with monitoring, you can access: + +- **Prometheus**: `http://${HOST_IP}:9090` +- **Grafana**: `http://${HOST_IP}:3000` (username: `admin`, password: `admin`) +- **Node Exporter**: `http://${HOST_IP}:9100` + ## Benchmarking Guides for evaluating the performance and accuracy of this CodeGen deployment are available: diff --git a/CodeGen/docker_compose/intel/cpu/xeon/README.md b/CodeGen/docker_compose/intel/cpu/xeon/README.md index f8cb1e03ff..ad0db68bff 100644 --- a/CodeGen/docker_compose/intel/cpu/xeon/README.md +++ b/CodeGen/docker_compose/intel/cpu/xeon/README.md @@ -49,7 +49,8 @@ This uses the default vLLM-based deployment using `compose.yaml`. # export https_proxy="your_https_proxy" # export no_proxy="localhost,127.0.0.1,${HOST_IP}" # Add other hosts if necessary source intel/set_env.sh - cd /intel/cpu/xeon + cd intel/cpu/xeon + bash grafana/dashboards/download_opea_dashboard.sh ``` _Note: The compose file might read additional variables from set_env.sh. Ensure all required variables like ports (`LLM_SERVICE_PORT`, `MEGA_SERVICE_PORT`, etc.) are set if not using defaults from the compose file._ @@ -146,7 +147,7 @@ Key parameters are configured via environment variables set before running `dock Most of these parameters are in `set_env.sh`, you can either modify this file or overwrite the env variables by setting them. ```shell -source CodeGen/docker_compose/set_env.sh +source CodeGen/docker_compose/intel/set_env.sh ``` #### Compose Files @@ -252,7 +253,63 @@ Users can interact with the backend service using the `Neural Copilot` VS Code e - **"Container name is in use"**: Stop existing containers (`docker compose down`) or change `container_name` in the compose file. - **Resource Issues:** CodeGen models can be memory-intensive. Monitor host RAM usage. Increase Docker resources if needed. -## Stopping the Application +## Monitoring Deployment + +To enable monitoring for the CodeGen application, you can use the monitoring Docker Compose file along with the main deployment. + +### Option #1: Default Deployment (without monitoring) + +To deploy the CodeGen services without monitoring, execute: + +```bash +docker compose up -d +``` + +### Option #2: Deployment with Monitoring + +> NOTE: To enable monitoring, `compose.monitoring.yaml` file need to be merged along with default `compose.yaml` file. + +To deploy with monitoring: + +```bash +bash grafana/dashboards/download_opea_dashboard.sh +docker compose -f compose.yaml -f compose.monitoring.yaml up -d +``` + +### Accessing Monitoring Services + +Once deployed with monitoring, you can access: + +- **Prometheus**: `http://${HOST_IP}:9090` +- **Grafana**: `http://${HOST_IP}:3000` (username: `admin`, password: `admin`) +- **Node Exporter**: `http://${HOST_IP}:9100` + +### Monitoring Components + +The monitoring stack includes: + +- **Prometheus**: For metrics collection and querying +- **Grafana**: For visualization and dashboards +- **Node Exporter**: For system metrics collection + +### Monitoring Dashboards + +The following dashboards are automatically downloaded and configured: + +- vLLM Dashboard +- TGI Dashboard +- CodeGen MegaService Dashboard +- Node Exporter Dashboard + +### Stopping the Application + +If monitoring is enabled, execute the following command: + +```bash +docker compose -f compose.yaml -f compose.monitoring.yaml down +``` + +If monitoring is not enabled, execute: ```bash docker compose down # for vLLM (compose.yaml) diff --git a/CodeGen/docker_compose/intel/cpu/xeon/compose.monitoring.yaml b/CodeGen/docker_compose/intel/cpu/xeon/compose.monitoring.yaml new file mode 100644 index 0000000000..dea34085b3 --- /dev/null +++ b/CodeGen/docker_compose/intel/cpu/xeon/compose.monitoring.yaml @@ -0,0 +1,58 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + prometheus: + image: prom/prometheus:v2.52.0 + container_name: opea_prometheus + user: root + volumes: + - ./prometheus.yaml:/etc/prometheus/prometheus.yaml + - ./prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yaml' + ports: + - '9090:9090' + ipc: host + restart: unless-stopped + + grafana: + image: grafana/grafana:11.0.0 + container_name: grafana + volumes: + - ./grafana_data:/var/lib/grafana + - ./grafana/dashboards:/var/lib/grafana/dashboards + - ./grafana/provisioning:/etc/grafana/provisioning + user: root + environment: + GF_SECURITY_ADMIN_PASSWORD: admin + GF_RENDERING_CALLBACK_URL: http://grafana:3000/ + GF_LOG_FILTERS: rendering:debug + no_proxy: ${no_proxy} + host_ip: ${host_ip} + depends_on: + - prometheus + ports: + - '3000:3000' + ipc: host + restart: unless-stopped + + node-exporter: + image: prom/node-exporter + container_name: node-exporter + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - '--path.procfs=/host/proc' + - '--path.sysfs=/host/sys' + - --collector.filesystem.ignored-mount-points + - "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)" + environment: + no_proxy: ${no_proxy} + ports: + - 9100:9100 + restart: always + deploy: + mode: global diff --git a/CodeGen/docker_compose/intel/cpu/xeon/grafana/dashboards/download_opea_dashboard.sh b/CodeGen/docker_compose/intel/cpu/xeon/grafana/dashboards/download_opea_dashboard.sh new file mode 100644 index 0000000000..48a4d78cf9 --- /dev/null +++ b/CodeGen/docker_compose/intel/cpu/xeon/grafana/dashboards/download_opea_dashboard.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" +if ls *.json 1> /dev/null 2>&1; then + rm *.json +fi + +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/vllm_grafana.json +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/tgi_grafana.json +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/codegen_megaservice_grafana.json +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/node_grafana.json diff --git a/CodeGen/docker_compose/intel/cpu/xeon/grafana/provisioning/dashboards/local.yaml b/CodeGen/docker_compose/intel/cpu/xeon/grafana/provisioning/dashboards/local.yaml new file mode 100644 index 0000000000..13922a769b --- /dev/null +++ b/CodeGen/docker_compose/intel/cpu/xeon/grafana/provisioning/dashboards/local.yaml @@ -0,0 +1,14 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: 1 + +providers: +- name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 #how often Grafana will scan for changed dashboards + options: + path: /var/lib/grafana/dashboards diff --git a/CodeGen/docker_compose/intel/cpu/xeon/grafana/provisioning/datasources/datasource.yml b/CodeGen/docker_compose/intel/cpu/xeon/grafana/provisioning/datasources/datasource.yml new file mode 100644 index 0000000000..a206521d67 --- /dev/null +++ b/CodeGen/docker_compose/intel/cpu/xeon/grafana/provisioning/datasources/datasource.yml @@ -0,0 +1,54 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# config file version +apiVersion: 1 + +# list of datasources that should be deleted from the database +deleteDatasources: + - name: Prometheus + orgId: 1 + +# list of datasources to insert/update depending +# what's available in the database +datasources: + # name of the datasource. Required +- name: Prometheus + # datasource type. Required + type: prometheus + # access mode. direct or proxy. Required + access: proxy + # org id. will default to orgId 1 if not specified + orgId: 1 + # url + url: http://$host_ip:9090 + # database password, if used + password: + # database user, if used + user: + # database name, if used + database: + # enable/disable basic auth + basicAuth: false + # basic auth username, if used + basicAuthUser: + # basic auth password, if used + basicAuthPassword: + # enable/disable with credentials headers + withCredentials: + # mark as default datasource. Max one per org + isDefault: true + # fields that will be converted to json and stored in json_data + jsonData: + httpMethod: GET + graphiteVersion: "1.1" + tlsAuth: false + tlsAuthWithCACert: false + # json object of data that will be encrypted. + secureJsonData: + tlsCACert: "..." + tlsClientCert: "..." + tlsClientKey: "..." + version: 1 + # allow users to edit datasources from the UI. + editable: true diff --git a/CodeGen/docker_compose/intel/cpu/xeon/prometheus.yaml b/CodeGen/docker_compose/intel/cpu/xeon/prometheus.yaml new file mode 100644 index 0000000000..27d0940e09 --- /dev/null +++ b/CodeGen/docker_compose/intel/cpu/xeon/prometheus.yaml @@ -0,0 +1,27 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# [IP_ADDR]:{PORT_OUTSIDE_CONTAINER} -> {PORT_INSIDE_CONTAINER} / {PROTOCOL} +global: + scrape_interval: 5s + external_labels: + monitor: "my-monitor" +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["opea_prometheus:9090"] + - job_name: "vllm" + metrics_path: /metrics + static_configs: + - targets: ["vllm-server:80"] + - job_name: "tgi" + metrics_path: /metrics + static_configs: + - targets: [ "tgi-service:80" ] + - job_name: "codegen-backend-server" + metrics_path: /metrics + static_configs: + - targets: ["codegen-xeon-backend-server:7778"] + - job_name: "prometheus-node-exporter" + metrics_path: /metrics + static_configs: + - targets: ["node-exporter:9100"] diff --git a/CodeGen/docker_compose/intel/hpu/gaudi/README.md b/CodeGen/docker_compose/intel/hpu/gaudi/README.md index 0f1e438cf8..0bb562b954 100644 --- a/CodeGen/docker_compose/intel/hpu/gaudi/README.md +++ b/CodeGen/docker_compose/intel/hpu/gaudi/README.md @@ -49,7 +49,10 @@ This uses the default vLLM-based deployment using `compose.yaml`. # export https_proxy="your_https_proxy" # export no_proxy="localhost,127.0.0.1,${HOST_IP}" # Add other hosts if necessary source intel/set_env.sh - cd /intel/hpu/gaudi + cd intel/hpu/gaudi + cd grafana/dashboards + bash download_opea_dashboard.sh + cd ../.. ``` _Note: The compose file might read additional variables from set_env.sh. Ensure all required variables like ports (`LLM_SERVICE_PORT`, `MEGA_SERVICE_PORT`, etc.) are set if not using defaults from the compose file._ @@ -228,7 +231,62 @@ Use the `Neural Copilot` extension configured with the CodeGen backend URL: `htt - **Model Download Issues:** Check `HF_TOKEN`, internet access, proxy settings. Check LLM service logs. - **Connection Errors:** Verify `HOST_IP`, ports, and proxy settings. Use `docker ps` and check service logs. -## Stopping the Application +## Monitoring Deployment + +To enable monitoring for the CodeGen application on Gaudi, you can use the monitoring Docker Compose file along with the main deployment. + +### Option #1: Default Deployment (without monitoring) + +To deploy the CodeGen services without monitoring, execute: + +```bash +docker compose up -d +``` + +### Option #2: Deployment with Monitoring + +> NOTE: To enable monitoring, `compose.monitoring.yaml` file need to be merged along with default `compose.yaml` file. + +To deploy with monitoring: + +```bash +docker compose -f compose.yaml -f compose.monitoring.yaml up -d +``` + +### Accessing Monitoring Services + +Once deployed with monitoring, you can access: + +- **Prometheus**: `http://${HOST_IP}:9090` +- **Grafana**: `http://${HOST_IP}:3000` (username: `admin`, password: `admin`) +- **Node Exporter**: `http://${HOST_IP}:9100` + +### Monitoring Components + +The monitoring stack includes: + +- **Prometheus**: For metrics collection and querying +- **Grafana**: For visualization and dashboards +- **Node Exporter**: For system metrics collection + +### Monitoring Dashboards + +The following dashboards are automatically downloaded and configured: + +- vLLM Dashboard +- TGI Dashboard +- CodeGen MegaService Dashboard +- Node Exporter Dashboard + +### Stopping the Application + +If monitoring is enabled, execute the following command: + +```bash +docker compose -f compose.yaml -f compose.monitoring.yaml down +``` + +If monitoring is not enabled, execute: ```bash docker compose down # for vLLM (compose.yaml) diff --git a/CodeGen/docker_compose/intel/hpu/gaudi/compose.monitoring.yaml b/CodeGen/docker_compose/intel/hpu/gaudi/compose.monitoring.yaml new file mode 100644 index 0000000000..cd891a445d --- /dev/null +++ b/CodeGen/docker_compose/intel/hpu/gaudi/compose.monitoring.yaml @@ -0,0 +1,73 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + prometheus: + image: prom/prometheus:v2.52.0 + container_name: opea_prometheus + user: root + volumes: + - ./prometheus.yaml:/etc/prometheus/prometheus.yaml + - ./prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yaml' + ports: + - '9090:9090' + ipc: host + restart: unless-stopped + + grafana: + image: grafana/grafana:11.0.0 + container_name: grafana + volumes: + - ./grafana_data:/var/lib/grafana + - ./grafana/dashboards:/var/lib/grafana/dashboards + - ./grafana/provisioning:/etc/grafana/provisioning + user: root + environment: + GF_SECURITY_ADMIN_PASSWORD: admin + GF_RENDERING_CALLBACK_URL: http://grafana:3000/ + GF_LOG_FILTERS: rendering:debug + no_proxy: ${no_proxy} + host_ip: ${host_ip} + depends_on: + - prometheus + ports: + - '3000:3000' + ipc: host + restart: unless-stopped + + node-exporter: + image: prom/node-exporter + container_name: node-exporter + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - '--path.procfs=/host/proc' + - '--path.sysfs=/host/sys' + - --collector.filesystem.ignored-mount-points + - "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)" + environment: + no_proxy: ${no_proxy} + ports: + - 9100:9100 + restart: always + deploy: + mode: global + + gaudi-metrics-exporter: + image: vault.habana.ai/gaudi-metric-exporter/metric-exporter:latest + privileged: true + container_name: gaudi-metrics-exporter + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + - /dev:/dev + deploy: + mode: global + ports: + - 41611:41611 + restart: unless-stopped diff --git a/CodeGen/docker_compose/intel/hpu/gaudi/grafana/dashboards/download_opea_dashboard.sh b/CodeGen/docker_compose/intel/hpu/gaudi/grafana/dashboards/download_opea_dashboard.sh new file mode 100644 index 0000000000..40f2cb12a6 --- /dev/null +++ b/CodeGen/docker_compose/intel/hpu/gaudi/grafana/dashboards/download_opea_dashboard.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +if ls *.json 1> /dev/null 2>&1; then + rm *.json +fi + +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/vllm_grafana.json +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/tgi_grafana.json +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/codegen_megaservice_grafana.json +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/node_grafana.json +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/gaudi_grafana_v2.json diff --git a/CodeGen/docker_compose/intel/hpu/gaudi/grafana/provisioning/dashboards/local.yaml b/CodeGen/docker_compose/intel/hpu/gaudi/grafana/provisioning/dashboards/local.yaml new file mode 100644 index 0000000000..13922a769b --- /dev/null +++ b/CodeGen/docker_compose/intel/hpu/gaudi/grafana/provisioning/dashboards/local.yaml @@ -0,0 +1,14 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: 1 + +providers: +- name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 #how often Grafana will scan for changed dashboards + options: + path: /var/lib/grafana/dashboards diff --git a/CodeGen/docker_compose/intel/hpu/gaudi/grafana/provisioning/datasources/datasource.yml b/CodeGen/docker_compose/intel/hpu/gaudi/grafana/provisioning/datasources/datasource.yml new file mode 100644 index 0000000000..a206521d67 --- /dev/null +++ b/CodeGen/docker_compose/intel/hpu/gaudi/grafana/provisioning/datasources/datasource.yml @@ -0,0 +1,54 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# config file version +apiVersion: 1 + +# list of datasources that should be deleted from the database +deleteDatasources: + - name: Prometheus + orgId: 1 + +# list of datasources to insert/update depending +# what's available in the database +datasources: + # name of the datasource. Required +- name: Prometheus + # datasource type. Required + type: prometheus + # access mode. direct or proxy. Required + access: proxy + # org id. will default to orgId 1 if not specified + orgId: 1 + # url + url: http://$host_ip:9090 + # database password, if used + password: + # database user, if used + user: + # database name, if used + database: + # enable/disable basic auth + basicAuth: false + # basic auth username, if used + basicAuthUser: + # basic auth password, if used + basicAuthPassword: + # enable/disable with credentials headers + withCredentials: + # mark as default datasource. Max one per org + isDefault: true + # fields that will be converted to json and stored in json_data + jsonData: + httpMethod: GET + graphiteVersion: "1.1" + tlsAuth: false + tlsAuthWithCACert: false + # json object of data that will be encrypted. + secureJsonData: + tlsCACert: "..." + tlsClientCert: "..." + tlsClientKey: "..." + version: 1 + # allow users to edit datasources from the UI. + editable: true diff --git a/CodeGen/docker_compose/intel/hpu/gaudi/prometheus.yaml b/CodeGen/docker_compose/intel/hpu/gaudi/prometheus.yaml new file mode 100644 index 0000000000..f259e2f7f9 --- /dev/null +++ b/CodeGen/docker_compose/intel/hpu/gaudi/prometheus.yaml @@ -0,0 +1,32 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# [IP_ADDR]:{PORT_OUTSIDE_CONTAINER} -> {PORT_INSIDE_CONTAINER} / {PROTOCOL} +global: + scrape_interval: 5s + external_labels: + monitor: "my-monitor" +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["opea_prometheus:9090"] + - job_name: "vllm" + metrics_path: /metrics + static_configs: + - targets: ["codegen-vllm-gaudi-server:80"] + - job_name: "tgi" + metrics_path: /metrics + static_configs: + - targets: ["codegen-tgi-gaudi-server:80"] + - job_name: "codegen-backend-server" + metrics_path: /metrics + static_configs: + - targets: ["codegen-gaudi-backend-server:7778"] + - job_name: "prometheus-node-exporter" + metrics_path: /metrics + static_configs: + - targets: ["node-exporter:9100"] + - job_name: "gaudi-metrics-exporter" + scrape_interval: 30s + metrics_path: /metrics + static_configs: + - targets: [ "gaudi-metrics-exporter:41611" ] diff --git a/CodeGen/docker_compose/intel/set_env.sh b/CodeGen/docker_compose/intel/set_env.sh index 543e9fee88..5e08df213e 100644 --- a/CodeGen/docker_compose/intel/set_env.sh +++ b/CodeGen/docker_compose/intel/set_env.sh @@ -51,3 +51,9 @@ export DATAPREP_ENDPOINT="http://${HOST_IP}:${DATAPREP_REDIS_PORT}/v1/dataprep" export LOGFLAG=false export MODEL_CACHE=${model_cache:-"./data"} export NUM_CARDS=1 + + +# Set network proxy settings +export no_proxy="${no_proxy},${HOST_IP},vllm-server,codegen-xeon-backend-server,codegen-xeon-ui-server,redis-vector-db,dataprep-redis-server,tei-embedding-serving,tei-embedding-server,retriever-redis,opea_prometheus,grafana,node-exporter,$JAEGER_IP" # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1" +export http_proxy=$http_proxy +export https_proxy=$https_proxy diff --git a/CodeGen/tests/test_compose_on_gaudi.sh b/CodeGen/tests/test_compose_on_gaudi.sh index 25cf3a945d..5e6f48a75c 100644 --- a/CodeGen/tests/test_compose_on_gaudi.sh +++ b/CodeGen/tests/test_compose_on_gaudi.sh @@ -38,8 +38,10 @@ function start_services() { export no_proxy="localhost,127.0.0.1,$ip_address" cd $WORKPATH/docker_compose/intel/hpu/gaudi + # download grafana dashboard + bash grafana/dashboards/download_opea_dashboard.sh # Start Docker Containers - docker compose -f ${compose_file} up -d | tee ${LOG_PATH}/start_services_with_compose.log + docker compose -f ${compose_file} -f compose.monitoring.yaml up -d | tee ${LOG_PATH}/start_services_with_compose.log n=0 until [[ "$n" -ge 100 ]]; do @@ -150,7 +152,7 @@ function stop_docker() { local compose_file="$1" cd $WORKPATH/docker_compose/intel/hpu/gaudi - docker compose -f ${compose_file} down + docker compose -f ${compose_file} -f compose.monitoring.yaml down } function main() { diff --git a/CodeGen/tests/test_compose_on_xeon.sh b/CodeGen/tests/test_compose_on_xeon.sh index 9effe97a4a..678108c494 100644 --- a/CodeGen/tests/test_compose_on_xeon.sh +++ b/CodeGen/tests/test_compose_on_xeon.sh @@ -41,8 +41,11 @@ function start_services() { export no_proxy="localhost,127.0.0.1,$ip_address" cd $WORKPATH/docker_compose/intel/cpu/xeon/ + # download grafana dashboard + bash grafana/dashboards/download_opea_dashboard.sh + # Start Docker Containers - docker compose -f ${compose_file} up -d > ${LOG_PATH}/start_services_with_compose.log + docker compose -f ${compose_file} -f compose.monitoring.yaml up -d > ${LOG_PATH}/start_services_with_compose.log n=0 until [[ "$n" -ge 100 ]]; do @@ -161,7 +164,7 @@ function stop_docker() { local compose_file="$1" cd $WORKPATH/docker_compose/intel/cpu/xeon/ - docker compose -f ${compose_file} down + docker compose -f ${compose_file} -f compose.monitoring.yaml down } function main() { diff --git a/CodeTrans/docker_compose/intel/cpu/xeon/README.md b/CodeTrans/docker_compose/intel/cpu/xeon/README.md index 131f03a999..43e76c8e6c 100755 --- a/CodeTrans/docker_compose/intel/cpu/xeon/README.md +++ b/CodeTrans/docker_compose/intel/cpu/xeon/README.md @@ -54,6 +54,8 @@ Consult the section on [CodeTrans Service configuration](#codetrans-configuratio ### Deploy the Services Using Docker Compose +#### Option #1 + To deploy the CodeTrans services, execute the `docker compose up` command with the appropriate arguments. For a default deployment, execute the command below. It uses the 'compose.yaml' file. ```bash @@ -61,6 +63,19 @@ cd cpu/xeon docker compose -f compose.yaml up -d ``` +#### Option #2 + +> NOTE : To enable monitoring, `compose.monitoring.yaml` file need to be merged along with default `compose.yaml` file. + +To deploy with monitoring: + +```bash +cd cpu/xeon/ +# download grafana dashboard +bash grafana/dashboards/download_opea_dashboard.sh +docker compose -f compose.yaml -f compose.monitoring.yaml up -d +``` + > **Note**: developers should build docker image from source when: > > - Developing off the git main branch (as the container's ports in the repo may be different > from the published docker image). @@ -117,6 +132,15 @@ To stop the containers associated with the deployment, execute the following com docker compose -f compose.yaml down ``` +If monitoring is enabled, execute the following command: + +```bash +cd cpu/xeon/ +# download grafana dashboard +bash grafana/dashboards/download_opea_dashboard.sh +docker compose -f compose.yaml -f compose.monitoring.yaml down +``` + ## Configuration Parameters Key parameters are configured via environment variables set before running `docker compose up`. @@ -137,11 +161,12 @@ Key parameters are configured via environment variables set before running `dock In the context of deploying a CodeTrans pipeline on an Intel® Xeon® platform, we can pick and choose different large language model serving frameworks. The table below outlines the various configurations that are available as part of the application. These configurations can be used as templates and can be extended to different components available in [GenAIComps](https://github.com/opea-project/GenAIComps.git). -| File | Description | -| -------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| [compose.yaml](./compose.yaml) | Default compose file using vllm as serving framework and redis as vector database. | -| [compose_tgi.yaml](./compose_tgi.yaml) | The LLM serving framework is TGI. All other configurations remain the same as the default. | -| [compose_remote.yaml](./compose_remote.yaml) | The LLM used is hosted on a remote server and an endpoint is used to access this model. vLLM is the serving framework. Additional environment variables need to be set before running. See [instructions](#running-llm-models-with-remote-endpoints) below. | +| File | Description | +| ---------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [compose.yaml](./compose.yaml) | Default compose file using vllm as serving framework and redis as vector database. | +| [compose_tgi.yaml](./compose_tgi.yaml) | The LLM serving framework is TGI. All other configurations remain the same as the default. | +| [compose_remote.yaml](./compose_remote.yaml) | The LLM used is hosted on a remote server and an endpoint is used to access this model. vLLM is the serving framework. Additional environment variables need to be set before running. See [instructions](#running-llm-models-with-remote-endpoints) below. | +| [compose.monitoring.yaml](./compose.monitoring.yaml) | Helper file for monitoring features. Can be used along with any compose files | ### Running LLM models with remote endpoints diff --git a/CodeTrans/docker_compose/intel/cpu/xeon/compose.monitoring.yaml b/CodeTrans/docker_compose/intel/cpu/xeon/compose.monitoring.yaml new file mode 100644 index 0000000000..dea34085b3 --- /dev/null +++ b/CodeTrans/docker_compose/intel/cpu/xeon/compose.monitoring.yaml @@ -0,0 +1,58 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + prometheus: + image: prom/prometheus:v2.52.0 + container_name: opea_prometheus + user: root + volumes: + - ./prometheus.yaml:/etc/prometheus/prometheus.yaml + - ./prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yaml' + ports: + - '9090:9090' + ipc: host + restart: unless-stopped + + grafana: + image: grafana/grafana:11.0.0 + container_name: grafana + volumes: + - ./grafana_data:/var/lib/grafana + - ./grafana/dashboards:/var/lib/grafana/dashboards + - ./grafana/provisioning:/etc/grafana/provisioning + user: root + environment: + GF_SECURITY_ADMIN_PASSWORD: admin + GF_RENDERING_CALLBACK_URL: http://grafana:3000/ + GF_LOG_FILTERS: rendering:debug + no_proxy: ${no_proxy} + host_ip: ${host_ip} + depends_on: + - prometheus + ports: + - '3000:3000' + ipc: host + restart: unless-stopped + + node-exporter: + image: prom/node-exporter + container_name: node-exporter + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - '--path.procfs=/host/proc' + - '--path.sysfs=/host/sys' + - --collector.filesystem.ignored-mount-points + - "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)" + environment: + no_proxy: ${no_proxy} + ports: + - 9100:9100 + restart: always + deploy: + mode: global diff --git a/CodeTrans/docker_compose/intel/cpu/xeon/grafana/dashboards/download_opea_dashboard.sh b/CodeTrans/docker_compose/intel/cpu/xeon/grafana/dashboards/download_opea_dashboard.sh new file mode 100644 index 0000000000..47d4f84587 --- /dev/null +++ b/CodeTrans/docker_compose/intel/cpu/xeon/grafana/dashboards/download_opea_dashboard.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" +if ls *.json 1> /dev/null 2>&1; then + rm *.json +fi + +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/vllm_grafana.json +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/tgi_grafana.json +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/codetrans_megaservice_grafana.json +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/node_grafana.json diff --git a/CodeTrans/docker_compose/intel/cpu/xeon/grafana/provisioning/dashboards/local.yaml b/CodeTrans/docker_compose/intel/cpu/xeon/grafana/provisioning/dashboards/local.yaml new file mode 100644 index 0000000000..13922a769b --- /dev/null +++ b/CodeTrans/docker_compose/intel/cpu/xeon/grafana/provisioning/dashboards/local.yaml @@ -0,0 +1,14 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: 1 + +providers: +- name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 #how often Grafana will scan for changed dashboards + options: + path: /var/lib/grafana/dashboards diff --git a/CodeTrans/docker_compose/intel/cpu/xeon/grafana/provisioning/datasources/datasource.yml b/CodeTrans/docker_compose/intel/cpu/xeon/grafana/provisioning/datasources/datasource.yml new file mode 100644 index 0000000000..a206521d67 --- /dev/null +++ b/CodeTrans/docker_compose/intel/cpu/xeon/grafana/provisioning/datasources/datasource.yml @@ -0,0 +1,54 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# config file version +apiVersion: 1 + +# list of datasources that should be deleted from the database +deleteDatasources: + - name: Prometheus + orgId: 1 + +# list of datasources to insert/update depending +# what's available in the database +datasources: + # name of the datasource. Required +- name: Prometheus + # datasource type. Required + type: prometheus + # access mode. direct or proxy. Required + access: proxy + # org id. will default to orgId 1 if not specified + orgId: 1 + # url + url: http://$host_ip:9090 + # database password, if used + password: + # database user, if used + user: + # database name, if used + database: + # enable/disable basic auth + basicAuth: false + # basic auth username, if used + basicAuthUser: + # basic auth password, if used + basicAuthPassword: + # enable/disable with credentials headers + withCredentials: + # mark as default datasource. Max one per org + isDefault: true + # fields that will be converted to json and stored in json_data + jsonData: + httpMethod: GET + graphiteVersion: "1.1" + tlsAuth: false + tlsAuthWithCACert: false + # json object of data that will be encrypted. + secureJsonData: + tlsCACert: "..." + tlsClientCert: "..." + tlsClientKey: "..." + version: 1 + # allow users to edit datasources from the UI. + editable: true diff --git a/CodeTrans/docker_compose/intel/cpu/xeon/prometheus.yaml b/CodeTrans/docker_compose/intel/cpu/xeon/prometheus.yaml new file mode 100644 index 0000000000..57bbf6e0db --- /dev/null +++ b/CodeTrans/docker_compose/intel/cpu/xeon/prometheus.yaml @@ -0,0 +1,23 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# [IP_ADDR]:{PORT_OUTSIDE_CONTAINER} -> {PORT_INSIDE_CONTAINER} / {PROTOCOL} +global: + scrape_interval: 5s + external_labels: + monitor: "my-monitor" +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["opea_prometheus:9090"] + - job_name: "vllm" + metrics_path: /metrics + static_configs: + - targets: ["codetrans-xeon-vllm-service:80"] + - job_name: "codetrans-backend-server" + metrics_path: /metrics + static_configs: + - targets: ["codetrans-xeon-backend-server:7777"] + - job_name: "prometheus-node-exporter" + metrics_path: /metrics + static_configs: + - targets: ["node-exporter:9100"] diff --git a/CodeTrans/docker_compose/intel/hpu/gaudi/README.md b/CodeTrans/docker_compose/intel/hpu/gaudi/README.md index 2597e1aeb6..830e9f7a7c 100755 --- a/CodeTrans/docker_compose/intel/hpu/gaudi/README.md +++ b/CodeTrans/docker_compose/intel/hpu/gaudi/README.md @@ -54,6 +54,8 @@ Consult the section on [CodeTrans Service configuration](#codetrans-configuratio ### Deploy the Services Using Docker Compose +#### Option #1 + To deploy the CodeTrans services, execute the `docker compose up` command with the appropriate arguments. For a default deployment, execute the command below. It uses the 'compose.yaml' file. ```bash @@ -61,6 +63,19 @@ cd hpu/gaudi docker compose -f compose.yaml up -d ``` +#### Option #2 + +> NOTE : To enable monitoring, `compose.monitoring.yaml` file need to be merged along with default `compose.yaml` file. + +To deploy with monitoring: + +```bash +cd hpu/gaudi/ +# download grafana dashboard +bash grafana/dashboards/download_opea_dashboard.sh +docker compose -f compose.yaml -f compose.monitoring.yaml up -d +``` + > **Note**: developers should build docker image from source when: > > - Developing off the git main branch (as the container's ports in the repo may be different > from the published docker image). @@ -117,6 +132,15 @@ To stop the containers associated with the deployment, execute the following com docker compose -f compose.yaml down ``` +If monitoring is enabled, execute the following command: + +```bash +cd hpu/gaudi/ +# download grafana dashboard +bash grafana/dashboards/download_opea_dashboard.sh +docker compose -f compose.yaml -f compose.monitoring.yaml down +``` + ## Configuration Parameters Key parameters are configured via environment variables set before running `docker compose up`. diff --git a/CodeTrans/docker_compose/intel/hpu/gaudi/compose.monitoring.yaml b/CodeTrans/docker_compose/intel/hpu/gaudi/compose.monitoring.yaml new file mode 100644 index 0000000000..691671e656 --- /dev/null +++ b/CodeTrans/docker_compose/intel/hpu/gaudi/compose.monitoring.yaml @@ -0,0 +1,75 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + prometheus: + image: prom/prometheus:v2.52.0 + container_name: opea_prometheus + user: root + volumes: + - ./prometheus.yaml:/etc/prometheus/prometheus.yaml + - ./prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yaml' + ports: + - '9090:9090' + ipc: host + restart: unless-stopped + + grafana: + image: grafana/grafana:11.0.0 + container_name: grafana + volumes: + - ./grafana_data:/var/lib/grafana + - ./grafana/dashboards:/var/lib/grafana/dashboards + - ./grafana/provisioning:/etc/grafana/provisioning + user: root + environment: + GF_SECURITY_ADMIN_PASSWORD: admin + GF_RENDERING_CALLBACK_URL: http://grafana:3000/ + GF_LOG_FILTERS: rendering:debug + no_proxy: ${no_proxy} + host_ip: ${host_ip} + depends_on: + - prometheus + ports: + - '3000:3000' + ipc: host + restart: unless-stopped + + node-exporter: + image: prom/node-exporter + container_name: node-exporter + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - '--path.procfs=/host/proc' + - '--path.rootfs=/rootfs' + - '--path.sysfs=/host/sys' + - '--path.udev.data=/rootfs/run/udev/data' + - --collector.filesystem.ignored-mount-points + - "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)" + environment: + no_proxy: ${no_proxy} + ports: + - 9100:9100 + restart: always + deploy: + mode: global + + gaudi-metrics-exporter: + image: vault.habana.ai/gaudi-metric-exporter/metric-exporter:latest + privileged: true + container_name: gaudi-metrics-exporter + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + - /dev:/dev + deploy: + mode: global + ports: + - 41611:41611 + restart: unless-stopped diff --git a/CodeTrans/docker_compose/intel/hpu/gaudi/grafana/dashboards/download_opea_dashboard.sh b/CodeTrans/docker_compose/intel/hpu/gaudi/grafana/dashboards/download_opea_dashboard.sh new file mode 100644 index 0000000000..b601762739 --- /dev/null +++ b/CodeTrans/docker_compose/intel/hpu/gaudi/grafana/dashboards/download_opea_dashboard.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" +if ls *.json 1> /dev/null 2>&1; then + rm *.json +fi + +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/vllm_grafana.json +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/tgi_grafana.json +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/gaudi_grafana_v2.json +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/codetrans_megaservice_grafana.json +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/node_grafana.json diff --git a/CodeTrans/docker_compose/intel/hpu/gaudi/grafana/provisioning/dashboards/local.yaml b/CodeTrans/docker_compose/intel/hpu/gaudi/grafana/provisioning/dashboards/local.yaml new file mode 100644 index 0000000000..13922a769b --- /dev/null +++ b/CodeTrans/docker_compose/intel/hpu/gaudi/grafana/provisioning/dashboards/local.yaml @@ -0,0 +1,14 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: 1 + +providers: +- name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 #how often Grafana will scan for changed dashboards + options: + path: /var/lib/grafana/dashboards diff --git a/CodeTrans/docker_compose/intel/hpu/gaudi/grafana/provisioning/datasources/datasource.yml b/CodeTrans/docker_compose/intel/hpu/gaudi/grafana/provisioning/datasources/datasource.yml new file mode 100644 index 0000000000..a206521d67 --- /dev/null +++ b/CodeTrans/docker_compose/intel/hpu/gaudi/grafana/provisioning/datasources/datasource.yml @@ -0,0 +1,54 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# config file version +apiVersion: 1 + +# list of datasources that should be deleted from the database +deleteDatasources: + - name: Prometheus + orgId: 1 + +# list of datasources to insert/update depending +# what's available in the database +datasources: + # name of the datasource. Required +- name: Prometheus + # datasource type. Required + type: prometheus + # access mode. direct or proxy. Required + access: proxy + # org id. will default to orgId 1 if not specified + orgId: 1 + # url + url: http://$host_ip:9090 + # database password, if used + password: + # database user, if used + user: + # database name, if used + database: + # enable/disable basic auth + basicAuth: false + # basic auth username, if used + basicAuthUser: + # basic auth password, if used + basicAuthPassword: + # enable/disable with credentials headers + withCredentials: + # mark as default datasource. Max one per org + isDefault: true + # fields that will be converted to json and stored in json_data + jsonData: + httpMethod: GET + graphiteVersion: "1.1" + tlsAuth: false + tlsAuthWithCACert: false + # json object of data that will be encrypted. + secureJsonData: + tlsCACert: "..." + tlsClientCert: "..." + tlsClientKey: "..." + version: 1 + # allow users to edit datasources from the UI. + editable: true diff --git a/CodeTrans/docker_compose/intel/hpu/gaudi/prometheus.yaml b/CodeTrans/docker_compose/intel/hpu/gaudi/prometheus.yaml new file mode 100644 index 0000000000..a9c3b5fc14 --- /dev/null +++ b/CodeTrans/docker_compose/intel/hpu/gaudi/prometheus.yaml @@ -0,0 +1,30 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# [IP_ADDR]:{PORT_OUTSIDE_CONTAINER} -> {PORT_INSIDE_CONTAINER} / {PROTOCOL} +global: + scrape_interval: 5s + external_labels: + monitor: "my-monitor" +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["opea_prometheus:9090"] + - job_name: "vllm" + metrics_path: /metrics + static_configs: + - targets: ["codetrans-gaudi-vllm-service:80"] + - job_name: "codetrans-backend-server" + metrics_path: /metrics + static_configs: + - targets: ["codetrans-gaudi-backend-server:7777"] + - job_name: "prometheus-node-exporter" + scrape_interval: 30s + scrape_timeout: 25s + metrics_path: /metrics + static_configs: + - targets: ["node-exporter:9100"] + - job_name: "gaudi-metrics-exporter" + scrape_interval: 30s + metrics_path: /metrics + static_configs: + - targets: ["gaudi-metrics-exporter:41611"] diff --git a/CodeTrans/docker_compose/intel/set_env.sh b/CodeTrans/docker_compose/intel/set_env.sh index 04c4048c52..0c2d0883c5 100644 --- a/CodeTrans/docker_compose/intel/set_env.sh +++ b/CodeTrans/docker_compose/intel/set_env.sh @@ -24,3 +24,9 @@ export FRONTEND_SERVICE_PORT=5173 export BACKEND_SERVICE_NAME=codetrans export BACKEND_SERVICE_IP=${host_ip} export BACKEND_SERVICE_PORT=7777 + + +# Set network proxy settings +export no_proxy="${no_proxy},${HOST_IP},vllm-server,codetrans-xeon-backend-server,codetrans-xeon-ui-server,redis-vector-db,dataprep-redis-server,tei-embedding-serving,tei-embedding-server,retriever-redis,opea_prometheus,grafana,node-exporter,$JAEGER_IP" # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1" +export http_proxy=$http_proxy +export https_proxy=$https_proxy diff --git a/CodeTrans/tests/test_compose_on_gaudi.sh b/CodeTrans/tests/test_compose_on_gaudi.sh index ad5f9da9fc..edfca863d7 100644 --- a/CodeTrans/tests/test_compose_on_gaudi.sh +++ b/CodeTrans/tests/test_compose_on_gaudi.sh @@ -42,8 +42,11 @@ function start_services() { sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env + # download grafana dashboard + bash grafana/dashboards/download_opea_dashboard.sh + # Start Docker Containers - docker compose up -d > ${LOG_PATH}/start_services_with_compose.log + docker compose -f compose.yaml -f compose.monitoring.yaml up -d > ${LOG_PATH}/start_services_with_compose.log n=0 until [[ "$n" -ge 100 ]]; do @@ -117,7 +120,7 @@ function validate_megaservice() { function stop_docker() { cd $WORKPATH/docker_compose/intel/hpu/gaudi - docker compose -f compose.yaml stop && docker compose rm -f + docker compose -f compose.yaml -f compose.monitoring.yaml stop && docker compose rm -f } function main() { diff --git a/CodeTrans/tests/test_compose_on_xeon.sh b/CodeTrans/tests/test_compose_on_xeon.sh index 8e42d903a5..bbd75208e8 100644 --- a/CodeTrans/tests/test_compose_on_xeon.sh +++ b/CodeTrans/tests/test_compose_on_xeon.sh @@ -40,10 +40,13 @@ function start_services() { source set_env.sh cd cpu/xeon/ + # download grafana dashboard + bash grafana/dashboards/download_opea_dashboard.sh + sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env # Start Docker Containers - docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log + docker compose -f compose.yaml -f compose.monitoring.yaml up -d > ${LOG_PATH}/start_services_with_compose.log n=0 until [[ "$n" -ge 100 ]]; do @@ -118,7 +121,7 @@ function validate_megaservice() { function stop_docker() { cd $WORKPATH/docker_compose/intel/cpu/xeon/ - docker compose -f compose.yaml stop && docker compose rm -f + docker compose -f compose.yaml -f compose.monitoring.yaml down } function main() { diff --git a/CodeTrans/tests/test_compose_tgi_on_gaudi.sh b/CodeTrans/tests/test_compose_tgi_on_gaudi.sh index 9b4a7e2c9e..964f6b6f01 100644 --- a/CodeTrans/tests/test_compose_tgi_on_gaudi.sh +++ b/CodeTrans/tests/test_compose_tgi_on_gaudi.sh @@ -43,8 +43,11 @@ function start_services() { sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env + # download grafana dashboard + bash grafana/dashboards/download_opea_dashboard.sh + # Start Docker Containers - docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log + docker compose -f compose_tgi.yaml -f compose.monitoring.yaml up -d > ${LOG_PATH}/start_services_with_compose.log n=0 until [[ "$n" -ge 100 ]]; do @@ -127,7 +130,7 @@ function validate_megaservice() { function stop_docker() { cd $WORKPATH/docker_compose/intel/hpu/gaudi/ - docker compose -f compose_tgi.yaml stop && docker compose rm -f + docker compose -f compose_tgi.yaml -f compose.monitoring.yaml stop && docker compose rm -f } function main() { diff --git a/CodeTrans/tests/test_compose_tgi_on_xeon.sh b/CodeTrans/tests/test_compose_tgi_on_xeon.sh index c66e8b4537..2957093520 100644 --- a/CodeTrans/tests/test_compose_tgi_on_xeon.sh +++ b/CodeTrans/tests/test_compose_tgi_on_xeon.sh @@ -41,10 +41,13 @@ function start_services() { source set_env.sh cd cpu/xeon/ + # download grafana dashboard + bash grafana/dashboards/download_opea_dashboard.sh + sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env # Start Docker Containers - docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log + docker compose -f compose_tgi.yaml -f compose.monitoring.yaml up -d > ${LOG_PATH}/start_services_with_compose.log n=0 until [[ "$n" -ge 100 ]]; do @@ -127,7 +130,7 @@ function validate_megaservice() { function stop_docker() { cd $WORKPATH/docker_compose/intel/cpu/xeon/ - docker compose -f compose_tgi.yaml stop && docker compose rm -f + docker compose -f compose_tgi.yaml -f compose.monitoring.yaml stop && docker compose rm -f } function main() {