diff --git a/deploy/helm/moai-inference-framework/files/dashboards/amd-gpu-usage-monitor.json b/deploy/helm/moai-inference-framework/files/dashboards/amd-gpu-usage-monitor.json new file mode 100644 index 00000000..85c8d31b --- /dev/null +++ b/deploy/helm/moai-inference-framework/files/dashboards/amd-gpu-usage-monitor.json @@ -0,0 +1,389 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 4, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "
GPU Index ➡️
", + "mode": "html" + }, + "pluginVersion": "12.3.0", + "title": "", + "type": "text" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 20, + "x": 4, + "y": 0 + }, + "id": 2, + "maxDataPoints": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "vertical", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/^gpu_id$/", + "values": true + }, + "showPercentChange": false, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "group by(gpu_id) (gpu_gfx_activity)", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "gpu_id" + } + ] + } + } + ], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 2 + }, + "id": 3, + "maxDataPoints": 1, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "12.3.0", + "repeat": "host", + "repeatDirection": "v", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "group by(hostname) (gpu_gfx_activity{hostname=\"$host\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "fieldMinMax": false, + "mappings": [ + { + "options": { + "/": { + "color": "blue", + "index": 0, + "text": "Idle" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 20, + "x": 4, + "y": 2 + }, + "id": 1, + "maxPerRow": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "vertical", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/.*/", + "values": false + }, + "showPercentChange": false, + "text": { + "titleSize": 18, + "valueSize": 16 + }, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "12.3.0", + "repeat": "host", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "label_join(label_join(gpu_gfx_activity{hostname=\"$host\"}, \"pod_display\", \"/\", \"namespace\", \"pod\"), \"resource\", \" : GPU \", \"hostname\", \"gpu_id\")", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "", + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": [ + "pod_display", + "resource" + ] + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "pod_display": false, + "resource": false + }, + "includeByName": {}, + "indexByName": { + "pod_display": 1, + "resource": 0 + }, + "renameByName": { + "hostname": "" + } + } + }, + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "desc": false, + "field": "resource" + } + ] + } + }, + { + "id": "transpose", + "options": {} + }, + { + "id": "filterFieldsByName", + "options": { + "byVariable": false, + "include": { + "pattern": "^(?!Field$).+" + } + } + } + ], + "type": "stat" + } + ], + "preload": false, + "refresh": "30s", + "schemaVersion": 42, + "tags": [], + "templating": { + "list": [ + { + "allowCustomValue": false, + "current": { + "text": "All", + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(gpu_gfx_activity,hostname)", + "includeAll": true, + "multi": true, + "name": "host", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(gpu_gfx_activity,hostname)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "AMD GPU Usage Monitor", + "uid": "amd-gpu-usage-monitor", + "version": 1 +} \ No newline at end of file