feat(metrics): add request prompt, generation, max_tokens and success metrics

googs1025 · googs1025 · commit d2d61c7a2ac0 · 2025-09-16T19:05:50.000+08:00
Signed-off-by: googs1025 &lt;googs1025@gmail.com&gt;
diff --git a/manifests/config_with_fake.yaml b/manifests/config_with_fake.yaml
@@ -7,7 +7,8 @@ time-to-first-token: 2000
 inter-token-latency: 1000
 kv-cache-transfer-latency: 100
 seed: 100100100
-fake-metrics: 
+fake-metrics:
+  request-success-total: 20
   running-requests: 16
   waiting-requests: 3 
   kv-cache-usage: 0.3
diff --git a/pkg/common/config.go b/pkg/common/config.go
@@ -186,6 +186,8 @@ type Metrics struct {
 	WaitingRequests int64 `yaml:"waiting-requests" json:"waiting-requests"`
 	// KVCacheUsagePercentage  is the fraction of KV-cache blocks currently in use (from 0 to 1)
 	KVCacheUsagePercentage float32 `yaml:"kv-cache-usage" json:"kv-cache-usage"`
+	// RequestSuccessTotal is the number of inference requests that are successful
+	RequestSuccessTotal int64 `yaml:"request-success-total" json:"request-success-total"`
 }
 
 type LorasMetrics struct {
diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
@@ -20,6 +20,7 @@ package llmdinferencesim
 
 import (
 	"context"
+	"math"
 	"strconv"
 	"strings"
 	"sync"
@@ -94,6 +95,61 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 		return err
 	}
 
+	s.requestPromptTokens = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: "",
+			Name:      "vllm:request_prompt_tokens",
+			Help:      "Number of prefill tokens processed.",
+			Buckets:   build125Buckets(s.config.MaxModelLen),
+		},
+		[]string{vllmapi.PromLabelModelName},
+	)
+	if err := s.registry.Register(s.requestPromptTokens); err != nil {
+		s.logger.Error(err, "Prometheus request_prompt_tokens histogram register failed")
+		return err
+	}
+
+	s.requestGenerationTokens = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: "",
+			Name:      "vllm:request_generation_tokens",
+			Help:      "Number of generation tokens processed.",
+			Buckets:   build125Buckets(s.config.MaxModelLen),
+		},
+		[]string{vllmapi.PromLabelModelName},
+	)
+	if err := s.registry.Register(s.requestGenerationTokens); err != nil {
+		s.logger.Error(err, "Prometheus request_generation_tokens histogram register failed")
+		return err
+	}
+
+	s.requestParamsMaxTokens = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: "",
+			Name:      "vllm:request_params_max_tokens",
+			Help:      "Histogram of the max_tokens request parameter.",
+			Buckets:   build125Buckets(s.config.MaxModelLen),
+		},
+		[]string{vllmapi.PromLabelModelName},
+	)
+	if err := s.registry.Register(s.requestParamsMaxTokens); err != nil {
+		s.logger.Error(err, "Prometheus request_params_max_tokens histogram register failed")
+		return err
+	}
+
+	s.requestSuccessTotal = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Subsystem: "",
+			Name:      "vllm:request_success_total",
+			Help:      "Count of successfully processed requests.",
+		},
+		[]string{vllmapi.PromLabelModelName, vllmapi.PromLabelFinishReason},
+	)
+	if err := s.registry.Register(s.requestSuccessTotal); err != nil {
+		s.logger.Error(err, "Prometheus request_success_total counter register failed")
+		return err
+	}
+
 	s.setInitialPrometheusMetrics()
 
 	return nil
@@ -102,16 +158,18 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 // setInitialPrometheusMetrics sends the default values to prometheus or
 // the fake metrics if set
 func (s *VllmSimulator) setInitialPrometheusMetrics() {
-	var nRunningReqs, nWaitingReqs, kvCacheUsage float64
+	var nRunningReqs, nWaitingReqs, kvCacheUsage, requestSuccessTotal float64
 	if s.config.FakeMetrics != nil {
 		nRunningReqs = float64(s.config.FakeMetrics.RunningRequests)
 		nWaitingReqs = float64(s.config.FakeMetrics.WaitingRequests)
 		kvCacheUsage = float64(s.config.FakeMetrics.KVCacheUsagePercentage)
+		requestSuccessTotal = float64(s.config.FakeMetrics.RequestSuccessTotal)
 	}
 	modelName := s.getDisplayedModelName(s.config.Model)
 	s.runningRequests.WithLabelValues(modelName).Set(nRunningReqs)
 	s.waitingRequests.WithLabelValues(modelName).Set(nWaitingReqs)
 	s.kvCacheUsagePercentage.WithLabelValues(modelName).Set(kvCacheUsage)
+	s.requestSuccessTotal.WithLabelValues(modelName, "stop").Add(requestSuccessTotal)
 
 	if s.config.FakeMetrics != nil && len(s.config.FakeMetrics.LoraMetrics) != 0 {
 		for _, metrics := range s.config.FakeMetrics.LoraMetrics {
@@ -198,6 +256,7 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
 	go s.runningRequestsUpdater(ctx)
 	go s.lorasUpdater(ctx)
 	go s.kvCacheUsageUpdater(ctx)
+	go s.recordRequestUpdater(ctx)
 }
 
 // waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel
@@ -282,3 +341,71 @@ func (s *VllmSimulator) decrementLoraRefCount(lora string, theMap *sync.Map) {
 		s.logger.Error(nil, "Zero model reference", "model", lora)
 	}
 }
+
+// recordRequestMetricsOnSuccess records metrics for a successfully completed request.
+func (s *VllmSimulator) recordRequestUpdater(ctx context.Context) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case event := <-s.requestSuccessChan:
+			s.recordRequestMetricsOnSuccess(
+				event.PromptTokens,
+				event.GenerationTokens,
+				event.MaxTokens,
+				event.FinishReason,
+			)
+		}
+	}
+}
+
+// requestSuccessEvent represents the data associated with a successfully completed request,
+// which is sent through the requestSuccessChan for asynchronous metrics recording.
+type requestSuccessEvent struct {
+	// PromptTokens is the number of input (prompt) tokens in the request
+	PromptTokens int
+	// GenerationTokens is the number of generated (output) tokens in the response
+	GenerationTokens int
+	// MaxTokens is the maximum number of tokens allowed for generation (if specified in the request)
+	MaxTokens *int64
+	// FinishReason indicates why the generation stopped (e.g., "stop", "length", "tool_calls")
+	FinishReason string
+}
+
+// recordRequestMetricsOnSuccess records metrics for a successfully completed request
+func (s *VllmSimulator) recordRequestMetricsOnSuccess(promptTokens,
+	generationTokens int, maxTokens *int64, finishReason string) {
+	modelName := s.getDisplayedModelName(s.config.Model)
+	s.requestPromptTokens.WithLabelValues(modelName).Observe(float64(promptTokens))
+	s.requestGenerationTokens.WithLabelValues(modelName).Observe(float64(generationTokens))
+	if maxTokens != nil {
+		s.requestParamsMaxTokens.WithLabelValues(modelName).Observe(float64(*maxTokens))
+	}
+	s.requestSuccessTotal.WithLabelValues(modelName, finishReason).Inc()
+}
+
+// build125Buckets generates histogram buckets in powers of 10 scaled by [1,2,5].
+// This matches vLLM's build_1_2_5_buckets() in metrics.py.
+//
+// Reference: https://github.com/vllm-project/vllm/blob/main/vllm/engine/metrics.py#L175
+func build125Buckets(maxValue int) []float64 {
+	var buckets []float64
+	exponent := 0
+	mantissa := []int{1, 2, 5}
+
+	for {
+		complete := true
+		for _, m := range mantissa {
+			value := m * int(math.Pow10(exponent))
+			if value <= maxValue {
+				buckets = append(buckets, float64(value))
+				complete = false
+			}
+		}
+		if complete {
+			break
+		}
+		exponent++
+	}
+	return buckets
+}
diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go
@@ -22,11 +22,13 @@ import (
 	"io"
 	"net/http"
 	"os"
+	"reflect"
 	"regexp"
 	"sort"
 	"strconv"
 	"strings"
 	"sync"
+	"testing"
 	"time"
 
 	"github.com/llm-d/llm-d-inference-sim/pkg/common"
@@ -572,3 +574,77 @@ func splitString(str string) []string {
 	}
 	return strings.Split(str, ",")
 }
+
+// TestBuild125Buckets tests the build125Buckets function with various inputs.
+func TestBuild125Buckets(t *testing.T) {
+	tests := []struct {
+		name     string
+		maxValue int
+		want     []float64
+	}{
+		{
+			name:     "max_value zero",
+			maxValue: 0,
+			want:     []float64{}, // no bucket <= 0
+		},
+		{
+			name:     "max_value one",
+			maxValue: 1,
+			want:     []float64{1},
+		},
+		{
+			name:     "max_value five",
+			maxValue: 5,
+			want:     []float64{1, 2, 5},
+		},
+		{
+			name:     "max_value ten",
+			maxValue: 10,
+			want:     []float64{1, 2, 5, 10},
+		},
+		{
+			name:     "max_value 100",
+			maxValue: 100,
+			want:     []float64{1, 2, 5, 10, 20, 50, 100},
+		},
+		{
+			name:     "max_value 999",
+			maxValue: 999,
+			want:     []float64{1, 2, 5, 10, 20, 50, 100, 200, 500},
+		},
+		{
+			name:     "max_value 1024",
+			maxValue: 1024,
+			want:     []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000},
+		},
+		{
+			name:     "max_value 4096",
+			maxValue: 4096,
+			want:     []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 4000},
+		},
+		{
+			name:     "max_value 32768",
+			maxValue: 32768,
+			want:     []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 30000},
+		},
+		{
+			name:     "max_value just below power of 10",
+			maxValue: 99,
+			want:     []float64{1, 2, 5, 10, 20, 50},
+		},
+		{
+			name:     "max_value negative",
+			maxValue: -1,
+			want:     []float64{}, // no positive bucket <= -1
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := build125Buckets(tt.maxValue)
+			if !reflect.DeepEqual(got, tt.want) {
+				t.Errorf("build125Buckets(%d) = %v, want %v", tt.maxValue, got, tt.want)
+			}
+		})
+	}
+}
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -92,6 +92,8 @@ type VllmSimulator struct {
 	nRunningReqs int64
 	// runReqChan is a channel to update nRunningReqs
 	runReqChan chan int64
+	// requestSuccessChan
+	requestSuccessChan chan requestSuccessEvent
 	// nWaitingReqs is the number of inference requests that are waiting to be processed
 	nWaitingReqs int64
 	// waitingReqChan is a channel to update nWaitingReqs
@@ -108,6 +110,14 @@ type VllmSimulator struct {
 	waitingRequests *prometheus.GaugeVec
 	// kvCacheUsagePercentage is prometheus gauge
 	kvCacheUsagePercentage *prometheus.GaugeVec
+	// requestPromptTokens is prometheus histogram for number of input (prompt) tokens in request
+	requestPromptTokens *prometheus.HistogramVec
+	// requestGenerationTokens is prometheus histogram for number of generated tokens in request
+	requestGenerationTokens *prometheus.HistogramVec
+	// requestParamsMaxTokens is prometheus histogram for 'max_tokens' parameter in request
+	requestParamsMaxTokens *prometheus.HistogramVec
+	// requestSuccessTotal is prometheus counter for total number of successful requests
+	requestSuccessTotal *prometheus.CounterVec
 	// channel for requeasts to be passed to workers
 	reqChan chan *openaiserverapi.CompletionReqCtx
 	// schema validator for tools parameters
@@ -597,9 +607,19 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) {
 						// in case this is prefill pod processing, return special finish reason
 						finishReason = common.RemoteDecodeFinishReason
 					}
-
 					s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &usageData)
 				}
+				select {
+				case s.requestSuccessChan <- requestSuccessEvent{
+					PromptTokens:     usageData.PromptTokens,
+					GenerationTokens: usageData.CompletionTokens,
+					MaxTokens:        reqCtx.CompletionReq.GetMaxCompletionTokens(),
+					FinishReason:     finishReason,
+				}:
+				default:
+					// 非阻塞：如果 channel 满了，丢弃（防止影响主流程）
+					s.logger.V(1).Info("metricsChan full, dropping success event")
+				}
 			}
 			reqCtx.Wg.Done()
 		}
diff --git a/pkg/vllm-api/vllm-models.go b/pkg/vllm-api/vllm-models.go
@@ -25,6 +25,7 @@ const (
 	PromLabelRunningLoraAdapters = "running_lora_adapters"
 	PromLabelMaxLora             = "max_lora"
 	PromLabelModelName           = "model_name"
+	PromLabelFinishReason        = "finish_reason"
 
 	VllmLoraRequestInfo    = "vllm:lora_requests_info"
 	VllmNumRequestsRunning = "vllm:num_requests_running"

Original file line number	Diff line number	Diff line change
`@@ -186,6 +186,8 @@ type Metrics struct {`
`186`	`186`	WaitingRequests int64 `yaml:"waiting-requests" json:"waiting-requests"`
`187`	`187`	`// KVCacheUsagePercentage is the fraction of KV-cache blocks currently in use (from 0 to 1)`
`188`	`188`	KVCacheUsagePercentage float32 `yaml:"kv-cache-usage" json:"kv-cache-usage"`
	`189`	`+ // RequestSuccessTotal is the number of inference requests that are successful`
	`190`	+ RequestSuccessTotal int64 `yaml:"request-success-total" json:"request-success-total"`
`189`	`191`	`}`
`190`	`192`
`191`	`193`	`type LorasMetrics struct {`