@@ -20,6 +20,7 @@ package llmdinferencesim
2020
2121import (
2222 "context"
23+ "math"
2324 "strconv"
2425 "strings"
2526 "sync"
@@ -94,6 +95,61 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
9495 return err
9596 }
9697
98+ s .requestPromptTokens = prometheus .NewHistogramVec (
99+ prometheus.HistogramOpts {
100+ Subsystem : "" ,
101+ Name : "vllm:request_prompt_tokens" ,
102+ Help : "Number of prefill tokens processed." ,
103+ Buckets : build125Buckets (s .config .MaxModelLen ),
104+ },
105+ []string {vllmapi .PromLabelModelName },
106+ )
107+ if err := s .registry .Register (s .requestPromptTokens ); err != nil {
108+ s .logger .Error (err , "Prometheus request_prompt_tokens histogram register failed" )
109+ return err
110+ }
111+
112+ s .requestGenerationTokens = prometheus .NewHistogramVec (
113+ prometheus.HistogramOpts {
114+ Subsystem : "" ,
115+ Name : "vllm:request_generation_tokens" ,
116+ Help : "Number of generation tokens processed." ,
117+ Buckets : build125Buckets (s .config .MaxModelLen ),
118+ },
119+ []string {vllmapi .PromLabelModelName },
120+ )
121+ if err := s .registry .Register (s .requestGenerationTokens ); err != nil {
122+ s .logger .Error (err , "Prometheus request_generation_tokens histogram register failed" )
123+ return err
124+ }
125+
126+ s .requestParamsMaxTokens = prometheus .NewHistogramVec (
127+ prometheus.HistogramOpts {
128+ Subsystem : "" ,
129+ Name : "vllm:request_params_max_tokens" ,
130+ Help : "Histogram of the max_tokens request parameter." ,
131+ Buckets : build125Buckets (s .config .MaxModelLen ),
132+ },
133+ []string {vllmapi .PromLabelModelName },
134+ )
135+ if err := s .registry .Register (s .requestParamsMaxTokens ); err != nil {
136+ s .logger .Error (err , "Prometheus request_params_max_tokens histogram register failed" )
137+ return err
138+ }
139+
140+ s .requestSuccessTotal = prometheus .NewCounterVec (
141+ prometheus.CounterOpts {
142+ Subsystem : "" ,
143+ Name : "vllm:request_success_total" ,
144+ Help : "Count of successfully processed requests." ,
145+ },
146+ []string {vllmapi .PromLabelModelName , vllmapi .PromLabelFinishReason },
147+ )
148+ if err := s .registry .Register (s .requestSuccessTotal ); err != nil {
149+ s .logger .Error (err , "Prometheus request_success_total counter register failed" )
150+ return err
151+ }
152+
97153 s .setInitialPrometheusMetrics ()
98154
99155 return nil
@@ -102,16 +158,18 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
102158// setInitialPrometheusMetrics sends the default values to prometheus or
103159// the fake metrics if set
104160func (s * VllmSimulator ) setInitialPrometheusMetrics () {
105- var nRunningReqs , nWaitingReqs , kvCacheUsage float64
161+ var nRunningReqs , nWaitingReqs , kvCacheUsage , requestSuccessTotal float64
106162 if s .config .FakeMetrics != nil {
107163 nRunningReqs = float64 (s .config .FakeMetrics .RunningRequests )
108164 nWaitingReqs = float64 (s .config .FakeMetrics .WaitingRequests )
109165 kvCacheUsage = float64 (s .config .FakeMetrics .KVCacheUsagePercentage )
166+ requestSuccessTotal = float64 (s .config .FakeMetrics .RequestSuccessTotal )
110167 }
111168 modelName := s .getDisplayedModelName (s .config .Model )
112169 s .runningRequests .WithLabelValues (modelName ).Set (nRunningReqs )
113170 s .waitingRequests .WithLabelValues (modelName ).Set (nWaitingReqs )
114171 s .kvCacheUsagePercentage .WithLabelValues (modelName ).Set (kvCacheUsage )
172+ s .requestSuccessTotal .WithLabelValues (modelName , "stop" ).Add (requestSuccessTotal )
115173
116174 if s .config .FakeMetrics != nil && len (s .config .FakeMetrics .LoraMetrics ) != 0 {
117175 for _ , metrics := range s .config .FakeMetrics .LoraMetrics {
@@ -198,6 +256,7 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
198256 go s .runningRequestsUpdater (ctx )
199257 go s .lorasUpdater (ctx )
200258 go s .kvCacheUsageUpdater (ctx )
259+ go s .recordRequestUpdater (ctx )
201260}
202261
203262// waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel
@@ -282,3 +341,71 @@ func (s *VllmSimulator) decrementLoraRefCount(lora string, theMap *sync.Map) {
282341 s .logger .Error (nil , "Zero model reference" , "model" , lora )
283342 }
284343}
344+
345+ // recordRequestMetricsOnSuccess records metrics for a successfully completed request.
346+ func (s * VllmSimulator ) recordRequestUpdater (ctx context.Context ) {
347+ for {
348+ select {
349+ case <- ctx .Done ():
350+ return
351+ case event := <- s .requestSuccessChan :
352+ s .recordRequestMetricsOnSuccess (
353+ event .PromptTokens ,
354+ event .GenerationTokens ,
355+ event .MaxTokens ,
356+ event .FinishReason ,
357+ )
358+ }
359+ }
360+ }
361+
362+ // requestSuccessEvent represents the data associated with a successfully completed request,
363+ // which is sent through the requestSuccessChan for asynchronous metrics recording.
364+ type requestSuccessEvent struct {
365+ // PromptTokens is the number of input (prompt) tokens in the request
366+ PromptTokens int
367+ // GenerationTokens is the number of generated (output) tokens in the response
368+ GenerationTokens int
369+ // MaxTokens is the maximum number of tokens allowed for generation (if specified in the request)
370+ MaxTokens * int64
371+ // FinishReason indicates why the generation stopped (e.g., "stop", "length", "tool_calls")
372+ FinishReason string
373+ }
374+
375+ // recordRequestMetricsOnSuccess records metrics for a successfully completed request
376+ func (s * VllmSimulator ) recordRequestMetricsOnSuccess (promptTokens ,
377+ generationTokens int , maxTokens * int64 , finishReason string ) {
378+ modelName := s .getDisplayedModelName (s .config .Model )
379+ s .requestPromptTokens .WithLabelValues (modelName ).Observe (float64 (promptTokens ))
380+ s .requestGenerationTokens .WithLabelValues (modelName ).Observe (float64 (generationTokens ))
381+ if maxTokens != nil {
382+ s .requestParamsMaxTokens .WithLabelValues (modelName ).Observe (float64 (* maxTokens ))
383+ }
384+ s .requestSuccessTotal .WithLabelValues (modelName , finishReason ).Inc ()
385+ }
386+
387+ // build125Buckets generates histogram buckets in powers of 10 scaled by [1,2,5].
388+ // This matches vLLM's build_1_2_5_buckets() in metrics.py.
389+ //
390+ // Reference: https://github.com/vllm-project/vllm/blob/main/vllm/engine/metrics.py#L175
391+ func build125Buckets (maxValue int ) []float64 {
392+ var buckets []float64
393+ exponent := 0
394+ mantissa := []int {1 , 2 , 5 }
395+
396+ for {
397+ complete := true
398+ for _ , m := range mantissa {
399+ value := m * int (math .Pow10 (exponent ))
400+ if value <= maxValue {
401+ buckets = append (buckets , float64 (value ))
402+ complete = false
403+ }
404+ }
405+ if complete {
406+ break
407+ }
408+ exponent ++
409+ }
410+ return buckets
411+ }
0 commit comments