Skip to content

Commit

Permalink
Export tier metrics (#18678)
Browse files Browse the repository at this point in the history
minio_node_tier_ttlb_seconds - Distribution of time to last byte for streaming objects from warm tier
minio_node_tier_requests_success - Number of requests to download object from warm tier that were successful
minio_node_tier_requests_failure - Number of requests to download object from warm tier that failed
  • Loading branch information
krisis committed Dec 21, 2023
1 parent b1a109a commit 56b7045
Show file tree
Hide file tree
Showing 5 changed files with 174 additions and 2 deletions.
6 changes: 5 additions & 1 deletion cmd/bucket-lifecycle.go
Expand Up @@ -507,9 +507,13 @@ func auditTierActions(ctx context.Context, tier string, bytes int64) func(err er
}

if err == nil {
op.TimeToResponseNS = time.Since(startTime).Nanoseconds()
since := time.Since(startTime)
op.TimeToResponseNS = since.Nanoseconds()
globalTierMetrics.Observe(tier, since)
globalTierMetrics.logSuccess(tier)
} else {
op.Error = err.Error()
globalTierMetrics.logFailure(tier)
}

logger.GetReqInfo(ctx).AppendTags("tierStats", op)
Expand Down
16 changes: 16 additions & 0 deletions cmd/metrics-v2.go
Expand Up @@ -90,6 +90,7 @@ func init() {
getNetworkMetrics(),
getMinioVersionMetrics(),
getS3TTFBMetric(),
getTierMetrics(),
getNotificationMetrics(),
getDistLockMetrics(),
getIAMNodeMetrics(),
Expand Down Expand Up @@ -155,6 +156,7 @@ const (
usageSubsystem MetricSubsystem = "usage"
quotaSubsystem MetricSubsystem = "quota"
ilmSubsystem MetricSubsystem = "ilm"
tierSubsystem MetricSubsystem = "tier"
scannerSubsystem MetricSubsystem = "scanner"
iamSubsystem MetricSubsystem = "iam"
kmsSubsystem MetricSubsystem = "kms"
Expand Down Expand Up @@ -246,6 +248,7 @@ const (
sizeDistribution = "size_distribution"
versionDistribution = "version_distribution"
ttfbDistribution = "seconds_distribution"
ttlbDistribution = "ttlb_seconds_distribution"

lastActivityTime = "last_activity_nano_seconds"
startTime = "starttime_seconds"
Expand All @@ -262,6 +265,9 @@ const (
transitionedObjects MetricName = "transitioned_objects"
transitionedVersions MetricName = "transitioned_versions"

tierRequestsSuccess MetricName = "requests_success"
tierRequestsFailure MetricName = "requests_failure"

kmsOnline = "online"
kmsRequestsSuccess = "request_success"
kmsRequestsError = "request_error"
Expand Down Expand Up @@ -1658,6 +1664,16 @@ func getS3TTFBMetric() *MetricsGroup {
return mg
}

func getTierMetrics() *MetricsGroup {
mg := &MetricsGroup{
cacheInterval: 10 * time.Second,
}
mg.RegisterRead(func(ctx context.Context) []Metric {
return globalTierMetrics.Report()
})
return mg
}

func getTransitionPendingTasksMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Expand Down
94 changes: 93 additions & 1 deletion cmd/tier.go
@@ -1,4 +1,4 @@
// Copyright (c) 2015-2021 MinIO, Inc.
// Copyright (c) 2015-2023 MinIO, Inc.
//
// This file is part of MinIO Object Storage stack
//
Expand Down Expand Up @@ -27,11 +27,13 @@ import (
"path"
"strings"
"sync"
"time"

"github.com/minio/madmin-go/v3"
"github.com/minio/minio/internal/crypto"
"github.com/minio/minio/internal/hash"
"github.com/minio/minio/internal/kms"
"github.com/prometheus/client_golang/prometheus"
)

//go:generate msgp -file $GOFILE
Expand Down Expand Up @@ -80,6 +82,96 @@ type TierConfigMgr struct {
Tiers map[string]madmin.TierConfig `json:"tiers"`
}

type tierMetrics struct {
sync.RWMutex // protects requestsCount only
requestsCount map[string]struct {
success int64
failure int64
}
histogram *prometheus.HistogramVec
}

var globalTierMetrics = tierMetrics{
requestsCount: make(map[string]struct {
success int64
failure int64
}),
histogram: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "tier_ttlb_seconds",
Help: "Time taken by requests served by warm tier",
Buckets: []float64{0.01, 0.1, 1, 2, 5, 10, 60, 5 * 60, 15 * 60, 30 * 60},
}, []string{"tier"}),
}

func (t *tierMetrics) Observe(tier string, dur time.Duration) {
t.histogram.With(prometheus.Labels{"tier": tier}).Observe(dur.Seconds())
}

func (t *tierMetrics) logSuccess(tier string) {
t.Lock()
defer t.Unlock()

stat := t.requestsCount[tier]
stat.success++
t.requestsCount[tier] = stat
}

func (t *tierMetrics) logFailure(tier string) {
t.Lock()
defer t.Unlock()

stat := t.requestsCount[tier]
stat.failure++
t.requestsCount[tier] = stat
}

var (
// {minio_node}_{tier}_{ttlb_seconds_distribution}
tierTTLBMD = MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: tierSubsystem,
Name: ttlbDistribution,
Help: "Distribution of time to last byte for objects downloaded from warm tier",
Type: gaugeMetric,
}

// {minio_node}_{tier}_{requests_success}
tierRequestsSuccessMD = MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: tierSubsystem,
Name: tierRequestsSuccess,
Help: "Number of requests to download object from warm tier that were successful",
Type: counterMetric,
}
// {minio_node}_{tier}_{requests_failure}
tierRequestsFailureMD = MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: tierSubsystem,
Name: tierRequestsFailure,
Help: "Number of requests to download object from warm tier that failed",
Type: counterMetric,
}
)

func (t *tierMetrics) Report() []Metric {
metrics := getHistogramMetrics(t.histogram, tierTTLBMD)
t.RLock()
defer t.RUnlock()
for tier, stat := range t.requestsCount {
metrics = append(metrics, Metric{
Description: tierRequestsSuccessMD,
Value: float64(stat.success),
VariableLabels: map[string]string{"tier": tier},
})
metrics = append(metrics, Metric{
Description: tierRequestsFailureMD,
Value: float64(stat.failure),
VariableLabels: map[string]string{"tier": tier},
})
}
return metrics
}

// IsTierValid returns true if there exists a remote tier by name tierName,
// otherwise returns false.
func (config *TierConfigMgr) IsTierValid(tierName string) bool {
Expand Down
52 changes: 52 additions & 0 deletions cmd/tier_test.go
@@ -0,0 +1,52 @@
// Copyright (c) 2015-2023 MinIO, Inc.
//
// This file is part of MinIO Object Storage stack
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

package cmd

import (
"testing"
"time"
)

func TestTierMetrics(t *testing.T) {
tier := "WARM-1"
globalTierMetrics.Observe(tier, 200*time.Millisecond)
expSuccess := 10
expFailure := 5
for i := 0; i < expSuccess; i++ {
globalTierMetrics.logSuccess(tier)
}
for i := 0; i < expFailure; i++ {
globalTierMetrics.logFailure(tier)
}
metrics := globalTierMetrics.Report()
var succ, fail float64
for _, metric := range metrics {
switch metric.Description.Name {
case tierRequestsSuccess:
succ += metric.Value
case tierRequestsFailure:
fail += metric.Value
}
}
if int(succ) != expSuccess {
t.Fatalf("Expected %d successes but got %f", expSuccess, succ)
}
if int(fail) != expFailure {
t.Fatalf("Expected %d failures but got %f", expFailure, fail)
}
}
8 changes: 8 additions & 0 deletions docs/metrics/prometheus/list.md
Expand Up @@ -200,6 +200,14 @@ For deployments with [bucket](https://min.io/docs/minio/linux/administration/buc
| `minio_node_ilm_transition_missed_immediate_tasks` | Number of missed immediate ILM transition tasks. |
| `minio_node_ilm_versions_scanned` | Total number of object versions checked for ilm actions since server start. |

## Tier Metrics

| Name | Description |
|:---------------------------------------------------|:----------------------------------------------------------------------------|
| `minio_node_tier_tier_ttlb_seconds_distribution` | Distribution of time to last byte for objects downloaded from warm tier |
| `minio_node_tier_requests_success` | Number of requests to download object from warm tier that were successful |
| `minio_node_tier_requests_failure` | Number of requests to download object from warm tier that were failure |

## System Metrics

| Name | Description |
Expand Down

0 comments on commit 56b7045

Please sign in to comment.