Skip to content

Commit

Permalink
Merge pull request thanos-io#7194 from xBazilio/retry-downsample-errors
Browse files Browse the repository at this point in the history
  • Loading branch information
fpetkovski authored and nicolastakashi committed Mar 23, 2024
2 parents f731719 + 2623e49 commit eab1db8
Show file tree
Hide file tree
Showing 20 changed files with 563 additions and 338 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Expand Up @@ -19,6 +19,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re
- [#7122](https://github.com/thanos-io/thanos/pull/7122) Store Gateway: Fix lazy expanded postings estimate base cardinality using posting group with remove keys.

### Added
- [#7194](https://github.com/thanos-io/thanos/pull/7194) Downsample: retry objstore related errors
- [#7105](https://github.com/thanos-io/thanos/pull/7105) Rule: add flag `--query.enable-x-functions` to allow usage of extended promql functions (xrate, xincrease, xdelta) in loaded rules
- [#6867](https://github.com/thanos-io/thanos/pull/6867) Query UI: Tenant input box added to the Query UI, in order to be able to specify which tenant the query should use.
- [#7175](https://github.com/thanos-io/thanos/pull/7175): Query: Add `--query.mode=distributed` which enables the new distributed mode of the Thanos query engine.
Expand All @@ -28,6 +29,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re
### Changed

- [#7123](https://github.com/thanos-io/thanos/pull/7123) Rule: Change default Alertmanager API version to v2.
- [##7222](https://github.com/thanos-io/thanos/pull/7123) Automatic detection of memory limits and configure GOMEMLIMIT to match.

### Removed

Expand Down
40 changes: 40 additions & 0 deletions cmd/thanos/config.go
Expand Up @@ -13,6 +13,7 @@ import (
"strings"
"time"

"github.com/KimMachineGun/automemlimit/memlimit"
extflag "github.com/efficientgo/tools/extkingpin"
"github.com/pkg/errors"

Expand Down Expand Up @@ -283,3 +284,42 @@ func parseFlagLabels(s []string) (labels.Labels, error) {
sort.Sort(lset)
return lset, nil
}

type goMemLimitConfig struct {
enableAutoGoMemlimit bool
memlimitRatio float64
}

func (gml *goMemLimitConfig) registerFlag(cmd extkingpin.FlagClause) *goMemLimitConfig {
cmd.Flag("enable-auto-gomemlimit",
"Enable go runtime to automatically limit memory consumption by compact component. This is an experimental feature.").
Default("false").BoolVar(&gml.enableAutoGoMemlimit)

cmd.Flag("auto-gomemlimit.ratio",
"The ratio of reserved GOMEMLIMIT memory to the detected maximum container or system memory.").
Default("0.9").FloatVar(&gml.memlimitRatio)

return gml
}

func configureGoAutoMemLimit(common goMemLimitConfig) error {
if common.memlimitRatio <= 0.0 || common.memlimitRatio > 1.0 {
return errors.New("--auto-gomemlimit.ratio must be greater than 0 and less than or equal to 1.")
}

if common.enableAutoGoMemlimit {
if _, err := memlimit.SetGoMemLimitWithOpts(
memlimit.WithRatio(common.memlimitRatio),
memlimit.WithProvider(
memlimit.ApplyFallback(
memlimit.FromCgroup,
memlimit.FromSystem,
),
),
); err != nil {
return errors.Wrap(err, "Failed to set GOMEMLIMIT automatically")
}
}

return nil
}
5 changes: 3 additions & 2 deletions cmd/thanos/downsample.go
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/prometheus/tsdb"
"github.com/prometheus/prometheus/tsdb/chunkenc"
"github.com/thanos-io/thanos/pkg/compact"

"github.com/thanos-io/objstore"
"github.com/thanos-io/objstore/client"
Expand Down Expand Up @@ -358,7 +359,7 @@ func processDownsampling(

err := block.Download(ctx, logger, bkt, m.ULID, bdir, objstore.WithFetchConcurrency(blockFilesConcurrency))
if err != nil {
return errors.Wrapf(err, "download block %s", m.ULID)
return compact.NewRetryError(errors.Wrapf(err, "download block %s", m.ULID))
}
level.Info(logger).Log("msg", "downloaded block", "id", m.ULID, "duration", time.Since(begin), "duration_ms", time.Since(begin).Milliseconds())

Expand Down Expand Up @@ -419,7 +420,7 @@ func processDownsampling(

err = block.Upload(ctx, logger, bkt, resdir, hashFunc)
if err != nil {
return errors.Wrapf(err, "upload downsampled block %s", id)
return compact.NewRetryError(errors.Wrapf(err, "upload downsampled block %s", id))
}

level.Info(logger).Log("msg", "uploaded block", "id", id, "duration", time.Since(begin), "duration_ms", time.Since(begin).Milliseconds())
Expand Down
9 changes: 9 additions & 0 deletions cmd/thanos/main.go
Expand Up @@ -49,6 +49,10 @@ func main() {
Default(logging.LogFormatLogfmt).Enum(logging.LogFormatLogfmt, logging.LogFormatJSON)
tracingConfig := extkingpin.RegisterCommonTracingFlags(app)

goMemLimitConf := goMemLimitConfig{}

goMemLimitConf.registerFlag(app)

registerSidecar(app)
registerStore(app)
registerQuery(app)
Expand All @@ -61,6 +65,11 @@ func main() {
cmd, setup := app.Parse()
logger := logging.NewLogger(*logLevel, *logFormat, *debugName)

if err := configureGoAutoMemLimit(goMemLimitConf); err != nil {
level.Error(logger).Log("msg", "failed to configure Go runtime memory limits", "err", err)
os.Exit(1)
}

// Running in container with limits but with empty/wrong value of GOMAXPROCS env var could lead to throttling by cpu
// maxprocs will automate adjustment by using cgroups info about cpu limit if it set as value for runtime.GOMAXPROCS.
undo, err := maxprocs.Set(maxprocs.Logger(func(template string, args ...interface{}) {
Expand Down
7 changes: 6 additions & 1 deletion cmd/thanos/rule.go
Expand Up @@ -951,7 +951,12 @@ func queryFuncCreator(
queryAPIClients := grpcEndpointSet.GetQueryAPIClients()
for _, i := range rand.Perm(len(queryAPIClients)) {
e := query.NewRemoteEngine(logger, queryAPIClients[i], query.Opts{})
q, err := e.NewInstantQuery(ctx, nil, qs, t)
expr, err := parser.ParseExpr(qs)
if err != nil {
level.Error(logger).Log("err", err, "query", qs)
continue
}
q, err := e.NewInstantQuery(ctx, nil, expr, t)
if err != nil {
level.Error(logger).Log("err", err, "query", qs)
continue
Expand Down
6 changes: 6 additions & 0 deletions docs/components/compact.md
Expand Up @@ -279,6 +279,9 @@ usage: thanos compact [<flags>]
Continuously compacts blocks in an object store bucket.
Flags:
--auto-gomemlimit.ratio=0.9
The ratio of reserved GOMEMLIMIT memory to the
detected maximum container or system memory.
--block-discovery-strategy="concurrent"
One of concurrent, recursive. When set to
concurrent, stores will concurrently issue
Expand Down Expand Up @@ -375,6 +378,9 @@ Flags:
non-downsampled data is not efficient and useful
e.g it is not possible to render all samples for
a human eye anyway
--enable-auto-gomemlimit Enable go runtime to automatically limit memory
consumption by compact component. This is an
experimental feature.
--hash-func= Specify which hash function to use when
calculating the hashes of produced files.
If no function has been specified, it does not
Expand Down
6 changes: 6 additions & 0 deletions docs/components/query-frontend.md
Expand Up @@ -199,10 +199,16 @@ Query frontend command implements a service deployed in front of queriers to
improve query parallelization and caching.
Flags:
--auto-gomemlimit.ratio=0.9
The ratio of reserved GOMEMLIMIT memory to the
detected maximum container or system memory.
--cache-compression-type=""
Use compression in results cache.
Supported values are: 'snappy' and ” (disable
compression).
--enable-auto-gomemlimit Enable go runtime to automatically limit memory
consumption by compact component. This is an
experimental feature.
-h, --help Show context-sensitive help (also try
--help-long and --help-man).
--http-address="0.0.0.0:10902"
Expand Down
6 changes: 6 additions & 0 deletions docs/components/query.md
Expand Up @@ -294,6 +294,12 @@ Flags:
--alert.query-url=ALERT.QUERY-URL
The external Thanos Query URL that would be set
in all alerts 'Source' field.
--auto-gomemlimit.ratio=0.9
The ratio of reserved GOMEMLIMIT memory to the
detected maximum container or system memory.
--enable-auto-gomemlimit Enable go runtime to automatically limit memory
consumption by compact component. This is an
experimental feature.
--endpoint=<endpoint> ... Addresses of statically configured Thanos
API servers (repeatable). The scheme may be
prefixed with 'dns+' or 'dnssrv+' to detect
Expand Down
6 changes: 6 additions & 0 deletions docs/components/receive.md
Expand Up @@ -297,6 +297,12 @@ usage: thanos receive [<flags>]
Accept Prometheus remote write API requests and write to local tsdb.
Flags:
--auto-gomemlimit.ratio=0.9
The ratio of reserved GOMEMLIMIT memory to the
detected maximum container or system memory.
--enable-auto-gomemlimit Enable go runtime to automatically limit memory
consumption by compact component. This is an
experimental feature.
--grpc-address="0.0.0.0:10901"
Listen ip:port address for gRPC endpoints
(StoreAPI). Make sure this address is routable
Expand Down
6 changes: 6 additions & 0 deletions docs/components/rule.md
Expand Up @@ -316,7 +316,13 @@ Flags:
lookups. The port defaults to 9093 or the
SRV record's value. The URL path is used as a
prefix for the regular Alertmanager API path.
--auto-gomemlimit.ratio=0.9
The ratio of reserved GOMEMLIMIT memory to the
detected maximum container or system memory.
--data-dir="data/" data directory
--enable-auto-gomemlimit Enable go runtime to automatically limit memory
consumption by compact component. This is an
experimental feature.
--eval-interval=1m The default evaluation interval to use.
--for-grace-period=10m Minimum duration between alert and restored
"for" state. This is maintained only for alerts
Expand Down
6 changes: 6 additions & 0 deletions docs/components/sidecar.md
Expand Up @@ -76,6 +76,12 @@ usage: thanos sidecar [<flags>]
Sidecar for Prometheus server.
Flags:
--auto-gomemlimit.ratio=0.9
The ratio of reserved GOMEMLIMIT memory to the
detected maximum container or system memory.
--enable-auto-gomemlimit Enable go runtime to automatically limit memory
consumption by compact component. This is an
experimental feature.
--grpc-address="0.0.0.0:10901"
Listen ip:port address for gRPC endpoints
(StoreAPI). Make sure this address is routable
Expand Down
8 changes: 8 additions & 0 deletions docs/components/store.md
Expand Up @@ -29,6 +29,9 @@ Store node giving access to blocks in a bucket provider. Now supported GCS, S3,
Azure, Swift, Tencent COS and Aliyun OSS.
Flags:
--auto-gomemlimit.ratio=0.9
The ratio of reserved GOMEMLIMIT memory to the
detected maximum container or system memory.
--block-discovery-strategy="concurrent"
One of concurrent, recursive. When set to
concurrent, stores will concurrently issue
Expand Down Expand Up @@ -69,6 +72,9 @@ Flags:
cause the store to read them. For such use
cases use Prometheus + sidecar. Ignored if
--no-cache-index-header option is specified.
--enable-auto-gomemlimit Enable go runtime to automatically limit memory
consumption by compact component. This is an
experimental feature.
--grpc-address="0.0.0.0:10901"
Listen ip:port address for gRPC endpoints
(StoreAPI). Make sure this address is routable
Expand Down Expand Up @@ -375,6 +381,8 @@ While the remaining settings are **optional**:
The `redis` index cache allows to use [Redis](https://redis.io) as cache backend. This cache type is configured using `--index-cache.config-file` to reference the configuration file or `--index-cache.config` to put yaml config directly:

```yaml mdox-exec="go run scripts/cfggen/main.go --name=cacheutil.RedisClientConfig"
# command-line-arguments
ld: warning: ignoring duplicate libraries: '-lproc'
type: REDIS
config:
addr: ""
Expand Down

0 comments on commit eab1db8

Please sign in to comment.