Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
112130: cli: add job traces for traceable jobs to the debug zip r=adityamaru a=adityamaru

This change teaches the debug zip to collect the traces for traceable jobs (backup, restore, import, pcr) that are in a running or reverting state at the time the zip is collected. These traces are dumped in a
`/jobs/<jobID>/<timestamp>/trace.zip` file and rely on the existing `tracing/zippr` that is used by
`cockroach debug job-trace` to collect the required information.

Informs: cockroachdb#111886
Release note (cli change): `cockroach debug zip` will now collect the inflight traces of traceable jobs such as backup, restore, import, c2c and dump them in a `jobs/` subdirectory in the zip.

112624: logictest: disable metamorphic testing for mixed version test r=rafiss a=rafiss

We've seen that these tests can get quite slow sometimes. One theory is that this is caused by metamorphic test settings, so this PR disables them so we can determine if it helps with test stability.

informs cockroachdb#112621
Release note: None

Co-authored-by: adityamaru <adityamaru@gmail.com>
Co-authored-by: Rafi Shamim <rafi@cockroachlabs.com>
  • Loading branch information
3 people committed Oct 18, 2023
3 parents 282ff9a + 0506ef5 + 48eceb7 commit 2cb7c2a
Show file tree
Hide file tree
Showing 37 changed files with 371 additions and 134 deletions.
3 changes: 3 additions & 0 deletions pkg/cli/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ go_test(
deps = [
"//pkg/base",
"//pkg/build",
"//pkg/ccl/backupccl",
"//pkg/cli/clicfg",
"//pkg/cli/clienturl",
"//pkg/cli/clierror",
Expand All @@ -374,6 +375,7 @@ go_test(
"//pkg/gossip",
"//pkg/jobs",
"//pkg/jobs/jobspb",
"//pkg/jobs/jobstest",
"//pkg/keys",
"//pkg/kv",
"//pkg/kv/kvclient/kvtenant",
Expand Down Expand Up @@ -402,6 +404,7 @@ go_test(
"//pkg/storage",
"//pkg/testutils",
"//pkg/testutils/datapathutils",
"//pkg/testutils/jobutils",
"//pkg/testutils/listenerutil",
"//pkg/testutils/serverutils",
"//pkg/testutils/skip",
Expand Down
9 changes: 9 additions & 0 deletions pkg/cli/cliflags/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -1690,6 +1690,15 @@ require any additional stop-the-world operations to be collected.
`,
}

ZipIncludeRunningJobTraces = FlagInfo{
Name: "include-running-job-traces",
Description: `
Include information about each running, traceable job in jobs/*/*/trace.zip
files. This involves collecting cluster-wide traces for each running job in the
cluster.
`,
}

ZipCPUProfileDuration = FlagInfo{
Name: "cpu-profile-duration",
Description: `
Expand Down
4 changes: 2 additions & 2 deletions pkg/cli/clisqlcfg/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ func (c *Context) MakeConn(url string) (clisqlclient.Conn, error) {
// By default, all connections will use the underlying driver to infer
// result types. This should be set back to false for any use case where the
// results are only shown for textual display.
conn.SetAlwaysInferResultTypes(true)
_ = conn.SetAlwaysInferResultTypes(true)

return conn, nil
}
Expand All @@ -198,7 +198,7 @@ func (c *Context) Run(ctx context.Context, conn clisqlclient.Conn) error {
// Anything using a SQL shell (e.g. `cockroach sql` or `demo`), only needs
// to show results in text format, so the underlying driver doesn't need to
// infer types.
conn.SetAlwaysInferResultTypes(false)
_ = conn.SetAlwaysInferResultTypes(false)

// Open the connection to make sure everything is OK before running any
// statements. Performs authentication.
Expand Down
5 changes: 3 additions & 2 deletions pkg/cli/clisqlclient/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,9 @@ type Conn interface {

// SetAlwaysInferResultTypes configures the alwaysInferResultTypes flag, which
// determines if the client should use the underlying driver to infer result
// types.
SetAlwaysInferResultTypes(b bool)
// types. It returns a method that can be used to reset the configuration to
// its previous value.
SetAlwaysInferResultTypes(b bool) func()

// GetServerMetadata returns details about the CockroachDB node
// this connection is connected to.
Expand Down
6 changes: 5 additions & 1 deletion pkg/cli/clisqlclient/conn.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,12 @@ func (c *sqlConn) SetMissingPassword(missing bool) {
}

// SetAlwaysInferResultTypes implements the Conn interface.
func (c *sqlConn) SetAlwaysInferResultTypes(b bool) {
func (c *sqlConn) SetAlwaysInferResultTypes(b bool) func() {
oldVal := c.alwaysInferResultTypes
c.alwaysInferResultTypes = b
return func() {
c.alwaysInferResultTypes = oldVal
}
}

// EnsureConn (re-)establishes the connection to the server.
Expand Down
7 changes: 7 additions & 0 deletions pkg/cli/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,10 @@ type zipContext struct {
// latency.
includeStacks bool

// includeRunningJobTraces includes the active traces of each running
// Traceable job in individual jobs/*/ranges/trace.zip files.
includeRunningJobTraces bool

// The log/heap/etc files to include.
files fileSelection
}
Expand All @@ -381,6 +385,9 @@ func setZipContextDefaults() {
// Goroutine stack dumps require a "stop the world" operation on the server side,
// which impacts performance and SQL service latency.
zipCtx.includeStacks = true
// Job traces for running Traceable jobs involves fetching cluster wide traces
// for each job.
zipCtx.includeRunningJobTraces = false
zipCtx.cpuProfDuration = 5 * time.Second
zipCtx.concurrency = 15

Expand Down
1 change: 1 addition & 0 deletions pkg/cli/debug_job_trace_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ func (r *traceSpanResumer) CollectProfile(_ context.Context, _ interface{}) erro
func TestDebugJobTrace(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
defer jobs.ResetConstructors()()

ctx := context.Background()
argsFn := func(args *base.TestServerArgs) {
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -702,6 +702,7 @@ func init() {
cliflagcfg.IntFlag(f, &zipCtx.concurrency, cliflags.ZipConcurrency)
cliflagcfg.BoolFlag(f, &zipCtx.includeRangeInfo, cliflags.ZipIncludeRangeInfo)
cliflagcfg.BoolFlag(f, &zipCtx.includeStacks, cliflags.ZipIncludeGoroutineStacks)
cliflagcfg.BoolFlag(f, &zipCtx.includeRunningJobTraces, cliflags.ZipIncludeRunningJobTraces)
}
// List-files + Zip commands.
for _, cmd := range []*cobra.Command{debugZipCmd, debugListFilesCmd} {
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/partial1
Original file line number Diff line number Diff line change
Expand Up @@ -263,3 +263,4 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null
[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/partial1_excluded
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,4 @@ debug zip /dev/null --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0
[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/partial2
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,4 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null
[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/testzip
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,4 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/testzip_concurrent
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
zip
----
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
[cluster] creating output file /dev/null...
[cluster] creating output file /dev/null: done
[cluster] discovering virtual clusters...
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/testzip_exclude_goroutine_stacks
Original file line number Diff line number Diff line change
Expand Up @@ -125,4 +125,5 @@ debug zip --concurrency=1 --cpu-profile-duration=1s --include-goroutine-stacks=f
[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
[cluster] NOTE: Omitted node-level goroutine stack dumps from this debug zip bundle. Use the --include-goroutine-stacks flag to enable the fetching of this data.
Original file line number Diff line number Diff line change
Expand Up @@ -161,3 +161,4 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/testzip_include_goroutine_stacks
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,4 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/testzip_include_range_info
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,4 @@ debug zip --concurrency=1 --cpu-profile-duration=1s --include-range-info /dev/nu
[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/testzip_shared_process_virtualization
Original file line number Diff line number Diff line change
Expand Up @@ -286,3 +286,4 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] pprof summary script... writing binary output: debug/cluster/test-tenant/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/cluster/test-tenant/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/cluster/test-tenant/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
Original file line number Diff line number Diff line change
Expand Up @@ -286,3 +286,4 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] pprof summary script... writing binary output: debug/cluster/test-tenant/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/cluster/test-tenant/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/cluster/test-tenant/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
105 changes: 104 additions & 1 deletion pkg/cli/zip.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ package cli

import (
"context"
"database/sql/driver"
"fmt"
"io"
"net"
Expand All @@ -23,6 +24,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/cli/cliflags"
"github.com/cockroachdb/cockroach/pkg/cli/clisqlclient"
"github.com/cockroachdb/cockroach/pkg/cli/clisqlexec"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/server/profiler"
Expand All @@ -31,7 +33,11 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
"github.com/cockroachdb/cockroach/pkg/sql/sem/catconstants"
"github.com/cockroachdb/cockroach/pkg/util/buildutil"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb"
tracezipper "github.com/cockroachdb/cockroach/pkg/util/tracing/zipper"
"github.com/cockroachdb/errors"
"github.com/jackc/pgconn"
"github.com/marusama/semaphore"
Expand Down Expand Up @@ -253,7 +259,7 @@ func runDebugZip(cmd *cobra.Command, args []string) (retErr error) {
sqlConn, err := makeTenantSQLClient(ctx, "cockroach zip", useSystemDb, tenant.TenantName)
// The zip output is sent directly into a text file, so the results should
// be scanned into strings.
sqlConn.SetAlwaysInferResultTypes(false)
_ = sqlConn.SetAlwaysInferResultTypes(false)
if err != nil {
_ = s.fail(errors.Wrap(err, "unable to open a SQL session. Debug information will be incomplete"))
} else {
Expand Down Expand Up @@ -344,6 +350,12 @@ done
}
}

if !zipCtx.includeRunningJobTraces {
zr.info("NOTE: Omitted traces of running jobs from this debug zip bundle." +
" Use the --" + cliflags.ZipIncludeRunningJobTraces.Name + " flag to enable the fetching of this" +
" data.")
}

if !zipCtx.includeStacks {
zr.info("NOTE: Omitted node-level goroutine stack dumps from this debug zip bundle." +
" Use the --" + cliflags.ZipIncludeGoroutineStacks.Name + " flag to enable the fetching of this" +
Expand Down Expand Up @@ -377,6 +389,97 @@ func maybeAddProfileSuffix(name string) string {
return name
}

type jobTrace struct {
jobID jobspb.JobID
traceID tracingpb.TraceID
}

// dumpTraceableJobTraces collects the traces for some "traceable" jobs that are
// in a running state. The job types in this list are the ones that have
// explicitly implemented the TraceableJob interface.
func (zc *debugZipContext) dumpTraceableJobTraces() error {
ctx := context.Background()
rows, err := zc.firstNodeSQLConn.Query(ctx,
`WITH
latestprogress AS (
SELECT job_id, value
FROM system.job_info AS progress
WHERE info_key = 'legacy_progress'
ORDER BY written desc
),
jobpage AS (
SELECT id
FROM system.jobs@jobs_status_created_idx
WHERE (job_type IN ($1, $2, $3, $4)) AND (status IN ($5, $6))
ORDER BY id
)
SELECT distinct (id), latestprogress.value AS progress
FROM jobpage AS j
INNER JOIN latestprogress ON j.id = latestprogress.job_id;`,
jobspb.TypeBackup.String(),
jobspb.TypeRestore.String(),
jobspb.TypeImport.String(),
jobspb.TypeReplicationStreamIngestion.String(),
"running",
"reverting",
)
if err != nil {
return err
}
defer func() {
if rows != nil {
if err := rows.Close(); err != nil {
log.Warningf(ctx, "failed to close with error: %v", err)
}
}
}()
vals := make([]driver.Value, 2)
jobTraces := make([]jobTrace, 0)
for err = rows.Next(vals); err == nil; err = rows.Next(vals) {
jobID, ok := vals[0].(int64)
if !ok {
return errors.New("failed to parse jobID")
}
progressBytes, ok := vals[1].([]byte)
if !ok {
return errors.New("failed to parse progress bytes")
}
progress := &jobspb.Progress{}
if err := protoutil.Unmarshal(progressBytes, progress); err != nil {
return err
}
jobTraces = append(jobTraces, jobTrace{jobID: jobspb.JobID(jobID), traceID: progress.TraceID})
}

func() {
// Debug zip collection sets this to false since results from the query are
// all dumped into txt files. In our case we parse the results of the query
// with their respective types and pre-process the information before
// dumping into a zip file.
reset := zc.firstNodeSQLConn.SetAlwaysInferResultTypes(true)
defer reset()
for _, jobTrace := range jobTraces {
inflightTraceZipper := tracezipper.MakeSQLConnInflightTraceZipper(zc.firstNodeSQLConn.GetDriverConn())
jobZip, err := inflightTraceZipper.Zip(ctx, int64(jobTrace.traceID))
if err != nil {
log.Warningf(ctx, "failed to collect inflight trace zip for job %d: %v", jobTrace.jobID, err)
continue
}

ts := timeutil.Now().Format(`20060102150405`)
name := fmt.Sprintf("%s/jobs/%d/%s/trace.zip", zc.prefix, jobTrace.jobID, ts)
s := zc.clusterPrinter.start("requesting traces for job %d", jobTrace.jobID)
if err := zc.z.createRaw(s, name, jobZip); err != nil {
log.Warningf(ctx, "failed to write inflight trace zip for job %d to file %s: %v",
jobTrace.jobID, name, err)
continue
}
}
}()

return nil
}

// dumpTableDataForZip runs the specified SQL query and stores the
// result. Errors encountered while running the SQL query are stored
// in an error file in the zip file, and dumpTableDataForZip() returns
Expand Down
7 changes: 7 additions & 0 deletions pkg/cli/zip_cluster_wide.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,5 +188,12 @@ func (zc *debugZipContext) collectClusterData(
}
}

if zipCtx.includeRunningJobTraces {
zc.clusterPrinter.info("collecting the inflight traces for jobs")
if err := zc.dumpTraceableJobTraces(); err != nil {
return &serverpb.NodesListResponse{}, nil, err
}
}

return nodesList, livenessByNodeID, nil
}

0 comments on commit 2cb7c2a

Please sign in to comment.