diff --git a/pkg/cli/BUILD.bazel b/pkg/cli/BUILD.bazel index 29aca1fc5388..f82b76c73e3c 100644 --- a/pkg/cli/BUILD.bazel +++ b/pkg/cli/BUILD.bazel @@ -359,6 +359,7 @@ go_test( deps = [ "//pkg/base", "//pkg/build", + "//pkg/ccl/backupccl", "//pkg/cli/clicfg", "//pkg/cli/clienturl", "//pkg/cli/clierror", @@ -374,6 +375,7 @@ go_test( "//pkg/gossip", "//pkg/jobs", "//pkg/jobs/jobspb", + "//pkg/jobs/jobstest", "//pkg/keys", "//pkg/kv", "//pkg/kv/kvclient/kvtenant", @@ -402,6 +404,7 @@ go_test( "//pkg/storage", "//pkg/testutils", "//pkg/testutils/datapathutils", + "//pkg/testutils/jobutils", "//pkg/testutils/listenerutil", "//pkg/testutils/serverutils", "//pkg/testutils/skip", diff --git a/pkg/cli/cliflags/flags.go b/pkg/cli/cliflags/flags.go index 683936383781..cd2b82d1b23a 100644 --- a/pkg/cli/cliflags/flags.go +++ b/pkg/cli/cliflags/flags.go @@ -1690,6 +1690,15 @@ require any additional stop-the-world operations to be collected. `, } + ZipIncludeRunningJobTraces = FlagInfo{ + Name: "include-running-job-traces", + Description: ` +Include information about each running, traceable job in jobs/*/*/trace.zip +files. This involves collecting cluster-wide traces for each running job in the +cluster. +`, + } + ZipCPUProfileDuration = FlagInfo{ Name: "cpu-profile-duration", Description: ` diff --git a/pkg/cli/clisqlcfg/context.go b/pkg/cli/clisqlcfg/context.go index 01d7e7a77ae8..9cfb86d3433a 100644 --- a/pkg/cli/clisqlcfg/context.go +++ b/pkg/cli/clisqlcfg/context.go @@ -184,7 +184,7 @@ func (c *Context) MakeConn(url string) (clisqlclient.Conn, error) { // By default, all connections will use the underlying driver to infer // result types. This should be set back to false for any use case where the // results are only shown for textual display. - conn.SetAlwaysInferResultTypes(true) + _ = conn.SetAlwaysInferResultTypes(true) return conn, nil } @@ -198,7 +198,7 @@ func (c *Context) Run(ctx context.Context, conn clisqlclient.Conn) error { // Anything using a SQL shell (e.g. `cockroach sql` or `demo`), only needs // to show results in text format, so the underlying driver doesn't need to // infer types. - conn.SetAlwaysInferResultTypes(false) + _ = conn.SetAlwaysInferResultTypes(false) // Open the connection to make sure everything is OK before running any // statements. Performs authentication. diff --git a/pkg/cli/clisqlclient/api.go b/pkg/cli/clisqlclient/api.go index 32519bbcb0d6..b7828744ee85 100644 --- a/pkg/cli/clisqlclient/api.go +++ b/pkg/cli/clisqlclient/api.go @@ -70,8 +70,9 @@ type Conn interface { // SetAlwaysInferResultTypes configures the alwaysInferResultTypes flag, which // determines if the client should use the underlying driver to infer result - // types. - SetAlwaysInferResultTypes(b bool) + // types. It returns a method that can be used to reset the configuration to + // its previous value. + SetAlwaysInferResultTypes(b bool) func() // GetServerMetadata returns details about the CockroachDB node // this connection is connected to. diff --git a/pkg/cli/clisqlclient/conn.go b/pkg/cli/clisqlclient/conn.go index e10a45eeb7b4..1247345fe6d9 100644 --- a/pkg/cli/clisqlclient/conn.go +++ b/pkg/cli/clisqlclient/conn.go @@ -164,8 +164,12 @@ func (c *sqlConn) SetMissingPassword(missing bool) { } // SetAlwaysInferResultTypes implements the Conn interface. -func (c *sqlConn) SetAlwaysInferResultTypes(b bool) { +func (c *sqlConn) SetAlwaysInferResultTypes(b bool) func() { + oldVal := c.alwaysInferResultTypes c.alwaysInferResultTypes = b + return func() { + c.alwaysInferResultTypes = oldVal + } } // EnsureConn (re-)establishes the connection to the server. diff --git a/pkg/cli/context.go b/pkg/cli/context.go index c33ae90e8975..021140e3b779 100644 --- a/pkg/cli/context.go +++ b/pkg/cli/context.go @@ -363,6 +363,10 @@ type zipContext struct { // latency. includeStacks bool + // includeRunningJobTraces includes the active traces of each running + // Traceable job in individual jobs/*/ranges/trace.zip files. + includeRunningJobTraces bool + // The log/heap/etc files to include. files fileSelection } @@ -381,6 +385,9 @@ func setZipContextDefaults() { // Goroutine stack dumps require a "stop the world" operation on the server side, // which impacts performance and SQL service latency. zipCtx.includeStacks = true + // Job traces for running Traceable jobs involves fetching cluster wide traces + // for each job. + zipCtx.includeRunningJobTraces = false zipCtx.cpuProfDuration = 5 * time.Second zipCtx.concurrency = 15 diff --git a/pkg/cli/debug_job_trace_test.go b/pkg/cli/debug_job_trace_test.go index cd02e98f3c51..7b03a7a6a69e 100644 --- a/pkg/cli/debug_job_trace_test.go +++ b/pkg/cli/debug_job_trace_test.go @@ -77,6 +77,7 @@ func (r *traceSpanResumer) CollectProfile(_ context.Context, _ interface{}) erro func TestDebugJobTrace(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) + defer jobs.ResetConstructors()() ctx := context.Background() argsFn := func(args *base.TestServerArgs) { diff --git a/pkg/cli/flags.go b/pkg/cli/flags.go index b93b372d2515..8a48741510f6 100644 --- a/pkg/cli/flags.go +++ b/pkg/cli/flags.go @@ -702,6 +702,7 @@ func init() { cliflagcfg.IntFlag(f, &zipCtx.concurrency, cliflags.ZipConcurrency) cliflagcfg.BoolFlag(f, &zipCtx.includeRangeInfo, cliflags.ZipIncludeRangeInfo) cliflagcfg.BoolFlag(f, &zipCtx.includeStacks, cliflags.ZipIncludeGoroutineStacks) + cliflagcfg.BoolFlag(f, &zipCtx.includeRunningJobTraces, cliflags.ZipIncludeRunningJobTraces) } // List-files + Zip commands. for _, cmd := range []*cobra.Command{debugZipCmd, debugListFilesCmd} { diff --git a/pkg/cli/testdata/zip/partial1 b/pkg/cli/testdata/zip/partial1 index 014309a8e748..205ee46c296b 100644 --- a/pkg/cli/testdata/zip/partial1 +++ b/pkg/cli/testdata/zip/partial1 @@ -263,3 +263,4 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null [cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. diff --git a/pkg/cli/testdata/zip/partial1_excluded b/pkg/cli/testdata/zip/partial1_excluded index a1cfd380abf6..3e986c5bae64 100644 --- a/pkg/cli/testdata/zip/partial1_excluded +++ b/pkg/cli/testdata/zip/partial1_excluded @@ -166,3 +166,4 @@ debug zip /dev/null --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0 [cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. diff --git a/pkg/cli/testdata/zip/partial2 b/pkg/cli/testdata/zip/partial2 index d6ddc4bd52bb..7e810329edf0 100644 --- a/pkg/cli/testdata/zip/partial2 +++ b/pkg/cli/testdata/zip/partial2 @@ -165,3 +165,4 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null [cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. diff --git a/pkg/cli/testdata/zip/testzip b/pkg/cli/testdata/zip/testzip index b3e8fff2cc5b..cbc363a36831 100644 --- a/pkg/cli/testdata/zip/testzip +++ b/pkg/cli/testdata/zip/testzip @@ -126,3 +126,4 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null [cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. diff --git a/pkg/cli/testdata/zip/testzip_concurrent b/pkg/cli/testdata/zip/testzip_concurrent index 0e420180df2f..846957639693 100644 --- a/pkg/cli/testdata/zip/testzip_concurrent +++ b/pkg/cli/testdata/zip/testzip_concurrent @@ -1,5 +1,6 @@ zip ---- +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. [cluster] creating output file /dev/null... [cluster] creating output file /dev/null: done [cluster] discovering virtual clusters... diff --git a/pkg/cli/testdata/zip/testzip_exclude_goroutine_stacks b/pkg/cli/testdata/zip/testzip_exclude_goroutine_stacks index 0e38d6616a69..70824f9df00f 100644 --- a/pkg/cli/testdata/zip/testzip_exclude_goroutine_stacks +++ b/pkg/cli/testdata/zip/testzip_exclude_goroutine_stacks @@ -125,4 +125,5 @@ debug zip --concurrency=1 --cpu-profile-duration=1s --include-goroutine-stacks=f [cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. [cluster] NOTE: Omitted node-level goroutine stack dumps from this debug zip bundle. Use the --include-goroutine-stacks flag to enable the fetching of this data. diff --git a/pkg/cli/testdata/zip/testzip_external_process_virtualization b/pkg/cli/testdata/zip/testzip_external_process_virtualization index 184a175a2429..8a529bb04db0 100644 --- a/pkg/cli/testdata/zip/testzip_external_process_virtualization +++ b/pkg/cli/testdata/zip/testzip_external_process_virtualization @@ -161,3 +161,4 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null [cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. diff --git a/pkg/cli/testdata/zip/testzip_include_goroutine_stacks b/pkg/cli/testdata/zip/testzip_include_goroutine_stacks index b3e8fff2cc5b..cbc363a36831 100644 --- a/pkg/cli/testdata/zip/testzip_include_goroutine_stacks +++ b/pkg/cli/testdata/zip/testzip_include_goroutine_stacks @@ -126,3 +126,4 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null [cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. diff --git a/pkg/cli/testdata/zip/testzip_include_range_info b/pkg/cli/testdata/zip/testzip_include_range_info index 75d1d8f92a53..44aa6093f060 100644 --- a/pkg/cli/testdata/zip/testzip_include_range_info +++ b/pkg/cli/testdata/zip/testzip_include_range_info @@ -126,3 +126,4 @@ debug zip --concurrency=1 --cpu-profile-duration=1s --include-range-info /dev/nu [cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. diff --git a/pkg/cli/testdata/zip/testzip_shared_process_virtualization b/pkg/cli/testdata/zip/testzip_shared_process_virtualization index 5aaad4c2caa5..aa484a72b4ba 100644 --- a/pkg/cli/testdata/zip/testzip_shared_process_virtualization +++ b/pkg/cli/testdata/zip/testzip_shared_process_virtualization @@ -286,3 +286,4 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null [cluster] pprof summary script... writing binary output: debug/cluster/test-tenant/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/cluster/test-tenant/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/cluster/test-tenant/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. diff --git a/pkg/cli/testdata/zip/testzip_shared_process_virtualization_with_default_tenant b/pkg/cli/testdata/zip/testzip_shared_process_virtualization_with_default_tenant index 5aaad4c2caa5..aa484a72b4ba 100644 --- a/pkg/cli/testdata/zip/testzip_shared_process_virtualization_with_default_tenant +++ b/pkg/cli/testdata/zip/testzip_shared_process_virtualization_with_default_tenant @@ -286,3 +286,4 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null [cluster] pprof summary script... writing binary output: debug/cluster/test-tenant/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/cluster/test-tenant/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/cluster/test-tenant/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. diff --git a/pkg/cli/zip.go b/pkg/cli/zip.go index 352b1cef55ef..18c32f896a15 100644 --- a/pkg/cli/zip.go +++ b/pkg/cli/zip.go @@ -12,6 +12,7 @@ package cli import ( "context" + "database/sql/driver" "fmt" "io" "net" @@ -23,6 +24,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cli/cliflags" "github.com/cockroachdb/cockroach/pkg/cli/clisqlclient" "github.com/cockroachdb/cockroach/pkg/cli/clisqlexec" + "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/server/profiler" @@ -31,7 +33,11 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/sql/sem/catconstants" "github.com/cockroachdb/cockroach/pkg/util/buildutil" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/protoutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb" + tracezipper "github.com/cockroachdb/cockroach/pkg/util/tracing/zipper" "github.com/cockroachdb/errors" "github.com/jackc/pgconn" "github.com/marusama/semaphore" @@ -253,7 +259,7 @@ func runDebugZip(cmd *cobra.Command, args []string) (retErr error) { sqlConn, err := makeTenantSQLClient(ctx, "cockroach zip", useSystemDb, tenant.TenantName) // The zip output is sent directly into a text file, so the results should // be scanned into strings. - sqlConn.SetAlwaysInferResultTypes(false) + _ = sqlConn.SetAlwaysInferResultTypes(false) if err != nil { _ = s.fail(errors.Wrap(err, "unable to open a SQL session. Debug information will be incomplete")) } else { @@ -344,6 +350,12 @@ done } } + if !zipCtx.includeRunningJobTraces { + zr.info("NOTE: Omitted traces of running jobs from this debug zip bundle." + + " Use the --" + cliflags.ZipIncludeRunningJobTraces.Name + " flag to enable the fetching of this" + + " data.") + } + if !zipCtx.includeStacks { zr.info("NOTE: Omitted node-level goroutine stack dumps from this debug zip bundle." + " Use the --" + cliflags.ZipIncludeGoroutineStacks.Name + " flag to enable the fetching of this" + @@ -377,6 +389,97 @@ func maybeAddProfileSuffix(name string) string { return name } +type jobTrace struct { + jobID jobspb.JobID + traceID tracingpb.TraceID +} + +// dumpTraceableJobTraces collects the traces for some "traceable" jobs that are +// in a running state. The job types in this list are the ones that have +// explicitly implemented the TraceableJob interface. +func (zc *debugZipContext) dumpTraceableJobTraces() error { + ctx := context.Background() + rows, err := zc.firstNodeSQLConn.Query(ctx, + `WITH +latestprogress AS ( + SELECT job_id, value + FROM system.job_info AS progress + WHERE info_key = 'legacy_progress' + ORDER BY written desc +), +jobpage AS ( + SELECT id + FROM system.jobs@jobs_status_created_idx + WHERE (job_type IN ($1, $2, $3, $4)) AND (status IN ($5, $6)) + ORDER BY id +) +SELECT distinct (id), latestprogress.value AS progress +FROM jobpage AS j +INNER JOIN latestprogress ON j.id = latestprogress.job_id;`, + jobspb.TypeBackup.String(), + jobspb.TypeRestore.String(), + jobspb.TypeImport.String(), + jobspb.TypeReplicationStreamIngestion.String(), + "running", + "reverting", + ) + if err != nil { + return err + } + defer func() { + if rows != nil { + if err := rows.Close(); err != nil { + log.Warningf(ctx, "failed to close with error: %v", err) + } + } + }() + vals := make([]driver.Value, 2) + jobTraces := make([]jobTrace, 0) + for err = rows.Next(vals); err == nil; err = rows.Next(vals) { + jobID, ok := vals[0].(int64) + if !ok { + return errors.New("failed to parse jobID") + } + progressBytes, ok := vals[1].([]byte) + if !ok { + return errors.New("failed to parse progress bytes") + } + progress := &jobspb.Progress{} + if err := protoutil.Unmarshal(progressBytes, progress); err != nil { + return err + } + jobTraces = append(jobTraces, jobTrace{jobID: jobspb.JobID(jobID), traceID: progress.TraceID}) + } + + func() { + // Debug zip collection sets this to false since results from the query are + // all dumped into txt files. In our case we parse the results of the query + // with their respective types and pre-process the information before + // dumping into a zip file. + reset := zc.firstNodeSQLConn.SetAlwaysInferResultTypes(true) + defer reset() + for _, jobTrace := range jobTraces { + inflightTraceZipper := tracezipper.MakeSQLConnInflightTraceZipper(zc.firstNodeSQLConn.GetDriverConn()) + jobZip, err := inflightTraceZipper.Zip(ctx, int64(jobTrace.traceID)) + if err != nil { + log.Warningf(ctx, "failed to collect inflight trace zip for job %d: %v", jobTrace.jobID, err) + continue + } + + ts := timeutil.Now().Format(`20060102150405`) + name := fmt.Sprintf("%s/jobs/%d/%s/trace.zip", zc.prefix, jobTrace.jobID, ts) + s := zc.clusterPrinter.start("requesting traces for job %d", jobTrace.jobID) + if err := zc.z.createRaw(s, name, jobZip); err != nil { + log.Warningf(ctx, "failed to write inflight trace zip for job %d to file %s: %v", + jobTrace.jobID, name, err) + continue + } + } + }() + + return nil +} + // dumpTableDataForZip runs the specified SQL query and stores the // result. Errors encountered while running the SQL query are stored // in an error file in the zip file, and dumpTableDataForZip() returns diff --git a/pkg/cli/zip_cluster_wide.go b/pkg/cli/zip_cluster_wide.go index 660514cabe72..68ab826b748f 100644 --- a/pkg/cli/zip_cluster_wide.go +++ b/pkg/cli/zip_cluster_wide.go @@ -188,5 +188,12 @@ func (zc *debugZipContext) collectClusterData( } } + if zipCtx.includeRunningJobTraces { + zc.clusterPrinter.info("collecting the inflight traces for jobs") + if err := zc.dumpTraceableJobTraces(); err != nil { + return &serverpb.NodesListResponse{}, nil, err + } + } + return nodesList, livenessByNodeID, nil } diff --git a/pkg/cli/zip_test.go b/pkg/cli/zip_test.go index 951299d44066..ad001f0b1cf6 100644 --- a/pkg/cli/zip_test.go +++ b/pkg/cli/zip_test.go @@ -28,17 +28,24 @@ import ( "time" "github.com/cockroachdb/cockroach/pkg/base" + _ "github.com/cockroachdb/cockroach/pkg/ccl/backupccl" + "github.com/cockroachdb/cockroach/pkg/jobs" + "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" + "github.com/cockroachdb/cockroach/pkg/jobs/jobstest" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv/kvpb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/security/username" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/datapathutils" + "github.com/cockroachdb/cockroach/pkg/testutils/jobutils" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" "github.com/cockroachdb/cockroach/pkg/testutils/skip" + "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" @@ -46,6 +53,7 @@ import ( "github.com/cockroachdb/datadriven" "github.com/cockroachdb/errors" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) // TestZipContainsAllInternalTables verifies that we don't add new internal tables @@ -870,3 +878,106 @@ func TestNodeRangeSelection(t *testing.T) { } } } + +func TestZipJobTrace(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + defer jobs.ResetConstructors()() + + s, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{ + Insecure: true, + Knobs: base.TestingKnobs{ + JobsTestingKnobs: jobs.NewTestingKnobsWithShortIntervals(), + }, + }) + defer s.Stopper().Stop(context.Background()) + blockCh := make(chan struct{}) + jobs.RegisterConstructor(jobspb.TypeImport, + func(j *jobs.Job, _ *cluster.Settings) jobs.Resumer { + return jobstest.FakeResumer{ + OnResume: func(ctx context.Context) error { + <-blockCh + return nil + }, + } + }, jobs.UsesTenantCostControl) + runner := sqlutils.MakeSQLRunner(sqlDB) + dir, cleanupFn := testutils.TempDir(t) + defer cleanupFn() + zipName := filepath.Join(dir, "test.zip") + + // Run a backup that completes, we should not see a trace for this job. + runner.Exec(t, `CREATE TABLE foo (id INT)`) + runner.Exec(t, `BACKUP TABLE foo INTO 'userfile:///completes'`) + + // Run a restore that completes, we should not see a trace for this job. + runner.Exec(t, `CREATE DATABASE test`) + runner.Exec(t, `RESTORE TABLE foo FROM LATEST IN 'userfile:///completes' WITH into_db = 'test'`) + + triggerJobAndWaitForRun := func(jobQuery string) jobspb.JobID { + var jobID jobspb.JobID + runner.QueryRow(t, jobQuery).Scan(&jobID) + jobutils.WaitForJobToRun(t, runner, jobID) + return jobID + } + + sqlURL := url.URL{ + Scheme: "postgres", + User: url.User(username.RootUser), + Host: s.AdvSQLAddr(), + RawQuery: "sslmode=disable", + } + sqlConn := sqlConnCtx.MakeSQLConn(io.Discard, io.Discard, sqlURL.String()) + defer func() { + if err := sqlConn.Close(); err != nil { + t.Fatal(err) + } + }() + + var expectedFilesList strings.Builder + var importJobID jobspb.JobID + var importJobID2 jobspb.JobID + func() { + out, err := os.Create(zipName) + if err != nil { + t.Fatal(err) + } + z := newZipper(out) + defer func() { + if err := z.close(); err != nil { + t.Fatal(err) + } + }() + + runner.Exec(t, `CREATE TABLE x (id INT PRIMARY KEY, n INT, s STRING)`) + importJobID = triggerJobAndWaitForRun(`IMPORT INTO x CSV DATA ('workload:///csv/bank/bank?rows=100&version=1.0.0') WITH detached`) + importJobID2 = triggerJobAndWaitForRun(`IMPORT INTO x CSV DATA ('workload:///csv/bank/bank?rows=100&version=1.0.0') WITH detached`) + expectedFilesList.WriteString(fmt.Sprintf("/jobs/%d/.*/trace.zip\n", importJobID)) + expectedFilesList.WriteString(fmt.Sprintf("/jobs/%d/.*/trace.zip\n", importJobID2)) + + zr := zipCtx.newZipReporter("test") + zc := debugZipContext{ + z: z, + clusterPrinter: zr, + timeout: 3 * time.Second, + firstNodeSQLConn: sqlConn, + } + if err := zc.dumpTraceableJobTraces(); err != nil { + t.Fatal(err) + } + }() + + r, err := zip.OpenReader(zipName) + if err != nil { + t.Fatal(err) + } + defer func() { _ = r.Close() }() + var fileList strings.Builder + for _, f := range r.File { + fmt.Fprintln(&fileList, f.Name) + } + require.Regexp(t, expectedFilesList.String(), fileList.String()) + close(blockCh) + jobutils.WaitForJobToSucceed(t, runner, importJobID) + jobutils.WaitForJobToSucceed(t, runner, importJobID2) +} diff --git a/pkg/jobs/BUILD.bazel b/pkg/jobs/BUILD.bazel index 879cd4b7c940..2f360a87ef02 100644 --- a/pkg/jobs/BUILD.bazel +++ b/pkg/jobs/BUILD.bazel @@ -41,7 +41,6 @@ go_library( "//pkg/scheduledjobs", "//pkg/security/username", "//pkg/server/telemetry", - "//pkg/server/tracedumper", "//pkg/settings", "//pkg/settings/cluster", "//pkg/sql/catalog", diff --git a/pkg/jobs/delegate_control_test.go b/pkg/jobs/delegate_control_test.go index ef68b09bab98..4dc8516a2656 100644 --- a/pkg/jobs/delegate_control_test.go +++ b/pkg/jobs/delegate_control_test.go @@ -159,7 +159,7 @@ func TestJobsControlForSchedules(t *testing.T) { // As such, the job does not undergo usual job state transitions // (e.g. pause-request -> paused). RegisterConstructor(jobspb.TypeImport, func(job *Job, _ *cluster.Settings) Resumer { - return FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(_ context.Context) error { <-blockResume return nil @@ -273,7 +273,7 @@ func TestFilterJobsControlForSchedules(t *testing.T) { // Our resume never completes any jobs, until this test completes. RegisterConstructor(jobspb.TypeImport, func(job *Job, _ *cluster.Settings) Resumer { - return FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(_ context.Context) error { <-blockResume return nil @@ -406,7 +406,7 @@ func TestJobControlByType(t *testing.T) { // Make the jobs of each type controllable. for _, jobType := range allJobTypes { RegisterConstructor(jobType, func(job *Job, _ *cluster.Settings) Resumer { - return FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { <-ctx.Done() return nil diff --git a/pkg/jobs/helpers_test.go b/pkg/jobs/helpers_test.go index 6904a448e592..8144b712175a 100644 --- a/pkg/jobs/helpers_test.go +++ b/pkg/jobs/helpers_test.go @@ -19,62 +19,9 @@ import ( "github.com/cockroachdb/errors" ) -// FakeResumer calls optional callbacks during the job lifecycle. -type FakeResumer struct { - OnResume func(context.Context) error - FailOrCancel func(context.Context) error - Success func() error - PauseRequest onPauseRequestFunc - TraceRealSpan bool -} - -func (d FakeResumer) ForceRealSpan() bool { - return d.TraceRealSpan -} - -func (d FakeResumer) DumpTraceAfterRun() bool { - return true -} - -var _ Resumer = FakeResumer{} - -func (d FakeResumer) Resume(ctx context.Context, execCtx interface{}) error { - if d.OnResume != nil { - if err := d.OnResume(ctx); err != nil { - return err - } - } - if d.Success != nil { - return d.Success() - } - return nil -} - -func (d FakeResumer) OnFailOrCancel(ctx context.Context, _ interface{}, _ error) error { - if d.FailOrCancel != nil { - return d.FailOrCancel(ctx) - } - return nil -} - -func (d FakeResumer) CollectProfile(_ context.Context, _ interface{}) error { - return nil -} - // OnPauseRequestFunc forwards the definition for use in tests. type OnPauseRequestFunc = onPauseRequestFunc -var _ PauseRequester = FakeResumer{} - -func (d FakeResumer) OnPauseRequest( - ctx context.Context, execCtx interface{}, txn isql.Txn, details *jobspb.Progress, -) error { - if d.PauseRequest == nil { - return nil - } - return d.PauseRequest(ctx, execCtx, txn, details) -} - func (r *Registry) CancelRequested(ctx context.Context, txn isql.Txn, id jobspb.JobID) error { return r.cancelRequested(ctx, txn, id) } diff --git a/pkg/jobs/jobs_test.go b/pkg/jobs/jobs_test.go index ccffa3f72e06..924b914ec1e0 100644 --- a/pkg/jobs/jobs_test.go +++ b/pkg/jobs/jobs_test.go @@ -29,6 +29,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/jobs" "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" "github.com/cockroachdb/cockroach/pkg/jobs/jobsprotectedts" + "github.com/cockroachdb/cockroach/pkg/jobs/jobstest" "github.com/cockroachdb/cockroach/pkg/keyvisualizer" "github.com/cockroachdb/cockroach/pkg/kv/kvserver" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts/ptpb" @@ -207,7 +208,7 @@ func noopPauseRequestFunc( return nil } -var _ jobs.TraceableJob = (*jobs.FakeResumer)(nil) +var _ jobs.TraceableJob = (*jobstest.FakeResumer)(nil) func (rts *registryTestSuite) setUp(t *testing.T) { rts.ctx = context.Background() @@ -261,7 +262,7 @@ func (rts *registryTestSuite) setUp(t *testing.T) { rts.onPauseRequest = noopPauseRequestFunc jobs.RegisterConstructor(jobspb.TypeImport, func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ TraceRealSpan: rts.traceRealSpan, OnResume: func(ctx context.Context) error { t.Log("Starting resume") @@ -1151,7 +1152,7 @@ func TestRegistryLifecycle(t *testing.T) { resumerJob := make(chan *jobs.Job, 1) jobs.RegisterConstructor( jobspb.TypeBackup, func(j *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { resumerJob <- j return nil @@ -2248,7 +2249,7 @@ func TestShowJobWhenComplete(t *testing.T) { defer close(done) jobs.RegisterConstructor( jobspb.TypeImport, func(_ *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { select { case <-ctx.Done(): @@ -2412,7 +2413,7 @@ func TestJobInTxn(t *testing.T) { }, ) jobs.RegisterConstructor(jobspb.TypeBackup, func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { t.Logf("Resuming job: %+v", job.Payload()) atomic.AddInt32(&hasRun, 1) @@ -2452,7 +2453,7 @@ func TestJobInTxn(t *testing.T) { }, ) jobs.RegisterConstructor(jobspb.TypeRestore, func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(_ context.Context) error { return errors.New("RESTORE failed") }, @@ -2550,7 +2551,7 @@ func TestStartableJobMixedVersion(t *testing.T) { require.NoError(t, err) jobs.RegisterConstructor(jobspb.TypeImport, func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{} + return jobstest.FakeResumer{} }, jobs.UsesTenantCostControl) var j *jobs.StartableJob jobID := jr.MakeJobID() @@ -2591,7 +2592,7 @@ func TestStartableJob(t *testing.T) { return func() { resumeFunc.Store(prev) } } jobs.RegisterConstructor(jobspb.TypeRestore, func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { return resumeFunc.Load().(func(ctx context.Context) error)(ctx) }, @@ -2777,7 +2778,7 @@ func TestStartableJobTxnRetry(t *testing.T) { defer s.Stopper().Stop(ctx) jr := s.JobRegistry().(*jobs.Registry) jobs.RegisterConstructor(jobspb.TypeRestore, func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{} + return jobstest.FakeResumer{} }, jobs.UsesTenantCostControl) rec := jobs.Record{ Details: jobspb.RestoreDetails{}, @@ -2819,7 +2820,7 @@ func TestRegistryTestingNudgeAdoptionQueue(t *testing.T) { defer jobs.ResetConstructors()() resuming := make(chan struct{}) jobs.RegisterConstructor(jobspb.TypeBackup, func(_ *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { resuming <- struct{}{} return nil @@ -2905,7 +2906,7 @@ func TestMetrics(t *testing.T) { fakeBackupMetrics := makeFakeMetrics() jobs.RegisterConstructor(jobspb.TypeBackup, func(j *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { defer fakeBackupMetrics.N.Inc(1) return waitForErr(ctx) @@ -2919,7 +2920,7 @@ func TestMetrics(t *testing.T) { ) jobs.RegisterConstructor(jobspb.TypeImport, func(_ *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { return waitForErr(ctx) }, @@ -3139,7 +3140,7 @@ func TestLoseLeaseDuringExecution(t *testing.T) { defer jobs.ResetConstructors()() resumed := make(chan error, 1) jobs.RegisterConstructor(jobspb.TypeBackup, func(j *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { defer close(resumed) _, err := s.InternalExecutor().(isql.Executor).Exec( @@ -3209,7 +3210,7 @@ func TestPauseReason(t *testing.T) { defer close(done) resumeSignaler := newResumeStartedSignaler() jobs.RegisterConstructor(jobspb.TypeImport, func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { resumeSignaler.SignalResumeStarted() select { @@ -3464,7 +3465,7 @@ func TestPausepoints(t *testing.T) { defer s.Stopper().Stop(ctx) idb := s.InternalDB().(isql.DB) jobs.RegisterConstructor(jobspb.TypeImport, func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { if err := registry.CheckPausepoint("test_pause_foo"); err != nil { return err @@ -3604,7 +3605,7 @@ func TestJobTypeMetrics(t *testing.T) { for typ := range typeToRecord { jobs.RegisterConstructor(typ, func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { <-ctx.Done() return ctx.Err() diff --git a/pkg/jobs/jobstest/BUILD.bazel b/pkg/jobs/jobstest/BUILD.bazel index 00d4a20af21c..cfbf48b6c574 100644 --- a/pkg/jobs/jobstest/BUILD.bazel +++ b/pkg/jobs/jobstest/BUILD.bazel @@ -4,6 +4,7 @@ go_library( name = "jobstest", srcs = [ "logutils.go", + "resumer.go", "utils.go", ], importpath = "github.com/cockroachdb/cockroach/pkg/jobs/jobstest", @@ -13,6 +14,7 @@ go_library( "//pkg/jobs/jobspb", "//pkg/scheduledjobs", "//pkg/sql/catalog/systemschema", + "//pkg/sql/isql", "//pkg/sql/sem/tree", "//pkg/testutils", "//pkg/util/log", diff --git a/pkg/jobs/jobstest/resumer.go b/pkg/jobs/jobstest/resumer.go new file mode 100644 index 000000000000..090d44553755 --- /dev/null +++ b/pkg/jobs/jobstest/resumer.go @@ -0,0 +1,67 @@ +// Copyright 2023 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package jobstest + +import ( + "context" + + "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" + "github.com/cockroachdb/cockroach/pkg/sql/isql" +) + +// FakeResumer calls optional callbacks during the job lifecycle. +type FakeResumer struct { + OnResume func(context.Context) error + FailOrCancel func(context.Context) error + Success func() error + PauseRequest func(ctx context.Context, planHookState interface{}, txn isql.Txn, progress *jobspb.Progress) error + TraceRealSpan bool +} + +func (d FakeResumer) ForceRealSpan() bool { + return d.TraceRealSpan +} + +func (d FakeResumer) DumpTraceAfterRun() bool { + return true +} + +func (d FakeResumer) Resume(ctx context.Context, _ interface{}) error { + if d.OnResume != nil { + if err := d.OnResume(ctx); err != nil { + return err + } + } + if d.Success != nil { + return d.Success() + } + return nil +} + +func (d FakeResumer) OnFailOrCancel(ctx context.Context, _ interface{}, _ error) error { + if d.FailOrCancel != nil { + return d.FailOrCancel(ctx) + } + return nil +} + +func (d FakeResumer) CollectProfile(_ context.Context, _ interface{}) error { + return nil +} + +func (d FakeResumer) OnPauseRequest( + ctx context.Context, execCtx interface{}, txn isql.Txn, details *jobspb.Progress, +) error { + if d.PauseRequest == nil { + return nil + } + return d.PauseRequest(ctx, execCtx, txn, details) +} diff --git a/pkg/jobs/registry.go b/pkg/jobs/registry.go index 7b23edfd762c..2c2a8a7f5677 100644 --- a/pkg/jobs/registry.go +++ b/pkg/jobs/registry.go @@ -26,7 +26,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/security/username" "github.com/cockroachdb/cockroach/pkg/server/telemetry" - "github.com/cockroachdb/cockroach/pkg/server/tracedumper" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/sql/catalog/catpb" "github.com/cockroachdb/cockroach/pkg/sql/isql" @@ -107,7 +106,6 @@ type Registry struct { settings *cluster.Settings execCtx jobExecCtxMaker metrics Metrics - td *tracedumper.TraceDumper knobs TestingKnobs // adoptionChan is used to nudge the registry to resume claimed jobs and @@ -226,7 +224,6 @@ func MakeRegistry( histogramWindowInterval time.Duration, execCtxFn jobExecCtxMaker, preventAdoptionFile string, - td *tracedumper.TraceDumper, knobs *TestingKnobs, ) *Registry { r := &Registry{ @@ -241,7 +238,6 @@ func MakeRegistry( execCtx: execCtxFn, preventAdoptionFile: preventAdoptionFile, preventAdoptionLogEvery: log.Every(time.Minute), - td: td, // Use a non-zero buffer to allow queueing of notifications. // The writing method will use a default case to avoid blocking // if a notification is already queued. diff --git a/pkg/jobs/registry_external_test.go b/pkg/jobs/registry_external_test.go index 175f71f3279f..7f15bae0904a 100644 --- a/pkg/jobs/registry_external_test.go +++ b/pkg/jobs/registry_external_test.go @@ -26,6 +26,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/jobs" "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" + "github.com/cockroachdb/cockroach/pkg/jobs/jobstest" "github.com/cockroachdb/cockroach/pkg/security/username" "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/settings/cluster" @@ -364,7 +365,7 @@ func TestGCDurationControl(t *testing.T) { } jobs.RegisterConstructor(jobspb.TypeImport, func(_ *jobs.Job, cs *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{} + return jobstest.FakeResumer{} }, jobs.UsesTenantCostControl) s, sqlDB, _ := serverutils.StartServer(t, args) defer s.Stopper().Stop(ctx) @@ -443,7 +444,7 @@ func TestErrorsPopulatedOnRetry(t *testing.T) { return ctx.Err() } } - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: execFn, FailOrCancel: execFn, } diff --git a/pkg/jobs/registry_test.go b/pkg/jobs/registry_test.go index 294340923ed7..002f6ac989f5 100644 --- a/pkg/jobs/registry_test.go +++ b/pkg/jobs/registry_test.go @@ -23,6 +23,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" + "github.com/cockroachdb/cockroach/pkg/jobs/jobstest" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/keyvisualizer" "github.com/cockroachdb/cockroach/pkg/kv" @@ -344,7 +345,7 @@ func TestCreateJobWritesToJobInfo(t *testing.T) { r := s.JobRegistry().(*Registry) RegisterConstructor(jobspb.TypeImport, func(job *Job, cs *cluster.Settings) Resumer { - return FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { return nil }, @@ -531,7 +532,7 @@ func TestBatchJobsCreation(t *testing.T) { r := s.JobRegistry().(*Registry) RegisterConstructor(jobspb.TypeImport, func(job *Job, cs *cluster.Settings) Resumer { - return FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { return nil }, @@ -728,7 +729,7 @@ func TestRetriesWithExponentialBackoff(t *testing.T) { bti.adopted = bti.registry.metrics.AdoptIterations bti.resumed = bti.registry.metrics.ResumedJobs RegisterConstructor(jobspb.TypeImport, func(job *Job, cs *cluster.Settings) Resumer { - return FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { if bti.done.Load().(bool) { return nil @@ -1040,7 +1041,7 @@ func TestExponentialBackoffSettings(t *testing.T) { tdb = sqlutils.MakeSQLRunner(sdb) // Create and run a dummy job. RegisterConstructor(jobspb.TypeImport, func(_ *Job, cs *cluster.Settings) Resumer { - return FakeResumer{} + return jobstest.FakeResumer{} }, UsesTenantCostControl) registry := s.JobRegistry().(*Registry) id := registry.MakeJobID() @@ -1179,7 +1180,7 @@ func TestRunWithoutLoop(t *testing.T) { atomic.AddInt64(counter, 1) } } - return FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { maybeIncrementCounter(&successDone, &ran) if shouldFail { @@ -1259,7 +1260,7 @@ func TestJobIdleness(t *testing.T) { resumeErrChan := make(chan error) defer close(resumeErrChan) RegisterConstructor(jobspb.TypeImport, func(_ *Job, cs *cluster.Settings) Resumer { - return FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { resumeStartChan <- struct{}{} return <-resumeErrChan @@ -1485,7 +1486,7 @@ func TestGetClaimedResumerFromRegistry(t *testing.T) { defer close(resumeErrChan) var counter int RegisterConstructor(jobspb.TypeImport, func(_ *Job, cs *cluster.Settings) Resumer { - return FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { resumeStartChan <- struct{}{} return <-resumeErrChan diff --git a/pkg/server/BUILD.bazel b/pkg/server/BUILD.bazel index 66e34929c750..349a875fd0c7 100644 --- a/pkg/server/BUILD.bazel +++ b/pkg/server/BUILD.bazel @@ -201,7 +201,6 @@ go_library( "//pkg/server/systemconfigwatcher", "//pkg/server/telemetry", "//pkg/server/tenantsettingswatcher", - "//pkg/server/tracedumper", "//pkg/settings", "//pkg/settings/cluster", "//pkg/spanconfig", diff --git a/pkg/server/server_sql.go b/pkg/server/server_sql.go index e567db52e431..194fd07b4e96 100644 --- a/pkg/server/server_sql.go +++ b/pkg/server/server_sql.go @@ -63,7 +63,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/server/settingswatcher" "github.com/cockroachdb/cockroach/pkg/server/status" "github.com/cockroachdb/cockroach/pkg/server/systemconfigwatcher" - "github.com/cockroachdb/cockroach/pkg/server/tracedumper" "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/spanconfig" @@ -646,7 +645,6 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { jobsKnobs = cfg.TestingKnobs.JobsTestingKnobs.(*jobs.TestingKnobs) } - td := tracedumper.NewTraceDumper(ctx, cfg.InflightTraceDirName, cfg.Settings) *jobRegistry = *jobs.MakeRegistry( ctx, cfg.AmbientCtx, @@ -663,7 +661,6 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { return sql.MakeJobExecContext(ctx, opName, user, &sql.MemoryMetrics{}, execCfg) }, jobAdoptionStopFile, - td, jobsKnobs, ) } diff --git a/pkg/server/tracedumper/BUILD.bazel b/pkg/server/tracedumper/BUILD.bazel index 5b1a88ad66ce..619fde7e16ce 100644 --- a/pkg/server/tracedumper/BUILD.bazel +++ b/pkg/server/tracedumper/BUILD.bazel @@ -10,11 +10,8 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/server/dumpstore", - "//pkg/settings", - "//pkg/settings/cluster", "//pkg/sql/isql", "//pkg/util/log", - "//pkg/util/timeutil", "//pkg/util/tracing/zipper", "@com_github_cockroachdb_errors//:errors", ], diff --git a/pkg/server/tracedumper/tracedumper.go b/pkg/server/tracedumper/tracedumper.go index 8d9910669679..669acd17a560 100644 --- a/pkg/server/tracedumper/tracedumper.go +++ b/pkg/server/tracedumper/tracedumper.go @@ -18,11 +18,8 @@ import ( "time" "github.com/cockroachdb/cockroach/pkg/server/dumpstore" - "github.com/cockroachdb/cockroach/pkg/settings" - "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/sql/isql" "github.com/cockroachdb/cockroach/pkg/util/log" - "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/util/tracing/zipper" "github.com/cockroachdb/errors" ) @@ -32,17 +29,6 @@ const ( timeFormat = "2006-01-02T15_04_05.000" ) -var ( - totalDumpSizeLimit = settings.RegisterByteSizeSetting( - settings.ApplicationLevel, - "server.job_trace.total_dump_size_limit", - "total size of job trace dumps to be kept. "+ - "Dumps are GC'ed in the order of creation time. The latest dump is "+ - "always kept even if its size exceeds the limit.", - 500<<20, // 500MiB - ) -) - // TraceDumper can be used to dump a zip file containing cluster wide inflight // trace spans for a particular trace, to a configured dir. type TraceDumper struct { @@ -106,20 +92,3 @@ func (t *TraceDumper) Dump(ctx context.Context, name string, traceID int64, ie i log.Errorf(ctx, "failed to dump trace %v", err) } } - -// NewTraceDumper returns a TraceDumper. -// -// dir is the directory in which dumps are stored. -func NewTraceDumper(ctx context.Context, dir string, st *cluster.Settings) *TraceDumper { - if dir == "" { - return nil - } - - log.Infof(ctx, "writing job trace dumps to %s", log.SafeManaged(dir)) - - td := &TraceDumper{ - currentTime: timeutil.Now, - store: dumpstore.NewStore(dir, totalDumpSizeLimit, st), - } - return td -} diff --git a/pkg/sql/copy/copy_test.go b/pkg/sql/copy/copy_test.go index 3cca3a5a6ba3..16d3f71b8039 100644 --- a/pkg/sql/copy/copy_test.go +++ b/pkg/sql/copy/copy_test.go @@ -339,7 +339,7 @@ func TestCopyFromTransaction(t *testing.T) { tconn := sqlConnCtx.MakeSQLConn(io.Discard, io.Discard, url.String()) tc.testf(tconn, func(tconn clisqlclient.Conn) { // Without this everything comes back as strings - tconn.SetAlwaysInferResultTypes(true) + _ = tconn.SetAlwaysInferResultTypes(true) // Put each test in its own db so they can be parallelized. err := tconn.Exec(ctx, fmt.Sprintf("CREATE DATABASE %s; USE %s", tc.name, tc.name)) require.NoError(t, err) diff --git a/pkg/sql/logictest/logic.go b/pkg/sql/logictest/logic.go index 55aa45cfacf9..8e32b2f5852c 100644 --- a/pkg/sql/logictest/logic.go +++ b/pkg/sql/logictest/logic.go @@ -1303,11 +1303,14 @@ func (t *logicTest) newTestServerCluster(bootstrapBinaryPath, upgradeBinaryPath testserver.CockroachLogsDirOpt(logsDir), } if strings.Contains(upgradeBinaryPath, "cockroach-short") { - // If we're using a cockroach-short binary, that means it was - // locally built, so we need to opt-out of version offsetting to - // better simulate a real upgrade path. opts = append(opts, testserver.EnvVarOpt([]string{ + // If we're using a cockroach-short binary, that means it was + // locally built, so we need to opt-out of version offsetting to + // better simulate a real upgrade path. "COCKROACH_TESTING_FORCE_RELEASE_BRANCH=true", + // The build is made during testing, so it has metamorphic constants. + // We disable them here so that the test is more stable. + "COCKROACH_INTERNAL_DISABLE_METAMORPHIC_TESTING=true", })) }