From bbd601fd06de8fd63e00a90c09a110b0bf8b3a91 Mon Sep 17 00:00:00 2001 From: adityamaru Date: Tue, 17 Oct 2023 15:36:46 -0400 Subject: [PATCH 1/3] jobstest: move FakeResumer to jobstest Release note: None --- pkg/jobs/delegate_control_test.go | 6 +-- pkg/jobs/helpers_test.go | 53 ----------------------- pkg/jobs/jobs_test.go | 33 ++++++++------- pkg/jobs/jobstest/BUILD.bazel | 2 + pkg/jobs/jobstest/resumer.go | 67 ++++++++++++++++++++++++++++++ pkg/jobs/registry_external_test.go | 5 ++- pkg/jobs/registry_test.go | 15 +++---- 7 files changed, 100 insertions(+), 81 deletions(-) create mode 100644 pkg/jobs/jobstest/resumer.go diff --git a/pkg/jobs/delegate_control_test.go b/pkg/jobs/delegate_control_test.go index ef68b09bab98..4dc8516a2656 100644 --- a/pkg/jobs/delegate_control_test.go +++ b/pkg/jobs/delegate_control_test.go @@ -159,7 +159,7 @@ func TestJobsControlForSchedules(t *testing.T) { // As such, the job does not undergo usual job state transitions // (e.g. pause-request -> paused). RegisterConstructor(jobspb.TypeImport, func(job *Job, _ *cluster.Settings) Resumer { - return FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(_ context.Context) error { <-blockResume return nil @@ -273,7 +273,7 @@ func TestFilterJobsControlForSchedules(t *testing.T) { // Our resume never completes any jobs, until this test completes. RegisterConstructor(jobspb.TypeImport, func(job *Job, _ *cluster.Settings) Resumer { - return FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(_ context.Context) error { <-blockResume return nil @@ -406,7 +406,7 @@ func TestJobControlByType(t *testing.T) { // Make the jobs of each type controllable. for _, jobType := range allJobTypes { RegisterConstructor(jobType, func(job *Job, _ *cluster.Settings) Resumer { - return FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { <-ctx.Done() return nil diff --git a/pkg/jobs/helpers_test.go b/pkg/jobs/helpers_test.go index 6904a448e592..8144b712175a 100644 --- a/pkg/jobs/helpers_test.go +++ b/pkg/jobs/helpers_test.go @@ -19,62 +19,9 @@ import ( "github.com/cockroachdb/errors" ) -// FakeResumer calls optional callbacks during the job lifecycle. -type FakeResumer struct { - OnResume func(context.Context) error - FailOrCancel func(context.Context) error - Success func() error - PauseRequest onPauseRequestFunc - TraceRealSpan bool -} - -func (d FakeResumer) ForceRealSpan() bool { - return d.TraceRealSpan -} - -func (d FakeResumer) DumpTraceAfterRun() bool { - return true -} - -var _ Resumer = FakeResumer{} - -func (d FakeResumer) Resume(ctx context.Context, execCtx interface{}) error { - if d.OnResume != nil { - if err := d.OnResume(ctx); err != nil { - return err - } - } - if d.Success != nil { - return d.Success() - } - return nil -} - -func (d FakeResumer) OnFailOrCancel(ctx context.Context, _ interface{}, _ error) error { - if d.FailOrCancel != nil { - return d.FailOrCancel(ctx) - } - return nil -} - -func (d FakeResumer) CollectProfile(_ context.Context, _ interface{}) error { - return nil -} - // OnPauseRequestFunc forwards the definition for use in tests. type OnPauseRequestFunc = onPauseRequestFunc -var _ PauseRequester = FakeResumer{} - -func (d FakeResumer) OnPauseRequest( - ctx context.Context, execCtx interface{}, txn isql.Txn, details *jobspb.Progress, -) error { - if d.PauseRequest == nil { - return nil - } - return d.PauseRequest(ctx, execCtx, txn, details) -} - func (r *Registry) CancelRequested(ctx context.Context, txn isql.Txn, id jobspb.JobID) error { return r.cancelRequested(ctx, txn, id) } diff --git a/pkg/jobs/jobs_test.go b/pkg/jobs/jobs_test.go index ccffa3f72e06..924b914ec1e0 100644 --- a/pkg/jobs/jobs_test.go +++ b/pkg/jobs/jobs_test.go @@ -29,6 +29,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/jobs" "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" "github.com/cockroachdb/cockroach/pkg/jobs/jobsprotectedts" + "github.com/cockroachdb/cockroach/pkg/jobs/jobstest" "github.com/cockroachdb/cockroach/pkg/keyvisualizer" "github.com/cockroachdb/cockroach/pkg/kv/kvserver" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts/ptpb" @@ -207,7 +208,7 @@ func noopPauseRequestFunc( return nil } -var _ jobs.TraceableJob = (*jobs.FakeResumer)(nil) +var _ jobs.TraceableJob = (*jobstest.FakeResumer)(nil) func (rts *registryTestSuite) setUp(t *testing.T) { rts.ctx = context.Background() @@ -261,7 +262,7 @@ func (rts *registryTestSuite) setUp(t *testing.T) { rts.onPauseRequest = noopPauseRequestFunc jobs.RegisterConstructor(jobspb.TypeImport, func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ TraceRealSpan: rts.traceRealSpan, OnResume: func(ctx context.Context) error { t.Log("Starting resume") @@ -1151,7 +1152,7 @@ func TestRegistryLifecycle(t *testing.T) { resumerJob := make(chan *jobs.Job, 1) jobs.RegisterConstructor( jobspb.TypeBackup, func(j *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { resumerJob <- j return nil @@ -2248,7 +2249,7 @@ func TestShowJobWhenComplete(t *testing.T) { defer close(done) jobs.RegisterConstructor( jobspb.TypeImport, func(_ *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { select { case <-ctx.Done(): @@ -2412,7 +2413,7 @@ func TestJobInTxn(t *testing.T) { }, ) jobs.RegisterConstructor(jobspb.TypeBackup, func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { t.Logf("Resuming job: %+v", job.Payload()) atomic.AddInt32(&hasRun, 1) @@ -2452,7 +2453,7 @@ func TestJobInTxn(t *testing.T) { }, ) jobs.RegisterConstructor(jobspb.TypeRestore, func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(_ context.Context) error { return errors.New("RESTORE failed") }, @@ -2550,7 +2551,7 @@ func TestStartableJobMixedVersion(t *testing.T) { require.NoError(t, err) jobs.RegisterConstructor(jobspb.TypeImport, func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{} + return jobstest.FakeResumer{} }, jobs.UsesTenantCostControl) var j *jobs.StartableJob jobID := jr.MakeJobID() @@ -2591,7 +2592,7 @@ func TestStartableJob(t *testing.T) { return func() { resumeFunc.Store(prev) } } jobs.RegisterConstructor(jobspb.TypeRestore, func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { return resumeFunc.Load().(func(ctx context.Context) error)(ctx) }, @@ -2777,7 +2778,7 @@ func TestStartableJobTxnRetry(t *testing.T) { defer s.Stopper().Stop(ctx) jr := s.JobRegistry().(*jobs.Registry) jobs.RegisterConstructor(jobspb.TypeRestore, func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{} + return jobstest.FakeResumer{} }, jobs.UsesTenantCostControl) rec := jobs.Record{ Details: jobspb.RestoreDetails{}, @@ -2819,7 +2820,7 @@ func TestRegistryTestingNudgeAdoptionQueue(t *testing.T) { defer jobs.ResetConstructors()() resuming := make(chan struct{}) jobs.RegisterConstructor(jobspb.TypeBackup, func(_ *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { resuming <- struct{}{} return nil @@ -2905,7 +2906,7 @@ func TestMetrics(t *testing.T) { fakeBackupMetrics := makeFakeMetrics() jobs.RegisterConstructor(jobspb.TypeBackup, func(j *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { defer fakeBackupMetrics.N.Inc(1) return waitForErr(ctx) @@ -2919,7 +2920,7 @@ func TestMetrics(t *testing.T) { ) jobs.RegisterConstructor(jobspb.TypeImport, func(_ *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { return waitForErr(ctx) }, @@ -3139,7 +3140,7 @@ func TestLoseLeaseDuringExecution(t *testing.T) { defer jobs.ResetConstructors()() resumed := make(chan error, 1) jobs.RegisterConstructor(jobspb.TypeBackup, func(j *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { defer close(resumed) _, err := s.InternalExecutor().(isql.Executor).Exec( @@ -3209,7 +3210,7 @@ func TestPauseReason(t *testing.T) { defer close(done) resumeSignaler := newResumeStartedSignaler() jobs.RegisterConstructor(jobspb.TypeImport, func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { resumeSignaler.SignalResumeStarted() select { @@ -3464,7 +3465,7 @@ func TestPausepoints(t *testing.T) { defer s.Stopper().Stop(ctx) idb := s.InternalDB().(isql.DB) jobs.RegisterConstructor(jobspb.TypeImport, func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { if err := registry.CheckPausepoint("test_pause_foo"); err != nil { return err @@ -3604,7 +3605,7 @@ func TestJobTypeMetrics(t *testing.T) { for typ := range typeToRecord { jobs.RegisterConstructor(typ, func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { <-ctx.Done() return ctx.Err() diff --git a/pkg/jobs/jobstest/BUILD.bazel b/pkg/jobs/jobstest/BUILD.bazel index 00d4a20af21c..cfbf48b6c574 100644 --- a/pkg/jobs/jobstest/BUILD.bazel +++ b/pkg/jobs/jobstest/BUILD.bazel @@ -4,6 +4,7 @@ go_library( name = "jobstest", srcs = [ "logutils.go", + "resumer.go", "utils.go", ], importpath = "github.com/cockroachdb/cockroach/pkg/jobs/jobstest", @@ -13,6 +14,7 @@ go_library( "//pkg/jobs/jobspb", "//pkg/scheduledjobs", "//pkg/sql/catalog/systemschema", + "//pkg/sql/isql", "//pkg/sql/sem/tree", "//pkg/testutils", "//pkg/util/log", diff --git a/pkg/jobs/jobstest/resumer.go b/pkg/jobs/jobstest/resumer.go new file mode 100644 index 000000000000..090d44553755 --- /dev/null +++ b/pkg/jobs/jobstest/resumer.go @@ -0,0 +1,67 @@ +// Copyright 2023 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package jobstest + +import ( + "context" + + "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" + "github.com/cockroachdb/cockroach/pkg/sql/isql" +) + +// FakeResumer calls optional callbacks during the job lifecycle. +type FakeResumer struct { + OnResume func(context.Context) error + FailOrCancel func(context.Context) error + Success func() error + PauseRequest func(ctx context.Context, planHookState interface{}, txn isql.Txn, progress *jobspb.Progress) error + TraceRealSpan bool +} + +func (d FakeResumer) ForceRealSpan() bool { + return d.TraceRealSpan +} + +func (d FakeResumer) DumpTraceAfterRun() bool { + return true +} + +func (d FakeResumer) Resume(ctx context.Context, _ interface{}) error { + if d.OnResume != nil { + if err := d.OnResume(ctx); err != nil { + return err + } + } + if d.Success != nil { + return d.Success() + } + return nil +} + +func (d FakeResumer) OnFailOrCancel(ctx context.Context, _ interface{}, _ error) error { + if d.FailOrCancel != nil { + return d.FailOrCancel(ctx) + } + return nil +} + +func (d FakeResumer) CollectProfile(_ context.Context, _ interface{}) error { + return nil +} + +func (d FakeResumer) OnPauseRequest( + ctx context.Context, execCtx interface{}, txn isql.Txn, details *jobspb.Progress, +) error { + if d.PauseRequest == nil { + return nil + } + return d.PauseRequest(ctx, execCtx, txn, details) +} diff --git a/pkg/jobs/registry_external_test.go b/pkg/jobs/registry_external_test.go index 175f71f3279f..7f15bae0904a 100644 --- a/pkg/jobs/registry_external_test.go +++ b/pkg/jobs/registry_external_test.go @@ -26,6 +26,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/jobs" "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" + "github.com/cockroachdb/cockroach/pkg/jobs/jobstest" "github.com/cockroachdb/cockroach/pkg/security/username" "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/settings/cluster" @@ -364,7 +365,7 @@ func TestGCDurationControl(t *testing.T) { } jobs.RegisterConstructor(jobspb.TypeImport, func(_ *jobs.Job, cs *cluster.Settings) jobs.Resumer { - return jobs.FakeResumer{} + return jobstest.FakeResumer{} }, jobs.UsesTenantCostControl) s, sqlDB, _ := serverutils.StartServer(t, args) defer s.Stopper().Stop(ctx) @@ -443,7 +444,7 @@ func TestErrorsPopulatedOnRetry(t *testing.T) { return ctx.Err() } } - return jobs.FakeResumer{ + return jobstest.FakeResumer{ OnResume: execFn, FailOrCancel: execFn, } diff --git a/pkg/jobs/registry_test.go b/pkg/jobs/registry_test.go index 294340923ed7..002f6ac989f5 100644 --- a/pkg/jobs/registry_test.go +++ b/pkg/jobs/registry_test.go @@ -23,6 +23,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" + "github.com/cockroachdb/cockroach/pkg/jobs/jobstest" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/keyvisualizer" "github.com/cockroachdb/cockroach/pkg/kv" @@ -344,7 +345,7 @@ func TestCreateJobWritesToJobInfo(t *testing.T) { r := s.JobRegistry().(*Registry) RegisterConstructor(jobspb.TypeImport, func(job *Job, cs *cluster.Settings) Resumer { - return FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { return nil }, @@ -531,7 +532,7 @@ func TestBatchJobsCreation(t *testing.T) { r := s.JobRegistry().(*Registry) RegisterConstructor(jobspb.TypeImport, func(job *Job, cs *cluster.Settings) Resumer { - return FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { return nil }, @@ -728,7 +729,7 @@ func TestRetriesWithExponentialBackoff(t *testing.T) { bti.adopted = bti.registry.metrics.AdoptIterations bti.resumed = bti.registry.metrics.ResumedJobs RegisterConstructor(jobspb.TypeImport, func(job *Job, cs *cluster.Settings) Resumer { - return FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { if bti.done.Load().(bool) { return nil @@ -1040,7 +1041,7 @@ func TestExponentialBackoffSettings(t *testing.T) { tdb = sqlutils.MakeSQLRunner(sdb) // Create and run a dummy job. RegisterConstructor(jobspb.TypeImport, func(_ *Job, cs *cluster.Settings) Resumer { - return FakeResumer{} + return jobstest.FakeResumer{} }, UsesTenantCostControl) registry := s.JobRegistry().(*Registry) id := registry.MakeJobID() @@ -1179,7 +1180,7 @@ func TestRunWithoutLoop(t *testing.T) { atomic.AddInt64(counter, 1) } } - return FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { maybeIncrementCounter(&successDone, &ran) if shouldFail { @@ -1259,7 +1260,7 @@ func TestJobIdleness(t *testing.T) { resumeErrChan := make(chan error) defer close(resumeErrChan) RegisterConstructor(jobspb.TypeImport, func(_ *Job, cs *cluster.Settings) Resumer { - return FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { resumeStartChan <- struct{}{} return <-resumeErrChan @@ -1485,7 +1486,7 @@ func TestGetClaimedResumerFromRegistry(t *testing.T) { defer close(resumeErrChan) var counter int RegisterConstructor(jobspb.TypeImport, func(_ *Job, cs *cluster.Settings) Resumer { - return FakeResumer{ + return jobstest.FakeResumer{ OnResume: func(ctx context.Context) error { resumeStartChan <- struct{}{} return <-resumeErrChan From 0506ef533188251704e25b0c6e05f7501ec85059 Mon Sep 17 00:00:00 2001 From: adityamaru Date: Tue, 10 Oct 2023 17:10:35 -0400 Subject: [PATCH 2/3] cli: add job traces for traceable jobs to the debug zip This change teaches the debug zip to collect the traces for traceable jobs (backup, restore, import, pcr) that are in a running or reverting state at the time the zip is collected. These traces are dumped in a `/jobs///trace.zip` file and rely on the existing `tracing/zippr` that is used by `cockroach debug job-trace` to collect the required information. Note this functionality is default off. Release note (cli change): `cockroach debug zip` has an additional flag that is default off `include-running-job-traces` that will enable collecting the inflight traces of traceable jobs such as backup, restore, import, c2c and dump them in a `jobs/` subdirectory in the zip. --- pkg/cli/BUILD.bazel | 3 + pkg/cli/cliflags/flags.go | 9 ++ pkg/cli/clisqlcfg/context.go | 4 +- pkg/cli/clisqlclient/api.go | 5 +- pkg/cli/clisqlclient/conn.go | 6 +- pkg/cli/context.go | 7 ++ pkg/cli/debug_job_trace_test.go | 1 + pkg/cli/flags.go | 1 + pkg/cli/testdata/zip/partial1 | 1 + pkg/cli/testdata/zip/partial1_excluded | 1 + pkg/cli/testdata/zip/partial2 | 1 + pkg/cli/testdata/zip/testzip | 1 + pkg/cli/testdata/zip/testzip_concurrent | 1 + .../zip/testzip_exclude_goroutine_stacks | 1 + .../testzip_external_process_virtualization | 1 + .../zip/testzip_include_goroutine_stacks | 1 + .../testdata/zip/testzip_include_range_info | 1 + .../zip/testzip_shared_process_virtualization | 1 + ...process_virtualization_with_default_tenant | 1 + pkg/cli/zip.go | 105 ++++++++++++++++- pkg/cli/zip_cluster_wide.go | 7 ++ pkg/cli/zip_test.go | 111 ++++++++++++++++++ pkg/jobs/BUILD.bazel | 1 - pkg/jobs/registry.go | 4 - pkg/server/BUILD.bazel | 1 - pkg/server/server_sql.go | 3 - pkg/server/tracedumper/BUILD.bazel | 3 - pkg/server/tracedumper/tracedumper.go | 31 ----- pkg/sql/copy/copy_test.go | 2 +- 29 files changed, 265 insertions(+), 50 deletions(-) diff --git a/pkg/cli/BUILD.bazel b/pkg/cli/BUILD.bazel index 29aca1fc5388..f82b76c73e3c 100644 --- a/pkg/cli/BUILD.bazel +++ b/pkg/cli/BUILD.bazel @@ -359,6 +359,7 @@ go_test( deps = [ "//pkg/base", "//pkg/build", + "//pkg/ccl/backupccl", "//pkg/cli/clicfg", "//pkg/cli/clienturl", "//pkg/cli/clierror", @@ -374,6 +375,7 @@ go_test( "//pkg/gossip", "//pkg/jobs", "//pkg/jobs/jobspb", + "//pkg/jobs/jobstest", "//pkg/keys", "//pkg/kv", "//pkg/kv/kvclient/kvtenant", @@ -402,6 +404,7 @@ go_test( "//pkg/storage", "//pkg/testutils", "//pkg/testutils/datapathutils", + "//pkg/testutils/jobutils", "//pkg/testutils/listenerutil", "//pkg/testutils/serverutils", "//pkg/testutils/skip", diff --git a/pkg/cli/cliflags/flags.go b/pkg/cli/cliflags/flags.go index 683936383781..cd2b82d1b23a 100644 --- a/pkg/cli/cliflags/flags.go +++ b/pkg/cli/cliflags/flags.go @@ -1690,6 +1690,15 @@ require any additional stop-the-world operations to be collected. `, } + ZipIncludeRunningJobTraces = FlagInfo{ + Name: "include-running-job-traces", + Description: ` +Include information about each running, traceable job in jobs/*/*/trace.zip +files. This involves collecting cluster-wide traces for each running job in the +cluster. +`, + } + ZipCPUProfileDuration = FlagInfo{ Name: "cpu-profile-duration", Description: ` diff --git a/pkg/cli/clisqlcfg/context.go b/pkg/cli/clisqlcfg/context.go index 01d7e7a77ae8..9cfb86d3433a 100644 --- a/pkg/cli/clisqlcfg/context.go +++ b/pkg/cli/clisqlcfg/context.go @@ -184,7 +184,7 @@ func (c *Context) MakeConn(url string) (clisqlclient.Conn, error) { // By default, all connections will use the underlying driver to infer // result types. This should be set back to false for any use case where the // results are only shown for textual display. - conn.SetAlwaysInferResultTypes(true) + _ = conn.SetAlwaysInferResultTypes(true) return conn, nil } @@ -198,7 +198,7 @@ func (c *Context) Run(ctx context.Context, conn clisqlclient.Conn) error { // Anything using a SQL shell (e.g. `cockroach sql` or `demo`), only needs // to show results in text format, so the underlying driver doesn't need to // infer types. - conn.SetAlwaysInferResultTypes(false) + _ = conn.SetAlwaysInferResultTypes(false) // Open the connection to make sure everything is OK before running any // statements. Performs authentication. diff --git a/pkg/cli/clisqlclient/api.go b/pkg/cli/clisqlclient/api.go index 32519bbcb0d6..b7828744ee85 100644 --- a/pkg/cli/clisqlclient/api.go +++ b/pkg/cli/clisqlclient/api.go @@ -70,8 +70,9 @@ type Conn interface { // SetAlwaysInferResultTypes configures the alwaysInferResultTypes flag, which // determines if the client should use the underlying driver to infer result - // types. - SetAlwaysInferResultTypes(b bool) + // types. It returns a method that can be used to reset the configuration to + // its previous value. + SetAlwaysInferResultTypes(b bool) func() // GetServerMetadata returns details about the CockroachDB node // this connection is connected to. diff --git a/pkg/cli/clisqlclient/conn.go b/pkg/cli/clisqlclient/conn.go index e10a45eeb7b4..1247345fe6d9 100644 --- a/pkg/cli/clisqlclient/conn.go +++ b/pkg/cli/clisqlclient/conn.go @@ -164,8 +164,12 @@ func (c *sqlConn) SetMissingPassword(missing bool) { } // SetAlwaysInferResultTypes implements the Conn interface. -func (c *sqlConn) SetAlwaysInferResultTypes(b bool) { +func (c *sqlConn) SetAlwaysInferResultTypes(b bool) func() { + oldVal := c.alwaysInferResultTypes c.alwaysInferResultTypes = b + return func() { + c.alwaysInferResultTypes = oldVal + } } // EnsureConn (re-)establishes the connection to the server. diff --git a/pkg/cli/context.go b/pkg/cli/context.go index c33ae90e8975..021140e3b779 100644 --- a/pkg/cli/context.go +++ b/pkg/cli/context.go @@ -363,6 +363,10 @@ type zipContext struct { // latency. includeStacks bool + // includeRunningJobTraces includes the active traces of each running + // Traceable job in individual jobs/*/ranges/trace.zip files. + includeRunningJobTraces bool + // The log/heap/etc files to include. files fileSelection } @@ -381,6 +385,9 @@ func setZipContextDefaults() { // Goroutine stack dumps require a "stop the world" operation on the server side, // which impacts performance and SQL service latency. zipCtx.includeStacks = true + // Job traces for running Traceable jobs involves fetching cluster wide traces + // for each job. + zipCtx.includeRunningJobTraces = false zipCtx.cpuProfDuration = 5 * time.Second zipCtx.concurrency = 15 diff --git a/pkg/cli/debug_job_trace_test.go b/pkg/cli/debug_job_trace_test.go index cd02e98f3c51..7b03a7a6a69e 100644 --- a/pkg/cli/debug_job_trace_test.go +++ b/pkg/cli/debug_job_trace_test.go @@ -77,6 +77,7 @@ func (r *traceSpanResumer) CollectProfile(_ context.Context, _ interface{}) erro func TestDebugJobTrace(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) + defer jobs.ResetConstructors()() ctx := context.Background() argsFn := func(args *base.TestServerArgs) { diff --git a/pkg/cli/flags.go b/pkg/cli/flags.go index b93b372d2515..8a48741510f6 100644 --- a/pkg/cli/flags.go +++ b/pkg/cli/flags.go @@ -702,6 +702,7 @@ func init() { cliflagcfg.IntFlag(f, &zipCtx.concurrency, cliflags.ZipConcurrency) cliflagcfg.BoolFlag(f, &zipCtx.includeRangeInfo, cliflags.ZipIncludeRangeInfo) cliflagcfg.BoolFlag(f, &zipCtx.includeStacks, cliflags.ZipIncludeGoroutineStacks) + cliflagcfg.BoolFlag(f, &zipCtx.includeRunningJobTraces, cliflags.ZipIncludeRunningJobTraces) } // List-files + Zip commands. for _, cmd := range []*cobra.Command{debugZipCmd, debugListFilesCmd} { diff --git a/pkg/cli/testdata/zip/partial1 b/pkg/cli/testdata/zip/partial1 index 014309a8e748..205ee46c296b 100644 --- a/pkg/cli/testdata/zip/partial1 +++ b/pkg/cli/testdata/zip/partial1 @@ -263,3 +263,4 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null [cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. diff --git a/pkg/cli/testdata/zip/partial1_excluded b/pkg/cli/testdata/zip/partial1_excluded index a1cfd380abf6..3e986c5bae64 100644 --- a/pkg/cli/testdata/zip/partial1_excluded +++ b/pkg/cli/testdata/zip/partial1_excluded @@ -166,3 +166,4 @@ debug zip /dev/null --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0 [cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. diff --git a/pkg/cli/testdata/zip/partial2 b/pkg/cli/testdata/zip/partial2 index d6ddc4bd52bb..7e810329edf0 100644 --- a/pkg/cli/testdata/zip/partial2 +++ b/pkg/cli/testdata/zip/partial2 @@ -165,3 +165,4 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null [cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. diff --git a/pkg/cli/testdata/zip/testzip b/pkg/cli/testdata/zip/testzip index b3e8fff2cc5b..cbc363a36831 100644 --- a/pkg/cli/testdata/zip/testzip +++ b/pkg/cli/testdata/zip/testzip @@ -126,3 +126,4 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null [cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. diff --git a/pkg/cli/testdata/zip/testzip_concurrent b/pkg/cli/testdata/zip/testzip_concurrent index 0e420180df2f..846957639693 100644 --- a/pkg/cli/testdata/zip/testzip_concurrent +++ b/pkg/cli/testdata/zip/testzip_concurrent @@ -1,5 +1,6 @@ zip ---- +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. [cluster] creating output file /dev/null... [cluster] creating output file /dev/null: done [cluster] discovering virtual clusters... diff --git a/pkg/cli/testdata/zip/testzip_exclude_goroutine_stacks b/pkg/cli/testdata/zip/testzip_exclude_goroutine_stacks index 0e38d6616a69..70824f9df00f 100644 --- a/pkg/cli/testdata/zip/testzip_exclude_goroutine_stacks +++ b/pkg/cli/testdata/zip/testzip_exclude_goroutine_stacks @@ -125,4 +125,5 @@ debug zip --concurrency=1 --cpu-profile-duration=1s --include-goroutine-stacks=f [cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. [cluster] NOTE: Omitted node-level goroutine stack dumps from this debug zip bundle. Use the --include-goroutine-stacks flag to enable the fetching of this data. diff --git a/pkg/cli/testdata/zip/testzip_external_process_virtualization b/pkg/cli/testdata/zip/testzip_external_process_virtualization index 184a175a2429..8a529bb04db0 100644 --- a/pkg/cli/testdata/zip/testzip_external_process_virtualization +++ b/pkg/cli/testdata/zip/testzip_external_process_virtualization @@ -161,3 +161,4 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null [cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. diff --git a/pkg/cli/testdata/zip/testzip_include_goroutine_stacks b/pkg/cli/testdata/zip/testzip_include_goroutine_stacks index b3e8fff2cc5b..cbc363a36831 100644 --- a/pkg/cli/testdata/zip/testzip_include_goroutine_stacks +++ b/pkg/cli/testdata/zip/testzip_include_goroutine_stacks @@ -126,3 +126,4 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null [cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. diff --git a/pkg/cli/testdata/zip/testzip_include_range_info b/pkg/cli/testdata/zip/testzip_include_range_info index 75d1d8f92a53..44aa6093f060 100644 --- a/pkg/cli/testdata/zip/testzip_include_range_info +++ b/pkg/cli/testdata/zip/testzip_include_range_info @@ -126,3 +126,4 @@ debug zip --concurrency=1 --cpu-profile-duration=1s --include-range-info /dev/nu [cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. diff --git a/pkg/cli/testdata/zip/testzip_shared_process_virtualization b/pkg/cli/testdata/zip/testzip_shared_process_virtualization index 5aaad4c2caa5..aa484a72b4ba 100644 --- a/pkg/cli/testdata/zip/testzip_shared_process_virtualization +++ b/pkg/cli/testdata/zip/testzip_shared_process_virtualization @@ -286,3 +286,4 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null [cluster] pprof summary script... writing binary output: debug/cluster/test-tenant/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/cluster/test-tenant/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/cluster/test-tenant/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. diff --git a/pkg/cli/testdata/zip/testzip_shared_process_virtualization_with_default_tenant b/pkg/cli/testdata/zip/testzip_shared_process_virtualization_with_default_tenant index 5aaad4c2caa5..aa484a72b4ba 100644 --- a/pkg/cli/testdata/zip/testzip_shared_process_virtualization_with_default_tenant +++ b/pkg/cli/testdata/zip/testzip_shared_process_virtualization_with_default_tenant @@ -286,3 +286,4 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null [cluster] pprof summary script... writing binary output: debug/cluster/test-tenant/pprof-summary.sh... done [cluster] hot range summary script... writing binary output: debug/cluster/test-tenant/hot-ranges.sh... done [cluster] tenant hot range summary script... writing binary output: debug/cluster/test-tenant/hot-ranges-tenant.sh... done +[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data. diff --git a/pkg/cli/zip.go b/pkg/cli/zip.go index 352b1cef55ef..18c32f896a15 100644 --- a/pkg/cli/zip.go +++ b/pkg/cli/zip.go @@ -12,6 +12,7 @@ package cli import ( "context" + "database/sql/driver" "fmt" "io" "net" @@ -23,6 +24,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cli/cliflags" "github.com/cockroachdb/cockroach/pkg/cli/clisqlclient" "github.com/cockroachdb/cockroach/pkg/cli/clisqlexec" + "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/server/profiler" @@ -31,7 +33,11 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/sql/sem/catconstants" "github.com/cockroachdb/cockroach/pkg/util/buildutil" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/protoutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb" + tracezipper "github.com/cockroachdb/cockroach/pkg/util/tracing/zipper" "github.com/cockroachdb/errors" "github.com/jackc/pgconn" "github.com/marusama/semaphore" @@ -253,7 +259,7 @@ func runDebugZip(cmd *cobra.Command, args []string) (retErr error) { sqlConn, err := makeTenantSQLClient(ctx, "cockroach zip", useSystemDb, tenant.TenantName) // The zip output is sent directly into a text file, so the results should // be scanned into strings. - sqlConn.SetAlwaysInferResultTypes(false) + _ = sqlConn.SetAlwaysInferResultTypes(false) if err != nil { _ = s.fail(errors.Wrap(err, "unable to open a SQL session. Debug information will be incomplete")) } else { @@ -344,6 +350,12 @@ done } } + if !zipCtx.includeRunningJobTraces { + zr.info("NOTE: Omitted traces of running jobs from this debug zip bundle." + + " Use the --" + cliflags.ZipIncludeRunningJobTraces.Name + " flag to enable the fetching of this" + + " data.") + } + if !zipCtx.includeStacks { zr.info("NOTE: Omitted node-level goroutine stack dumps from this debug zip bundle." + " Use the --" + cliflags.ZipIncludeGoroutineStacks.Name + " flag to enable the fetching of this" + @@ -377,6 +389,97 @@ func maybeAddProfileSuffix(name string) string { return name } +type jobTrace struct { + jobID jobspb.JobID + traceID tracingpb.TraceID +} + +// dumpTraceableJobTraces collects the traces for some "traceable" jobs that are +// in a running state. The job types in this list are the ones that have +// explicitly implemented the TraceableJob interface. +func (zc *debugZipContext) dumpTraceableJobTraces() error { + ctx := context.Background() + rows, err := zc.firstNodeSQLConn.Query(ctx, + `WITH +latestprogress AS ( + SELECT job_id, value + FROM system.job_info AS progress + WHERE info_key = 'legacy_progress' + ORDER BY written desc +), +jobpage AS ( + SELECT id + FROM system.jobs@jobs_status_created_idx + WHERE (job_type IN ($1, $2, $3, $4)) AND (status IN ($5, $6)) + ORDER BY id +) +SELECT distinct (id), latestprogress.value AS progress +FROM jobpage AS j +INNER JOIN latestprogress ON j.id = latestprogress.job_id;`, + jobspb.TypeBackup.String(), + jobspb.TypeRestore.String(), + jobspb.TypeImport.String(), + jobspb.TypeReplicationStreamIngestion.String(), + "running", + "reverting", + ) + if err != nil { + return err + } + defer func() { + if rows != nil { + if err := rows.Close(); err != nil { + log.Warningf(ctx, "failed to close with error: %v", err) + } + } + }() + vals := make([]driver.Value, 2) + jobTraces := make([]jobTrace, 0) + for err = rows.Next(vals); err == nil; err = rows.Next(vals) { + jobID, ok := vals[0].(int64) + if !ok { + return errors.New("failed to parse jobID") + } + progressBytes, ok := vals[1].([]byte) + if !ok { + return errors.New("failed to parse progress bytes") + } + progress := &jobspb.Progress{} + if err := protoutil.Unmarshal(progressBytes, progress); err != nil { + return err + } + jobTraces = append(jobTraces, jobTrace{jobID: jobspb.JobID(jobID), traceID: progress.TraceID}) + } + + func() { + // Debug zip collection sets this to false since results from the query are + // all dumped into txt files. In our case we parse the results of the query + // with their respective types and pre-process the information before + // dumping into a zip file. + reset := zc.firstNodeSQLConn.SetAlwaysInferResultTypes(true) + defer reset() + for _, jobTrace := range jobTraces { + inflightTraceZipper := tracezipper.MakeSQLConnInflightTraceZipper(zc.firstNodeSQLConn.GetDriverConn()) + jobZip, err := inflightTraceZipper.Zip(ctx, int64(jobTrace.traceID)) + if err != nil { + log.Warningf(ctx, "failed to collect inflight trace zip for job %d: %v", jobTrace.jobID, err) + continue + } + + ts := timeutil.Now().Format(`20060102150405`) + name := fmt.Sprintf("%s/jobs/%d/%s/trace.zip", zc.prefix, jobTrace.jobID, ts) + s := zc.clusterPrinter.start("requesting traces for job %d", jobTrace.jobID) + if err := zc.z.createRaw(s, name, jobZip); err != nil { + log.Warningf(ctx, "failed to write inflight trace zip for job %d to file %s: %v", + jobTrace.jobID, name, err) + continue + } + } + }() + + return nil +} + // dumpTableDataForZip runs the specified SQL query and stores the // result. Errors encountered while running the SQL query are stored // in an error file in the zip file, and dumpTableDataForZip() returns diff --git a/pkg/cli/zip_cluster_wide.go b/pkg/cli/zip_cluster_wide.go index 660514cabe72..68ab826b748f 100644 --- a/pkg/cli/zip_cluster_wide.go +++ b/pkg/cli/zip_cluster_wide.go @@ -188,5 +188,12 @@ func (zc *debugZipContext) collectClusterData( } } + if zipCtx.includeRunningJobTraces { + zc.clusterPrinter.info("collecting the inflight traces for jobs") + if err := zc.dumpTraceableJobTraces(); err != nil { + return &serverpb.NodesListResponse{}, nil, err + } + } + return nodesList, livenessByNodeID, nil } diff --git a/pkg/cli/zip_test.go b/pkg/cli/zip_test.go index 951299d44066..ad001f0b1cf6 100644 --- a/pkg/cli/zip_test.go +++ b/pkg/cli/zip_test.go @@ -28,17 +28,24 @@ import ( "time" "github.com/cockroachdb/cockroach/pkg/base" + _ "github.com/cockroachdb/cockroach/pkg/ccl/backupccl" + "github.com/cockroachdb/cockroach/pkg/jobs" + "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" + "github.com/cockroachdb/cockroach/pkg/jobs/jobstest" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv/kvpb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/security/username" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/datapathutils" + "github.com/cockroachdb/cockroach/pkg/testutils/jobutils" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" "github.com/cockroachdb/cockroach/pkg/testutils/skip" + "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" @@ -46,6 +53,7 @@ import ( "github.com/cockroachdb/datadriven" "github.com/cockroachdb/errors" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) // TestZipContainsAllInternalTables verifies that we don't add new internal tables @@ -870,3 +878,106 @@ func TestNodeRangeSelection(t *testing.T) { } } } + +func TestZipJobTrace(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + defer jobs.ResetConstructors()() + + s, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{ + Insecure: true, + Knobs: base.TestingKnobs{ + JobsTestingKnobs: jobs.NewTestingKnobsWithShortIntervals(), + }, + }) + defer s.Stopper().Stop(context.Background()) + blockCh := make(chan struct{}) + jobs.RegisterConstructor(jobspb.TypeImport, + func(j *jobs.Job, _ *cluster.Settings) jobs.Resumer { + return jobstest.FakeResumer{ + OnResume: func(ctx context.Context) error { + <-blockCh + return nil + }, + } + }, jobs.UsesTenantCostControl) + runner := sqlutils.MakeSQLRunner(sqlDB) + dir, cleanupFn := testutils.TempDir(t) + defer cleanupFn() + zipName := filepath.Join(dir, "test.zip") + + // Run a backup that completes, we should not see a trace for this job. + runner.Exec(t, `CREATE TABLE foo (id INT)`) + runner.Exec(t, `BACKUP TABLE foo INTO 'userfile:///completes'`) + + // Run a restore that completes, we should not see a trace for this job. + runner.Exec(t, `CREATE DATABASE test`) + runner.Exec(t, `RESTORE TABLE foo FROM LATEST IN 'userfile:///completes' WITH into_db = 'test'`) + + triggerJobAndWaitForRun := func(jobQuery string) jobspb.JobID { + var jobID jobspb.JobID + runner.QueryRow(t, jobQuery).Scan(&jobID) + jobutils.WaitForJobToRun(t, runner, jobID) + return jobID + } + + sqlURL := url.URL{ + Scheme: "postgres", + User: url.User(username.RootUser), + Host: s.AdvSQLAddr(), + RawQuery: "sslmode=disable", + } + sqlConn := sqlConnCtx.MakeSQLConn(io.Discard, io.Discard, sqlURL.String()) + defer func() { + if err := sqlConn.Close(); err != nil { + t.Fatal(err) + } + }() + + var expectedFilesList strings.Builder + var importJobID jobspb.JobID + var importJobID2 jobspb.JobID + func() { + out, err := os.Create(zipName) + if err != nil { + t.Fatal(err) + } + z := newZipper(out) + defer func() { + if err := z.close(); err != nil { + t.Fatal(err) + } + }() + + runner.Exec(t, `CREATE TABLE x (id INT PRIMARY KEY, n INT, s STRING)`) + importJobID = triggerJobAndWaitForRun(`IMPORT INTO x CSV DATA ('workload:///csv/bank/bank?rows=100&version=1.0.0') WITH detached`) + importJobID2 = triggerJobAndWaitForRun(`IMPORT INTO x CSV DATA ('workload:///csv/bank/bank?rows=100&version=1.0.0') WITH detached`) + expectedFilesList.WriteString(fmt.Sprintf("/jobs/%d/.*/trace.zip\n", importJobID)) + expectedFilesList.WriteString(fmt.Sprintf("/jobs/%d/.*/trace.zip\n", importJobID2)) + + zr := zipCtx.newZipReporter("test") + zc := debugZipContext{ + z: z, + clusterPrinter: zr, + timeout: 3 * time.Second, + firstNodeSQLConn: sqlConn, + } + if err := zc.dumpTraceableJobTraces(); err != nil { + t.Fatal(err) + } + }() + + r, err := zip.OpenReader(zipName) + if err != nil { + t.Fatal(err) + } + defer func() { _ = r.Close() }() + var fileList strings.Builder + for _, f := range r.File { + fmt.Fprintln(&fileList, f.Name) + } + require.Regexp(t, expectedFilesList.String(), fileList.String()) + close(blockCh) + jobutils.WaitForJobToSucceed(t, runner, importJobID) + jobutils.WaitForJobToSucceed(t, runner, importJobID2) +} diff --git a/pkg/jobs/BUILD.bazel b/pkg/jobs/BUILD.bazel index 879cd4b7c940..2f360a87ef02 100644 --- a/pkg/jobs/BUILD.bazel +++ b/pkg/jobs/BUILD.bazel @@ -41,7 +41,6 @@ go_library( "//pkg/scheduledjobs", "//pkg/security/username", "//pkg/server/telemetry", - "//pkg/server/tracedumper", "//pkg/settings", "//pkg/settings/cluster", "//pkg/sql/catalog", diff --git a/pkg/jobs/registry.go b/pkg/jobs/registry.go index 7b23edfd762c..2c2a8a7f5677 100644 --- a/pkg/jobs/registry.go +++ b/pkg/jobs/registry.go @@ -26,7 +26,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/security/username" "github.com/cockroachdb/cockroach/pkg/server/telemetry" - "github.com/cockroachdb/cockroach/pkg/server/tracedumper" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/sql/catalog/catpb" "github.com/cockroachdb/cockroach/pkg/sql/isql" @@ -107,7 +106,6 @@ type Registry struct { settings *cluster.Settings execCtx jobExecCtxMaker metrics Metrics - td *tracedumper.TraceDumper knobs TestingKnobs // adoptionChan is used to nudge the registry to resume claimed jobs and @@ -226,7 +224,6 @@ func MakeRegistry( histogramWindowInterval time.Duration, execCtxFn jobExecCtxMaker, preventAdoptionFile string, - td *tracedumper.TraceDumper, knobs *TestingKnobs, ) *Registry { r := &Registry{ @@ -241,7 +238,6 @@ func MakeRegistry( execCtx: execCtxFn, preventAdoptionFile: preventAdoptionFile, preventAdoptionLogEvery: log.Every(time.Minute), - td: td, // Use a non-zero buffer to allow queueing of notifications. // The writing method will use a default case to avoid blocking // if a notification is already queued. diff --git a/pkg/server/BUILD.bazel b/pkg/server/BUILD.bazel index 66e34929c750..349a875fd0c7 100644 --- a/pkg/server/BUILD.bazel +++ b/pkg/server/BUILD.bazel @@ -201,7 +201,6 @@ go_library( "//pkg/server/systemconfigwatcher", "//pkg/server/telemetry", "//pkg/server/tenantsettingswatcher", - "//pkg/server/tracedumper", "//pkg/settings", "//pkg/settings/cluster", "//pkg/spanconfig", diff --git a/pkg/server/server_sql.go b/pkg/server/server_sql.go index e567db52e431..194fd07b4e96 100644 --- a/pkg/server/server_sql.go +++ b/pkg/server/server_sql.go @@ -63,7 +63,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/server/settingswatcher" "github.com/cockroachdb/cockroach/pkg/server/status" "github.com/cockroachdb/cockroach/pkg/server/systemconfigwatcher" - "github.com/cockroachdb/cockroach/pkg/server/tracedumper" "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/spanconfig" @@ -646,7 +645,6 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { jobsKnobs = cfg.TestingKnobs.JobsTestingKnobs.(*jobs.TestingKnobs) } - td := tracedumper.NewTraceDumper(ctx, cfg.InflightTraceDirName, cfg.Settings) *jobRegistry = *jobs.MakeRegistry( ctx, cfg.AmbientCtx, @@ -663,7 +661,6 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { return sql.MakeJobExecContext(ctx, opName, user, &sql.MemoryMetrics{}, execCfg) }, jobAdoptionStopFile, - td, jobsKnobs, ) } diff --git a/pkg/server/tracedumper/BUILD.bazel b/pkg/server/tracedumper/BUILD.bazel index 5b1a88ad66ce..619fde7e16ce 100644 --- a/pkg/server/tracedumper/BUILD.bazel +++ b/pkg/server/tracedumper/BUILD.bazel @@ -10,11 +10,8 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/server/dumpstore", - "//pkg/settings", - "//pkg/settings/cluster", "//pkg/sql/isql", "//pkg/util/log", - "//pkg/util/timeutil", "//pkg/util/tracing/zipper", "@com_github_cockroachdb_errors//:errors", ], diff --git a/pkg/server/tracedumper/tracedumper.go b/pkg/server/tracedumper/tracedumper.go index 8d9910669679..669acd17a560 100644 --- a/pkg/server/tracedumper/tracedumper.go +++ b/pkg/server/tracedumper/tracedumper.go @@ -18,11 +18,8 @@ import ( "time" "github.com/cockroachdb/cockroach/pkg/server/dumpstore" - "github.com/cockroachdb/cockroach/pkg/settings" - "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/sql/isql" "github.com/cockroachdb/cockroach/pkg/util/log" - "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/util/tracing/zipper" "github.com/cockroachdb/errors" ) @@ -32,17 +29,6 @@ const ( timeFormat = "2006-01-02T15_04_05.000" ) -var ( - totalDumpSizeLimit = settings.RegisterByteSizeSetting( - settings.ApplicationLevel, - "server.job_trace.total_dump_size_limit", - "total size of job trace dumps to be kept. "+ - "Dumps are GC'ed in the order of creation time. The latest dump is "+ - "always kept even if its size exceeds the limit.", - 500<<20, // 500MiB - ) -) - // TraceDumper can be used to dump a zip file containing cluster wide inflight // trace spans for a particular trace, to a configured dir. type TraceDumper struct { @@ -106,20 +92,3 @@ func (t *TraceDumper) Dump(ctx context.Context, name string, traceID int64, ie i log.Errorf(ctx, "failed to dump trace %v", err) } } - -// NewTraceDumper returns a TraceDumper. -// -// dir is the directory in which dumps are stored. -func NewTraceDumper(ctx context.Context, dir string, st *cluster.Settings) *TraceDumper { - if dir == "" { - return nil - } - - log.Infof(ctx, "writing job trace dumps to %s", log.SafeManaged(dir)) - - td := &TraceDumper{ - currentTime: timeutil.Now, - store: dumpstore.NewStore(dir, totalDumpSizeLimit, st), - } - return td -} diff --git a/pkg/sql/copy/copy_test.go b/pkg/sql/copy/copy_test.go index 3cca3a5a6ba3..16d3f71b8039 100644 --- a/pkg/sql/copy/copy_test.go +++ b/pkg/sql/copy/copy_test.go @@ -339,7 +339,7 @@ func TestCopyFromTransaction(t *testing.T) { tconn := sqlConnCtx.MakeSQLConn(io.Discard, io.Discard, url.String()) tc.testf(tconn, func(tconn clisqlclient.Conn) { // Without this everything comes back as strings - tconn.SetAlwaysInferResultTypes(true) + _ = tconn.SetAlwaysInferResultTypes(true) // Put each test in its own db so they can be parallelized. err := tconn.Exec(ctx, fmt.Sprintf("CREATE DATABASE %s; USE %s", tc.name, tc.name)) require.NoError(t, err) From 48eceb79a0b15554ab6486daf717994002ed4d04 Mon Sep 17 00:00:00 2001 From: Rafi Shamim Date: Wed, 18 Oct 2023 13:21:11 -0400 Subject: [PATCH 3/3] logictest: disable metamorphic testing for mixed version test We've seen that these tests can get quite slow sometimes. One theory is that this is caused by metamorphic test settings, so this PR disables them so we can determine if it helps with test stability. Release note: None --- pkg/sql/logictest/logic.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pkg/sql/logictest/logic.go b/pkg/sql/logictest/logic.go index 55aa45cfacf9..8e32b2f5852c 100644 --- a/pkg/sql/logictest/logic.go +++ b/pkg/sql/logictest/logic.go @@ -1303,11 +1303,14 @@ func (t *logicTest) newTestServerCluster(bootstrapBinaryPath, upgradeBinaryPath testserver.CockroachLogsDirOpt(logsDir), } if strings.Contains(upgradeBinaryPath, "cockroach-short") { - // If we're using a cockroach-short binary, that means it was - // locally built, so we need to opt-out of version offsetting to - // better simulate a real upgrade path. opts = append(opts, testserver.EnvVarOpt([]string{ + // If we're using a cockroach-short binary, that means it was + // locally built, so we need to opt-out of version offsetting to + // better simulate a real upgrade path. "COCKROACH_TESTING_FORCE_RELEASE_BRANCH=true", + // The build is made during testing, so it has metamorphic constants. + // We disable them here so that the test is more stable. + "COCKROACH_INTERNAL_DISABLE_METAMORPHIC_TESTING=true", })) }