pingcap · ti-chi-bot · Apr 12, 2023 · Apr 3, 2023 · Apr 4, 2023 · Apr 4, 2023
diff --git a/br/pkg/lightning/backend/backend.go b/br/pkg/lightning/backend/backend.go
@@ -108,6 +108,9 @@ type EngineConfig struct {
 	TableInfo *checkpoints.TidbTableInfo
 	// local backend specified configuration
 	Local LocalEngineConfig
+	// KeepSortDir indicates whether to keep the temporary sort directory
+	// when opening the engine, instead of removing it.
+	KeepSortDir bool
 }
 
 // LocalEngineConfig is the configuration used for local backend in OpenEngine.

diff --git a/br/pkg/lightning/backend/local/local.go b/br/pkg/lightning/backend/local/local.go
@@ -851,8 +851,10 @@ func (local *Local) OpenEngine(ctx context.Context, cfg *backend.EngineConfig, e
 	}
 
 	sstDir := engineSSTDir(local.LocalStoreDir, engineUUID)
-	if err := os.RemoveAll(sstDir); err != nil {
-		return errors.Trace(err)
+	if !cfg.KeepSortDir {
+		if err := os.RemoveAll(sstDir); err != nil {
+			return errors.Trace(err)
+		}
 	}
 	if !common.IsDirExists(sstDir) {
 		if err := os.Mkdir(sstDir, 0o750); err != nil {

diff --git a/ddl/BUILD.bazel b/ddl/BUILD.bazel
@@ -161,6 +161,7 @@ go_test(
         "attributes_sql_test.go",
         "backfilling_test.go",
         "cancel_test.go",
+        "checkpoint_test.go",
         "cluster_test.go",
         "column_change_test.go",
         "column_modify_test.go",

diff --git a/ddl/backfilling.go b/ddl/backfilling.go
@@ -185,6 +185,7 @@ type backfillResult struct {
 	taskID     int
 	addedCount int
 	scanCount  int
+	totalCount int
 	nextKey    kv.Key
 	err        error
 }
@@ -596,7 +597,11 @@ func handleOneResult(result *backfillResult, scheduler backfillScheduler, consum
 		scheduler.drainTasks() // Make it quit early.
 		return result.err
 	}
-	*totalAddedCount += int64(result.addedCount)
+	if result.totalCount > 0 {
+		*totalAddedCount = int64(result.totalCount)
+	} else {
+		*totalAddedCount += int64(result.addedCount)
+	}
 	reorgCtx := consumer.dc.getReorgCtx(reorgInfo.Job.ID)
 	reorgCtx.setRowCount(*totalAddedCount)
 	keeper.updateNextKey(result.taskID, result.nextKey)

diff --git a/ddl/backfilling_scheduler.go b/ddl/backfilling_scheduler.go
@@ -70,16 +70,17 @@ type txnBackfillScheduler struct {
 	workers []*backfillWorker
 	wg      sync.WaitGroup
 
-	taskCh   chan *reorgBackfillTask
-	resultCh chan *backfillResult
-	closed   bool
+	taskCh    chan *reorgBackfillTask
+	resultCh  chan *backfillResult
+	taskMaxID int
+	closed    bool
 }
 
 func newBackfillScheduler(ctx context.Context, info *reorgInfo, sessPool *sess.Pool,
 	tp backfillerType, tbl table.PhysicalTable, sessCtx sessionctx.Context,
 	jobCtx *JobContext) (backfillScheduler, error) {
 	if tp == typeAddIndexWorker && info.ReorgMeta.ReorgTp == model.ReorgTypeLitMerge {
-		return newIngestBackfillScheduler(ctx, info, tbl), nil
+		return newIngestBackfillScheduler(ctx, info, sessPool, tbl), nil
 	}
 	return newTxnBackfillScheduler(ctx, info, sessPool, tp, tbl, sessCtx, jobCtx)
 }
@@ -110,6 +111,8 @@ func (b *txnBackfillScheduler) setupWorkers() error {
 }
 
 func (b *txnBackfillScheduler) sendTask(task *reorgBackfillTask) {
+	b.taskMaxID++
+	task.id = b.taskMaxID
 	b.taskCh <- task
 }
 
@@ -254,6 +257,7 @@ func (b *txnBackfillScheduler) close(force bool) {
 type ingestBackfillScheduler struct {
 	ctx       context.Context
 	reorgInfo *reorgInfo
+	sessPool  *sess.Pool
 	tbl       table.PhysicalTable
 
 	closed bool
@@ -262,24 +266,31 @@ type ingestBackfillScheduler struct {
 	resultCh chan *backfillResult
 
 	copReqSenderPool *copReqSenderPool
+	taskMaxID        int
 
 	writerPool  *workerpool.WorkerPool[idxRecResult]
 	writerMaxID int
 	poolErr     chan error
 	backendCtx  *ingest.BackendContext
+
+	checkpointMgr ingest.CheckpointManager
 }
 
-func newIngestBackfillScheduler(ctx context.Context, info *reorgInfo, tbl table.PhysicalTable) *ingestBackfillScheduler {
+func newIngestBackfillScheduler(ctx context.Context, info *reorgInfo,
+	sessPool *sess.Pool, tbl table.PhysicalTable) *ingestBackfillScheduler {
 	return &ingestBackfillScheduler{
 		ctx:       ctx,
 		reorgInfo: info,
+		sessPool:  sessPool,
 		tbl:       tbl,
 		taskCh:    make(chan *reorgBackfillTask, backfillTaskChanSize),
 		resultCh:  make(chan *backfillResult, backfillTaskChanSize),
 		poolErr:   make(chan error),
 	}
 }
 
+const checkpointUpdateInterval = 5 * time.Second
+
 func (b *ingestBackfillScheduler) setupWorkers() error {
 	job := b.reorgInfo.Job
 	bc, ok := ingest.LitBackCtxMgr.Load(job.ID)
@@ -288,6 +299,12 @@ func (b *ingestBackfillScheduler) setupWorkers() error {
 		return errors.Trace(errors.New("cannot get lightning backend"))
 	}
 	b.backendCtx = bc
+	mgr, err := ingest.NewCentralizedCheckpointManager(b.ctx, bc, b.sessPool, job.ID,
+		b.reorgInfo.currElement.ID, checkpointUpdateInterval)
+	if err != nil {
+		return errors.Trace(err)
+	}
+	b.checkpointMgr = mgr
 	copReqSenderPool, err := b.createCopReqSenderPool()
 	if err != nil {
 		return errors.Trace(err)
@@ -311,8 +328,21 @@ func (b *ingestBackfillScheduler) close(force bool) {
 		return
 	}
 	close(b.taskCh)
-	b.copReqSenderPool.close(force)
-	b.writerPool.ReleaseAndWait()
+	if b.copReqSenderPool != nil {
+		b.copReqSenderPool.close(force)
+	}
+	if b.writerPool != nil {
+		b.writerPool.ReleaseAndWait()
+	}
+	if b.checkpointMgr != nil {
+		b.checkpointMgr.Close()
+	}
+	// Get the latest status after all workers are closed so that the result is more accurate.
+	cnt, nextKey := b.checkpointMgr.Status()
+	b.resultCh <- &backfillResult{
+		totalCount: cnt,
+		nextKey:    nextKey,
+	}
 	close(b.resultCh)
 	if !force {
 		jobID := b.reorgInfo.ID
@@ -325,6 +355,8 @@ func (b *ingestBackfillScheduler) close(force bool) {
 }
 
 func (b *ingestBackfillScheduler) sendTask(task *reorgBackfillTask) {
+	b.taskMaxID++
+	task.id = b.taskMaxID
 	b.taskCh <- task
 }
 
@@ -375,7 +407,8 @@ func (b *ingestBackfillScheduler) createWorker() workerpool.Worker[idxRecResult]
 		return nil
 	}
 	worker, err := newAddIndexIngestWorker(b.tbl, reorgInfo.d, ei, b.resultCh, job.ID,
-		reorgInfo.SchemaName, b.reorgInfo.currElement.ID, b.writerMaxID, b.copReqSenderPool, sessCtx)
+		reorgInfo.SchemaName, b.reorgInfo.currElement.ID, b.writerMaxID,
+		b.copReqSenderPool, sessCtx, b.checkpointMgr)
 	if err != nil {
 		// Return an error only if it is the first worker.
 		if b.writerMaxID == 0 {
@@ -407,7 +440,7 @@ func (b *ingestBackfillScheduler) createCopReqSenderPool() (*copReqSenderPool, e
 		logutil.BgLogger().Warn("[ddl-ingest] cannot init cop request sender", zap.Error(err))
 		return nil, err
 	}
-	return newCopReqSenderPool(b.ctx, copCtx, sessCtx.GetStore(), b.taskCh), nil
+	return newCopReqSenderPool(b.ctx, copCtx, sessCtx.GetStore(), b.taskCh, b.checkpointMgr), nil
 }
 
 func (b *ingestBackfillScheduler) expectedWorkerSize() (readerSize int, writerSize int) {
@@ -439,30 +472,24 @@ func (w *addIndexIngestWorker) HandleTask(rs idxRecResult) {
 		w.resultCh <- result
 		return
 	}
-	count, nextKey, err := w.WriteLocal(&rs)
+	count, err := w.WriteLocal(&rs)
 	if err != nil {
 		result.err = err
 		w.resultCh <- result
 		return
 	}
 	if count == 0 {
 		logutil.BgLogger().Info("[ddl-ingest] finish a cop-request task", zap.Int("id", rs.id))
-		if bc, ok := ingest.LitBackCtxMgr.Load(w.jobID); ok {
-			err := bc.Flush(w.index.Meta().ID)
-			if err != nil {
-				result.err = err
-				w.resultCh <- result
-			}
-		}
 		return
 	}
-	result.scanCount = count
-	result.addedCount = count
+	cnt, nextKey := w.checkpointMgr.Status()
+	result.totalCount = cnt
 	result.nextKey = nextKey
-	w.metricCounter.Add(float64(count))
+	w.metricCounter.Add(float64(cnt))
 	if ResultCounterForTest != nil && result.err == nil {
 		ResultCounterForTest.Add(1)
 	}
+	result.err = w.checkpointMgr.UpdateCurrent(rs.id, count)
 	w.resultCh <- result
 }
 

diff --git a/ddl/checkpoint_test.go b/ddl/checkpoint_test.go
@@ -0,0 +1,21 @@
+// Copyright 2023 PingCAP, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ddl
+
+import "testing"
+
+func TestCheckpointManager(t *testing.T) {
+
+}
diff --git a/ddl/dist_backfilling.go b/ddl/dist_backfilling.go
@@ -270,7 +270,7 @@ func (bwm *backfilWorkerManager) waitFinalResult(resultCh <-chan *backfillResult
 				}
 
 				if ingestBackendCtx != nil && i%workerCnt == 0 {
-					err := ingestBackendCtx.Flush(eleID)
+					_, err := ingestBackendCtx.Flush(eleID)
 					if err != nil {
 						bwm.unsyncErr = err
 						return

diff --git a/ddl/export_test.go b/ddl/export_test.go
@@ -51,7 +51,7 @@ func FetchChunk4Test(copCtx *copContext, tbl table.PhysicalTable, startKey, endK
 	}
 	taskCh := make(chan *reorgBackfillTask, 5)
 	resultCh := make(chan idxRecResult, 5)
-	pool := newCopReqSenderPool(context.Background(), copCtx, store, taskCh)
+	pool := newCopReqSenderPool(context.Background(), copCtx, store, taskCh, &dummyCheckpointMgr{})
 	pool.chunkSender = &resultChanForTest{ch: resultCh}
 	pool.adjustSize(1)
 	pool.tasksCh <- task
@@ -67,3 +67,23 @@ func ConvertRowToHandleAndIndexDatum(row chunk.Row, copCtx *copContext) (kv.Hand
 	handle, err := buildHandle(handleData, copCtx.tblInfo, copCtx.pkInfo, &stmtctx.StatementContext{TimeZone: time.Local})
 	return handle, idxData, err
 }
+
+type dummyCheckpointMgr struct{}
+
+func (d *dummyCheckpointMgr) Status() (_ int, _ kv.Key) {
+	return 0, nil
+}
+
+func (d *dummyCheckpointMgr) IsComplete(_ int, _, _ kv.Key) bool {
+	return false
+}
+
+func (d *dummyCheckpointMgr) Register(_ int, _, _ kv.Key) {}
+
+func (d *dummyCheckpointMgr) UpdateTotal(_ int, _ int, _ bool) {}
+
+func (d *dummyCheckpointMgr) UpdateCurrent(_ int, _ int) error {
+	return nil
+}
+
+func (d *dummyCheckpointMgr) Close() {}