*: provide a variable to ignore the real-time stats in the planner (#…

…43988) close #46080
pingcap · Sep 1, 2023 · 7119410 · 7119410
1 parent 3036b6d
commit 7119410
Show file tree

Hide file tree

Showing 22 changed files with 332 additions and 165 deletions.
diff --git a/planner/cardinality/BUILD.bazel b/planner/cardinality/BUILD.bazel
@@ -59,7 +59,7 @@ go_test(
     data = glob(["testdata/**"]),
     embed = [":cardinality"],
     flaky = True,
-    shard_count = 31,
+    shard_count = 32,
     deps = [
         "//config",
         "//domain",

diff --git a/planner/cardinality/row_count_column.go b/planner/cardinality/row_count_column.go
@@ -282,9 +282,13 @@ func GetColumnRowCount(sctx sessionctx.Context, c *statistics.Column, ranges []*
 		// If the current table row count has changed, we should scale the row count accordingly.
 		cnt *= c.GetIncreaseFactor(realtimeRowCount)
 
+		histNDV := c.NDV
+		if c.StatsVer == statistics.Version2 {
+			histNDV = histNDV - int64(c.TopN.Num())
+		}
 		// handling the out-of-range part
 		if (c.OutOfRange(lowVal) && !lowVal.IsNull()) || c.OutOfRange(highVal) {
-			cnt += c.Histogram.OutOfRangeRowCount(sctx, &lowVal, &highVal, modifyCount)
+			cnt += c.Histogram.OutOfRangeRowCount(sctx, &lowVal, &highVal, modifyCount, histNDV)
 		}
 
 		if debugTrace {

diff --git a/planner/cardinality/row_count_index.go b/planner/cardinality/row_count_index.go
@@ -320,9 +320,13 @@ func getIndexRowCountForStatsV2(sctx sessionctx.Context, idx *statistics.Index,
 		// If the current table row count has changed, we should scale the row count accordingly.
 		count *= idx.GetIncreaseFactor(realtimeRowCount)
 
+		histNDV := idx.NDV
+		if idx.StatsVer == statistics.Version2 {
+			histNDV = histNDV - int64(idx.TopN.Num())
+		}
 		// handling the out-of-range part
 		if (outOfRangeOnIndex(idx, l) && !(isSingleCol && lowIsNull)) || outOfRangeOnIndex(idx, r) {
-			count += idx.Histogram.OutOfRangeRowCount(sctx, &l, &r, modifyCount)
+			count += idx.Histogram.OutOfRangeRowCount(sctx, &l, &r, modifyCount, histNDV)
 		}
 
 		if debugTrace {

diff --git a/planner/cardinality/row_count_test.go b/planner/cardinality/row_count_test.go
@@ -33,7 +33,7 @@ func TestPseudoTable(t *testing.T) {
 		State:     model.StatePublic,
 	}
 	ti.Columns = append(ti.Columns, colInfo)
-	tbl := statistics.PseudoTable(ti)
+	tbl := statistics.PseudoTable(ti, false)
 	require.Len(t, tbl.Columns, 1)
 	require.Greater(t, tbl.RealtimeCount, int64(0))
 	sctx := mock.NewContext()
@@ -50,7 +50,7 @@ func TestPseudoTable(t *testing.T) {
 		Hidden:    true,
 		State:     model.StatePublic,
 	})
-	tbl = statistics.PseudoTable(ti)
+	tbl = statistics.PseudoTable(ti, false)
 	// We added a hidden column. The pseudo table still only have one column.
 	require.Equal(t, len(tbl.Columns), 1)
 }
diff --git a/planner/cardinality/selectivity_test.go b/planner/cardinality/selectivity_test.go
@@ -173,11 +173,15 @@ func TestOutOfRangeEstimationAfterDelete(t *testing.T) {
 	testKit.MustExec("drop table if exists t")
 	testKit.MustExec("create table t(a int unsigned)")
 	require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh()))
+	// [300, 900)
+	// 5 rows for each value, 3000 rows in total.
 	for i := 0; i < 3000; i++ {
-		testKit.MustExec(fmt.Sprintf("insert into t values (%v)", i/5+300)) // [300, 900)
+		testKit.MustExec(fmt.Sprintf("insert into t values (%v)", i/5+300))
 	}
 	require.Nil(t, h.DumpStatsDeltaToKV(handle.DumpAll))
 	testKit.MustExec("analyze table t with 1 samplerate, 0 topn")
+	// Data in [300, 500), 1000 rows in total, are deleted.
+	// 2000 rows left.
 	testKit.MustExec("delete from t where a < 500")
 	require.Nil(t, h.DumpStatsDeltaToKV(handle.DumpAll))
 	require.Nil(t, h.Update(dom.InfoSchema()))
@@ -193,9 +197,15 @@ func TestOutOfRangeEstimationAfterDelete(t *testing.T) {
 	for i := range input {
 		testdata.OnRecord(func() {
 			output[i].SQL = input[i]
-			output[i].Result = testdata.ConvertRowsToStrings(testKit.MustQuery(input[i]).Rows())
 		})
-		testKit.MustQuery(input[i]).Check(testkit.Rows(output[i].Result...))
+		if strings.HasPrefix(input[i], "explain") {
+			testdata.OnRecord(func() {
+				output[i].Result = testdata.ConvertRowsToStrings(testKit.MustQuery(input[i]).Rows())
+			})
+			testKit.MustQuery(input[i]).Check(testkit.Rows(output[i].Result...))
+		} else {
+			testKit.MustExec(input[i])
+		}
 	}
 }
 
@@ -1321,3 +1331,71 @@ func TestCrossValidationSelectivity(t *testing.T) {
 		"└─Selection 0.00 cop[tikv]  gt(test.t.c, 1000)",
 		"  └─TableRangeScan 2.00 cop[tikv] table:t range:(1 0,1 1000), keep order:false"))
 }
+
+func TestIgnoreRealtimeStats(t *testing.T) {
+	store, dom := testkit.CreateMockStoreAndDomain(t)
+	testKit := testkit.NewTestKit(t, store)
+	testKit.MustExec("use test")
+	testKit.MustExec("drop table if exists t")
+	testKit.MustExec("create table t(a int, b int, index ib(b))")
+	h := dom.StatsHandle()
+	require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh()))
+
+	// 1. Insert 11 rows of data without ANALYZE.
+	testKit.MustExec("insert into t values(1,1),(1,2),(1,3),(1,4),(1,5),(2,1),(2,2),(2,3),(2,4),(2,5),(3,1)")
+	require.Nil(t, h.DumpStatsDeltaToKV(handle.DumpAll))
+	require.Nil(t, h.Update(dom.InfoSchema()))
+
+	// 1-1. use real-time stats.
+	// From the real-time stats, we are able to know the total count is 11.
+	testKit.MustExec("set @@tidb_opt_objective = 'moderate'")
+	testKit.MustQuery("explain select * from t where a = 1 and b > 2").Check(testkit.Rows(
+		"TableReader_7 0.00 root  data:Selection_6",
+		"└─Selection_6 0.00 cop[tikv]  eq(test.t.a, 1), gt(test.t.b, 2)",
+		"  └─TableFullScan_5 11.00 cop[tikv] table:t keep order:false, stats:pseudo",
+	))
+
+	// 1-2. ignore real-time stats.
+	// Use pseudo stats table. The total row count is 10000.
+	testKit.MustExec("set @@tidb_opt_objective = 'determinate'")
+	testKit.MustQuery("explain select * from t where a = 1 and b > 2").Check(testkit.Rows(
+		"TableReader_7 3.33 root  data:Selection_6",
+		"└─Selection_6 3.33 cop[tikv]  eq(test.t.a, 1), gt(test.t.b, 2)",
+		"  └─TableFullScan_5 10000.00 cop[tikv] table:t keep order:false, stats:pseudo",
+	))
+
+	// 2. After ANALYZE.
+	testKit.MustExec("analyze table t with 1 samplerate")
+	require.Nil(t, h.Update(dom.InfoSchema()))
+
+	// The execution plans are the same no matter we ignore the real-time stats or not.
+	analyzedPlan := []string{
+		"TableReader_7 2.73 root  data:Selection_6",
+		"└─Selection_6 2.73 cop[tikv]  eq(test.t.a, 1), gt(test.t.b, 2)",
+		"  └─TableFullScan_5 11.00 cop[tikv] table:t keep order:false",
+	}
+	testKit.MustExec("set @@tidb_opt_objective = 'moderate'")
+	testKit.MustQuery("explain select * from t where a = 1 and b > 2").Check(testkit.Rows(analyzedPlan...))
+	testKit.MustExec("set @@tidb_opt_objective = 'determinate'")
+	testKit.MustQuery("explain select * from t where a = 1 and b > 2").Check(testkit.Rows(analyzedPlan...))
+
+	// 3. Insert another 4 rows of data.
+	testKit.MustExec("insert into t values(3,2),(3,3),(3,4),(3,5)")
+	require.Nil(t, h.DumpStatsDeltaToKV(handle.DumpAll))
+	require.Nil(t, h.Update(dom.InfoSchema()))
+
+	// 3-1. use real-time stats.
+	// From the real-time stats, we are able to know the total count is 15.
+	// Selectivity is not changed: 15 * (2.73 / 11) = 3.72
+	testKit.MustExec("set @@tidb_opt_objective = 'moderate'")
+	testKit.MustQuery("explain select * from t where a = 1 and b > 2").Check(testkit.Rows(
+		"TableReader_7 3.72 root  data:Selection_6",
+		"└─Selection_6 3.72 cop[tikv]  eq(test.t.a, 1), gt(test.t.b, 2)",
+		"  └─TableFullScan_5 15.00 cop[tikv] table:t keep order:false",
+	))
+
+	// 3-2. ignore real-time stats.
+	// The execution plan is the same as case 2.
+	testKit.MustExec("set @@tidb_opt_objective = 'determinate'")
+	testKit.MustQuery("explain select * from t where a = 1 and b > 2").Check(testkit.Rows(analyzedPlan...))
+}
diff --git a/planner/cardinality/testdata/cardinality_suite_in.json b/planner/cardinality/testdata/cardinality_suite_in.json
@@ -111,7 +111,12 @@
       "explain format = 'brief' select * from t where a > 900 and a < 1000",
       "explain format = 'brief' select * from t where a > 900 and a < 1100",
       "explain format = 'brief' select * from t where a > 200 and a < 300",
-      "explain format = 'brief' select * from t where a > 100 and a < 300"
+      "explain format = 'brief' select * from t where a > 100 and a < 300",
+      "set @@tidb_opt_objective = 'determinate'",
+      "explain format = 'brief' select * from t where a <= 300",
+      "explain format = 'brief' select * from t where a <= 500",
+      "explain format = 'brief' select * from t where a > 900",
+      "explain format = 'brief' select * from t where a <= 900"
     ]
   },
   {

diff --git a/planner/cardinality/testdata/cardinality_suite_out.json b/planner/cardinality/testdata/cardinality_suite_out.json
@@ -234,6 +234,42 @@
           "└─Selection 832.49 cop[tikv]  gt(test.t.a, 100), lt(test.t.a, 300)",
           "  └─TableFullScan 2000.00 cop[tikv] table:t keep order:false"
         ]
+      },
+      {
+        "SQL": "set @@tidb_opt_objective = 'determinate'",
+        "Result": null
+      },
+      {
+        "SQL": "explain format = 'brief' select * from t where a <= 300",
+        "Result": [
+          "TableReader 10.00 root  data:Selection",
+          "└─Selection 10.00 cop[tikv]  le(test.t.a, 300)",
+          "  └─TableFullScan 3000.00 cop[tikv] table:t keep order:false"
+        ]
+      },
+      {
+        "SQL": "explain format = 'brief' select * from t where a <= 500",
+        "Result": [
+          "TableReader 1010.00 root  data:Selection",
+          "└─Selection 1010.00 cop[tikv]  le(test.t.a, 500)",
+          "  └─TableFullScan 3000.00 cop[tikv] table:t keep order:false"
+        ]
+      },
+      {
+        "SQL": "explain format = 'brief' select * from t where a > 900",
+        "Result": [
+          "TableReader 5.00 root  data:Selection",
+          "└─Selection 5.00 cop[tikv]  gt(test.t.a, 900)",
+          "  └─TableFullScan 3000.00 cop[tikv] table:t keep order:false"
+        ]
+      },
+      {
+        "SQL": "explain format = 'brief' select * from t where a <= 900",
+        "Result": [
+          "TableReader 3000.00 root  data:Selection",
+          "└─Selection 3000.00 cop[tikv]  le(test.t.a, 900)",
+          "  └─TableFullScan 3000.00 cop[tikv] table:t keep order:false"
+        ]
       }
     ]
   },

diff --git a/planner/core/logical_plan_builder.go b/planner/core/logical_plan_builder.go
@@ -4695,6 +4695,7 @@ func (ds *DataSource) AddExtraPhysTblIDColumn() *expression.Column {
 // 1. tidb-server started and statistics handle has not been initialized.
 // 2. table row count from statistics is zero.
 // 3. statistics is outdated.
+// Note: please also update getLatestVersionFromStatsTable() when logic in this function changes.
 func getStatsTable(ctx sessionctx.Context, tblInfo *model.TableInfo, pid int64) *statistics.Table {
 	statsHandle := domain.GetDomain(ctx).StatsHandle()
 	var usePartitionStats, countIs0, pseudoStatsForUninitialized, pseudoStatsForOutdated bool
@@ -4717,7 +4718,7 @@ func getStatsTable(ctx sessionctx.Context, tblInfo *model.TableInfo, pid int64)
 	}
 	// 1. tidb-server started and statistics handle has not been initialized.
 	if statsHandle == nil {
-		return statistics.PseudoTable(tblInfo)
+		return statistics.PseudoTable(tblInfo, false)
 	}
 
 	if pid == tblInfo.ID || ctx.GetSessionVars().StmtCtx.UseDynamicPartitionPrune() {
@@ -4727,11 +4728,35 @@ func getStatsTable(ctx sessionctx.Context, tblInfo *model.TableInfo, pid int64)
 		statsTbl = statsHandle.GetPartitionStats(tblInfo, pid, cache.WithTableStatsByQuery())
 	}
 
+	allowPseudoTblTriggerLoading := false
+	// In OptObjectiveDeterminate mode, we need to ignore the real-time stats.
+	// To achieve this, we copy the statsTbl and reset the real-time stats fields (set ModifyCount to 0 and set
+	// RealtimeCount to the row count from the ANALYZE, which is fetched from loaded stats in GetAnalyzeRowCount()).
+	if ctx.GetSessionVars().GetOptObjective() == variable.OptObjectiveDeterminate {
+		analyzeCount := max(int64(statsTbl.GetAnalyzeRowCount()), 0)
+		// If the two fields are already the values we want, we don't need to modify it, and also we don't need to copy.
+		if statsTbl.RealtimeCount != analyzeCount || statsTbl.ModifyCount != 0 {
+			// Here is a case that we need specially care about:
+			// The original stats table from the stats cache is not a pseudo table, but the analyze row count is 0 (probably
+			// because of no col/idx stats are loaded), which will makes it a pseudo table according to the rule 2 below.
+			// Normally, a pseudo table won't trigger stats loading since we assume it means "no stats available", but
+			// in such case, we need it able to trigger stats loading.
+			// That's why we use the special allowPseudoTblTriggerLoading flag here.
+			if !statsTbl.Pseudo && statsTbl.RealtimeCount > 0 && analyzeCount == 0 {
+				allowPseudoTblTriggerLoading = true
+			}
+			// Copy it so we can modify the ModifyCount and the RealtimeCount safely.
+			statsTbl = statsTbl.ShallowCopy()
+			statsTbl.RealtimeCount = analyzeCount
+			statsTbl.ModifyCount = 0
+		}
+	}
+
 	// 2. table row count from statistics is zero.
 	if statsTbl.RealtimeCount == 0 {
 		countIs0 = true
 		core_metrics.PseudoEstimationNotAvailable.Inc()
-		return statistics.PseudoTable(tblInfo)
+		return statistics.PseudoTable(tblInfo, allowPseudoTblTriggerLoading)
 	}
 
 	// 3. statistics is uninitialized or outdated.
@@ -4751,6 +4776,44 @@ func getStatsTable(ctx sessionctx.Context, tblInfo *model.TableInfo, pid int64)
 	return statsTbl
 }
 
+// getLatestVersionFromStatsTable gets statistics information for a table specified by "tableID", and get the max
+// LastUpdateVersion among all Columns and Indices in it.
+// Its overall logic is quite similar to getStatsTable(). During plan cache matching, only the latest version is needed.
+// In such case, compared to getStatsTable(), this function can save some copies, memory allocations and unnecessary
+// checks. Also, this function won't trigger metrics changes.
+func getLatestVersionFromStatsTable(ctx sessionctx.Context, tblInfo *model.TableInfo, pid int64) (version uint64) {
+	statsHandle := domain.GetDomain(ctx).StatsHandle()
+	// 1. tidb-server started and statistics handle has not been initialized. Pseudo stats table.
+	if statsHandle == nil {
+		return 0
+	}
+
+	var statsTbl *statistics.Table
+	if pid == tblInfo.ID || ctx.GetSessionVars().StmtCtx.UseDynamicPartitionPrune() {
+		statsTbl = statsHandle.GetTableStats(tblInfo, cache.WithTableStatsByQuery())
+	} else {
+		statsTbl = statsHandle.GetPartitionStats(tblInfo, pid, cache.WithTableStatsByQuery())
+	}
+
+	// 2. Table row count from statistics is zero. Pseudo stats table.
+	realtimeRowCount := statsTbl.RealtimeCount
+	if ctx.GetSessionVars().GetOptObjective() == variable.OptObjectiveDeterminate {
+		realtimeRowCount = max(int64(statsTbl.GetAnalyzeRowCount()), 0)
+	}
+	if realtimeRowCount == 0 {
+		return 0
+	}
+
+	// 3. Not pseudo stats table. Return the max LastUpdateVersion among all Columns and Indices
+	for _, col := range statsTbl.Columns {
+		version = max(version, col.LastUpdateVersion)
+	}
+	for _, idx := range statsTbl.Indices {
+		version = max(version, idx.LastUpdateVersion)
+	}
+	return version
+}
+
 func (b *PlanBuilder) tryBuildCTE(ctx context.Context, tn *ast.TableName, asName *model.CIStr) (LogicalPlan, error) {
 	for i := len(b.outerCTEs) - 1; i >= 0; i-- {
 		cte := b.outerCTEs[i]

diff --git a/planner/core/plan_cache_utils.go b/planner/core/plan_cache_utils.go
@@ -36,7 +36,6 @@ import (
 	"github.com/pingcap/tidb/sessionctx"
 	"github.com/pingcap/tidb/sessionctx/stmtctx"
 	"github.com/pingcap/tidb/sessionctx/variable"
-	"github.com/pingcap/tidb/statistics"
 	"github.com/pingcap/tidb/types"
 	driver "github.com/pingcap/tidb/types/parser_driver"
 	"github.com/pingcap/tidb/util/codec"
@@ -478,24 +477,6 @@ func GetPreparedStmt(stmt *ast.ExecuteStmt, vars *variable.SessionVars) (*PlanCa
 	return nil, ErrStmtNotFound
 }
 
-func tableStatsVersionForPlanCache(tStats *statistics.Table) (tableStatsVer uint64) {
-	if tStats == nil {
-		return 0
-	}
-	// use the max version of all columns and indices as the table stats version
-	for _, col := range tStats.Columns {
-		if col.LastUpdateVersion > tableStatsVer {
-			tableStatsVer = col.LastUpdateVersion
-		}
-	}
-	for _, idx := range tStats.Indices {
-		if idx.LastUpdateVersion > tableStatsVer {
-			tableStatsVer = idx.LastUpdateVersion
-		}
-	}
-	return tableStatsVer
-}
-
 // GetMatchOpts get options to fetch plan or generate new plan
 // we can add more options here
 func GetMatchOpts(sctx sessionctx.Context, is infoschema.InfoSchema, stmt *PlanCacheStmt, params []expression.Expression) (*utilpc.PlanCacheMatchOpts, error) {
@@ -508,8 +489,7 @@ func GetMatchOpts(sctx sessionctx.Context, is infoschema.InfoSchema, stmt *PlanC
 			if err != nil { // CTE in this case
 				continue
 			}
-			tStats := getStatsTable(sctx, t.Meta(), t.Meta().ID)
-			statsVerHash += tableStatsVersionForPlanCache(tStats) // use '+' as the hash function for simplicity
+			statsVerHash += getLatestVersionFromStatsTable(sctx, t.Meta(), t.Meta().ID) // use '+' as the hash function for simplicity
 		}
 
 		for _, node := range stmt.QueryFeatures.limits {

diff --git a/planner/core/planbuilder.go b/planner/core/planbuilder.go
@@ -1807,10 +1807,10 @@ func (b *PlanBuilder) buildPhysicalIndexLookUpReader(_ context.Context, dbName m
 		Ranges:           ranger.FullRange(),
 		physicalTableID:  physicalID,
 		isPartition:      isPartition,
-		tblColHists:      &(statistics.PseudoTable(tblInfo)).HistColl,
+		tblColHists:      &(statistics.PseudoTable(tblInfo, false)).HistColl,
 	}.Init(b.ctx, b.getSelectOffset())
 	// There is no alternative plan choices, so just use pseudo stats to avoid panic.
-	is.SetStats(&property.StatsInfo{HistColl: &(statistics.PseudoTable(tblInfo)).HistColl})
+	is.SetStats(&property.StatsInfo{HistColl: &(statistics.PseudoTable(tblInfo, false)).HistColl})
 	if hasCommonCols {
 		for _, c := range commonInfos {
 			is.Columns = append(is.Columns, c.ColumnInfo)
@@ -1826,7 +1826,7 @@ func (b *PlanBuilder) buildPhysicalIndexLookUpReader(_ context.Context, dbName m
 		DBName:          dbName,
 		physicalTableID: physicalID,
 		isPartition:     isPartition,
-		tblColHists:     &(statistics.PseudoTable(tblInfo)).HistColl,
+		tblColHists:     &(statistics.PseudoTable(tblInfo, false)).HistColl,
 	}.Init(b.ctx, b.getSelectOffset())
 	ts.SetSchema(idxColSchema)
 	ts.Columns = ExpandVirtualColumn(ts.Columns, ts.schema, ts.Table.Columns)

diff --git a/planner/core/stats.go b/planner/core/stats.go
@@ -68,7 +68,7 @@ func (p *LogicalMemTable) DeriveStats(_ []*property.StatsInfo, selfSchema *expre
 	if p.StatsInfo() != nil {
 		return p.StatsInfo(), nil
 	}
-	statsTable := statistics.PseudoTable(p.TableInfo)
+	statsTable := statistics.PseudoTable(p.TableInfo, false)
 	stats := &property.StatsInfo{
 		RowCount:     float64(statsTable.RealtimeCount),
 		ColNDVs:      make(map[int64]float64, len(p.TableInfo.Columns)),