Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

planner, statistics: fix the inconsistent est when table has no stats #52427

Merged
merged 5 commits into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/planner/cardinality/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ go_test(
data = glob(["testdata/**"]),
embed = [":cardinality"],
flaky = True,
shard_count = 27,
shard_count = 28,
deps = [
"//pkg/config",
"//pkg/domain",
Expand Down
36 changes: 36 additions & 0 deletions pkg/planner/cardinality/selectivity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1310,3 +1310,39 @@ func TestSubsetIdxCardinality(t *testing.T) {
testKit.MustQuery(input[i]).Check(testkit.Rows(output[i].Result...))
}
}

func TestBuiltinInEstWithoutStats(t *testing.T) {
store, dom := testkit.CreateMockStoreAndDomain(t)
tk := testkit.NewTestKit(t, store)
h := dom.StatsHandle()

tk.MustExec("use test")
tk.MustExec("create table t(a int)")
require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh()))
tk.MustExec("insert into t values(1), (2), (3), (4), (5), (6), (7), (8), (9), (10)")
require.NoError(t, h.DumpStatsDeltaToKV(true))
is := dom.InfoSchema()
require.NoError(t, h.Update(is))

tk.MustQuery("explain format='brief' select * from t where a in (1, 2, 3, 4, 5, 6, 7, 8)").Check(testkit.Rows(
"TableReader 0.08 root data:Selection",
"└─Selection 0.08 cop[tikv] in(test.t.a, 1, 2, 3, 4, 5, 6, 7, 8)",
" └─TableFullScan 10.00 cop[tikv] table:t keep order:false, stats:pseudo",
))

h.Clear()
require.NoError(t, h.InitStatsLite(is))
tk.MustQuery("explain format='brief' select * from t where a in (1, 2, 3, 4, 5, 6, 7, 8)").Check(testkit.Rows(
"TableReader 0.08 root data:Selection",
"└─Selection 0.08 cop[tikv] in(test.t.a, 1, 2, 3, 4, 5, 6, 7, 8)",
" └─TableFullScan 10.00 cop[tikv] table:t keep order:false, stats:pseudo",
))

h.Clear()
require.NoError(t, h.InitStats(is))
tk.MustQuery("explain format='brief' select * from t where a in (1, 2, 3, 4, 5, 6, 7, 8)").Check(testkit.Rows(
"TableReader 0.08 root data:Selection",
"└─Selection 0.08 cop[tikv] in(test.t.a, 1, 2, 3, 4, 5, 6, 7, 8)",
" └─TableFullScan 10.00 cop[tikv] table:t keep order:false, stats:pseudo",
))
}
10 changes: 10 additions & 0 deletions pkg/planner/core/collect_column_stats_usage.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,11 +195,21 @@ func (c *columnStatsUsageCollector) addHistNeededColumns(ds *DataSource) {
colIDSet := intset.NewFastIntSet()

for _, col := range columns {
// If the column is plan-generated one, Skip it.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does "plan-generated" mean?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Such that ID=-1 means ExtraHandleCol

// TODO: we may need to consider the ExtraHandle.
if col.ID < 0 {
continue
}
tblColID := model.TableItemID{TableID: ds.physicalTableID, ID: col.ID, IsIndex: false}
colIDSet.Insert(int(col.ID))
c.histNeededCols[tblColID] = true
}
for _, col := range ds.Columns {
// If the column is plan-generated one, Skip it.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto.

// TODO: we may need to consider the ExtraHandle.
if col.ID < 0 {
continue
}
if !colIDSet.Has(int(col.ID)) && !col.Hidden {
tblColID := model.TableItemID{TableID: ds.physicalTableID, ID: col.ID, IsIndex: false}
if _, ok := c.histNeededCols[tblColID]; ok {
Expand Down
8 changes: 5 additions & 3 deletions pkg/statistics/handle/storage/read.go
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,9 @@ func CleanFakeItemsForShowHistInFlights(statsCache statstypes.StatsCache) int {
if item.IsIndex {
_, loadNeeded = tbl.IndexIsLoadNeeded(item.ID)
} else {
_, loadNeeded = tbl.ColumnIsLoadNeeded(item.ID, true)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We keep the async load unchanged. Wait for later patch to handle it.

var analyzed bool
_, loadNeeded, analyzed = tbl.ColumnIsLoadNeeded(item.ID, true)
loadNeeded = loadNeeded && analyzed
}
if !loadNeeded {
statistics.HistogramNeededItems.Delete(item)
Expand All @@ -589,8 +591,8 @@ func loadNeededColumnHistograms(sctx sessionctx.Context, statsCache statstypes.S
return nil
}
var colInfo *model.ColumnInfo
_, loadNeeded := tbl.ColumnIsLoadNeeded(col.ID, true)
if !loadNeeded {
_, loadNeeded, analyzed := tbl.ColumnIsLoadNeeded(col.ID, true)
if !loadNeeded || !analyzed {
statistics.HistogramNeededItems.Delete(col)
return nil
}
Expand Down
18 changes: 16 additions & 2 deletions pkg/statistics/handle/syncload/stats_syncload.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ func (s *statsSyncLoad) removeHistLoadedColumns(neededItems []model.StatsLoadIte
}
continue
}
_, loadNeeded := tbl.ColumnIsLoadNeeded(item.ID, item.FullLoad)
_, loadNeeded, _ := tbl.ColumnIsLoadNeeded(item.ID, item.FullLoad)
if loadNeeded {
remainedItems = append(remainedItems, item)
}
Expand Down Expand Up @@ -269,7 +269,7 @@ func (s *statsSyncLoad) handleOneItemTask(sctx sessionctx.Context, task *statsty
wrapper.idxInfo = tbl.ColAndIdxExistenceMap.GetIndex(item.ID)
}
} else {
col, loadNeeded := tbl.ColumnIsLoadNeeded(item.ID, task.Item.FullLoad)
col, loadNeeded, analyzed := tbl.ColumnIsLoadNeeded(item.ID, task.Item.FullLoad)
if !loadNeeded {
return result, nil
}
Expand All @@ -278,6 +278,20 @@ func (s *statsSyncLoad) handleOneItemTask(sctx sessionctx.Context, task *statsty
} else {
wrapper.colInfo = tbl.ColAndIdxExistenceMap.GetCol(item.ID)
}
// If this column is not analyzed yet and we don't have it in memory.
// We create a fake one for the pseudo estimation.
if loadNeeded && !analyzed {
wrapper.col = &statistics.Column{
PhysicalID: item.TableID,
Info: wrapper.colInfo,
Histogram: *statistics.NewHistogram(item.ID, 0, 0, 0, &wrapper.colInfo.FieldType, 0, 0),
IsHandle: tbl.IsPkIsHandle && mysql.HasPriKeyFlag(wrapper.colInfo.GetFlag()),
}
if s.updateCachedItem(item, wrapper.col, wrapper.idx, task.Item.FullLoad) {
return result, nil
}
return nil, nil
}
}
t := time.Now()
needUpdate := false
Expand Down
22 changes: 17 additions & 5 deletions pkg/statistics/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -629,25 +629,37 @@ func (t *Table) GetStatsHealthy() (int64, bool) {
// The Column should be visible in the table and really has analyzed statistics in the stroage.
// Also, if the stats has been loaded into the memory, we also don't need to load it.
// We return the Column together with the checking result, to avoid accessing the map multiple times.
func (t *Table) ColumnIsLoadNeeded(id int64, fullLoad bool) (*Column, bool) {
// The first bool is whether we have it in memory. The second bool is whether this column has stats in the system table or not.
func (t *Table) ColumnIsLoadNeeded(id int64, fullLoad bool) (*Column, bool, bool) {
if t.Pseudo {
return nil, false, false
}
col, ok := t.Columns[id]
hasAnalyzed := t.ColAndIdxExistenceMap.HasAnalyzed(id, false)

// If it's not analyzed yet. Don't need to load it.
// If it's not analyzed yet.
if !hasAnalyzed {
return nil, false
// If we don't have it in memory, we create a fake hist for pseudo estimation (see handleOneItemTask()).
if !ok {
// If we don't have this column. We skip it.
// It's something ridiculous. But it's possible that the stats don't have some ColumnInfo.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this means this column has stats but doesn't have ColumnInfo.
In what case would this happen?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Such that alter table add column ddl event is not handled by the StatsHandle. Currently, the statistis.Table is initialized by the stats. not by the model.TableInfo

// We need to find a way to maintain it more correctly.
return nil, t.ColAndIdxExistenceMap.Has(id, false), false
}
// Otherwise we don't need to load it.
return nil, false, false
}

// Restore the condition from the simplified form:
// 1. !ok && hasAnalyzed => need load
// 2. ok && hasAnalyzed && fullLoad && !col.IsFullLoad => need load
// 3. ok && hasAnalyzed && !fullLoad && !col.statsInitialized => need load
if !ok || (fullLoad && !col.IsFullLoad()) || (!fullLoad && !col.statsInitialized) {
return col, true
return col, true, true
}

// Otherwise don't need load it.
return col, false
return col, false, true
}

// IndexIsLoadNeeded checks whether the index needs trigger the async/sync load.
Expand Down