opt: hoist uncorrelated equality subqueries

Subqueries that are in equality expressions with a variable are now hoisted. When these expressions exist in a filter, hoisting the subquery can allow the main query to plan a lookup join, rather than an inefficient full-table scan. For example, consider the table and query: CREATE TABLE t ( a INT, INDEX (a) ); SELECT * FROM t WHERE a = (SELECT max(a) FROM t); Prior to this commit, the query plan for this query required a full table scan: select ├── columns: a:1 ├── scan t@t_a_idx │ ├── columns: a:1 │ └── constraint: /1/2: (/NULL - ] └── filters └── eq ├── a:1 └── subquery └── scalar-group-by ├── columns: max:9 ├── scan t@t_a_idx,rev │ ├── columns: a:5 │ ├── constraint: /5/6: (/NULL - ] │ └── limit: 1(rev) └── aggregations └── const-agg [as=max:9, outer=(5)] └── a:5 By hoisting the subquery, the full table scan is replaced with a lookup join: project ├── columns: a:1 └── inner-join (lookup t@t_a_idx) ├── columns: a:1 max:9 ├── key columns: [9] = [1] ├── scalar-group-by │ ├── columns: max:9 │ ├── scan t@t_a_idx,rev │ │ ├── columns: a:5 │ │ ├── constraint: /5/6: (/NULL - ] │ │ └── limit: 1(rev) │ └── aggregations │ └── const-agg [as=max:9, outer=(5)] │ └── a:5 └── filters (true) This hoisting is enabled by default, but can be disabled by setting the `optimizer_hoist_uncorrelated_equality_subqueries` session setting to `false`. Fixes cockroachdb#83392 Informs cockroachdb#51820 Informs cockroachdb#93829 Informs cockroachdb#100855 Release note (performance improvement): Queries that have subqueries in equality expressions are now more efficiently planned by the optimizer.
mgartner · Apr 17, 2023 · 41fc214 · 41fc214
1 parent 7f55244
commit 41fc214
Show file tree

Hide file tree

Showing 19 changed files with 671 additions and 416 deletions.
diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go
@@ -3514,6 +3514,10 @@ func (m *sessionDataMutator) SetOptimizerAlwaysUseHistograms(val bool) {
 	m.data.OptimizerAlwaysUseHistograms = val
 }
 
+func (m *sessionDataMutator) SetOptimizerHoistUncorrelatedEqualitySubqueries(val bool) {
+	m.data.OptimizerHoistUncorrelatedEqualitySubqueries = val
+}
+
 func (m *sessionDataMutator) SetEnableCreateStatsUsingExtremes(val bool) {
 	m.data.EnableCreateStatsUsingExtremes = val
 }

diff --git a/pkg/sql/logictest/testdata/logic_test/information_schema b/pkg/sql/logictest/testdata/logic_test/information_schema
@@ -5278,6 +5278,7 @@ on_update_rehome_row_enabled                          on
 opt_split_scan_limit                                  2048
 optimizer                                             on
 optimizer_always_use_histograms                       on
+optimizer_hoist_uncorrelated_equality_subqueries      on
 optimizer_use_forecasts                               on
 optimizer_use_histograms                              on
 optimizer_use_improved_disjunction_stats              on

diff --git a/pkg/sql/logictest/testdata/logic_test/pg_catalog b/pkg/sql/logictest/testdata/logic_test/pg_catalog
@@ -2759,6 +2759,7 @@ null_ordered_last                                     off                 NULL
 on_update_rehome_row_enabled                          on                  NULL      NULL        NULL        string
 opt_split_scan_limit                                  2048                NULL      NULL        NULL        string
 optimizer_always_use_histograms                       on                  NULL      NULL        NULL        string
+optimizer_hoist_uncorrelated_equality_subqueries      on                  NULL      NULL        NULL        string
 optimizer_use_forecasts                               on                  NULL      NULL        NULL        string
 optimizer_use_histograms                              on                  NULL      NULL        NULL        string
 optimizer_use_improved_disjunction_stats              on                  NULL      NULL        NULL        string
@@ -2913,6 +2914,7 @@ null_ordered_last                                     off                 NULL
 on_update_rehome_row_enabled                          on                  NULL  user     NULL      on                  on
 opt_split_scan_limit                                  2048                NULL  user     NULL      2048                2048
 optimizer_always_use_histograms                       on                  NULL  user     NULL      on                  on
+optimizer_hoist_uncorrelated_equality_subqueries      on                  NULL  user     NULL      on                  on
 optimizer_use_forecasts                               on                  NULL  user     NULL      on                  on
 optimizer_use_histograms                              on                  NULL  user     NULL      on                  on
 optimizer_use_improved_disjunction_stats              on                  NULL  user     NULL      on                  on
@@ -3067,6 +3069,7 @@ on_update_rehome_row_enabled                          NULL    NULL     NULL
 opt_split_scan_limit                                  NULL    NULL     NULL     NULL        NULL
 optimizer                                             NULL    NULL     NULL     NULL        NULL
 optimizer_always_use_histograms                       NULL    NULL     NULL     NULL        NULL
+optimizer_hoist_uncorrelated_equality_subqueries      NULL    NULL     NULL     NULL        NULL
 optimizer_use_forecasts                               NULL    NULL     NULL     NULL        NULL
 optimizer_use_histograms                              NULL    NULL     NULL     NULL        NULL
 optimizer_use_improved_disjunction_stats              NULL    NULL     NULL     NULL        NULL

diff --git a/pkg/sql/logictest/testdata/logic_test/show_source b/pkg/sql/logictest/testdata/logic_test/show_source
@@ -113,6 +113,7 @@ null_ordered_last                                     off
 on_update_rehome_row_enabled                          on
 opt_split_scan_limit                                  2048
 optimizer_always_use_histograms                       on
+optimizer_hoist_uncorrelated_equality_subqueries      on
 optimizer_use_forecasts                               on
 optimizer_use_histograms                              on
 optimizer_use_improved_disjunction_stats              on

diff --git a/pkg/sql/opt/exec/execbuilder/testdata/subquery b/pkg/sql/opt/exec/execbuilder/testdata/subquery
@@ -121,64 +121,58 @@ vectorized: true
 • root
 │ columns: (a, b, c)
 │
-├── • filter
+├── • project
 │   │ columns: (a, b, c)
-│   │ estimated row count: 333 (missing stats)
-│   │ filter: a = @S2
 │   │
-│   └── • scan
-│         columns: (a, b, c)
-│         estimated row count: 1,000 (missing stats)
-│         table: abc@abc_pkey
-│         spans: FULL SCAN
-│
-├── • subquery
-│   │ id: @S1
-│   │ original sql: (SELECT * FROM abc WHERE c = (a + 3))
-│   │ exec mode: one row
-│   │
-│   └── • render
-│       │ columns: (column16)
-│       │ render column16: true
+│   └── • lookup join (inner)
+│       │ columns: (any_not_null, a, b, c)
+│       │ estimated row count: 1 (missing stats)
+│       │ table: abc@abc_pkey
+│       │ equality: (any_not_null) = (a)
+│       │ equality cols are key
 │       │
-│       └── • limit
-│           │ columns: (a, c)
-│           │ count: 1
+│       └── • group (scalar)
+│           │ columns: (any_not_null)
+│           │ estimated row count: 1 (missing stats)
+│           │ aggregate 0: any_not_null(a)
 │           │
-│           └── • filter
-│               │ columns: (a, c)
-│               │ estimated row count: 330 (missing stats)
-│               │ filter: c = (a + 3)
+│           └── • limit
+│               │ columns: (a)
+│               │ count: 1
 │               │
-│               └── • scan
-│                     columns: (a, c)
-│                     estimated row count: 1,000 (missing stats)
-│                     table: abc@abc_pkey
-│                     spans: FULL SCAN (SOFT LIMIT)
+│               └── • filter
+│                   │ columns: (a)
+│                   │ ordering: -a
+│                   │ estimated row count: 333 (missing stats)
+│                   │ filter: COALESCE(@S1, false)
+│                   │
+│                   └── • revscan
+│                         columns: (a)
+│                         ordering: -a
+│                         estimated row count: 1,000 (missing stats)
+│                         table: abc@abc_pkey
+│                         spans: FULL SCAN (SOFT LIMIT)
 │
 └── • subquery
-    │ id: @S2
-    │ original sql: (SELECT max(a) FROM abc WHERE EXISTS (SELECT * FROM abc WHERE c = (a + 3)))
+    │ id: @S1
+    │ original sql: (SELECT * FROM abc WHERE c = (a + 3))
     │ exec mode: one row
     │
-    └── • group (scalar)
-        │ columns: (any_not_null)
-        │ estimated row count: 1 (missing stats)
-        │ aggregate 0: any_not_null(a)
+    └── • render
+        │ columns: (column16)
+        │ render column16: true
         │
         └── • limit
-            │ columns: (a)
+            │ columns: (a, c)
             │ count: 1
             │
             └── • filter
-                │ columns: (a)
-                │ ordering: -a
-                │ estimated row count: 333 (missing stats)
-                │ filter: COALESCE(@S1, false)
+                │ columns: (a, c)
+                │ estimated row count: 330 (missing stats)
+                │ filter: c = (a + 3)
                 │
-                └── • revscan
-                      columns: (a)
-                      ordering: -a
+                └── • scan
+                      columns: (a, c)
                       estimated row count: 1,000 (missing stats)
                       table: abc@abc_pkey
                       spans: FULL SCAN (SOFT LIMIT)

diff --git a/pkg/sql/opt/exec/execbuilder/testdata/tpch_vec b/pkg/sql/opt/exec/execbuilder/testdata/tpch_vec
@@ -20848,17 +20848,20 @@ EXPLAIN (VEC) SELECT s_suppkey, s_name, s_address, s_phone, total_revenue FROM s
 ----
 │
 └ Node 1
-  └ *colexecjoin.mergeJoinInnerOp
-    ├ *colfetcher.ColBatchScan
+  └ *rowexec.joinReader
     └ *colexec.sortOp
-      └ *colexecsel.selEQFloat64Float64Op
-        └ *colexecbase.castOpNullAny
-          └ *colexecbase.constNullOp
-            └ *colexec.hashAggregator
-              └ *colexecproj.projMultFloat64Float64Op
-                └ *colexecprojconst.projMinusFloat64ConstFloat64Op
-                  └ *colfetcher.ColIndexJoin
-                    └ *colfetcher.ColBatchScan
+      └ *colexecjoin.hashJoiner
+        ├ *colexec.hashAggregator
+        │ └ *colexecproj.projMultFloat64Float64Op
+        │   └ *colexecprojconst.projMinusFloat64ConstFloat64Op
+        │     └ *colfetcher.ColIndexJoin
+        │       └ *colfetcher.ColBatchScan
+        └ *colexec.orderedAggregator
+          └ *colexec.hashAggregator
+            └ *colexecproj.projMultFloat64Float64Op
+              └ *colexecprojconst.projMinusFloat64ConstFloat64Op
+                └ *colfetcher.ColIndexJoin
+                  └ *colfetcher.ColBatchScan
 
 statement ok
 DROP VIEW revenue0

diff --git a/pkg/sql/opt/exec/execbuilder/testdata/udf b/pkg/sql/opt/exec/execbuilder/testdata/udf
@@ -116,48 +116,48 @@ EXPLAIN (VERBOSE) SELECT * FROM sub3 WHERE sub_fn() = 3 AND (SELECT max(a) FROM
 distribution: local
 vectorized: true
 ·
-• root
+• project
 │ columns: (a)
 │
-├── • filter
-│   │ columns: (a)
-│   │ estimated row count: 111 (missing stats)
-│   │ filter: (sub_fn() = 3) AND (a = @S1)
-│   │
-│   └── • scan
-│         columns: (a)
-│         estimated row count: 1,000 (missing stats)
-│         table: sub3@sub3_pkey
-│         spans: FULL SCAN
-│
-└── • subquery
-    │ id: @S1
-    │ original sql: (SELECT max(a) FROM sub2)
-    │ exec mode: one row
+└── • lookup join (inner)
+    │ columns: (any_not_null, a)
+    │ estimated row count: 1 (missing stats)
+    │ table: sub3@sub3_pkey
+    │ equality: (any_not_null) = (a)
+    │ equality cols are key
+    │ pred: sub_fn() = 3
     │
-    └── • group (scalar)
+    └── • filter
         │ columns: (any_not_null)
-        │ estimated row count: 1 (missing stats)
-        │ aggregate 0: any_not_null(a)
+        │ estimated row count: 0 (missing stats)
+        │ filter: sub_fn() = 3
         │
-        └── • revscan
-              columns: (a)
-              estimated row count: 1 (missing stats)
-              table: sub2@sub2_pkey
-              spans: LIMITED SCAN
-              limit: 1
+        └── • group (scalar)
+            │ columns: (any_not_null)
+            │ estimated row count: 1 (missing stats)
+            │ aggregate 0: any_not_null(a)
+            │
+            └── • revscan
+                  columns: (a)
+                  estimated row count: 1 (missing stats)
+                  table: sub2@sub2_pkey
+                  spans: LIMITED SCAN
+                  limit: 1
+
+statement ok
+CREATE FUNCTION sub_fn_lt() RETURNS INT LANGUAGE SQL AS 'SELECT a FROM sub1 WHERE a < (SELECT max(a) FROM sub2)'
 
 # The uncorrelated subquery in the UDF body is executed only once.
 query T kvtrace
-SELECT sub_fn()
+SELECT sub_fn_lt()
 ----
 Scan /Table/112/{1-2}
 Scan /Table/113/{1-2}
 
 # The uncorrelated subquery in the UDF body is executed only once per row
 # produced by generate_series.
 query T kvtrace
-SELECT sub_fn() FROM generate_series(1, 3)
+SELECT sub_fn_lt() FROM generate_series(1, 3)
 ----
 Scan /Table/112/{1-2}
 Scan /Table/113/{1-2}
@@ -174,12 +174,12 @@ CREATE FUNCTION sub_fn2() RETURNS INT LANGUAGE SQL AS 'SELECT a FROM sub1 WHERE
 query T kvtrace
 SELECT sub_fn2() FROM generate_series(1, 3)
 ----
-Scan /Table/112/{1-2}
 Scan /Table/113/1/30/0
-Scan /Table/112/{1-2}
+Scan /Table/112/1/30/0
 Scan /Table/113/1/30/0
-Scan /Table/112/{1-2}
+Scan /Table/112/1/30/0
 Scan /Table/113/1/30/0
+Scan /Table/112/1/30/0
 
 statement ok
 CREATE FUNCTION sub_fn3() RETURNS INT LANGUAGE SQL AS 'SELECT a FROM sub1 WHERE EXISTS (SELECT a FROM sub2 WHERE a = 30)'

diff --git a/pkg/sql/opt/memo/memo.go b/pkg/sql/opt/memo/memo.go
@@ -163,6 +163,7 @@ type Memo struct {
 	useLimitOrderingForStreamingGroupBy    bool
 	useImprovedSplitDisjunctionForJoins    bool
 	alwaysUseHistograms                    bool
+	hoistUncorrelatedEqualitySubqueries    bool
 
 	// curRank is the highest currently in-use scalar expression rank.
 	curRank opt.ScalarRank
@@ -221,6 +222,7 @@ func (m *Memo) Init(ctx context.Context, evalCtx *eval.Context) {
 		useLimitOrderingForStreamingGroupBy:    evalCtx.SessionData().OptimizerUseLimitOrderingForStreamingGroupBy,
 		useImprovedSplitDisjunctionForJoins:    evalCtx.SessionData().OptimizerUseImprovedSplitDisjunctionForJoins,
 		alwaysUseHistograms:                    evalCtx.SessionData().OptimizerAlwaysUseHistograms,
+		hoistUncorrelatedEqualitySubqueries:    evalCtx.SessionData().OptimizerHoistUncorrelatedEqualitySubqueries,
 	}
 	m.metadata.Init()
 	m.logPropsBuilder.init(ctx, evalCtx, m)
@@ -362,7 +364,8 @@ func (m *Memo) IsStale(
 		m.useImprovedDisjunctionStats != evalCtx.SessionData().OptimizerUseImprovedDisjunctionStats ||
 		m.useLimitOrderingForStreamingGroupBy != evalCtx.SessionData().OptimizerUseLimitOrderingForStreamingGroupBy ||
 		m.useImprovedSplitDisjunctionForJoins != evalCtx.SessionData().OptimizerUseImprovedSplitDisjunctionForJoins ||
-		m.alwaysUseHistograms != evalCtx.SessionData().OptimizerAlwaysUseHistograms {
+		m.alwaysUseHistograms != evalCtx.SessionData().OptimizerAlwaysUseHistograms ||
+		m.hoistUncorrelatedEqualitySubqueries != evalCtx.SessionData().OptimizerHoistUncorrelatedEqualitySubqueries {
 		return true, nil
 	}
 

diff --git a/pkg/sql/opt/memo/memo_test.go b/pkg/sql/opt/memo/memo_test.go
@@ -354,6 +354,12 @@ func TestMemoIsStale(t *testing.T) {
 	evalCtx.SessionData().OptimizerAlwaysUseHistograms = false
 	notStale()
 
+	// Stale optimizer_hoist_uncorrelated_equality_subqueries.
+	evalCtx.SessionData().OptimizerHoistUncorrelatedEqualitySubqueries = true
+	stale()
+	evalCtx.SessionData().OptimizerHoistUncorrelatedEqualitySubqueries = false
+	notStale()
+
 	// Stale data sources and schema. Create new catalog so that data sources are
 	// recreated and can be modified independently.
 	catalog = testcat.New()