From 38cc955c043406336c3dbc328c3c280667a90667 Mon Sep 17 00:00:00 2001 From: wshwsh12 <793703860@qq.com> Date: Thu, 21 May 2020 02:11:56 +0800 Subject: [PATCH 01/10] partition-pruning --- partition-pruning.md | 147 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 146 insertions(+), 1 deletion(-) diff --git a/partition-pruning.md b/partition-pruning.md index b10ffd9b8c86..496821e15179 100644 --- a/partition-pruning.md +++ b/partition-pruning.md @@ -3,4 +3,149 @@ title: 分区裁剪 category: performance --- -# 分区裁剪 \ No newline at end of file +# 分区裁剪 + +分区裁剪通过分析查询语句中的过滤条件,只选择可能满足条件的分区,不扫描匹配不上的分区,进而显著地减少计算的数据量。 + +## 分区表 + +分区裁剪是只有当目标表为分区表时,才可以进行的一种优化方式。[分区表](/partitioned-table.md) 是 TiDB 中常见的性能优化的手段,Range 分区可以用于解决业务中大量删除带来的性能问题,Hash 分区则可以用于大量写入场景下的数据打散。 + +## 分区裁剪的使用场景 + +### 场景一 + +分区裁剪需要使用分区表上面的查询条件,如果查询条件不能下推到分区表,则相应的查询语句无法使用分区裁剪。 + +```sql +create table t1 (x int) partition by range (x) ( + partition p0 values less than (5), + partition p1 values less than (10)); +create table t2 (x int); + +explain select * from t1 left join t2 on t1.x = t2.x where t2.x > 5; ++------------------------------+----------+-----------+------------------------+----------------------------------------------+ +| id | estRows | task | access object | operator info | ++------------------------------+----------+-----------+------------------------+----------------------------------------------+ +| HashJoin_8 | 4166.67 | root | | inner join, equal:[eq(test.t1.x, test.t2.x)] | +| ├─TableReader_15(Build) | 3333.33 | root | | data:Selection_14 | +| │ └─Selection_14 | 3333.33 | cop[tikv] | | gt(test.t2.x, 5), not(isnull(test.t2.x)) | +| │ └─TableFullScan_13 | 10000.00 | cop[tikv] | table:t2 | keep order:false, stats:pseudo | +| └─TableReader_12(Probe) | 3333.33 | root | | data:Selection_11 | +| └─Selection_11 | 3333.33 | cop[tikv] | | gt(test.t1.x, 5), not(isnull(test.t1.x)) | +| └─TableFullScan_10 | 10000.00 | cop[tikv] | table:t1, partition:p1 | keep order:false, stats:pseudo | ++------------------------------+----------+-----------+------------------------+----------------------------------------------+ +7 rows in set (0.01 sec) +``` +在这个查询中,外连接可以简化成内连接,然后由 `t1.x = t2.x` 和 `t2.x > 5` 可以推出条件 `t1.x > 5`,于是可以分区裁剪并且只使用 `p1` 分区。 + +```sql +explain select * from t1 left join t2 on t1.x = t2.x and t2.x > 5; ++------------------------------+----------+-----------+------------------------+---------------------------------------------------+ +| id | estRows | task | access object | operator info | ++------------------------------+----------+-----------+------------------------+---------------------------------------------------+ +| HashJoin_9 | 20000.00 | root | | left outer join, equal:[eq(test.t1.x, test.t2.x)] | +| ├─TableReader_18(Build) | 3333.33 | root | | data:Selection_17 | +| │ └─Selection_17 | 3333.33 | cop[tikv] | | gt(test.t2.x, 5), not(isnull(test.t2.x)) | +| │ └─TableFullScan_16 | 10000.00 | cop[tikv] | table:t2 | keep order:false, stats:pseudo | +| └─Union_11(Probe) | 20000.00 | root | | | +| ├─TableReader_13 | 10000.00 | root | | data:TableFullScan_12 | +| │ └─TableFullScan_12 | 10000.00 | cop[tikv] | table:t1, partition:p0 | keep order:false, stats:pseudo | +| └─TableReader_15 | 10000.00 | root | | data:TableFullScan_14 | +| └─TableFullScan_14 | 10000.00 | cop[tikv] | table:t1, partition:p1 | keep order:false, stats:pseudo | ++------------------------------+----------+-----------+------------------------+---------------------------------------------------+ +9 rows in set (0.00 sec) +``` + +这个查询中的 `t2.x > 5` 条件不能下推到 `t1` 分区表上面,因此 `t1` 无法分区裁剪。 + +### 场景二 + +由于分区裁剪的规则优化是在查询计划的生成阶段,对于执行阶段才能获取到过滤条件的场景,无法利用分区裁剪的优化。 + +```sql +create table t1 (x int) partition by range (x) ( + partition p0 values less than (5), + partition p1 values less than (10)); +create table t2 (x int); +explain select * from t2 where x < (select * from t1 where t2.x < t1.x and t2.x < 2); ++--------------------------------------+----------+-----------+------------------------+-----------------------------------------------------------+ +| id | estRows | task | access object | operator info | ++--------------------------------------+----------+-----------+------------------------+-----------------------------------------------------------+ +| Projection_13 | 9990.00 | root | | test.t2.x | +| └─Apply_15 | 9990.00 | root | | CARTESIAN inner join, other cond:lt(test.t2.x, test.t1.x) | +| ├─TableReader_18(Build) | 9990.00 | root | | data:Selection_17 | +| │ └─Selection_17 | 9990.00 | cop[tikv] | | not(isnull(test.t2.x)) | +| │ └─TableFullScan_16 | 10000.00 | cop[tikv] | table:t2 | keep order:false, stats:pseudo | +| └─Selection_19(Probe) | 0.80 | root | | not(isnull(test.t1.x)) | +| └─MaxOneRow_20 | 1.00 | root | | | +| └─Union_21 | 2.00 | root | | | +| ├─TableReader_24 | 2.00 | root | | data:Selection_23 | +| │ └─Selection_23 | 2.00 | cop[tikv] | | lt(test.t2.x, 2), lt(test.t2.x, test.t1.x) | +| │ └─TableFullScan_22 | 2.50 | cop[tikv] | table:t1, partition:p0 | keep order:false, stats:pseudo | +| └─TableReader_27 | 2.00 | root | | data:Selection_26 | +| └─Selection_26 | 2.00 | cop[tikv] | | lt(test.t2.x, 2), lt(test.t2.x, test.t1.x) | +| └─TableFullScan_25 | 2.50 | cop[tikv] | table:t1, partition:p1 | keep order:false, stats:pseudo | ++--------------------------------------+----------+-----------+------------------------+-----------------------------------------------------------+ +14 rows in set (0.00 sec) +``` +这个查询每从 `t2` 读取一行,都会去分区表 `t1` 上进行查询,理论上这时会满足 `t1.x > val` 的过滤条件,但实际上由于分区裁剪只作用于查询计划生成阶段,而不是执行阶段,因而不会做裁剪。 + +### 场景三 + +对于 `hash` 分区类型,只有等值比较的查询条件能够支持分区裁剪。 +```sql +create table t (x int) partition by hash(x) partitions 4; +explain select * from t where x = 1; ++-------------------------+----------+-----------+-----------------------+--------------------------------+ +| id | estRows | task | access object | operator info | ++-------------------------+----------+-----------+-----------------------+--------------------------------+ +| TableReader_8 | 10.00 | root | | data:Selection_7 | +| └─Selection_7 | 10.00 | cop[tikv] | | eq(test.t.x, 1) | +| └─TableFullScan_6 | 10000.00 | cop[tikv] | table:t, partition:p1 | keep order:false, stats:pseudo | ++-------------------------+----------+-----------+-----------------------+--------------------------------+ +3 rows in set (0.01 sec) +explain select * from t where x > 2; ++------------------------------+----------+-----------+-----------------------+--------------------------------+ +| id | estRows | task | access object | operator info | ++------------------------------+----------+-----------+-----------------------+--------------------------------+ +| Union_10 | 13333.33 | root | | | +| ├─TableReader_13 | 3333.33 | root | | data:Selection_12 | +| │ └─Selection_12 | 3333.33 | cop[tikv] | | gt(test.t.x, 2) | +| │ └─TableFullScan_11 | 10000.00 | cop[tikv] | table:t, partition:p0 | keep order:false, stats:pseudo | +| ├─TableReader_16 | 3333.33 | root | | data:Selection_15 | +| │ └─Selection_15 | 3333.33 | cop[tikv] | | gt(test.t.x, 2) | +| │ └─TableFullScan_14 | 10000.00 | cop[tikv] | table:t, partition:p1 | keep order:false, stats:pseudo | +| ├─TableReader_19 | 3333.33 | root | | data:Selection_18 | +| │ └─Selection_18 | 3333.33 | cop[tikv] | | gt(test.t.x, 2) | +| │ └─TableFullScan_17 | 10000.00 | cop[tikv] | table:t, partition:p2 | keep order:false, stats:pseudo | +| └─TableReader_22 | 3333.33 | root | | data:Selection_21 | +| └─Selection_21 | 3333.33 | cop[tikv] | | gt(test.t.x, 2) | +| └─TableFullScan_20 | 10000.00 | cop[tikv] | table:t, partition:p3 | keep order:false, stats:pseudo | ++------------------------------+----------+-----------+-----------------------+--------------------------------+ +13 rows in set (0.00 sec) +``` +### 场景四 + +对于 `range` 分区类型,分区表达式必须是 `col` 或者 `fn(col)` 的简单形式,查询条件是 `> < = >= <=` 时才能支持分区裁剪。如果分区表达式是 `fn(col)` 形式,还要求 `fn` 必须是单调函数,才有可能分区裁剪。 + +理论上所有满足单调条件(严格或者非严格)的函数都是可以支持分区裁剪。实际上,目前 TiDB 已经支持的单调函数只有: +```sql +unix_timestamp +to_days +``` +例如,分区表达式是 `fn(col)` 形式,`fn` 为我们支持的单调函数 `to_days`: +```sql +create table t (id datetime) partition by range (to_days(id)) ( + partition p0 values less than (to_days('2020-04-01')), + partition p1 values less than (to_days('2020-05-01'))); +explain select * from t where id > '2020-04-18'; ++-------------------------+----------+-----------+-----------------------+-------------------------------------------+ +| id | estRows | task | access object | operator info | ++-------------------------+----------+-----------+-----------------------+-------------------------------------------+ +| TableReader_8 | 3333.33 | root | | data:Selection_7 | +| └─Selection_7 | 3333.33 | cop[tikv] | | gt(test.t.id, 2020-04-18 00:00:00.000000) | +| └─TableFullScan_6 | 10000.00 | cop[tikv] | table:t, partition:p1 | keep order:false, stats:pseudo | ++-------------------------+----------+-----------+-----------------------+-------------------------------------------+ +3 rows in set (0.00 sec) +``` \ No newline at end of file From d24269302bba5304507d03a7bad11806356c83c8 Mon Sep 17 00:00:00 2001 From: wshwsh12 <793703860@qq.com> Date: Thu, 21 May 2020 10:20:46 +0800 Subject: [PATCH 02/10] make lint happy --- partition-pruning.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/partition-pruning.md b/partition-pruning.md index 496821e15179..c9d630344e78 100644 --- a/partition-pruning.md +++ b/partition-pruning.md @@ -37,6 +37,7 @@ explain select * from t1 left join t2 on t1.x = t2.x where t2.x > 5; +------------------------------+----------+-----------+------------------------+----------------------------------------------+ 7 rows in set (0.01 sec) ``` + 在这个查询中,外连接可以简化成内连接,然后由 `t1.x = t2.x` 和 `t2.x > 5` 可以推出条件 `t1.x > 5`,于是可以分区裁剪并且只使用 `p1` 分区。 ```sql @@ -89,11 +90,13 @@ explain select * from t2 where x < (select * from t1 where t2.x < t1.x and t2.x +--------------------------------------+----------+-----------+------------------------+-----------------------------------------------------------+ 14 rows in set (0.00 sec) ``` + 这个查询每从 `t2` 读取一行,都会去分区表 `t1` 上进行查询,理论上这时会满足 `t1.x > val` 的过滤条件,但实际上由于分区裁剪只作用于查询计划生成阶段,而不是执行阶段,因而不会做裁剪。 ### 场景三 对于 `hash` 分区类型,只有等值比较的查询条件能够支持分区裁剪。 + ```sql create table t (x int) partition by hash(x) partitions 4; explain select * from t where x = 1; @@ -125,16 +128,20 @@ explain select * from t where x > 2; +------------------------------+----------+-----------+-----------------------+--------------------------------+ 13 rows in set (0.00 sec) ``` + ### 场景四 对于 `range` 分区类型,分区表达式必须是 `col` 或者 `fn(col)` 的简单形式,查询条件是 `> < = >= <=` 时才能支持分区裁剪。如果分区表达式是 `fn(col)` 形式,还要求 `fn` 必须是单调函数,才有可能分区裁剪。 理论上所有满足单调条件(严格或者非严格)的函数都是可以支持分区裁剪。实际上,目前 TiDB 已经支持的单调函数只有: + ```sql unix_timestamp to_days ``` + 例如,分区表达式是 `fn(col)` 形式,`fn` 为我们支持的单调函数 `to_days`: + ```sql create table t (id datetime) partition by range (to_days(id)) ( partition p0 values less than (to_days('2020-04-01')), From 679eef13fbc1bde672d0de79e31d123a88d0c19f Mon Sep 17 00:00:00 2001 From: wshwsh12 <793703860@qq.com> Date: Thu, 21 May 2020 10:25:23 +0800 Subject: [PATCH 03/10] adjust link --- partition-pruning.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/partition-pruning.md b/partition-pruning.md index c9d630344e78..0b0b1fd70b3d 100644 --- a/partition-pruning.md +++ b/partition-pruning.md @@ -9,7 +9,7 @@ category: performance ## 分区表 -分区裁剪是只有当目标表为分区表时,才可以进行的一种优化方式。[分区表](/partitioned-table.md) 是 TiDB 中常见的性能优化的手段,Range 分区可以用于解决业务中大量删除带来的性能问题,Hash 分区则可以用于大量写入场景下的数据打散。 +分区裁剪是只有当目标表为分区表时,才可以进行的一种优化方式。[分区表](/partitioned-table.md#分区表) 是 TiDB 中常见的性能优化的手段,Range 分区可以用于解决业务中大量删除带来的性能问题,Hash 分区则可以用于大量写入场景下的数据打散。 ## 分区裁剪的使用场景 From 3c4e1add353261536ef6f66b8fc14c998f4f5c35 Mon Sep 17 00:00:00 2001 From: wshwsh12 <793703860@qq.com> Date: Sun, 24 May 2020 16:58:28 +0800 Subject: [PATCH 04/10] address comments --- partition-pruning.md | 299 ++++++++++++++++++++++++++++--------------- 1 file changed, 199 insertions(+), 100 deletions(-) diff --git a/partition-pruning.md b/partition-pruning.md index 0b0b1fd70b3d..8ad74b5b4249 100644 --- a/partition-pruning.md +++ b/partition-pruning.md @@ -5,101 +5,26 @@ category: performance # 分区裁剪 -分区裁剪通过分析查询语句中的过滤条件,只选择可能满足条件的分区,不扫描匹配不上的分区,进而显著地减少计算的数据量。 - -## 分区表 - -分区裁剪是只有当目标表为分区表时,才可以进行的一种优化方式。[分区表](/partitioned-table.md#分区表) 是 TiDB 中常见的性能优化的手段,Range 分区可以用于解决业务中大量删除带来的性能问题,Hash 分区则可以用于大量写入场景下的数据打散。 +分区裁剪是只有当目标表为分区表时,才可以进行的一种优化方式。分区裁剪通过分析查询语句中的过滤条件,只选择可能满足条件的分区,不扫描匹配不上的分区,进而显著地减少计算的数据量。 ## 分区裁剪的使用场景 -### 场景一 +分区表有 Range 分区和 hash 分区两种形式,分区裁剪对两种分区表也有不同的使用场景。 -分区裁剪需要使用分区表上面的查询条件,如果查询条件不能下推到分区表,则相应的查询语句无法使用分区裁剪。 +### 分区裁剪在 Hash 分区表上的应用 -```sql -create table t1 (x int) partition by range (x) ( - partition p0 values less than (5), - partition p1 values less than (10)); -create table t2 (x int); +#### Hash 分区表上可以使用分区裁剪的场景 -explain select * from t1 left join t2 on t1.x = t2.x where t2.x > 5; -+------------------------------+----------+-----------+------------------------+----------------------------------------------+ -| id | estRows | task | access object | operator info | -+------------------------------+----------+-----------+------------------------+----------------------------------------------+ -| HashJoin_8 | 4166.67 | root | | inner join, equal:[eq(test.t1.x, test.t2.x)] | -| ├─TableReader_15(Build) | 3333.33 | root | | data:Selection_14 | -| │ └─Selection_14 | 3333.33 | cop[tikv] | | gt(test.t2.x, 5), not(isnull(test.t2.x)) | -| │ └─TableFullScan_13 | 10000.00 | cop[tikv] | table:t2 | keep order:false, stats:pseudo | -| └─TableReader_12(Probe) | 3333.33 | root | | data:Selection_11 | -| └─Selection_11 | 3333.33 | cop[tikv] | | gt(test.t1.x, 5), not(isnull(test.t1.x)) | -| └─TableFullScan_10 | 10000.00 | cop[tikv] | table:t1, partition:p1 | keep order:false, stats:pseudo | -+------------------------------+----------+-----------+------------------------+----------------------------------------------+ -7 rows in set (0.01 sec) -``` - -在这个查询中,外连接可以简化成内连接,然后由 `t1.x = t2.x` 和 `t2.x > 5` 可以推出条件 `t1.x > 5`,于是可以分区裁剪并且只使用 `p1` 分区。 - -```sql -explain select * from t1 left join t2 on t1.x = t2.x and t2.x > 5; -+------------------------------+----------+-----------+------------------------+---------------------------------------------------+ -| id | estRows | task | access object | operator info | -+------------------------------+----------+-----------+------------------------+---------------------------------------------------+ -| HashJoin_9 | 20000.00 | root | | left outer join, equal:[eq(test.t1.x, test.t2.x)] | -| ├─TableReader_18(Build) | 3333.33 | root | | data:Selection_17 | -| │ └─Selection_17 | 3333.33 | cop[tikv] | | gt(test.t2.x, 5), not(isnull(test.t2.x)) | -| │ └─TableFullScan_16 | 10000.00 | cop[tikv] | table:t2 | keep order:false, stats:pseudo | -| └─Union_11(Probe) | 20000.00 | root | | | -| ├─TableReader_13 | 10000.00 | root | | data:TableFullScan_12 | -| │ └─TableFullScan_12 | 10000.00 | cop[tikv] | table:t1, partition:p0 | keep order:false, stats:pseudo | -| └─TableReader_15 | 10000.00 | root | | data:TableFullScan_14 | -| └─TableFullScan_14 | 10000.00 | cop[tikv] | table:t1, partition:p1 | keep order:false, stats:pseudo | -+------------------------------+----------+-----------+------------------------+---------------------------------------------------+ -9 rows in set (0.00 sec) -``` - -这个查询中的 `t2.x > 5` 条件不能下推到 `t1` 分区表上面,因此 `t1` 无法分区裁剪。 - -### 场景二 +只有等值比较的查询条件能够支持 Hash 分区表的裁剪。 -由于分区裁剪的规则优化是在查询计划的生成阶段,对于执行阶段才能获取到过滤条件的场景,无法利用分区裁剪的优化。 +{{< copyable "sql" >}} ```sql -create table t1 (x int) partition by range (x) ( - partition p0 values less than (5), - partition p1 values less than (10)); -create table t2 (x int); -explain select * from t2 where x < (select * from t1 where t2.x < t1.x and t2.x < 2); -+--------------------------------------+----------+-----------+------------------------+-----------------------------------------------------------+ -| id | estRows | task | access object | operator info | -+--------------------------------------+----------+-----------+------------------------+-----------------------------------------------------------+ -| Projection_13 | 9990.00 | root | | test.t2.x | -| └─Apply_15 | 9990.00 | root | | CARTESIAN inner join, other cond:lt(test.t2.x, test.t1.x) | -| ├─TableReader_18(Build) | 9990.00 | root | | data:Selection_17 | -| │ └─Selection_17 | 9990.00 | cop[tikv] | | not(isnull(test.t2.x)) | -| │ └─TableFullScan_16 | 10000.00 | cop[tikv] | table:t2 | keep order:false, stats:pseudo | -| └─Selection_19(Probe) | 0.80 | root | | not(isnull(test.t1.x)) | -| └─MaxOneRow_20 | 1.00 | root | | | -| └─Union_21 | 2.00 | root | | | -| ├─TableReader_24 | 2.00 | root | | data:Selection_23 | -| │ └─Selection_23 | 2.00 | cop[tikv] | | lt(test.t2.x, 2), lt(test.t2.x, test.t1.x) | -| │ └─TableFullScan_22 | 2.50 | cop[tikv] | table:t1, partition:p0 | keep order:false, stats:pseudo | -| └─TableReader_27 | 2.00 | root | | data:Selection_26 | -| └─Selection_26 | 2.00 | cop[tikv] | | lt(test.t2.x, 2), lt(test.t2.x, test.t1.x) | -| └─TableFullScan_25 | 2.50 | cop[tikv] | table:t1, partition:p1 | keep order:false, stats:pseudo | -+--------------------------------------+----------+-----------+------------------------+-----------------------------------------------------------+ -14 rows in set (0.00 sec) +create table t (x int) partition by hash(x) partitions 4; +explain select * from t where x = 1; ``` -这个查询每从 `t2` 读取一行,都会去分区表 `t1` 上进行查询,理论上这时会满足 `t1.x > val` 的过滤条件,但实际上由于分区裁剪只作用于查询计划生成阶段,而不是执行阶段,因而不会做裁剪。 - -### 场景三 - -对于 `hash` 分区类型,只有等值比较的查询条件能够支持分区裁剪。 - ```sql -create table t (x int) partition by hash(x) partitions 4; -explain select * from t where x = 1; +-------------------------+----------+-----------+-----------------------+--------------------------------+ | id | estRows | task | access object | operator info | +-------------------------+----------+-----------+-----------------------+--------------------------------+ @@ -107,31 +32,163 @@ explain select * from t where x = 1; | └─Selection_7 | 10.00 | cop[tikv] | | eq(test.t.x, 1) | | └─TableFullScan_6 | 10000.00 | cop[tikv] | table:t, partition:p1 | keep order:false, stats:pseudo | +-------------------------+----------+-----------+-----------------------+--------------------------------+ -3 rows in set (0.01 sec) -explain select * from t where x > 2; +``` + +#### Hash 分区表上不能使用分区裁剪的场景 + +##### 场景一 + +不能确定查询结果只在一个分区上的条件:如 `in`, `between`, `> < >= <=` 等查询条件,不能使用分区裁剪的优化。 + +{{< copyable "sql" >}} + +```sql +create table t (x int) partition by hash(x) partitions 4; +explain select * from t where x in (1,2); +``` + +```sql +------------------------------+----------+-----------+-----------------------+--------------------------------+ | id | estRows | task | access object | operator info | +------------------------------+----------+-----------+-----------------------+--------------------------------+ -| Union_10 | 13333.33 | root | | | -| ├─TableReader_13 | 3333.33 | root | | data:Selection_12 | -| │ └─Selection_12 | 3333.33 | cop[tikv] | | gt(test.t.x, 2) | +| Union_10 | 80.00 | root | | | +| ├─TableReader_13 | 20.00 | root | | data:Selection_12 | +| │ └─Selection_12 | 20.00 | cop[tikv] | | in(test.t.x, 1, 2) | | │ └─TableFullScan_11 | 10000.00 | cop[tikv] | table:t, partition:p0 | keep order:false, stats:pseudo | -| ├─TableReader_16 | 3333.33 | root | | data:Selection_15 | -| │ └─Selection_15 | 3333.33 | cop[tikv] | | gt(test.t.x, 2) | +| ├─TableReader_16 | 20.00 | root | | data:Selection_15 | +| │ └─Selection_15 | 20.00 | cop[tikv] | | in(test.t.x, 1, 2) | | │ └─TableFullScan_14 | 10000.00 | cop[tikv] | table:t, partition:p1 | keep order:false, stats:pseudo | -| ├─TableReader_19 | 3333.33 | root | | data:Selection_18 | -| │ └─Selection_18 | 3333.33 | cop[tikv] | | gt(test.t.x, 2) | +| ├─TableReader_19 | 20.00 | root | | data:Selection_18 | +| │ └─Selection_18 | 20.00 | cop[tikv] | | in(test.t.x, 1, 2) | | │ └─TableFullScan_17 | 10000.00 | cop[tikv] | table:t, partition:p2 | keep order:false, stats:pseudo | -| └─TableReader_22 | 3333.33 | root | | data:Selection_21 | -| └─Selection_21 | 3333.33 | cop[tikv] | | gt(test.t.x, 2) | +| └─TableReader_22 | 20.00 | root | | data:Selection_21 | +| └─Selection_21 | 20.00 | cop[tikv] | | in(test.t.x, 1, 2) | | └─TableFullScan_20 | 10000.00 | cop[tikv] | table:t, partition:p3 | keep order:false, stats:pseudo | +------------------------------+----------+-----------+-----------------------+--------------------------------+ -13 rows in set (0.00 sec) ``` -### 场景四 +##### 场景二 + +由于分区裁剪的规则优化是在查询计划的生成阶段,对于执行阶段才能获取到过滤条件的场景,无法利用分区裁剪的优化。 + +{{< copyable "sql" >}} + +```sql +create table t (x int) partition by hash(x) partitions 4; +explain select * from t2 where x = (select * from t1 where t2.x = t1.x and t2.x < 2); +``` + +```sql ++--------------------------------------+----------+-----------+------------------------+----------------------------------------------+ +| id | estRows | task | access object | operator info | ++--------------------------------------+----------+-----------+------------------------+----------------------------------------------+ +| Projection_13 | 9990.00 | root | | test.t2.x | +| └─Apply_15 | 9990.00 | root | | inner join, equal:[eq(test.t2.x, test.t1.x)] | +| ├─TableReader_18(Build) | 9990.00 | root | | data:Selection_17 | +| │ └─Selection_17 | 9990.00 | cop[tikv] | | not(isnull(test.t2.x)) | +| │ └─TableFullScan_16 | 10000.00 | cop[tikv] | table:t2 | keep order:false, stats:pseudo | +| └─Selection_19(Probe) | 0.80 | root | | not(isnull(test.t1.x)) | +| └─MaxOneRow_20 | 1.00 | root | | | +| └─Union_21 | 2.00 | root | | | +| ├─TableReader_24 | 2.00 | root | | data:Selection_23 | +| │ └─Selection_23 | 2.00 | cop[tikv] | | eq(test.t2.x, test.t1.x), lt(test.t2.x, 2) | +| │ └─TableFullScan_22 | 2500.00 | cop[tikv] | table:t1, partition:p0 | keep order:false, stats:pseudo | +| └─TableReader_27 | 2.00 | root | | data:Selection_26 | +| └─Selection_26 | 2.00 | cop[tikv] | | eq(test.t2.x, test.t1.x), lt(test.t2.x, 2) | +| └─TableFullScan_25 | 2500.00 | cop[tikv] | table:t1, partition:p1 | keep order:false, stats:pseudo | ++--------------------------------------+----------+-----------+------------------------+----------------------------------------------+ +``` + +这个查询每从 `t2` 读取一行,都会去分区表 `t1` 上进行查询,理论上这时会满足 `t1.x = val` 的过滤条件,但实际上由于分区裁剪只作用于查询计划生成阶段,而不是执行阶段,因而不会做裁剪。 + +### 分区裁剪在 Range 分区表上的应用 + +#### Range 分区表上可以使用分区裁剪的场景 + +##### 场景一 + +等值比较的查询条件可以使用分区裁剪。 + +{{< copyable "sql" >}} + +```sql +create table t (x int) partition by range (x) ( + partition p0 values less than (5), + partition p1 values less than (10), + partition p2 values less than (15) + ); +explain select * from t where x = 3; +``` + +```sql ++-------------------------+----------+-----------+-----------------------+--------------------------------+ +| id | estRows | task | access object | operator info | ++-------------------------+----------+-----------+-----------------------+--------------------------------+ +| TableReader_8 | 10.00 | root | | data:Selection_7 | +| └─Selection_7 | 10.00 | cop[tikv] | | eq(test.t.x, 3) | +| └─TableFullScan_6 | 10000.00 | cop[tikv] | table:t, partition:p0 | keep order:false, stats:pseudo | ++-------------------------+----------+-----------+-----------------------+--------------------------------+ +``` + +使用 `in` 条件的等值比较查询条件也可以使用分区裁剪。 + +{{< copyable "sql" >}} + +```sql +create table t (x int) partition by range (x) ( + partition p0 values less than (5), + partition p1 values less than (10), + partition p2 values less than (15) + ); +explain select * from t where x in(1,13); +``` + +```sql ++-----------------------------+----------+-----------+-----------------------+--------------------------------+ +| id | estRows | task | access object | operator info | ++-----------------------------+----------+-----------+-----------------------+--------------------------------+ +| Union_8 | 40.00 | root | | | +| ├─TableReader_11 | 20.00 | root | | data:Selection_10 | +| │ └─Selection_10 | 20.00 | cop[tikv] | | in(test.t.x, 1, 13) | +| │ └─TableFullScan_9 | 10000.00 | cop[tikv] | table:t, partition:p0 | keep order:false, stats:pseudo | +| └─TableReader_14 | 20.00 | root | | data:Selection_13 | +| └─Selection_13 | 20.00 | cop[tikv] | | in(test.t.x, 1, 13) | +| └─TableFullScan_12 | 10000.00 | cop[tikv] | table:t, partition:p2 | keep order:false, stats:pseudo | ++-----------------------------+----------+-----------+-----------------------+--------------------------------+ +``` -对于 `range` 分区类型,分区表达式必须是 `col` 或者 `fn(col)` 的简单形式,查询条件是 `> < = >= <=` 时才能支持分区裁剪。如果分区表达式是 `fn(col)` 形式,还要求 `fn` 必须是单调函数,才有可能分区裁剪。 +##### 场景二 + +区间比较的查询条件如 `between`, `> < = >= <=` 可以使用分区裁剪。 + +{{< copyable "sql" >}} + +```sql +create table t (x int) partition by range (x) ( + partition p0 values less than (5), + partition p1 values less than (10), + partition p2 values less than (15) + ); +explain select * from t where x between 7 and 14; +``` + +```sql ++-----------------------------+----------+-----------+-----------------------+-----------------------------------+ +| id | estRows | task | access object | operator info | ++-----------------------------+----------+-----------+-----------------------+-----------------------------------+ +| Union_8 | 500.00 | root | | | +| ├─TableReader_11 | 250.00 | root | | data:Selection_10 | +| │ └─Selection_10 | 250.00 | cop[tikv] | | ge(test.t.x, 7), le(test.t.x, 14) | +| │ └─TableFullScan_9 | 10000.00 | cop[tikv] | table:t, partition:p1 | keep order:false, stats:pseudo | +| └─TableReader_14 | 250.00 | root | | data:Selection_13 | +| └─Selection_13 | 250.00 | cop[tikv] | | ge(test.t.x, 7), le(test.t.x, 14) | +| └─TableFullScan_12 | 10000.00 | cop[tikv] | table:t, partition:p2 | keep order:false, stats:pseudo | ++-----------------------------+----------+-----------+-----------------------+-----------------------------------+ +``` + +##### 场景三 + +分区表达式为 `fn(col)` 的简单形式,查询条件是 `> < = >= <=` ,且 `fn` 是单调函数,可以使用分区裁剪。 理论上所有满足单调条件(严格或者非严格)的函数都是可以支持分区裁剪。实际上,目前 TiDB 已经支持的单调函数只有: @@ -140,13 +197,18 @@ unix_timestamp to_days ``` -例如,分区表达式是 `fn(col)` 形式,`fn` 为我们支持的单调函数 `to_days`: +例如,分区表达式是 `fn(col)` 形式,`fn` 为我们支持的单调函数 `to_days`,就可以使用分区裁剪: + +{{< copyable "sql" >}} ```sql create table t (id datetime) partition by range (to_days(id)) ( partition p0 values less than (to_days('2020-04-01')), partition p1 values less than (to_days('2020-05-01'))); explain select * from t where id > '2020-04-18'; +``` + +```sql +-------------------------+----------+-----------+-----------------------+-------------------------------------------+ | id | estRows | task | access object | operator info | +-------------------------+----------+-----------+-----------------------+-------------------------------------------+ @@ -154,5 +216,42 @@ explain select * from t where id > '2020-04-18'; | └─Selection_7 | 3333.33 | cop[tikv] | | gt(test.t.id, 2020-04-18 00:00:00.000000) | | └─TableFullScan_6 | 10000.00 | cop[tikv] | table:t, partition:p1 | keep order:false, stats:pseudo | +-------------------------+----------+-----------+-----------------------+-------------------------------------------+ -3 rows in set (0.00 sec) -``` \ No newline at end of file +``` + +#### Range 分区表上不能使用分区裁剪的场景 + +由于分区裁剪的规则优化是在查询计划的生成阶段,对于执行阶段才能获取到过滤条件的场景,无法利用分区裁剪的优化。 + +{{< copyable "sql" >}} + +```sql +create table t1 (x int) partition by range (x) ( + partition p0 values less than (5), + partition p1 values less than (10)); +create table t2 (x int); +explain select * from t2 where x < (select * from t1 where t2.x < t1.x and t2.x < 2); +``` + +```sql ++--------------------------------------+----------+-----------+------------------------+-----------------------------------------------------------+ +| id | estRows | task | access object | operator info | ++--------------------------------------+----------+-----------+------------------------+-----------------------------------------------------------+ +| Projection_13 | 9990.00 | root | | test.t2.x | +| └─Apply_15 | 9990.00 | root | | CARTESIAN inner join, other cond:lt(test.t2.x, test.t1.x) | +| ├─TableReader_18(Build) | 9990.00 | root | | data:Selection_17 | +| │ └─Selection_17 | 9990.00 | cop[tikv] | | not(isnull(test.t2.x)) | +| │ └─TableFullScan_16 | 10000.00 | cop[tikv] | table:t2 | keep order:false, stats:pseudo | +| └─Selection_19(Probe) | 0.80 | root | | not(isnull(test.t1.x)) | +| └─MaxOneRow_20 | 1.00 | root | | | +| └─Union_21 | 2.00 | root | | | +| ├─TableReader_24 | 2.00 | root | | data:Selection_23 | +| │ └─Selection_23 | 2.00 | cop[tikv] | | lt(test.t2.x, 2), lt(test.t2.x, test.t1.x) | +| │ └─TableFullScan_22 | 2.50 | cop[tikv] | table:t1, partition:p0 | keep order:false, stats:pseudo | +| └─TableReader_27 | 2.00 | root | | data:Selection_26 | +| └─Selection_26 | 2.00 | cop[tikv] | | lt(test.t2.x, 2), lt(test.t2.x, test.t1.x) | +| └─TableFullScan_25 | 2.50 | cop[tikv] | table:t1, partition:p1 | keep order:false, stats:pseudo | ++--------------------------------------+----------+-----------+------------------------+-----------------------------------------------------------+ +14 rows in set (0.00 sec) +``` + +这个查询每从 `t2` 读取一行,都会去分区表 `t1` 上进行查询,理论上这时会满足 `t1.x > val` 的过滤条件,但实际上由于分区裁剪只作用于查询计划生成阶段,而不是执行阶段,因而不会做裁剪。 \ No newline at end of file From b4dd22597119e15f2d573646b2b0e6c733e4b02d Mon Sep 17 00:00:00 2001 From: wshwsh12 <793703860@qq.com> Date: Sun, 24 May 2020 17:01:57 +0800 Subject: [PATCH 05/10] address comnet --- partition-pruning.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/partition-pruning.md b/partition-pruning.md index 8ad74b5b4249..51f98ea57560 100644 --- a/partition-pruning.md +++ b/partition-pruning.md @@ -44,25 +44,25 @@ explain select * from t where x = 1; ```sql create table t (x int) partition by hash(x) partitions 4; -explain select * from t where x in (1,2); +explain select * from t where x > 2; ``` ```sql +------------------------------+----------+-----------+-----------------------+--------------------------------+ | id | estRows | task | access object | operator info | +------------------------------+----------+-----------+-----------------------+--------------------------------+ -| Union_10 | 80.00 | root | | | -| ├─TableReader_13 | 20.00 | root | | data:Selection_12 | -| │ └─Selection_12 | 20.00 | cop[tikv] | | in(test.t.x, 1, 2) | +| Union_10 | 13333.33 | root | | | +| ├─TableReader_13 | 3333.33 | root | | data:Selection_12 | +| │ └─Selection_12 | 3333.33 | cop[tikv] | | gt(test.t.x, 2) | | │ └─TableFullScan_11 | 10000.00 | cop[tikv] | table:t, partition:p0 | keep order:false, stats:pseudo | -| ├─TableReader_16 | 20.00 | root | | data:Selection_15 | -| │ └─Selection_15 | 20.00 | cop[tikv] | | in(test.t.x, 1, 2) | +| ├─TableReader_16 | 3333.33 | root | | data:Selection_15 | +| │ └─Selection_15 | 3333.33 | cop[tikv] | | gt(test.t.x, 2) | | │ └─TableFullScan_14 | 10000.00 | cop[tikv] | table:t, partition:p1 | keep order:false, stats:pseudo | -| ├─TableReader_19 | 20.00 | root | | data:Selection_18 | -| │ └─Selection_18 | 20.00 | cop[tikv] | | in(test.t.x, 1, 2) | +| ├─TableReader_19 | 3333.33 | root | | data:Selection_18 | +| │ └─Selection_18 | 3333.33 | cop[tikv] | | gt(test.t.x, 2) | | │ └─TableFullScan_17 | 10000.00 | cop[tikv] | table:t, partition:p2 | keep order:false, stats:pseudo | -| └─TableReader_22 | 20.00 | root | | data:Selection_21 | -| └─Selection_21 | 20.00 | cop[tikv] | | in(test.t.x, 1, 2) | +| └─TableReader_22 | 3333.33 | root | | data:Selection_21 | +| └─Selection_21 | 3333.33 | cop[tikv] | | gt(test.t.x, 2) | | └─TableFullScan_20 | 10000.00 | cop[tikv] | table:t, partition:p3 | keep order:false, stats:pseudo | +------------------------------+----------+-----------+-----------------------+--------------------------------+ ``` From c59052edd7c02f063aa27d5731c96e77a2e9016b Mon Sep 17 00:00:00 2001 From: wshwsh12 <793703860@qq.com> Date: Mon, 25 May 2020 10:20:20 +0800 Subject: [PATCH 06/10] address commnet --- partition-pruning.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/partition-pruning.md b/partition-pruning.md index 51f98ea57560..a10124568879 100644 --- a/partition-pruning.md +++ b/partition-pruning.md @@ -197,7 +197,7 @@ unix_timestamp to_days ``` -例如,分区表达式是 `fn(col)` 形式,`fn` 为我们支持的单调函数 `to_days`,就可以使用分区裁剪: +例如,分区表达式是 `fn(col)` 形式,`fn` 为我们支持的单调函数 `to_days`,就可以使用分区裁剪: {{< copyable "sql" >}} From 2283c39f9219627bdb87c0fc591cc82baed8215e Mon Sep 17 00:00:00 2001 From: wshwsh12 <793703860@qq.com> Date: Mon, 25 May 2020 10:23:27 +0800 Subject: [PATCH 07/10] address commnet --- partition-pruning.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/partition-pruning.md b/partition-pruning.md index a10124568879..e69c98047135 100644 --- a/partition-pruning.md +++ b/partition-pruning.md @@ -67,6 +67,8 @@ explain select * from t where x > 2; +------------------------------+----------+-----------+-----------------------+--------------------------------+ ``` +在这条 SQL 中,`x > 2` 条件无法确定对应的 Hash Partition,所以不能使用分区裁剪。 + ##### 场景二 由于分区裁剪的规则优化是在查询计划的生成阶段,对于执行阶段才能获取到过滤条件的场景,无法利用分区裁剪的优化。 From 130a50d99177e15396ee1dc2934eee3e533447ef Mon Sep 17 00:00:00 2001 From: wshwsh12 <793703860@qq.com> Date: Mon, 25 May 2020 11:56:47 +0800 Subject: [PATCH 08/10] address commnet --- partition-pruning.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/partition-pruning.md b/partition-pruning.md index e69c98047135..b17015c549c7 100644 --- a/partition-pruning.md +++ b/partition-pruning.md @@ -34,6 +34,8 @@ explain select * from t where x = 1; +-------------------------+----------+-----------+-----------------------+--------------------------------+ ``` +在这条 SQL 中,由条件 `x = 1` 可以知道所有结果均在一个分区上。数值 `1` 在经过 Hash 后,可以确定其在分区 `p1` 中。因此只需要扫描分区 `p1` ,而无需访问一定不会出现相关结果的 `p2` 、`p3` 、`p4` 分区。 + #### Hash 分区表上不能使用分区裁剪的场景 ##### 场景一 From 1e00a91248c274435c2fe816c7aaa803f7501f45 Mon Sep 17 00:00:00 2001 From: wshwsh12 <793703860@qq.com> Date: Mon, 25 May 2020 12:04:28 +0800 Subject: [PATCH 09/10] address commnet --- partition-pruning.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/partition-pruning.md b/partition-pruning.md index b17015c549c7..a438f730a853 100644 --- a/partition-pruning.md +++ b/partition-pruning.md @@ -34,7 +34,7 @@ explain select * from t where x = 1; +-------------------------+----------+-----------+-----------------------+--------------------------------+ ``` -在这条 SQL 中,由条件 `x = 1` 可以知道所有结果均在一个分区上。数值 `1` 在经过 Hash 后,可以确定其在分区 `p1` 中。因此只需要扫描分区 `p1` ,而无需访问一定不会出现相关结果的 `p2` 、`p3` 、`p4` 分区。 +在这条 SQL 中,由条件 `x = 1` 可以知道所有结果均在一个分区上。数值 `1` 在经过 Hash 后,可以确定其在分区 `p1` 中。因此只需要扫描分区 `p1` ,而无需访问一定不会出现相关结果的 `p2` 、`p3` 、`p4` 分区。从执行计划来看,其中只出现了一个 `TableFullScan` 算子,且在 `access object` 中指定了 `p1` 分区,确认 `partition pruning` 生效了。 #### Hash 分区表上不能使用分区裁剪的场景 @@ -161,6 +161,8 @@ explain select * from t where x in(1,13); +-----------------------------+----------+-----------+-----------------------+--------------------------------+ ``` +在这条 SQL 中,由条件 `x in(1,13)` 可以知道所有结果只会分布在几个分区上。经过分析,发现所有 `x = 1` 的记录都在分区 `p0` 上, 所有 `x = 13` 的记录都在分区 `p2` 上,因此只需要访问这 `p0`、`p2` 分区, + ##### 场景二 区间比较的查询条件如 `between`, `> < = >= <=` 可以使用分区裁剪。 From af927e240752f81aeefb87de4058c5224ba221ad Mon Sep 17 00:00:00 2001 From: wshwsh12 <793703860@qq.com> Date: Mon, 25 May 2020 12:04:57 +0800 Subject: [PATCH 10/10] address commnet --- partition-pruning.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/partition-pruning.md b/partition-pruning.md index a438f730a853..42cf11d1e21f 100644 --- a/partition-pruning.md +++ b/partition-pruning.md @@ -161,7 +161,7 @@ explain select * from t where x in(1,13); +-----------------------------+----------+-----------+-----------------------+--------------------------------+ ``` -在这条 SQL 中,由条件 `x in(1,13)` 可以知道所有结果只会分布在几个分区上。经过分析,发现所有 `x = 1` 的记录都在分区 `p0` 上, 所有 `x = 13` 的记录都在分区 `p2` 上,因此只需要访问这 `p0`、`p2` 分区, +在这条 SQL 中,由条件 `x in(1,13)` 可以知道所有结果只会分布在几个分区上。经过分析,发现所有 `x = 1` 的记录都在分区 `p0` 上, 所有 `x = 13` 的记录都在分区 `p2` 上,因此只需要访问 `p0`、`p2` 这两个分区, ##### 场景二