From c842c798e4a9e31dce06b4836b2bdcbafe1155d6 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Tue, 3 Mar 2015 19:44:42 -0800 Subject: [PATCH] Support INSERT ... ON CONFLICT IGNORE This non-standard INSERT clause allows DML statement authors to specify that in the event of each of any of the tuples being inserted duplicating an existing tuple in terms of a value or set of values constrained by a unique index, an alternative IGNORE path may be taken (the tuple slot proposed for insertion is skipped without raising an error). The implementation loops until either an insert occurs, or a conclusively committed conflicting tuple is determined to exist. This is implemented using a new infrastructure called "speculative insertion", which is an optimistic variant of regular insertion that has sessions attempt an insert following a pre-check, and handle uncommon conflicts (would-be unique/exclusion violations) in a special manner (this necessiates "super-deletion", and a new iteration). Speculative heap insertions are WAL-logged in two steps: One record relates to an initial intent to insert, while a second minimal record simply confirms that that attempt was ultimately successful (i.e. no conflicts where detected when inserting into constraint-related indexes). Logical decoding does not rely on the presence of this second record to affirm that a speculative insertion succeeded, though; it relies on the *absence* on an (internal) "super-deletion" record. Optionally, INSERT ... ON CONFLICT IGNORE commands may restrict the implementation from considering all would-be duplicate violations as reason to take the IGNORE path. A "unique index inference" clause/specification can be provided, which must have a set of user-supplied column names (or expressions), and optionally, a predicate (for partial indexes). This is used to infer some existing unique index (or, in corner cases, multiple unique indexes). Failure to infer at least one unique index is an error. Arbiter unique indexes are displayed in EXPLAIN output. Peter Geoghegan, with review and contributions from numerous others, including Heikki Linnakangas, Andres Freund, and Robert Haas. External testing infrastructure used during review developed by Jeff Janes. --- .../pg_stat_statements/pg_stat_statements.c | 13 + contrib/postgres_fdw/deparse.c | 7 +- .../postgres_fdw/expected/postgres_fdw.out | 3 + contrib/postgres_fdw/postgres_fdw.c | 12 +- contrib/postgres_fdw/postgres_fdw.h | 2 +- contrib/postgres_fdw/sql/postgres_fdw.sql | 2 + doc/src/sgml/ddl.sgml | 4 + doc/src/sgml/fdwhandler.sgml | 7 + doc/src/sgml/keywords.sgml | 7 + doc/src/sgml/postgres-fdw.sgml | 6 + doc/src/sgml/ref/create_rule.sgml | 7 +- doc/src/sgml/ref/create_table.sgml | 5 +- doc/src/sgml/ref/create_view.sgml | 11 +- doc/src/sgml/ref/insert.sgml | 220 ++++++++- doc/src/sgml/ref/set_constraints.sgml | 5 +- src/backend/access/heap/heapam.c | 392 +++++++++++++-- src/backend/access/heap/hio.c | 27 +- src/backend/access/nbtree/nbtinsert.c | 32 +- src/backend/access/rmgrdesc/heapdesc.c | 9 + src/backend/catalog/index.c | 53 +- src/backend/catalog/indexing.c | 2 +- src/backend/commands/constraint.c | 2 +- src/backend/commands/copy.c | 7 +- src/backend/commands/explain.c | 22 + src/backend/executor/README | 72 +++ src/backend/executor/execIndexing.c | 456 ++++++++++++++++-- src/backend/executor/execMain.c | 8 +- src/backend/executor/nodeModifyTable.c | 171 ++++++- src/backend/nodes/copyfuncs.c | 55 +++ src/backend/nodes/equalfuncs.c | 44 ++ src/backend/nodes/nodeFuncs.c | 58 +++ src/backend/nodes/outfuncs.c | 19 + src/backend/nodes/readfuncs.c | 21 + src/backend/optimizer/plan/createplan.c | 12 +- src/backend/optimizer/plan/planner.c | 2 + src/backend/optimizer/util/plancat.c | 300 ++++++++++++ src/backend/parser/analyze.c | 28 +- src/backend/parser/gram.y | 82 +++- src/backend/parser/parse_clause.c | 164 +++++++ src/backend/parser/parse_collate.c | 1 + src/backend/replication/logical/decode.c | 29 +- .../replication/logical/reorderbuffer.c | 167 +++++-- src/backend/rewrite/rewriteHandler.c | 24 +- src/backend/storage/lmgr/lmgr.c | 92 ++++ src/backend/utils/adt/lockfuncs.c | 1 + src/backend/utils/time/tqual.c | 29 +- src/include/access/heapam.h | 3 + src/include/access/heapam_xlog.h | 54 ++- src/include/access/hio.h | 2 +- src/include/access/htup_details.h | 36 +- src/include/catalog/index.h | 2 + src/include/executor/executor.h | 11 +- src/include/nodes/execnodes.h | 8 + src/include/nodes/nodes.h | 15 + src/include/nodes/parsenodes.h | 35 +- src/include/nodes/plannodes.h | 2 + src/include/nodes/primnodes.h | 16 + src/include/optimizer/plancat.h | 2 + src/include/optimizer/planmain.h | 2 +- src/include/parser/kwlist.h | 2 + src/include/parser/parse_clause.h | 2 + src/include/replication/reorderbuffer.h | 13 +- src/include/storage/lmgr.h | 5 + src/include/storage/lock.h | 10 + src/include/utils/snapshot.h | 22 +- .../expected/insert-conflict-ignore.out | 23 + src/test/isolation/isolation_schedule | 1 + .../specs/insert-conflict-ignore.spec | 41 ++ src/test/regress/expected/errors.out | 4 +- src/test/regress/expected/insert_conflict.out | 178 +++++++ src/test/regress/expected/rules.out | 9 + src/test/regress/expected/updatable_views.out | 2 + src/test/regress/parallel_schedule | 1 + src/test/regress/serial_schedule | 1 + src/test/regress/sql/insert_conflict.sql | 115 +++++ src/test/regress/sql/rules.sql | 6 + src/test/regress/sql/updatable_views.sql | 2 + 77 files changed, 3089 insertions(+), 228 deletions(-) create mode 100644 src/test/isolation/expected/insert-conflict-ignore.out create mode 100644 src/test/isolation/specs/insert-conflict-ignore.spec create mode 100644 src/test/regress/expected/insert_conflict.out create mode 100644 src/test/regress/sql/insert_conflict.sql diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 76d9e0a5ec613..609b4d433b823 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -2264,6 +2264,9 @@ JumbleQuery(pgssJumbleState *jstate, Query *query) JumbleRangeTable(jstate, query->rtable); JumbleExpr(jstate, (Node *) query->jointree); JumbleExpr(jstate, (Node *) query->targetList); + APP_JUMB(query->specClause); + JumbleExpr(jstate, (Node *) query->arbiterElems); + JumbleExpr(jstate, query->arbiterWhere); JumbleExpr(jstate, (Node *) query->returningList); JumbleExpr(jstate, (Node *) query->groupClause); JumbleExpr(jstate, query->havingQual); @@ -2631,6 +2634,16 @@ JumbleExpr(pgssJumbleState *jstate, Node *node) APP_JUMB(ce->cursor_param); } break; + case T_InferenceElem: + { + InferenceElem *ie = (InferenceElem *) node; + + APP_JUMB(ie->infercollid); + APP_JUMB(ie->inferopfamily); + APP_JUMB(ie->inferopcinputtype); + JumbleExpr(jstate, ie->expr); + } + break; case T_TargetEntry: { TargetEntry *tle = (TargetEntry *) node; diff --git a/contrib/postgres_fdw/deparse.c b/contrib/postgres_fdw/deparse.c index 94fab18c42504..cb2b098932635 100644 --- a/contrib/postgres_fdw/deparse.c +++ b/contrib/postgres_fdw/deparse.c @@ -847,8 +847,8 @@ appendWhereClause(StringInfo buf, void deparseInsertSql(StringInfo buf, PlannerInfo *root, Index rtindex, Relation rel, - List *targetAttrs, List *returningList, - List **retrieved_attrs) + List *targetAttrs, bool ignore, + List *returningList, List **retrieved_attrs) { AttrNumber pindex; bool first; @@ -892,6 +892,9 @@ deparseInsertSql(StringInfo buf, PlannerInfo *root, else appendStringInfoString(buf, " DEFAULT VALUES"); + if (ignore) + appendStringInfoString(buf, " ON CONFLICT IGNORE"); + deparseReturningList(buf, root, rtindex, rel, rel->trigdesc && rel->trigdesc->trig_insert_after_row, returningList, retrieved_attrs); diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out index 93e9836cf0d1c..c9cf0b0274991 100644 --- a/contrib/postgres_fdw/expected/postgres_fdw.out +++ b/contrib/postgres_fdw/expected/postgres_fdw.out @@ -2327,6 +2327,9 @@ INSERT INTO ft1(c1, c2) VALUES(11, 12); -- duplicate key ERROR: duplicate key value violates unique constraint "t1_pkey" DETAIL: Key ("C 1")=(11) already exists. CONTEXT: Remote SQL command: INSERT INTO "S 1"."T 1"("C 1", c2, c3, c4, c5, c6, c7, c8) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) +INSERT INTO ft1(c1, c2) VALUES(11, 12) ON CONFLICT IGNORE; -- works +INSERT INTO ft1(c1, c2) VALUES(11, 12) ON CONFLICT (c1, c2) IGNORE; -- unsupported +ERROR: postgres_fdw does not support ON CONFLICT unique index inference INSERT INTO ft1(c1, c2) VALUES(1111, -2); -- c2positive ERROR: new row for relation "T 1" violates check constraint "c2positive" DETAIL: Failing row contains (1111, -2, null, null, null, null, ft1 , null). diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c index 478e12484b940..b4a58d5cfcdba 100644 --- a/contrib/postgres_fdw/postgres_fdw.c +++ b/contrib/postgres_fdw/postgres_fdw.c @@ -1171,6 +1171,7 @@ postgresPlanForeignModify(PlannerInfo *root, List *targetAttrs = NIL; List *returningList = NIL; List *retrieved_attrs = NIL; + bool ignore = false; initStringInfo(&sql); @@ -1222,6 +1223,15 @@ postgresPlanForeignModify(PlannerInfo *root, if (plan->returningLists) returningList = (List *) list_nth(plan->returningLists, subplan_index); + if (root->parse->arbiterElems) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("postgres_fdw does not support ON CONFLICT unique index inference"))); + else if (plan->spec == SPEC_IGNORE) + ignore = true; + else if (plan->spec != SPEC_NONE) + elog(ERROR, "unexpected speculative specification: %d", (int) plan->spec); + /* * Construct the SQL command string. */ @@ -1229,7 +1239,7 @@ postgresPlanForeignModify(PlannerInfo *root, { case CMD_INSERT: deparseInsertSql(&sql, root, resultRelation, rel, - targetAttrs, returningList, + targetAttrs, ignore, returningList, &retrieved_attrs); break; case CMD_UPDATE: diff --git a/contrib/postgres_fdw/postgres_fdw.h b/contrib/postgres_fdw/postgres_fdw.h index 950c6f79a22a7..3763a5797e356 100644 --- a/contrib/postgres_fdw/postgres_fdw.h +++ b/contrib/postgres_fdw/postgres_fdw.h @@ -60,7 +60,7 @@ extern void appendWhereClause(StringInfo buf, List **params); extern void deparseInsertSql(StringInfo buf, PlannerInfo *root, Index rtindex, Relation rel, - List *targetAttrs, List *returningList, + List *targetAttrs, bool ignore, List *returningList, List **retrieved_attrs); extern void deparseUpdateSql(StringInfo buf, PlannerInfo *root, Index rtindex, Relation rel, diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql index 4a23457e79653..aba373e817eed 100644 --- a/contrib/postgres_fdw/sql/postgres_fdw.sql +++ b/contrib/postgres_fdw/sql/postgres_fdw.sql @@ -372,6 +372,8 @@ UPDATE ft2 SET c2 = c2 + 600 WHERE c1 % 10 = 8 AND c1 < 1200 RETURNING *; ALTER TABLE "S 1"."T 1" ADD CONSTRAINT c2positive CHECK (c2 >= 0); INSERT INTO ft1(c1, c2) VALUES(11, 12); -- duplicate key +INSERT INTO ft1(c1, c2) VALUES(11, 12) ON CONFLICT IGNORE; -- works +INSERT INTO ft1(c1, c2) VALUES(11, 12) ON CONFLICT (c1, c2) IGNORE; -- unsupported INSERT INTO ft1(c1, c2) VALUES(1111, -2); -- c2positive UPDATE ft1 SET c2 = -c2 WHERE c1 = 1; -- c2positive diff --git a/doc/src/sgml/ddl.sgml b/doc/src/sgml/ddl.sgml index 0aa0c13c5c811..055c78d03df8c 100644 --- a/doc/src/sgml/ddl.sgml +++ b/doc/src/sgml/ddl.sgml @@ -2442,6 +2442,10 @@ VALUES ('Albany', NULL, NULL, 'NY'); All check constraints and not-null constraints on a parent table are automatically inherited by its children. Other types of constraints (unique, primary key, and foreign key constraints) are not inherited. + Therefore, INSERT with ON CONFLICT + unique index inference considers only unique constraints/indexes + directly associated with the table inserted into (which can be an + inheritance parent or child). diff --git a/doc/src/sgml/fdwhandler.sgml b/doc/src/sgml/fdwhandler.sgml index 5af41318e5c17..ff061b49b6057 100644 --- a/doc/src/sgml/fdwhandler.sgml +++ b/doc/src/sgml/fdwhandler.sgml @@ -1014,6 +1014,13 @@ GetForeignServerByName(const char *name, bool missing_ok); source provides. + + INSERT with an ON CONFLICT clause is not + supported with a unique index inference specification, since a + conflict arbitrating unique index cannot meaningfully be inferred + on a foreign table. + + diff --git a/doc/src/sgml/keywords.sgml b/doc/src/sgml/keywords.sgml index b0dfd5ff75bce..ea582116ab9dc 100644 --- a/doc/src/sgml/keywords.sgml +++ b/doc/src/sgml/keywords.sgml @@ -853,6 +853,13 @@ + + CONFLICT + non-reserved + + + + CONNECT diff --git a/doc/src/sgml/postgres-fdw.sgml b/doc/src/sgml/postgres-fdw.sgml index 43adb61455d91..81d4441d49a9c 100644 --- a/doc/src/sgml/postgres-fdw.sgml +++ b/doc/src/sgml/postgres-fdw.sgml @@ -68,6 +68,12 @@ in your user mapping must have privileges to do these things.) + + postgres_fdw supports INSERT + statements with an ON CONFLICT IGNORE clause, provided a + unique index inference specification is omitted. + + It is generally recommended that the columns of a foreign table be declared with exactly the same data types, and collations if applicable, as the diff --git a/doc/src/sgml/ref/create_rule.sgml b/doc/src/sgml/ref/create_rule.sgml index 677766a2d5e36..2aabf62f67ba7 100644 --- a/doc/src/sgml/ref/create_rule.sgml +++ b/doc/src/sgml/ref/create_rule.sgml @@ -136,7 +136,12 @@ CREATE [ OR REPLACE ] RULE name AS The event is one of SELECT, INSERT, UPDATE, or - DELETE. + DELETE. Note that an + INSERT containing an ON CONFLICT + IGNORE clause cannot be used on tables that have + either INSERT or UPDATE + rules. Consider using an updatable view instead, which are + supported with ON CONFLICT IGNORE. diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml index be7ebd5f54f74..05396e6c8fe1c 100644 --- a/doc/src/sgml/ref/create_table.sgml +++ b/doc/src/sgml/ref/create_table.sgml @@ -717,7 +717,10 @@ CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXI EXCLUDE, and REFERENCES (foreign key) constraints accept this clause. NOT NULL and CHECK constraints are not - deferrable. + deferrable. Note that constraints that were created with this + clause cannot be used as arbiters of whether or not to take the + alternative path with an INSERT statement + that includes an ON CONFLICT clause. diff --git a/doc/src/sgml/ref/create_view.sgml b/doc/src/sgml/ref/create_view.sgml index 5dadab1dee9d7..d8efd30448a24 100644 --- a/doc/src/sgml/ref/create_view.sgml +++ b/doc/src/sgml/ref/create_view.sgml @@ -286,8 +286,9 @@ CREATE VIEW vista AS SELECT text 'Hello World' AS hello; Simple views are automatically updatable: the system will allow INSERT, UPDATE and DELETE statements - to be used on the view in the same way as on a regular table. A view is - automatically updatable if it satisfies all of the following conditions: + to be used on the view in the same way as on a regular table (aside from + the limitations on ON CONFLICT noted below). A view is automatically + updatable if it satisfies all of the following conditions: @@ -383,6 +384,12 @@ CREATE VIEW vista AS SELECT text 'Hello World' AS hello; not need any permissions on the underlying base relations (see ). + + INSERT with an ON CONFLICT IGNORE + clause is supported on updatable views (if an inference + specification is provided, it must infer a unique index on the + underlying base relation). + diff --git a/doc/src/sgml/ref/insert.sgml b/doc/src/sgml/ref/insert.sgml index a3cccb9f7c79a..75de9a6ed808b 100644 --- a/doc/src/sgml/ref/insert.sgml +++ b/doc/src/sgml/ref/insert.sgml @@ -24,6 +24,7 @@ PostgreSQL documentation [ WITH [ RECURSIVE ] with_query [, ...] ] INSERT INTO table_name [ ( column_name [, ...] ) ] { DEFAULT VALUES | VALUES ( { expression | DEFAULT } [, ...] ) [, ...] | query } + [ ON CONFLICT [ ( { column_name_index | ( expression_index ) } [ COLLATE collation ] [ opclass ] [, ...] [ WHERE index_predicate ] ) ] IGNORE] [ RETURNING * | output_expression [ [ AS ] output_name ] [, ...] ] @@ -58,6 +59,108 @@ INSERT INTO table_name [ ( + + The optional ON CONFLICT clause specifies a path to + take as an alternative to raising a conflict related error. The + alternative path is considered individually for each row proposed + for insertion; it is taken (or not taken) once per row. + ON CONFLICT IGNORE simply avoids inserting any + individual row when it is determined that a conflict related error + would otherwise need to be raised. + + + + ON CONFLICT IGNORE optionally accepts a + unique index inference specification, which + consists of one or more column_name_index columns and/or + expression_index + expressions on columns, appearing between parenthesis. These are + used to infer a unique index to limit pre-checking for conflicts to + (if no appropriate index is available, an error is raised). A + subset of the table to limit the check for conflicts to can + optionally also be specified using index_predicate (this allows the + implementation to use an expression index only covering at least + the subset). Note that omitting a unique index inference + specification indicates a total indifference to where any conflict + could occur, which isn't always appropriate. At times, it may be + desirable for ON CONFLICT IGNORE to + not suppress a conflict related error + associated with an index where that isn't explicitly anticipated. + + + + Columns and/or expressions appearing in a unique index inference + specification must match all the columns/expressions of some + existing unique index on table_name - there can be no + columns/expressions from the unique index that do not appear in the + inference specification, nor can there be any columns/expressions + appearing in the inference specification that do not appear in the + unique index definition. However, the order of the + columns/expressions in the index definition, or whether or not the + index definition specified NULLS FIRST or + NULLS LAST, or the internal sort order of each column + (whether DESC or ASC were specified) are + all irrelevant. Deferred unique constraints are not supported as + arbiters of whether an alternative ON CONFLICT path + should be taken. + + + + The definition of a conflict for the purposes of the ON + CONFLICT clause is somewhat subtle, although the exact + definition is seldom of great interest. A conflict is a condition + that ordinarily necessitates raising either a unique violation from + a unique constraint (or unique index), or an exclusion violation + from an exclusion constraint, occurring in an index/constraint that + arbitrates the ON CONFLICT path (there may be more than + one). Only unique indexes (or unique constraints) can be inferred + with a unique index inference specification. In contrast to the + rules around certain other SQL clauses, like the + DISTINCT clause, the definition of a duplicate + (a conflict) is based on whatever unique indexes happen to be + defined on columns on the table. In particular, the default + operator class for the type of each indexed column is not + considered. The inference clause can require a particular named + operator class be used per column/expression indexed if that's a + concern. Similarly, the inference specification can limit its + consideration of arbiter unique indexes on the basis of collations + on column/expression covered by available indexes. + + + + The optional index_predicate can be used to + allow the inference specification to infer that a partial unique + index can be used. Any unique index that otherwise satisfies the + inference specification, while also covering at least all the rows + in the table covered by index_predicate may be used. It is + recommended that the partial index predicate of the unique index + intended to be used as the arbiter of taking the alternative path + be matched exactly, but this is not required. Note that an error + will be raised if an arbiter unique index is chosen that does not + cover the tuple or tuples proposed for insertion. However, an + overly specific index_predicate does not imply that + arbitrating conflicts will be limited to the subset of rows covered + by the inferred unique index corresponding to index_predicate. + + + + Multiple unique indexes/constraints may be inferred where multiple + indexes exist that satisfy the inference specification, although + typically this does not occur (this behavior only exists to + smoothly cover certain corner cases). Note that the ordering of + multiple column_name_index columns and/or + expression_index + within the inference specification is not significant. + + The optional RETURNING clause causes INSERT to compute and return value(s) based on each row actually inserted. @@ -126,6 +229,85 @@ INSERT INTO table_name [ ( + + column_name_index + + + The name of a table_name column. Part of a + unique inference specification. Follows CREATE + INDEX format. + + + + + + expression_index + + + Similar to column_name_index, but used to + infer expressions on table_name columns appearing + within index definitions (not simple columns). Part of unique + index inference clause. Follows CREATE INDEX + format. + + + + + + collation + + + When specified, mandates that corresponding column_name_index or + expression_index + use particular collation in order to be inferred as arbitrating + the ON CONFLICT path taken. Typically this is + omitted even when it is intended that ON CONFLICT + infer a particular unique index or unique constraint with a + non-default collation, since the use of a non-default collation + does not usually change the semantics of arbitration (because + the equality semantics are often equivalent + anyway). Follows CREATE INDEX format. + + + + + + opclass + + + When specified, mandates that corresponding column_name_index or + expression_index + use particular operator class in order to be inferred as + arbitrating the ON CONFLICT path taken. Sometimes + this is omitted even when it is intended that ON + CONFLICT infer a particular unique index or unique constraint + with a non-default operator class (because the + equality semantics are often equivalent + across a type's operator classes anyway, or because it's + sufficient to trust that the defined unique index has the + pertinent definition of equality). Follows CREATE + INDEX format. + + + + + + index_predicate + + + Used to allow inference of partial unique indexes. Any indexes + that satisfy the predicate (which need not actually be partial + indexes) will be used in conflict arbitration. Follows + CREATE INDEX format. + + + + DEFAULT VALUES @@ -171,8 +353,9 @@ INSERT INTO table_name [ ( An expression to be computed and returned by the INSERT - command after each row is inserted. The expression can use any - column names of the table named by table_name. + command after each row is inserted (not updated). The + expression can use any column names of the table named by + table_name. Write * to return all columns of the inserted row(s). @@ -311,7 +494,35 @@ WITH upd AS ( RETURNING * ) INSERT INTO employees_log SELECT *, current_timestamp FROM upd; - + + + + Insert a distributor, or do nothing for rows proposed for insertion + when an existing, excluded row (a row with a matching constrained + column or columns after before row insert triggers fire) exists. + Example assumes a unique index has been defined that constrains + values appearing in the did column: + + INSERT INTO distributors (did, dname) VALUES (7, 'Redline GmbH') + ON CONFLICT (did) IGNORE; + + + + Insert new distributor if possible; otherwise + IGNORE. Example assumes a unique index has been + defined that constrains values appearing in the + did column on a subset of rows where the + is_active boolean column evaluates to + true: + + -- This statement could infer a partial unique index on did + -- with a predicate of WHERE is_active, but it could also + -- just use a regular unique constraint on did if that was + -- all that was available. + INSERT INTO distributors (did, dname) VALUES (9, 'Antwerp Design') + ON CONFLICT (did WHERE is_active) IGNORE; + + @@ -321,7 +532,8 @@ INSERT INTO employees_log SELECT *, current_timestamp FROM upd; INSERT conforms to the SQL standard, except that the RETURNING clause is a PostgreSQL extension, as is the ability - to use WITH with INSERT. + to use WITH with INSERT, and the ability to + specify an alternative path with ON CONFLICT. Also, the case in which a column name list is omitted, but not all the columns are filled from the VALUES clause or query, diff --git a/doc/src/sgml/ref/set_constraints.sgml b/doc/src/sgml/ref/set_constraints.sgml index 7c31871b0bd2b..ba2b5badbd26f 100644 --- a/doc/src/sgml/ref/set_constraints.sgml +++ b/doc/src/sgml/ref/set_constraints.sgml @@ -69,7 +69,10 @@ SET CONSTRAINTS { ALL | name [, ... Currently, only UNIQUE, PRIMARY KEY, REFERENCES (foreign key), and EXCLUDE - constraints are affected by this setting. + constraints are affected by this setting. Note that constraints + that are DEFERRED cannot be used as arbiters by + the ON CONFLICT clause that INSERT + supports. NOT NULL and CHECK constraints are always checked immediately when a row is inserted or modified (not at the end of the statement). diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 457cd708fd3b5..e19c73868c135 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -104,6 +104,7 @@ static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask, static HTSU_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid, TransactionId xid, LockTupleMode mode); +static void heap_confirm_insert(Relation relation, HeapTuple tuple); static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask, uint16 *new_infomask2); static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax, @@ -2102,7 +2103,9 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); - RelationPutHeapTuple(relation, buffer, heaptup); + RelationPutHeapTuple(relation, buffer, heaptup, + (options & HEAP_INSERT_SPECULATIVE) != 0); + if (PageIsAllVisible(BufferGetPage(buffer))) { @@ -2156,7 +2159,11 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, } xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self); - xlrec.flags = all_visible_cleared ? XLOG_HEAP_ALL_VISIBLE_CLEARED : 0; + xlrec.flags = 0; + if (all_visible_cleared) + xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED; + if (options & HEAP_INSERT_SPECULATIVE) + xlrec.flags |= XLH_INSERT_IS_SPECULATIVE; Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer)); /* @@ -2166,7 +2173,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, */ if (RelationIsLogicallyLogged(relation)) { - xlrec.flags |= XLOG_HEAP_CONTAINS_NEW_TUPLE; + xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE; bufflags |= REGBUF_KEEP_DATA; } @@ -2208,6 +2215,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, */ CacheInvalidateHeapTuple(relation, heaptup, NULL); + /* Always count deletion, even of speculative insertions */ pgstat_count_heap_insert(relation, 1); /* @@ -2368,7 +2376,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, * RelationGetBufferForTuple has ensured that the first tuple fits. * Put that on the page, and then as many other tuples as fit. */ - RelationPutHeapTuple(relation, buffer, heaptuples[ndone]); + RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false); for (nthispage = 1; ndone + nthispage < ntuples; nthispage++) { HeapTuple heaptup = heaptuples[ndone + nthispage]; @@ -2376,7 +2384,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace) break; - RelationPutHeapTuple(relation, buffer, heaptup); + RelationPutHeapTuple(relation, buffer, heaptup, false); /* * We don't use heap_multi_insert for catalog tuples yet, but @@ -2436,7 +2444,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, /* the rest of the scratch space is used for tuple data */ tupledata = scratchptr; - xlrec->flags = all_visible_cleared ? XLOG_HEAP_ALL_VISIBLE_CLEARED : 0; + xlrec->flags = all_visible_cleared ? XLH_INSERT_ALL_VISIBLE_CLEARED : 0; xlrec->ntuples = nthispage; /* @@ -2471,7 +2479,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, Assert((scratchptr - scratch) < BLCKSZ); if (need_tuple_data) - xlrec->flags |= XLOG_HEAP_CONTAINS_NEW_TUPLE; + xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE; /* * Signal that this is the last xl_heap_multi_insert record @@ -2479,7 +2487,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, * decoding so it knows when to cleanup temporary data. */ if (ndone + nthispage == ntuples) - xlrec->flags |= XLOG_HEAP_LAST_MULTI_INSERT; + xlrec->flags |= XLH_INSERT_LAST_IN_MULTI; if (init) { @@ -2883,7 +2891,7 @@ heap_delete(Relation relation, ItemPointer tid, if (RelationIsAccessibleInLogicalDecoding(relation)) log_heap_new_cid(relation, &tp); - xlrec.flags = all_visible_cleared ? XLOG_HEAP_ALL_VISIBLE_CLEARED : 0; + xlrec.flags = all_visible_cleared ? XLH_DELETE_ALL_VISIBLE_CLEARED : 0; xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask, tp.t_data->t_infomask2); xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); @@ -2892,9 +2900,9 @@ heap_delete(Relation relation, ItemPointer tid, if (old_key_tuple != NULL) { if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL) - xlrec.flags |= XLOG_HEAP_CONTAINS_OLD_TUPLE; + xlrec.flags |= XLH_DELETE_CONTAINS_OLD_TUPLE; else - xlrec.flags |= XLOG_HEAP_CONTAINS_OLD_KEY; + xlrec.flags |= XLH_DELETE_CONTAINS_OLD_KEY; } XLogBeginInsert(); @@ -2971,6 +2979,209 @@ heap_delete(Relation relation, ItemPointer tid, return HeapTupleMayBeUpdated; } +/* + * heap_super_delete - super delete a tuple + * + * Caller requires that we "super-delete" a tuple just inserted in the same + * command. Instead of setting its xmax, we set xmin to InvalidTransactionId, + * making it immediately appear as dead to everyone. In particular, we want + * HeapTupleSatisfiesDirty() to regard the tuple as dead, so that another + * backend inserting a duplicate key value won't unnecessarily wait for our + * transaction to finish (it'll just wait for our speculative insertion to + * finish). + * + * This is somewhat redundant with heap_delete, but we prefer to handle super + * deletion in a dedicated routine with stripped down requirements. + * + * This routine does not consider replica identity, since logical decoding only + * ever needs to treat a super deletion as nullifying a pending speculative + * insertion. + * + * The WAL records generated here match heap_delete(). The same recovery + * routines are used. + */ +static void +heap_super_delete(Relation relation, ItemPointer tid) +{ + TransactionId xid = GetCurrentTransactionId(); + ItemId lp; + HeapTupleData tp; + Page page; + BlockNumber block; + Buffer buffer; + Buffer vmbuffer = InvalidBuffer; + bool all_visible_cleared = false; + + Assert(ItemPointerIsValid(tid)); + + block = ItemPointerGetBlockNumber(tid); + buffer = ReadBuffer(relation, block); + page = BufferGetPage(buffer); + + /* + * Before locking the buffer, pin the visibility map page if it appears to + * be necessary. Since we haven't got the lock yet, someone else might be + * in the middle of changing this, so we'll need to recheck after we have + * the lock. + */ + if (PageIsAllVisible(page)) + visibilitymap_pin(relation, block, &vmbuffer); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * If we didn't pin the visibility map page and the page has become all + * visible while we were busy locking the buffer, we'll have to unlock and + * re-lock, to avoid holding the buffer lock across an I/O. That's a bit + * unfortunate, but hopefully shouldn't happen often. + */ + if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + visibilitymap_pin(relation, block, &vmbuffer); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + } + + lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); + Assert(ItemIdIsNormal(lp)); + + tp.t_tableOid = RelationGetRelid(relation); + tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); + tp.t_len = ItemIdGetLength(lp); + tp.t_self = *tid; + + if (tp.t_data->t_choice.t_heap.t_xmin != xid) + elog(ERROR, "attempted to super-delete a tuple from other transaction"); + + /* + * No need to check for serializable conflicts here. There is never a need + * for a combocid, either. No need to extract replica identity, or do + * anything special with infomask bits. + */ + START_CRIT_SECTION(); + + /* + * The tuple will become DEAD immediately. Set flag that this page is a + * candidate for pruning once our xid falls below the OldestXmin horizon. + */ + PageSetPrunable(page, xid); + + if (PageIsAllVisible(page)) + { + all_visible_cleared = true; + PageClearAllVisible(page); + visibilitymap_clear(relation, BufferGetBlockNumber(buffer), + vmbuffer); + } + + /* store transaction information of xact deleting the tuple */ + tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; + Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data)); + + /* + * Super deleteion sets tuple header xmin to InvalidTransactionId, in order + * to immediately make the tuple invisible to speculative token waiters + * that will be woken momentarily. + */ + HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId); + + /* Make sure there is no forward chain link in t_ctid */ + Assert(HeapTupleHeaderIsSpeculative(tp.t_data)); + tp.t_data->t_ctid = tp.t_self; + + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (RelationNeedsWAL(relation)) + { + xl_heap_delete xlrec; + XLogRecPtr recptr; + + xlrec.flags = XLH_DELETE_IS_SUPER; + if (all_visible_cleared) + xlrec.flags |= XLH_DELETE_ALL_VISIBLE_CLEARED; + xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask, + tp.t_data->t_infomask2); + xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); + xlrec.xmax = xid; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapDelete); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + /* No replica identity logged */ + + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + + if (HeapTupleHasExternal(&tp)) + toast_delete(relation, &tp); + + /* + * Never need to mark tuple for invalidation, since catalogs don't support + * speculative insertion + */ + + /* Now we can release the buffer */ + ReleaseBuffer(buffer); + + /* Always count deletion, even of speculative insertions */ + pgstat_count_heap_delete(relation); +} + +/* + * heap_finish_speculative - set speculative tuple as permanent (or + * unsuccessful). + * + * This routine may be used to mark a tuple originally only inserted + * speculatively as a bonafide, permanent tuple. The t_ctid field (which will + * contain a speculative token value) is modified in place to point to the + * tuple itself, which is characteristic of a newly inserted ordinary tuple. + * Alternatively, the tuple is "super deleted" when a speculative insertion was + * unsuccessful. + */ +void +heap_finish_speculative(Relation relation, HeapTuple tuple, bool conflict) +{ + if (!conflict) + { + /* + * Confirm successful speculative insertion, making tuple physically + * indistinguishable from a tuple that was inserted in the conventional + * manner. + */ + heap_confirm_insert(relation, tuple); + } + else + { + /* + * "Super delete" tuple due to conflict from concurrent insert. + * + * This is occasionally necessary so that "unprincipled deadlocks" are + * avoided; now that a conflict was found, other sessions should not + * wait on our speculative token, and they certainly shouldn't treat + * our speculatively-inserted heap tuple as an ordinary tuple that it + * must wait on the outcome of our xact to UPDATE/DELETE. This makes + * heap tuples behave as conceptual "value locks" of short duration, + * distinct from ordinary tuples that other xacts must wait on + * xmin-xact-end of in the event of a possible unique/exclusion + * violation (the violation that arbitrates taking the alternative + * path). + */ + heap_super_delete(relation, &(tuple->t_self)); + } +} + /* * simple_heap_delete - delete a tuple * @@ -3688,7 +3899,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, HeapTupleClearHeapOnly(newtup); } - RelationPutHeapTuple(relation, newbuf, heaptup); /* insert new tuple */ + RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */ if (!already_marked) { @@ -5361,6 +5572,81 @@ heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid, return HeapTupleMayBeUpdated; } +/* + * heap_confirm_insert - clear speculative token from tuple + * + * It would not be okay for a transaction to commit without confirming outcome + * of speculative insertion. Occasionally, super deletion is necessary (if an + * attempt at speculative insertion failed). More often, this routine confirms + * a speculative insertion was successful. + * + * The need to WAL-log this action may not be obvious (logical decoding does + * not require it). Doing so allows the implementation to not have to consider + * speculative tuples with an in-doubt status. Either a transaction is in + * progress and its tuples may be speculative, or it committed and they cannot + * be, or it aborted and it doesn't matter either way. + */ +static void +heap_confirm_insert(Relation relation, HeapTuple tuple) +{ + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self))); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = (Page) BufferGetPage(buffer); + + offnum = ItemPointerGetOffsetNumber(&(tuple->t_self)); + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(ERROR, "heap_confirm_insert: invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + /* SpecTokenOffsetNumber should be distinguishable from any real offset */ + StaticAssertStmt(MaxOffsetNumber < SpecTokenOffsetNumber, + "invalid speculative token constant"); + + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + + Assert(HeapTupleHeaderIsSpeculative(tuple->t_data)); + + MarkBufferDirty(buffer); + + /* + * Make sure there is no apparent forward chain link in t_ctid. + * Speculative inserters rely on this (in fact, the forward link is a + * speculative token value). + */ + htup->t_ctid = tuple->t_self; + + /* XLOG stuff */ + if (RelationNeedsWAL(relation)) + { + xl_heap_confirm xlrec; + XLogRecPtr recptr; + + xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buffer); +} /* * heap_inplace_update - update a tuple "in place" (ie, overwrite it) @@ -6663,22 +6949,22 @@ log_heap_update(Relation reln, Buffer oldbuf, /* Prepare main WAL data chain */ xlrec.flags = 0; if (all_visible_cleared) - xlrec.flags |= XLOG_HEAP_ALL_VISIBLE_CLEARED; + xlrec.flags |= XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED; if (new_all_visible_cleared) - xlrec.flags |= XLOG_HEAP_NEW_ALL_VISIBLE_CLEARED; + xlrec.flags |= XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED; if (prefixlen > 0) - xlrec.flags |= XLOG_HEAP_PREFIX_FROM_OLD; + xlrec.flags |= XLH_UPDATE_PREFIX_FROM_OLD; if (suffixlen > 0) - xlrec.flags |= XLOG_HEAP_SUFFIX_FROM_OLD; + xlrec.flags |= XLH_UPDATE_SUFFIX_FROM_OLD; if (need_tuple_data) { - xlrec.flags |= XLOG_HEAP_CONTAINS_NEW_TUPLE; + xlrec.flags |= XLH_UPDATE_CONTAINS_NEW_TUPLE; if (old_key_tuple) { if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL) - xlrec.flags |= XLOG_HEAP_CONTAINS_OLD_TUPLE; + xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_TUPLE; else - xlrec.flags |= XLOG_HEAP_CONTAINS_OLD_KEY; + xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY; } } @@ -7304,7 +7590,7 @@ heap_xlog_delete(XLogReaderState *record) * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ - if (xlrec->flags & XLOG_HEAP_ALL_VISIBLE_CLEARED) + if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED) { Relation reln = CreateFakeRelcacheEntry(target_node); Buffer vmbuffer = InvalidBuffer; @@ -7332,13 +7618,16 @@ heap_xlog_delete(XLogReaderState *record) HeapTupleHeaderClearHotUpdated(htup); fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask, &htup->t_infomask2); - HeapTupleHeaderSetXmax(htup, xlrec->xmax); + if (!(xlrec->flags & XLH_DELETE_IS_SUPER)) + HeapTupleHeaderSetXmax(htup, xlrec->xmax); + else + HeapTupleHeaderSetXmin(htup, InvalidTransactionId); HeapTupleHeaderSetCmax(htup, FirstCommandId, false); /* Mark the page as a candidate for pruning */ PageSetPrunable(page, XLogRecGetXid(record)); - if (xlrec->flags & XLOG_HEAP_ALL_VISIBLE_CLEARED) + if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); /* Make sure there is no forward chain link in t_ctid */ @@ -7379,7 +7668,7 @@ heap_xlog_insert(XLogReaderState *record) * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ - if (xlrec->flags & XLOG_HEAP_ALL_VISIBLE_CLEARED) + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) { Relation reln = CreateFakeRelcacheEntry(target_node); Buffer vmbuffer = InvalidBuffer; @@ -7442,7 +7731,7 @@ heap_xlog_insert(XLogReaderState *record) PageSetLSN(page, lsn); - if (xlrec->flags & XLOG_HEAP_ALL_VISIBLE_CLEARED) + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); MarkBufferDirty(buffer); @@ -7499,7 +7788,7 @@ heap_xlog_multi_insert(XLogReaderState *record) * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ - if (xlrec->flags & XLOG_HEAP_ALL_VISIBLE_CLEARED) + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) { Relation reln = CreateFakeRelcacheEntry(rnode); Buffer vmbuffer = InvalidBuffer; @@ -7581,7 +7870,7 @@ heap_xlog_multi_insert(XLogReaderState *record) PageSetLSN(page, lsn); - if (xlrec->flags & XLOG_HEAP_ALL_VISIBLE_CLEARED) + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); MarkBufferDirty(buffer); @@ -7654,7 +7943,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ - if (xlrec->flags & XLOG_HEAP_ALL_VISIBLE_CLEARED) + if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) { Relation reln = CreateFakeRelcacheEntry(rnode); Buffer vmbuffer = InvalidBuffer; @@ -7709,7 +7998,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) /* Mark the page as a candidate for pruning */ PageSetPrunable(page, XLogRecGetXid(record)); - if (xlrec->flags & XLOG_HEAP_ALL_VISIBLE_CLEARED) + if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); PageSetLSN(page, lsn); @@ -7738,7 +8027,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ - if (xlrec->flags & XLOG_HEAP_NEW_ALL_VISIBLE_CLEARED) + if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) { Relation reln = CreateFakeRelcacheEntry(rnode); Buffer vmbuffer = InvalidBuffer; @@ -7766,13 +8055,13 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) if (PageGetMaxOffsetNumber(page) + 1 < offnum) elog(PANIC, "heap_update_redo: invalid max offset number"); - if (xlrec->flags & XLOG_HEAP_PREFIX_FROM_OLD) + if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD) { Assert(newblk == oldblk); memcpy(&prefixlen, recdata, sizeof(uint16)); recdata += sizeof(uint16); } - if (xlrec->flags & XLOG_HEAP_SUFFIX_FROM_OLD) + if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD) { Assert(newblk == oldblk); memcpy(&suffixlen, recdata, sizeof(uint16)); @@ -7844,7 +8133,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) if (offnum == InvalidOffsetNumber) elog(PANIC, "heap_update_redo: failed to add tuple"); - if (xlrec->flags & XLOG_HEAP_NEW_ALL_VISIBLE_CLEARED) + if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ @@ -7877,6 +8166,42 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) XLogRecordPageWithFreeSpace(rnode, newblk, freespace); } +static void +heap_xlog_confirm(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_confirm *xlrec = (xl_heap_confirm *) XLogRecGetData(record); + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + offnum = xlrec->offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "heap_confirm_redo: invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + /* + * Confirm tuple as actually inserted + */ + ItemPointerSet(&htup->t_ctid, BufferGetBlockNumber(buffer), offnum); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + static void heap_xlog_lock(XLogReaderState *record) { @@ -8027,6 +8352,9 @@ heap_redo(XLogReaderState *record) case XLOG_HEAP_HOT_UPDATE: heap_xlog_update(record, true); break; + case XLOG_HEAP_CONFIRM: + heap_xlog_confirm(record); + break; case XLOG_HEAP_LOCK: heap_xlog_lock(record); break; diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c index 6d091f63af0df..a9f0ca35e49e2 100644 --- a/src/backend/access/heap/hio.c +++ b/src/backend/access/heap/hio.c @@ -35,12 +35,17 @@ void RelationPutHeapTuple(Relation relation, Buffer buffer, - HeapTuple tuple) + HeapTuple tuple, + bool token) { Page pageHeader; OffsetNumber offnum; - ItemId itemId; - Item item; + + /* + * A tuple that's being inserted speculatively should already have its + * token set. + */ + Assert(!token || HeapTupleHeaderIsSpeculative(tuple->t_data)); /* Add the tuple to the page */ pageHeader = BufferGetPage(buffer); @@ -54,10 +59,18 @@ RelationPutHeapTuple(Relation relation, /* Update tuple->t_self to the actual position where it was stored */ ItemPointerSet(&(tuple->t_self), BufferGetBlockNumber(buffer), offnum); - /* Insert the correct position into CTID of the stored tuple, too */ - itemId = PageGetItemId(pageHeader, offnum); - item = PageGetItem(pageHeader, itemId); - ((HeapTupleHeader) item)->t_ctid = tuple->t_self; + /* + * Insert the correct position into CTID of the stored tuple, too + * (unless this is a speculative insertion, in which case the token is + * held in CTID field instead) + */ + if (!token) + { + ItemId itemId = PageGetItemId(pageHeader, offnum); + Item item = PageGetItem(pageHeader, itemId); + + ((HeapTupleHeader) item)->t_ctid = tuple->t_self; + } } /* diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index ef68a7145fce5..c57ae1a40c772 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -51,7 +51,8 @@ static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, Buffer buf, OffsetNumber offset, ScanKey itup_scankey, - IndexUniqueCheck checkUnique, bool *is_unique); + IndexUniqueCheck checkUnique, bool *is_unique, + uint32 *speculativeToken); static void _bt_findinsertloc(Relation rel, Buffer *bufptr, OffsetNumber *offsetptr, @@ -159,17 +160,27 @@ _bt_doinsert(Relation rel, IndexTuple itup, */ if (checkUnique != UNIQUE_CHECK_NO) { - TransactionId xwait; + TransactionId xwait; + uint32 speculativeToken; offset = _bt_binsrch(rel, buf, natts, itup_scankey, false); xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey, - checkUnique, &is_unique); + checkUnique, &is_unique, &speculativeToken); if (TransactionIdIsValid(xwait)) { /* Have to wait for the other guy ... */ _bt_relbuf(rel, buf); - XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex); + /* + * If it's a speculative insertion, wait for it to finish (ie. + * to go ahead with the insertion, or kill the tuple). Otherwise + * wait for the transaction to finish as usual. + */ + if (speculativeToken) + SpeculativeInsertionWait(xwait, speculativeToken); + else + XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex); + /* start over... */ _bt_freestack(stack); goto top; @@ -211,9 +222,12 @@ _bt_doinsert(Relation rel, IndexTuple itup, * also point to end-of-page, which means that the first tuple to check * is the first tuple on the next page. * - * Returns InvalidTransactionId if there is no conflict, else an xact ID - * we must wait for to see if it commits a conflicting tuple. If an actual - * conflict is detected, no return --- just ereport(). + * Returns InvalidTransactionId if there is no conflict, else an xact ID we + * must wait for to see if it commits a conflicting tuple. If an actual + * conflict is detected, no return --- just ereport(). If an xact ID is + * returned, and the conflicting tuple still has a speculative insertion in + * progress, *speculativeToken is set to non-zero, and the caller can wait for + * the verdict on the insertion using SpeculativeInsertionWait(). * * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return * InvalidTransactionId because we don't want to wait. In this case we @@ -223,7 +237,8 @@ _bt_doinsert(Relation rel, IndexTuple itup, static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, Buffer buf, OffsetNumber offset, ScanKey itup_scankey, - IndexUniqueCheck checkUnique, bool *is_unique) + IndexUniqueCheck checkUnique, bool *is_unique, + uint32 *speculativeToken) { TupleDesc itupdesc = RelationGetDescr(rel); int natts = rel->rd_rel->relnatts; @@ -340,6 +355,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, if (nbuf != InvalidBuffer) _bt_relbuf(rel, nbuf); /* Tell _bt_doinsert to wait... */ + *speculativeToken = SnapshotDirty.speculativeToken; return xwait; } diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c index 4f06a2637aec4..f4a1b002cf1f2 100644 --- a/src/backend/access/rmgrdesc/heapdesc.c +++ b/src/backend/access/rmgrdesc/heapdesc.c @@ -75,6 +75,12 @@ heap_desc(StringInfo buf, XLogReaderState *record) xlrec->new_offnum, xlrec->new_xmax); } + else if (info == XLOG_HEAP_CONFIRM) + { + xl_heap_confirm *xlrec = (xl_heap_confirm *) rec; + + appendStringInfo(buf, "off %u", xlrec->offnum); + } else if (info == XLOG_HEAP_LOCK) { xl_heap_lock *xlrec = (xl_heap_lock *) rec; @@ -177,6 +183,9 @@ heap_identify(uint8 info) case XLOG_HEAP_HOT_UPDATE | XLOG_HEAP_INIT_PAGE: id = "HOT_UPDATE+INIT"; break; + case XLOG_HEAP_CONFIRM: + id = "HEAP_CONFIRM"; + break; case XLOG_HEAP_LOCK: id = "LOCK"; break; diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index ac3b785b5a723..8a63c5963ba9c 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -1665,6 +1665,10 @@ BuildIndexInfo(Relation index) /* other info */ ii->ii_Unique = indexStruct->indisunique; ii->ii_ReadyForInserts = IndexIsReady(indexStruct); + /* assume not doing speculative insertion for now */ + ii->ii_UniqueOps = NULL; + ii->ii_UniqueProcs = NULL; + ii->ii_UniqueStrats = NULL; /* initialize index-build state to default */ ii->ii_Concurrent = false; @@ -1673,6 +1677,53 @@ BuildIndexInfo(Relation index) return ii; } +/* ---------------- + * IndexInfoSpeculative + * Append extra state to IndexInfo record + * + * For unique indexes, we usually don't want to add info to the IndexInfo for + * checking uniqueness, since the B-Tree AM handles that directly. However, in + * the case of speculative insertion, external support is required. + * + * Do this processing here rather than in BuildIndexInfo() to save the common + * non-speculative cases the overhead they'd otherwise incur. + * ---------------- + */ +void +IndexInfoSpeculative(Relation index, IndexInfo *ii) +{ + int ncols = index->rd_rel->relnatts; + int i; + + /* + * fetch info for checking unique indexes + */ + Assert(ii->ii_Unique); + + if (index->rd_rel->relam != BTREE_AM_OID) + elog(ERROR, "unexpected non-btree speculative unique index"); + + ii->ii_UniqueOps = (Oid *) palloc(sizeof(Oid) * ncols); + ii->ii_UniqueProcs = (Oid *) palloc(sizeof(Oid) * ncols); + ii->ii_UniqueStrats = (uint16 *) palloc(sizeof(uint16) * ncols); + + /* + * We have to look up the operator's strategy number. This + * provides a cross-check that the operator does match the index. + */ + /* We need the func OIDs and strategy numbers too */ + for (i = 0; i < ncols; i++) + { + ii->ii_UniqueStrats[i] = BTEqualStrategyNumber; + ii->ii_UniqueOps[i] = + get_opfamily_member(index->rd_opfamily[i], + index->rd_opcintype[i], + index->rd_opcintype[i], + ii->ii_UniqueStrats[i]); + ii->ii_UniqueProcs[i] = get_opcode(ii->ii_UniqueOps[i]); + } +} + /* ---------------- * FormIndexDatum * Construct values[] and isnull[] arrays for a new index tuple. @@ -2612,7 +2663,7 @@ IndexCheckExclusion(Relation heapRelation, check_exclusion_constraint(heapRelation, indexRelation, indexInfo, &(heapTuple->t_self), values, isnull, - estate, true, false); + estate, true); } heap_endscan(scan); diff --git a/src/backend/catalog/indexing.c b/src/backend/catalog/indexing.c index fe123addac0ce..0231084c7c922 100644 --- a/src/backend/catalog/indexing.c +++ b/src/backend/catalog/indexing.c @@ -46,7 +46,7 @@ CatalogOpenIndexes(Relation heapRel) resultRelInfo->ri_RelationDesc = heapRel; resultRelInfo->ri_TrigDesc = NULL; /* we don't fire triggers */ - ExecOpenIndices(resultRelInfo); + ExecOpenIndices(resultRelInfo, false); return resultRelInfo; } diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index 561d8fae574e6..e49affba9ee10 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -172,7 +172,7 @@ unique_key_recheck(PG_FUNCTION_ARGS) */ check_exclusion_constraint(trigdata->tg_relation, indexRel, indexInfo, &(new_row->t_self), values, isnull, - estate, false, false); + estate, false); } /* diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 92ff632e124e6..669bca406e5a3 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2283,7 +2283,7 @@ CopyFrom(CopyState cstate) 1, /* dummy rangetable index */ 0); - ExecOpenIndices(resultRelInfo); + ExecOpenIndices(resultRelInfo, false); estate->es_result_relations = resultRelInfo; estate->es_num_result_relations = 1; @@ -2438,7 +2438,8 @@ CopyFrom(CopyState cstate) if (resultRelInfo->ri_NumIndices > 0) recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), - estate); + estate, false, + NIL); /* AFTER ROW INSERT Triggers */ ExecARInsertTriggers(estate, resultRelInfo, tuple, @@ -2552,7 +2553,7 @@ CopyFromInsertBatch(CopyState cstate, EState *estate, CommandId mycid, ExecStoreTuple(bufferedTuples[i], myslot, InvalidBuffer, false); recheckIndexes = ExecInsertIndexTuples(myslot, &(bufferedTuples[i]->t_self), - estate); + estate, false, NIL); ExecARInsertTriggers(estate, resultRelInfo, bufferedTuples[i], recheckIndexes); diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 315a52849c949..aec1e9fb80a98 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -2320,6 +2320,8 @@ show_modifytable_info(ModifyTableState *mtstate, ExplainState *es) const char *foperation; bool labeltargets; int j; + List *idxNames = NIL; + ListCell *lst; switch (node->operation) { @@ -2405,6 +2407,26 @@ show_modifytable_info(ModifyTableState *mtstate, ExplainState *es) } } + foreach(lst, node->arbiterIndexes) + { + char *indexname = get_rel_name(lfirst_oid(lst)); + + idxNames = lappend(idxNames, indexname); + } + + /* + * Make sure that there is still an arbiter property list when ON CONFLICT + * IGNORE is used, and an inference specification is omitted (Non-text + * format explains will show an empty array, which seems appropriate + * there). + */ + if (node->spec == SPEC_IGNORE && idxNames == NIL && + es->format == EXPLAIN_FORMAT_TEXT) + idxNames = lappend(idxNames, "(All)"); + + if (node->spec == SPEC_IGNORE) + ExplainPropertyList("Conflict Arbiter Indexes", idxNames, es); + if (labeltargets) ExplainCloseGroup("Target Tables", "Target Tables", false, es); } diff --git a/src/backend/executor/README b/src/backend/executor/README index 8afa1e3e4a759..892f9fe035bfb 100644 --- a/src/backend/executor/README +++ b/src/backend/executor/README @@ -200,3 +200,75 @@ is no explicit prohibition on SRFs in UPDATE, but the net effect will be that only the first result row of an SRF counts, because all subsequent rows will result in attempts to re-update an already updated target row. This is historical behavior and seems not worth changing.) + +Speculative insertion +--------------------- + +Speculative insertion is a process that the executor manages for the benefit of +INSERT...ON CONFLICT IGNORE. Supported indexes include nbtree unique +indexes (nbtree is currently the only amcanunique index access method), or +exclusion constraint indexes (exclusion constraints are considered a +generalization of unique constraints). + +The primary user-visible goal for INSERT ... ON CONFLICT is to guarantee either +an insert, or a conclusive determination that an insert cannot go ahead (due to +a conclusively committed/visible conflict). A would-be conflict (and the +associated index) are the arbiters of whether or not the alternative (IGNORE) +path is taken. The implementation more or less tries to insert until one or +the other of those two outcomes is reached. There are some non-obvious hazards +involved that are carefully avoided. These hazards relate to concurrent +activity causing conflicts for the implementation, which must be handled. + +The index is the authoritative source of truth for whether there is or is not a +conflict, for unique index enforcement in general, and for speculative +insertion in particular. The heap must still be considered, though, not least +since it alone has authoritative visibility information. Through looping, we +hope to overcome the disconnect between the heap and the arbiter index. +Theoretically, some individual session could loop forever, although under high +concurrency one session always proceeds. + +The first step in the loop is to perform a pre-check. The indexes are scanned +for existing conflicting values. At this point, we may have to wait until the +end of another xact (or xact's promise token -- more on that later), iff it +isn't immediately conclusive that there is or is not a conflict (when we finish +the pre-check, there is a conclusion about there either being or +not being a conflict). + +The second step (skipped when a conflict is found) is to insert a heap tuple +and related index tuples opportunistically. This uses the same mechanism as +deferred unique indexes, and so we never wait for a possibly conflicting xact +to commit or abort (unlike with conventional unique index insertion) -- we +simply detect a possible conflict. + +When opportunistically inserting during the second step, we are not logically +inserting a tuple as such. Rather, the process is somewhat similar to the +conventional unique index insertion steps taken within the nbtree AM, where we +must briefly lock the *value* being inserted: in that codepath, the value +proposed for insertion is for an instant locked *in the abstract*, by way of a +buffer lock on "the first leaf page the value could be on". Then, having +established the right to physically insert, do so (or throw an error). For +speculative insertion, if no conflict occurs during the insertion (which is +usually the case, since it was just determined in the first step that there was +no conflict), then we're done. Otherwise, we must restart (and likely find the +same conflict tuple during the first step of the new iteration). But a +counter-intuitive step must be taken first (which is what makes this whole +dance similar to conventional nbtree "value locking"). + +We must "super delete" the tuple when the opportunistic insertion finds a +conflict. This means that it immediately becomes invisible to all snapshot +types, and immediately becomes reclaimable by VACUUM. Other backends +(speculative inserters or ordinary inserters) know to not wait on our +transaction end when they encounter an optimistically inserted "promise tuple". +Rather, they wait on a corresponding promise token lock, which we hold only for +as long as opportunistically inserting. We release the lock when done +opportunistically inserting (and after "super deleting", if that proved +necessary), releasing our waiters (who will ordinarily re-find our promise +tuple as a bona fide tuple, or occasionally will find that they can insert +after all). It's important that other xacts not wait on the end of our xact +until we've established that we've successfully and conclusively inserted +logically (or established that there was an insertion conflict, and cleaned up +after it by "super deleting"). Otherwise, concurrent speculative inserters +could be involved in "unprincipled deadlocks": deadlocks where there is no +user-visible mutual dependency, and yet an implementation related mutual +dependency is unexpectedly introduced. The user might be left with no +reasonable way of avoiding these deadlocks, which would not be okay. diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index a697682b20e8c..44840c0f33346 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -50,6 +50,55 @@ * to the caller. The caller must re-check them later by calling * check_exclusion_constraint(). * + * Speculative insertion + * --------------------- + * + * Speculative insertion is a process that the executor manages for the benefit + * of INSERT...ON CONFLICT IGNORE. Supported indexes include nbtree unique + * indexes (nbtree is currently the only amcanunique index access method), or + * exclusion constraint indexes (exclusion constraints are considered a + * generalization of unique constraints). Speculative insertion is a is a + * two-phase mechanism, used to implement INSERT ... ON CONFLICT IGNORE. We + * first insert the tuple to the heap and update the indexes as usual, but if a + * constraint is violated, we can still back out the insertion without aborting + * the whole transaction. In a INSERT ... ON CONFLICT statement, the tuple is + * first inserted as usual, but if a conflict is detected, it's backed out and + * the ON CONFLICT action is executed instead. + * + * Insertion to a unique index works as usual: the index AM checks for duplicate + * keys atomically with the insertion. But instead of throwing an error on a + * conflict, the speculatively inserted heap tuple is backed out. + * + * Exclusion constraints are slightly more complicated. As mentioned earlier, + * there is a risk of deadlock when two backends insert the same key concurrently. + * That was not a problem for regular insertions, when one of the transactions has + * to be aborted anyway, but with a speculative insertion we cannot let a deadlock + * happen, because we only want to back out the speculatively inserted tuple on + * conflict, not abort the whole transaction. + * + * When a backend detects that the speculative insertion conflicts with another + * in-progress tuple, it has two options: + * + * 1. back out the speculatively inserted tuple, then wait for the other + * transaction, and retry. + * 2. wait for the other transaction, with the speculatively inserted tuple still + * in place. + * + * If two backends insert at the same time, and both try to wait for each other, + * they will deadlock. So option 2 is not acceptable. Option 1 avoids the deadlock, + * but it is prone to a livelock instead. Both transactions will wake up immediately + * as the other transaction backs out. Then they both retry, and conflict with each + * other again, lather, rinse, repeat. + * + * To avoid the livelock, one of the backends must back out first, and then wait, + * while the other one waits without backing out. It doesn't matter which one + * backs out, so we employ an arbitrary rule that the transaction with the higher + * XID backs out. That avoids livelock, and almost completely avoids deadlock. + * There may still be deadlocks with exclusion constraints, but very + * infrequently, and only in the presence of super deletions. It's difficult + * to demonstrate this even with a synthetic stress-test, and (undetectable) + * livelocks are still impossible, so this is deemed acceptable. + * * * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -63,12 +112,30 @@ #include "postgres.h" #include "access/relscan.h" +#include "access/xact.h" #include "catalog/index.h" #include "executor/executor.h" #include "nodes/nodeFuncs.h" #include "storage/lmgr.h" #include "utils/tqual.h" +/* waitMode argument to check_exclusion_or_unique_constraint() */ +typedef enum +{ + CEOUC_WAIT, + CEOUC_NOWAIT, + CEOUC_LIVELOCK_PREVENTING_WAIT, +} CEOUC_WAIT_MODE; + +static bool check_exclusion_or_unique_constraint(Relation heap, Relation index, + IndexInfo *indexInfo, + ItemPointer tupleid, + Datum *values, bool *isnull, + EState *estate, bool newIndex, + CEOUC_WAIT_MODE waitMode, + bool errorOK, + ItemPointer conflictTid); + static bool index_recheck_constraint(Relation index, Oid *constr_procs, Datum *existing_values, bool *existing_isnull, Datum *new_values); @@ -84,7 +151,7 @@ static bool index_recheck_constraint(Relation index, Oid *constr_procs, * ---------------------------------------------------------------- */ void -ExecOpenIndices(ResultRelInfo *resultRelInfo) +ExecOpenIndices(ResultRelInfo *resultRelInfo, bool speculative) { Relation resultRelation = resultRelInfo->ri_RelationDesc; List *indexoidlist; @@ -137,6 +204,13 @@ ExecOpenIndices(ResultRelInfo *resultRelInfo) /* extract index key information from the index's pg_index info */ ii = BuildIndexInfo(indexDesc); + /* + * If the indexes are to be used for speculative insertion, add extra + * information required by unique index entries. + */ + if (speculative && ii->ii_Unique) + IndexInfoSpeculative(indexDesc, ii); + relationDescs[i] = indexDesc; indexInfoArray[i] = ii; i++; @@ -186,7 +260,8 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo) * Unique and exclusion constraints are enforced at the same * time. This returns a list of index OIDs for any unique or * exclusion constraints that are deferred and that had - * potential (unconfirmed) conflicts. + * potential (unconfirmed) conflicts. (if noDupErr == true, + * the same is done for non-deferred constraints) * * CAUTION: this must not be called for a HOT update. * We can't defend against that here for lack of info. @@ -196,7 +271,9 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo) List * ExecInsertIndexTuples(TupleTableSlot *slot, ItemPointer tupleid, - EState *estate) + EState *estate, + bool noDupErr, + List *arbiterIndexes) { List *result = NIL; ResultRelInfo *resultRelInfo; @@ -236,12 +313,17 @@ ExecInsertIndexTuples(TupleTableSlot *slot, IndexInfo *indexInfo; IndexUniqueCheck checkUnique; bool satisfiesConstraint; + bool arbiter; if (indexRelation == NULL) continue; indexInfo = indexInfoArray[i]; + /* Record if speculative insertion arbiter */ + arbiter = list_member_oid(arbiterIndexes, + indexRelation->rd_index->indexrelid); + /* If the index is marked as read-only, ignore it */ if (!indexInfo->ii_ReadyForInserts) continue; @@ -266,7 +348,17 @@ ExecInsertIndexTuples(TupleTableSlot *slot, /* Skip this index-update if the predicate isn't satisfied */ if (!ExecQual(predicate, econtext, false)) + { + if (arbiter) + ereport(ERROR, + (errcode(ERRCODE_TRIGGERED_ACTION_EXCEPTION), + errmsg("inferred arbiter partial unique index's predicate does not cover tuple proposed for insertion"), + errdetail("ON CONFLICT inference clause implies that the tuple proposed for insertion must be covered by the predicate of partial index \"%s\".", + RelationGetRelationName(indexRelation)), + errtableconstraint(heapRelation, + RelationGetRelationName(indexRelation)))); continue; + } } /* @@ -288,9 +380,14 @@ ExecInsertIndexTuples(TupleTableSlot *slot, * For a deferrable unique index, we tell the index AM to just detect * possible non-uniqueness, and we add the index OID to the result * list if further checking is needed. + * + * For a speculative insertion (used by INSERT ... ON CONFLICT), do + * the same as for a deferrable unique index. */ if (!indexRelation->rd_index->indisunique) checkUnique = UNIQUE_CHECK_NO; + else if (noDupErr && (arbiterIndexes == NIL || arbiter)) + checkUnique = UNIQUE_CHECK_PARTIAL; else if (indexRelation->rd_index->indimmediate) checkUnique = UNIQUE_CHECK_YES; else @@ -308,8 +405,11 @@ ExecInsertIndexTuples(TupleTableSlot *slot, * If the index has an associated exclusion constraint, check that. * This is simpler than the process for uniqueness checks since we * always insert first and then check. If the constraint is deferred, - * we check now anyway, but don't throw error on violation; instead - * we'll queue a recheck event. + * we check now anyway, but don't throw error on violation or wait for + * a conclusive outcome from a concurrent insertion; instead we'll + * queue a recheck event. Similarly, noDupErr callers (speculative + * inserters) will recheck later, and wait for a conclusive outcome + * then. * * An index for an exclusion constraint can't also be UNIQUE (not an * essential property, we just don't allow it in the grammar), so no @@ -317,13 +417,31 @@ ExecInsertIndexTuples(TupleTableSlot *slot, */ if (indexInfo->ii_ExclusionOps != NULL) { - bool errorOK = !indexRelation->rd_index->indimmediate; + bool violationOK; + bool waitMode; + + if (noDupErr) + { + violationOK = true; + waitMode = CEOUC_LIVELOCK_PREVENTING_WAIT; + } + else if (!indexRelation->rd_index->indimmediate) + { + violationOK = true; + waitMode = CEOUC_NOWAIT; + } + else + { + violationOK = false; + waitMode = CEOUC_WAIT; + } satisfiesConstraint = - check_exclusion_constraint(heapRelation, - indexRelation, indexInfo, - tupleid, values, isnull, - estate, false, errorOK); + check_exclusion_or_unique_constraint(heapRelation, + indexRelation, indexInfo, + tupleid, values, isnull, + estate, false, + waitMode, violationOK, NULL); } if ((checkUnique == UNIQUE_CHECK_PARTIAL || @@ -342,37 +460,201 @@ ExecInsertIndexTuples(TupleTableSlot *slot, return result; } +/* ---------------------------------------------------------------- + * ExecCheckIndexConstraints + * + * This routine checks if a tuple violates any unique or + * exclusion constraints. If no conflict, returns true. + * Otherwise returns false, and the TID of the conflicting + * tuple is returned in *conflictTid. + * + * If 'arbiterIndexes' is given, only those indexes are checked. + * NIL means all indexes. + * + * Note that this doesn't lock the values in any way, so it's + * possible that a conflicting tuple is inserted immediately + * after this returns. But this can be used for a pre-check + * before insertion. + * ---------------------------------------------------------------- + */ +bool +ExecCheckIndexConstraints(TupleTableSlot *slot, + EState *estate, ItemPointer conflictTid, + List *arbiterIndexes) +{ + ResultRelInfo *resultRelInfo; + int i; + int numIndices; + RelationPtr relationDescs; + Relation heapRelation; + IndexInfo **indexInfoArray; + ExprContext *econtext; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + ItemPointerData invalidItemPtr; + bool checkedIndex = false; + + ItemPointerSetInvalid(conflictTid); + ItemPointerSetInvalid(&invalidItemPtr); + + /* + * Get information from the result relation info structure. + */ + resultRelInfo = estate->es_result_relation_info; + numIndices = resultRelInfo->ri_NumIndices; + relationDescs = resultRelInfo->ri_IndexRelationDescs; + indexInfoArray = resultRelInfo->ri_IndexRelationInfo; + heapRelation = resultRelInfo->ri_RelationDesc; + + /* + * We will use the EState's per-tuple context for evaluating predicates + * and index expressions (creating it if it's not already there). + */ + econtext = GetPerTupleExprContext(estate); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* + * For each index, form index tuple and check if it satisfies the + * constraint. + */ + for (i = 0; i < numIndices; i++) + { + Relation indexRelation = relationDescs[i]; + IndexInfo *indexInfo; + bool satisfiesConstraint; + + if (indexRelation == NULL) + continue; + + indexInfo = indexInfoArray[i]; + + if (!indexInfo->ii_Unique && !indexInfo->ii_ExclusionOps) + continue; + + if (!indexRelation->rd_index->indimmediate) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ON CONFLICT is not supported on relations with deferred unique constraints/exclusion constraints"), + errtableconstraint(heapRelation, + RelationGetRelationName(indexRelation)))); + + /* If the index is marked as read-only, ignore it */ + if (!indexInfo->ii_ReadyForInserts) + continue; + + /* When specific arbiter indexes requested, only examine them */ + if (arbiterIndexes != NIL && + !list_member_oid(arbiterIndexes, + indexRelation->rd_index->indexrelid)) + continue; + + checkedIndex = true; + + /* Check for partial index */ + if (indexInfo->ii_Predicate != NIL) + { + List *predicate; + + /* + * If predicate state not set up yet, create it (in the estate's + * per-query context) + */ + predicate = indexInfo->ii_PredicateState; + if (predicate == NIL) + { + predicate = (List *) + ExecPrepareExpr((Expr *) indexInfo->ii_Predicate, + estate); + indexInfo->ii_PredicateState = predicate; + } + + /* Skip this index-update if the predicate isn't satisfied */ + if (!ExecQual(predicate, econtext, false)) + continue; + } + + /* + * FormIndexDatum fills in its values and isnull parameters with the + * appropriate values for the column(s) of the index. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + satisfiesConstraint = + check_exclusion_or_unique_constraint(heapRelation, indexRelation, + indexInfo, &invalidItemPtr, + values, isnull, estate, false, + CEOUC_WAIT, true, + conflictTid); + if (!satisfiesConstraint) + return false; + } + + if (arbiterIndexes != NIL && !checkedIndex) + elog(ERROR, "unexpected failure to find arbiter unique index"); + + return true; +} + /* - * Check for violation of an exclusion constraint + * Check for violation of an exclusion or unique constraint * * heap: the table containing the new tuple - * index: the index supporting the exclusion constraint + * index: the index supporting the constraint * indexInfo: info about the index, including the exclusion properties - * tupleid: heap TID of the new tuple we have just inserted + * tupleid: heap TID of the new tuple we have just inserted (invalid if we + * haven't inserted a new tuple yet) * values, isnull: the *index* column values computed for the new tuple * estate: an EState we can do evaluation in * newIndex: if true, we are trying to build a new index (this affects * only the wording of error messages) - * errorOK: if true, don't throw error for violation + * waitMode: whether to wait for concurrent inserters/deleters + * violationOK: if true, don't throw error for violation + * conflictTid: if not-NULL, the TID of the conflicting tuple is returned here * * Returns true if OK, false if actual or potential violation * - * When errorOK is true, we report violation without waiting to see if any - * concurrent transaction has committed or not; so the violation is only - * potential, and the caller must recheck sometime later. This behavior - * is convenient for deferred exclusion checks; we need not bother queuing - * a deferred event if there is definitely no conflict at insertion time. + * 'waitMode' determines what happens if a conflict is detected with a tuple + * that was inserted or deleted by a transaction that's still running. + * CEOUC_WAIT means that we wait for the transaction to commit, before + * throwing an error or returning. CEOUC_NOWAIT means that we report the + * violation immediately; so the violation is only potential, and the caller + * must recheck sometime later. This behavior is convenient for deferred + * exclusion checks; we need not bother queuing a deferred event if there is + * definitely no conflict at insertion time. * - * When errorOK is false, we'll throw error on violation, so a false result - * is impossible. + * CEOUC_LIVELOCK_PREVENTING_WAIT is like CEOUC_NOWAIT, but we will sometimes + * wait anyway, to prevent livelocking if two transactions try inserting at + * the same time. This is used with speculative insertions, for INSERT ON + * CONFLICT statements. + * + * If violationOK is true, we just report the potential or actual violation to + * the caller by returning 'false'. Otherwise we throw a descriptive error + * message here. When violationOK is false, a false result is impossible. + * + * Note: The indexam is normally responsible for checking unique constraints, + * so this normally only needs to be used for exclusion constraints. But this + * function is also called when doing a "pre-check" for conflicts on a unique + * constraint, when doing speculative insertion. Caller may use the returned + * conflict TID to take further steps. */ -bool -check_exclusion_constraint(Relation heap, Relation index, IndexInfo *indexInfo, - ItemPointer tupleid, Datum *values, bool *isnull, - EState *estate, bool newIndex, bool errorOK) +static bool +check_exclusion_or_unique_constraint(Relation heap, Relation index, + IndexInfo *indexInfo, + ItemPointer tupleid, + Datum *values, bool *isnull, + EState *estate, bool newIndex, + CEOUC_WAIT_MODE waitMode, + bool violationOK, + ItemPointer conflictTid) { - Oid *constr_procs = indexInfo->ii_ExclusionProcs; - uint16 *constr_strats = indexInfo->ii_ExclusionStrats; + Oid *constr_procs; + uint16 *constr_strats; Oid *index_collations = index->rd_indcollation; int index_natts = index->rd_index->indnatts; IndexScanDesc index_scan; @@ -386,6 +668,17 @@ check_exclusion_constraint(Relation heap, Relation index, IndexInfo *indexInfo, TupleTableSlot *existing_slot; TupleTableSlot *save_scantuple; + if (indexInfo->ii_ExclusionOps) + { + constr_procs = indexInfo->ii_ExclusionProcs; + constr_strats = indexInfo->ii_ExclusionStrats; + } + else + { + constr_procs = indexInfo->ii_UniqueProcs; + constr_strats = indexInfo->ii_UniqueStrats; + } + /* * If any of the input values are NULL, the constraint check is assumed to * pass (i.e., we assume the operators are strict). @@ -450,7 +743,8 @@ check_exclusion_constraint(Relation heap, Relation index, IndexInfo *indexInfo, /* * Ignore the entry for the tuple we're trying to check. */ - if (ItemPointerEquals(tupleid, &tup->t_self)) + if (ItemPointerIsValid(tupleid) && + ItemPointerEquals(tupleid, &tup->t_self)) { if (found_self) /* should not happen */ elog(ERROR, "found self tuple multiple times in index \"%s\"", @@ -480,39 +774,78 @@ check_exclusion_constraint(Relation heap, Relation index, IndexInfo *indexInfo, } /* - * At this point we have either a conflict or a potential conflict. If - * we're not supposed to raise error, just return the fact of the - * potential conflict without waiting to see if it's real. - */ - if (errorOK) - { - conflict = true; - break; - } - - /* + * At this point we have either a conflict or a potential conflict. + * * If an in-progress transaction is affecting the visibility of this - * tuple, we need to wait for it to complete and then recheck. For - * simplicity we do rechecking by just restarting the whole scan --- - * this case probably doesn't happen often enough to be worth trying - * harder, and anyway we don't want to hold any index internal locks - * while waiting. + * tuple, we need to wait for it to complete and then recheck (unless + * the caller requested not to). For simplicity we do rechecking by + * just restarting the whole scan --- this case probably doesn't + * happen often enough to be worth trying harder, and anyway we don't + * want to hold any index internal locks while waiting. + * + * About livelock insurance: + * + * When doing a speculative insertion pre-check, we cannot have an + * "unprincipled deadlock" with another session, fundamentally because + * because there is no possible mutual dependency, since we only hold + * a lock on our token, without attempting to lock anything else + * (maybe this is not the first iteration, but no matter; we'll have + * super deleted and released insertion token lock if so, and all + * locks needed are already held. Also, our XID lock is irrelevant.) + * + * In the second phase, where there is a re-check for conflicts, we + * can't deadlock either (we never lock another thing, since we don't + * wait in that phase). However, a theoretical livelock hazard + * exists: Two sessions could each see each other's conflicting + * tuple, and each could go and delete, retrying forever. + * + * To break the mutual dependency, we may wait on the other xact here + * over our caller's request to not do so (in the second phase). This + * does not imply the risk of unprincipled deadlocks either, because + * if we end up unexpectedly waiting, the other session will super + * delete its own tuple *before* releasing its token lock and freeing + * us, and without attempting to wait on us to release our token lock. + * We'll take another iteration here, after waiting on the other + * session's token, not find a conflict this time, and then proceed + * (assuming we're the oldest XID). + * + * N.B.: Unprincipled deadlocks are still theoretically possible with + * non-speculative insertion with exclusion constraints, but this + * seems inconsequential, since an error was inevitable for one of the + * sessions anyway. We only worry about speculative insertion's + * problems, since they're likely with idiomatic usage. */ xwait = TransactionIdIsValid(DirtySnapshot.xmin) ? DirtySnapshot.xmin : DirtySnapshot.xmax; - if (TransactionIdIsValid(xwait)) + if (TransactionIdIsValid(xwait) && + (waitMode == CEOUC_WAIT || + (waitMode == CEOUC_LIVELOCK_PREVENTING_WAIT && + TransactionIdPrecedes(GetCurrentTransactionId(), xwait)))) { ctid_wait = tup->t_data->t_ctid; index_endscan(index_scan); - XactLockTableWait(xwait, heap, &ctid_wait, - XLTW_RecheckExclusionConstr); + if (DirtySnapshot.speculativeToken) + SpeculativeInsertionWait(DirtySnapshot.xmin, + DirtySnapshot.speculativeToken); + else + XactLockTableWait(xwait, heap, &ctid_wait, + XLTW_RecheckExclusionConstr); goto retry; } /* - * We have a definite conflict. Report it. + * We have a definite conflict (or a potential one, but the caller + * didn't want to wait). Return it to caller, or report it. */ + if (violationOK) + { + conflict = true; + if (conflictTid) + *conflictTid = tup->t_self; + break; + } + error_new = BuildIndexValueDescription(index, values, isnull); error_existing = BuildIndexValueDescription(index, existing_values, existing_isnull); @@ -544,10 +877,10 @@ check_exclusion_constraint(Relation heap, Relation index, IndexInfo *indexInfo, /* * Ordinarily, at this point the search should have found the originally - * inserted tuple, unless we exited the loop early because of conflict. - * However, it is possible to define exclusion constraints for which that - * wouldn't be true --- for instance, if the operator is <>. So we no - * longer complain if found_self is still false. + * inserted tuple (if any), unless we exited the loop early because of + * conflict. However, it is possible to define exclusion constraints for + * which that wouldn't be true --- for instance, if the operator is <>. + * So we no longer complain if found_self is still false. */ econtext->ecxt_scantuple = save_scantuple; @@ -557,6 +890,25 @@ check_exclusion_constraint(Relation heap, Relation index, IndexInfo *indexInfo, return !conflict; } +/* + * Check for violation of an exclusion constraint + * + * This is a dumbed down version of check_exclusion_or_unique_constraint + * for external callers. They don't need all the special modes. + */ +void +check_exclusion_constraint(Relation heap, Relation index, + IndexInfo *indexInfo, + ItemPointer tupleid, + Datum *values, bool *isnull, + EState *estate, bool newIndex) +{ + (void) check_exclusion_or_unique_constraint(heap, index, indexInfo, tupleid, + values, isnull, + estate, newIndex, + CEOUC_WAIT, false, NULL); +} + /* * Check existing tuple's index values to see if it really matches the * exclusion condition against the new_values. Returns true if conflict. diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index df4da3faa974c..6f592585bb8b7 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -2132,8 +2132,9 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, * recycled and reused for an unrelated tuple. This implies that * the latest version of the row was deleted, so we need do * nothing. (Should be safe to examine xmin without getting - * buffer's content lock, since xmin never changes in an existing - * tuple.) + * buffer's content lock. We assume reading a TransactionId to be + * atomic, and Xmin never changes in an existing tuple, except to + * invalid or frozen, and neither of those can match priorXmax.) */ if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple.t_data), priorXmax)) @@ -2233,6 +2234,9 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); + + /* Should not encounter speculative tuple on recheck */ + Assert(!HeapTupleHeaderIsSpeculative(tuple.t_data)); if (!ItemPointerEquals(&hufd.ctid, &tuple.t_self)) { /* it was updated, so look at the updated version */ diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 06ec82e246185..218010fc85c03 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -46,6 +46,7 @@ #include "miscadmin.h" #include "nodes/nodeFuncs.h" #include "storage/bufmgr.h" +#include "storage/lmgr.h" #include "utils/builtins.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -151,6 +152,35 @@ ExecProcessReturning(ProjectionInfo *projectReturning, return ExecProject(projectReturning, NULL); } +/* + * ExecCheckHeapTupleVisible -- verify heap tuple is visible + * + * It would not be consistent with guarantees of the higher isolation levels to + * proceed with avoiding insertion (taking speculative insertion's alternative + * path) on the basis of another tuple that is not visible to MVCC snapshot. + * Check for the need to raise a serialization failure, and do so as necessary. + */ +static void +ExecCheckHeapTupleVisible(EState *estate, + ResultRelInfo *relinfo, + ItemPointer tid) +{ + Relation rel = relinfo->ri_RelationDesc; + Buffer buffer; + HeapTupleData tuple; + + if (!IsolationUsesXactSnapshot()) + return; + + tuple.t_self = *tid; + if (!heap_fetch(rel, estate->es_snapshot, &tuple, &buffer, false, NULL)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent insert or update directing alternative ON CONFLICT path"))); + + ReleaseBuffer(buffer); +} + /* ---------------------------------------------------------------- * ExecInsert * @@ -163,6 +193,8 @@ ExecProcessReturning(ProjectionInfo *projectReturning, static TupleTableSlot * ExecInsert(TupleTableSlot *slot, TupleTableSlot *planSlot, + List *arbiterIndexes, + SpecCmd spec, EState *estate, bool canSetTag) { @@ -199,7 +231,15 @@ ExecInsert(TupleTableSlot *slot, if (resultRelationDesc->rd_rel->relhasoids) HeapTupleSetOid(tuple, InvalidOid); - /* BEFORE ROW INSERT Triggers */ + /* + * BEFORE ROW INSERT Triggers. + * + * Note: We fire BEFORE ROW TRIGGERS for every attempted insertion in an + * INSERT ... ON CONFLICT statement. We cannot check for constraint + * violations before firing these triggers, because they can change the + * values to insert. Also, they can run arbitrary user-defined code with + * side-effects that we can't cancel by just not inserting the tuple. + */ if (resultRelInfo->ri_TrigDesc && resultRelInfo->ri_TrigDesc->trig_insert_before_row) { @@ -268,21 +308,115 @@ ExecInsert(TupleTableSlot *slot, if (resultRelationDesc->rd_att->constr) ExecConstraints(resultRelInfo, slot, estate); - /* - * insert the tuple - * - * Note: heap_insert returns the tid (location) of the new tuple in - * the t_self field. - */ - newId = heap_insert(resultRelationDesc, tuple, - estate->es_output_cid, 0, NULL); + if (spec != SPEC_NONE && resultRelInfo->ri_NumIndices > 0) + { + /* Perform a speculative insertion. */ + uint32 specToken; + ItemPointerData conflictTid; + bool conflict; - /* - * insert index entries for tuple - */ - if (resultRelInfo->ri_NumIndices > 0) + /* + * Do a non-conclusive check for conflicts first. + * + * We're not holding any locks yet, so this doesn't guarantee that + * the later insert won't conflict. But it avoids leaving behind + * a lot of cancelled speculative insertions, if you run a lot of + * INSERT ON CONFLICT statements that do conflict. + * + * We loop back here if we find a conflict below, either during + * the pre-check, or when we re-check after inserting the tuple + * speculatively. See the executor README for a full discussion + * of speculative insertion. + */ +vlock: + if (!ExecCheckIndexConstraints(slot, estate, &conflictTid, + arbiterIndexes)) + { + /* + * Committed conflict tuple found. + * + * In case of ON CONFLICT IGNORE, do nothing. However, verify + * that the tuple is visible to the executor's MVCC snapshot at + * higher isolation levels. + */ + if (spec == SPEC_IGNORE) + { + ExecCheckHeapTupleVisible(estate, resultRelInfo, + &conflictTid); + return NULL; + } + } + + /* + * Before we start insertion proper, acquire our "speculative + * insertion lock". Others can use that to wait for us to decide + * if we're going to go ahead with the insertion, instead of + * waiting for the whole transaction to complete. + */ + specToken = SpeculativeInsertionLockAcquire(GetCurrentTransactionId()); + HeapTupleHeaderSetSpeculativeToken(tuple->t_data, specToken); + + /* insert the tuple, with the speculative token */ + newId = heap_insert(resultRelationDesc, tuple, + estate->es_output_cid, + HEAP_INSERT_SPECULATIVE, + NULL); + + /* insert index entries for tuple */ recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), - estate); + estate, true, + arbiterIndexes); + + /* + * Speculative insertion does not support deferred constraints, so + * the specific index involved in the violation doesn't matter. + */ + conflict = recheckIndexes != NIL; + list_free(recheckIndexes); + + /* + * Update the tuple header to indicate that we really inserted + * it. Or if there was a conflict, kill it. + */ + heap_finish_speculative(resultRelationDesc, tuple, conflict); + + /* + * Wake up anyone waiting for our decision. They will re-check + * the tuple, see that it's no longer speculative, and wait on our + * XID as if this was a regularly inserted tuple all along. Or if + * we killed the tuple, they will see it's dead, and proceed as if + * the tuple never existed. + */ + SpeculativeInsertionLockRelease(GetCurrentTransactionId()); + + /* + * If there was a conflict, start from the beginning. We'll do + * the pre-check again, which will now find the conflicting tuple + * (unless it aborts before we get there). + */ + if (conflict) + goto vlock; + + /* Since there was no insertion conflict, we're done */ + } + else + { + /* + * insert the tuple normally. + * + * Note: heap_insert returns the tid (location) of the new tuple + * in the t_self field. + */ + newId = heap_insert(resultRelationDesc, tuple, + estate->es_output_cid, + 0, NULL); + + /* insert index entries for tuple */ + if (resultRelInfo->ri_NumIndices > 0) + recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + estate, false, + arbiterIndexes); + } } if (canSetTag) @@ -800,7 +934,7 @@ lreplace:; */ if (resultRelInfo->ri_NumIndices > 0 && !HeapTupleIsHeapOnly(tuple)) recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), - estate); + estate, false, NIL); } if (canSetTag) @@ -1062,7 +1196,8 @@ ExecModifyTable(ModifyTableState *node) switch (operation) { case CMD_INSERT: - slot = ExecInsert(slot, planSlot, estate, node->canSetTag); + slot = ExecInsert(slot, planSlot, node->arbiterIndexes, + node->spec, estate, node->canSetTag); break; case CMD_UPDATE: slot = ExecUpdate(tupleid, oldtuple, slot, planSlot, @@ -1137,6 +1272,8 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) mtstate->resultRelInfo = estate->es_result_relations + node->resultRelIndex; mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans); mtstate->mt_nplans = nplans; + mtstate->spec = node->spec; + mtstate->arbiterIndexes = node->arbiterIndexes; /* set up epqstate with dummy subplan data for the moment */ EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam); @@ -1175,7 +1312,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) if (resultRelInfo->ri_RelationDesc->rd_rel->relhasindex && operation != CMD_DELETE && resultRelInfo->ri_IndexRelationDescs == NULL) - ExecOpenIndices(resultRelInfo); + ExecOpenIndices(resultRelInfo, mtstate->spec != SPEC_NONE); /* Now init the plan for this result rel */ estate->es_result_relation_info = resultRelInfo; diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 59c755d7a5952..6ca0245aa3816 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -180,6 +180,8 @@ _copyModifyTable(const ModifyTable *from) COPY_NODE_FIELD(resultRelations); COPY_SCALAR_FIELD(resultRelIndex); COPY_NODE_FIELD(plans); + COPY_SCALAR_FIELD(spec); + COPY_NODE_FIELD(arbiterIndexes); COPY_NODE_FIELD(withCheckOptionLists); COPY_NODE_FIELD(returningLists); COPY_NODE_FIELD(fdwPrivLists); @@ -1781,6 +1783,22 @@ _copyCurrentOfExpr(const CurrentOfExpr *from) return newnode; } +/* + * _copyInferenceElem + */ +static InferenceElem * +_copyInferenceElem(const InferenceElem *from) +{ + InferenceElem *newnode = makeNode(InferenceElem); + + COPY_NODE_FIELD(expr); + COPY_SCALAR_FIELD(infercollid); + COPY_SCALAR_FIELD(inferopfamily); + COPY_SCALAR_FIELD(inferopcinputtype); + + return newnode; +} + /* * _copyTargetEntry */ @@ -2129,6 +2147,30 @@ _copyWithClause(const WithClause *from) return newnode; } +static InferClause * +_copyInferClause(const InferClause *from) +{ + InferClause *newnode = makeNode(InferClause); + + COPY_NODE_FIELD(indexElems); + COPY_NODE_FIELD(whereClause); + COPY_LOCATION_FIELD(location); + + return newnode; +} + +static ConflictClause * +_copyConflictClause(const ConflictClause *from) +{ + ConflictClause *newnode = makeNode(ConflictClause); + + COPY_SCALAR_FIELD(specclause); + COPY_NODE_FIELD(infer); + COPY_LOCATION_FIELD(location); + + return newnode; +} + static CommonTableExpr * _copyCommonTableExpr(const CommonTableExpr *from) { @@ -2546,6 +2588,9 @@ _copyQuery(const Query *from) COPY_NODE_FIELD(jointree); COPY_NODE_FIELD(targetList); COPY_NODE_FIELD(withCheckOptions); + COPY_SCALAR_FIELD(specClause); + COPY_NODE_FIELD(arbiterElems); + COPY_NODE_FIELD(arbiterWhere); COPY_NODE_FIELD(returningList); COPY_NODE_FIELD(groupClause); COPY_NODE_FIELD(havingQual); @@ -2569,6 +2614,7 @@ _copyInsertStmt(const InsertStmt *from) COPY_NODE_FIELD(relation); COPY_NODE_FIELD(cols); COPY_NODE_FIELD(selectStmt); + COPY_NODE_FIELD(confClause); COPY_NODE_FIELD(returningList); COPY_NODE_FIELD(withClause); @@ -4263,6 +4309,9 @@ copyObject(const void *from) case T_CurrentOfExpr: retval = _copyCurrentOfExpr(from); break; + case T_InferenceElem: + retval = _copyInferenceElem(from); + break; case T_TargetEntry: retval = _copyTargetEntry(from); break; @@ -4730,6 +4779,12 @@ copyObject(const void *from) case T_WithClause: retval = _copyWithClause(from); break; + case T_InferClause: + retval = _copyInferClause(from); + break; + case T_ConflictClause: + retval = _copyConflictClause(from); + break; case T_CommonTableExpr: retval = _copyCommonTableExpr(from); break; diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 3bc81762af5fc..8d31ecc5cff76 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -682,6 +682,17 @@ _equalCurrentOfExpr(const CurrentOfExpr *a, const CurrentOfExpr *b) return true; } +static bool +_equalInferenceElem(const InferenceElem *a, const InferenceElem *b) +{ + COMPARE_NODE_FIELD(expr); + COMPARE_SCALAR_FIELD(infercollid); + COMPARE_SCALAR_FIELD(inferopfamily); + COMPARE_SCALAR_FIELD(inferopcinputtype); + + return true; +} + static bool _equalTargetEntry(const TargetEntry *a, const TargetEntry *b) { @@ -868,6 +879,9 @@ _equalQuery(const Query *a, const Query *b) COMPARE_NODE_FIELD(jointree); COMPARE_NODE_FIELD(targetList); COMPARE_NODE_FIELD(withCheckOptions); + COMPARE_SCALAR_FIELD(specClause); + COMPARE_NODE_FIELD(arbiterElems); + COMPARE_NODE_FIELD(arbiterWhere); COMPARE_NODE_FIELD(returningList); COMPARE_NODE_FIELD(groupClause); COMPARE_NODE_FIELD(havingQual); @@ -889,6 +903,7 @@ _equalInsertStmt(const InsertStmt *a, const InsertStmt *b) COMPARE_NODE_FIELD(relation); COMPARE_NODE_FIELD(cols); COMPARE_NODE_FIELD(selectStmt); + COMPARE_NODE_FIELD(confClause); COMPARE_NODE_FIELD(returningList); COMPARE_NODE_FIELD(withClause); @@ -2420,6 +2435,26 @@ _equalWithClause(const WithClause *a, const WithClause *b) return true; } +static bool +_equalInferClause(const InferClause *a, const InferClause *b) +{ + COMPARE_NODE_FIELD(indexElems); + COMPARE_NODE_FIELD(whereClause); + COMPARE_LOCATION_FIELD(location); + + return true; +} + +static bool +_equalConflictClause(const ConflictClause *a, const ConflictClause *b) +{ + COMPARE_SCALAR_FIELD(specclause); + COMPARE_NODE_FIELD(infer); + COMPARE_LOCATION_FIELD(location); + + return true; +} + static bool _equalCommonTableExpr(const CommonTableExpr *a, const CommonTableExpr *b) { @@ -2699,6 +2734,9 @@ equal(const void *a, const void *b) case T_CurrentOfExpr: retval = _equalCurrentOfExpr(a, b); break; + case T_InferenceElem: + retval = _equalInferenceElem(a, b); + break; case T_TargetEntry: retval = _equalTargetEntry(a, b); break; @@ -3153,6 +3191,12 @@ equal(const void *a, const void *b) case T_WithClause: retval = _equalWithClause(a, b); break; + case T_InferClause: + retval = _equalInferClause(a, b); + break; + case T_ConflictClause: + retval = _equalConflictClause(a, b); + break; case T_CommonTableExpr: retval = _equalCommonTableExpr(a, b); break; diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c index d6f1f5bb6d7d3..d815022b83b92 100644 --- a/src/backend/nodes/nodeFuncs.c +++ b/src/backend/nodes/nodeFuncs.c @@ -235,6 +235,13 @@ exprType(const Node *expr) case T_CurrentOfExpr: type = BOOLOID; break; + case T_InferenceElem: + { + const InferenceElem *n = (const InferenceElem *) expr; + + type = exprType((Node *) n->expr); + } + break; case T_PlaceHolderVar: type = exprType((Node *) ((const PlaceHolderVar *) expr)->phexpr); break; @@ -894,6 +901,9 @@ exprCollation(const Node *expr) case T_CurrentOfExpr: coll = InvalidOid; /* result is always boolean */ break; + case T_InferenceElem: + coll = exprCollation((Node *) ((const InferenceElem *) expr)->expr); + break; case T_PlaceHolderVar: coll = exprCollation((Node *) ((const PlaceHolderVar *) expr)->phexpr); break; @@ -1484,6 +1494,12 @@ exprLocation(const Node *expr) case T_WithClause: loc = ((const WithClause *) expr)->location; break; + case T_InferClause: + loc = ((const InferClause *) expr)->location; + break; + case T_ConflictClause: + loc = ((const ConflictClause *) expr)->location; + break; case T_CommonTableExpr: loc = ((const CommonTableExpr *) expr)->location; break; @@ -1491,6 +1507,10 @@ exprLocation(const Node *expr) /* just use argument's location */ loc = exprLocation((Node *) ((const PlaceHolderVar *) expr)->phexpr); break; + case T_InferenceElem: + /* just use nested expr's location */ + loc = exprLocation((Node *) ((const InferenceElem *) expr)->expr); + break; default: /* for any other node type it's just unknown... */ loc = -1; @@ -1920,6 +1940,8 @@ expression_tree_walker(Node *node, break; case T_PlaceHolderVar: return walker(((PlaceHolderVar *) node)->phexpr, context); + case T_InferenceElem: + return walker(((InferenceElem *) node)->expr, context); case T_AppendRelInfo: { AppendRelInfo *appinfo = (AppendRelInfo *) node; @@ -1968,6 +1990,10 @@ query_tree_walker(Query *query, return true; if (walker((Node *) query->withCheckOptions, context)) return true; + if (walker((Node *) query->arbiterElems, context)) + return true; + if (walker(query->arbiterWhere, context)) + return true; if (walker((Node *) query->returningList, context)) return true; if (walker((Node *) query->jointree, context)) @@ -2630,6 +2656,16 @@ expression_tree_mutator(Node *node, return (Node *) newnode; } break; + case T_InferenceElem: + { + InferenceElem *inferenceelemdexpr = (InferenceElem *) node; + InferenceElem *newnode; + + FLATCOPY(newnode, inferenceelemdexpr, InferenceElem); + MUTATE(newnode->expr, newnode->expr, Node *); + return (Node *) newnode; + } + break; case T_AppendRelInfo: { AppendRelInfo *appinfo = (AppendRelInfo *) node; @@ -2709,6 +2745,8 @@ query_tree_mutator(Query *query, MUTATE(query->targetList, query->targetList, List *); MUTATE(query->withCheckOptions, query->withCheckOptions, List *); + MUTATE(query->arbiterElems, query->arbiterElems, List *); + MUTATE(query->arbiterWhere, query->arbiterWhere, Node *); MUTATE(query->returningList, query->returningList, List *); MUTATE(query->jointree, query->jointree, FromExpr *); MUTATE(query->setOperations, query->setOperations, Node *); @@ -2978,6 +3016,8 @@ raw_expression_tree_walker(Node *node, return true; if (walker(stmt->selectStmt, context)) return true; + if (walker(stmt->confClause, context)) + return true; if (walker(stmt->returningList, context)) return true; if (walker(stmt->withClause, context)) @@ -3217,6 +3257,24 @@ raw_expression_tree_walker(Node *node, break; case T_WithClause: return walker(((WithClause *) node)->ctes, context); + case T_InferClause: + { + InferClause *stmt = (InferClause *) node; + + if (walker(stmt->indexElems, context)) + return true; + if (walker(stmt->whereClause, context)) + return true; + } + break; + case T_ConflictClause: + { + ConflictClause *stmt = (ConflictClause *) node; + + if (walker(stmt->infer, context)) + return true; + } + break; case T_CommonTableExpr: return walker(((CommonTableExpr *) node)->ctequery, context); default: diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index e0dca56ea6cfa..c911ea0e54524 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -332,6 +332,8 @@ _outModifyTable(StringInfo str, const ModifyTable *node) WRITE_NODE_FIELD(resultRelations); WRITE_INT_FIELD(resultRelIndex); WRITE_NODE_FIELD(plans); + WRITE_ENUM_FIELD(spec, SpecType); + WRITE_NODE_FIELD(arbiterIndexes); WRITE_NODE_FIELD(withCheckOptionLists); WRITE_NODE_FIELD(returningLists); WRITE_NODE_FIELD(fdwPrivLists); @@ -1431,6 +1433,17 @@ _outCurrentOfExpr(StringInfo str, const CurrentOfExpr *node) WRITE_INT_FIELD(cursor_param); } +static void +_outInferenceElem(StringInfo str, const InferenceElem *node) +{ + WRITE_NODE_TYPE("INFERENCEELEM"); + + WRITE_NODE_FIELD(expr); + WRITE_OID_FIELD(infercollid); + WRITE_OID_FIELD(inferopfamily); + WRITE_OID_FIELD(inferopcinputtype); +} + static void _outTargetEntry(StringInfo str, const TargetEntry *node) { @@ -2314,6 +2327,9 @@ _outQuery(StringInfo str, const Query *node) WRITE_NODE_FIELD(jointree); WRITE_NODE_FIELD(targetList); WRITE_NODE_FIELD(withCheckOptions); + WRITE_ENUM_FIELD(specClause, SpecType); + WRITE_NODE_FIELD(arbiterElems); + WRITE_NODE_FIELD(arbiterWhere); WRITE_NODE_FIELD(returningList); WRITE_NODE_FIELD(groupClause); WRITE_NODE_FIELD(havingQual); @@ -3106,6 +3122,9 @@ _outNode(StringInfo str, const void *obj) case T_CurrentOfExpr: _outCurrentOfExpr(str, obj); break; + case T_InferenceElem: + _outInferenceElem(str, obj); + break; case T_TargetEntry: _outTargetEntry(str, obj); break; diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index b0cd95da063bf..5a70176be1b52 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -214,6 +214,9 @@ _readQuery(void) READ_NODE_FIELD(jointree); READ_NODE_FIELD(targetList); READ_NODE_FIELD(withCheckOptions); + READ_ENUM_FIELD(specClause, SpecCmd); + READ_NODE_FIELD(arbiterElems); + READ_NODE_FIELD(arbiterWhere); READ_NODE_FIELD(returningList); READ_NODE_FIELD(groupClause); READ_NODE_FIELD(havingQual); @@ -1130,6 +1133,22 @@ _readCurrentOfExpr(void) READ_DONE(); } +/* + * _readInferenceElem + */ +static InferenceElem * +_readInferenceElem(void) +{ + READ_LOCALS(InferenceElem); + + READ_NODE_FIELD(expr); + READ_OID_FIELD(infercollid); + READ_OID_FIELD(inferopfamily); + READ_OID_FIELD(inferopcinputtype); + + READ_DONE(); +} + /* * _readTargetEntry */ @@ -1394,6 +1413,8 @@ parseNodeString(void) return_value = _readSetToDefault(); else if (MATCH("CURRENTOFEXPR", 13)) return_value = _readCurrentOfExpr(); + else if (MATCH("INFERENCEELEM", 13)) + return_value = _readInferenceElem(); else if (MATCH("TARGETENTRY", 11)) return_value = _readTargetEntry(); else if (MATCH("RANGETBLREF", 11)) diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index cb69c03df0008..4677ae9c9b68c 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -4815,7 +4815,7 @@ make_modifytable(PlannerInfo *root, Index nominalRelation, List *resultRelations, List *subplans, List *withCheckOptionLists, List *returningLists, - List *rowMarks, int epqParam) + List *rowMarks, SpecCmd spec, int epqParam) { ModifyTable *node = makeNode(ModifyTable); Plan *plan = &node->plan; @@ -4865,6 +4865,8 @@ make_modifytable(PlannerInfo *root, node->resultRelations = resultRelations; node->resultRelIndex = -1; /* will be set correctly in setrefs.c */ node->plans = subplans; + node->spec = spec; + node->arbiterIndexes = NIL; node->withCheckOptionLists = withCheckOptionLists; node->returningLists = returningLists; node->rowMarks = rowMarks; @@ -4917,6 +4919,14 @@ make_modifytable(PlannerInfo *root, } node->fdwPrivLists = fdw_private_list; + /* + * If a set of unique index inference elements was provided (an INSERT...ON + * CONFLICT "inference specification"), then infer appropriate unique + * indexes (or throw an error if none are available). + */ + if (root->parse->arbiterElems) + node->arbiterIndexes = infer_arbiter_indexes(root); + return node; } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index ea4d4c55cbd2c..db578835dc21d 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -612,6 +612,7 @@ subquery_planner(PlannerGlobal *glob, Query *parse, withCheckOptionLists, returningLists, rowMarks, + parse->specClause, SS_assign_special_param(root)); } } @@ -1095,6 +1096,7 @@ inheritance_planner(PlannerInfo *root) withCheckOptionLists, returningLists, rowMarks, + parse->specClause, SS_assign_special_param(root)); } diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 8abed2ae0dada..ac8c74287617f 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -50,6 +50,8 @@ int constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION; get_relation_info_hook_type get_relation_info_hook = NULL; +static bool infer_collation_opclass_match(InferenceElem *elem, Relation idxRel, + Bitmapset *inferAttrs, List *idxExprs); static int32 get_rel_data_width(Relation rel, int32 *attr_widths); static List *get_relation_constraints(PlannerInfo *root, Oid relationObjectId, RelOptInfo *rel, @@ -394,6 +396,304 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, (*get_relation_info_hook) (root, relationObjectId, inhparent, rel); } +/* + * infer_arbiter_indexes - + * Retrieves unique indexes to arbitrate speculative insertion. + * + * Uses user-supplied inference clause expressions and predicate to match a + * unique index from those defined and ready on the heap relation (target). An + * exact match is required on columns/expressions (although they can appear in + * any order). However, the predicate given by the user need only restrict + * insertion to a subset of some part of the table covered by some particular + * unique index (in particular, a partial unique index) in order to be + * inferred. + * + * The implementation does not consider which B-Tree operator class any + * particular available unique index attribute use, unless one appeared in the + * user-supplied inference specification (the same is true of collations). In + * particular, there is no system dependency on the default operator class for + * the purposes of inference. If no opclass (or collation) is specified, then + * all matching indexes (that may or may not match the default in terms of each + * attribute opclass/collation) are used for inference. + * + * This logic somewhat mirrors get_relation_info(). This process is not + * deferred to a get_relation_info() call while planning because there may not + * be any such call. + */ +List * +infer_arbiter_indexes(PlannerInfo *root) +{ + Query *parse = root->parse; + + /* Iteration state */ + Relation relation; + Oid relationObjectId; + List *indexList; + ListCell *l; + + /* Normalized inference attributes and inference expressions: */ + Bitmapset *inferAttrs = NULL; + List *inferElems = NIL; + + /* Result */ + List *candidates = NIL; + + Assert(parse->specClause == SPEC_IGNORE); + + /* + * We need not lock the relation since it was already locked, either by + * the rewriter or when expand_inherited_rtentry() added it to the query's + * rangetable. + */ + relationObjectId = rt_fetch(parse->resultRelation, parse->rtable)->relid; + + relation = heap_open(relationObjectId, NoLock); + + /* + * Build normalized/BMS representation of plain indexed attributes, as well + * as direct list of inference elements. This is required for matching the + * cataloged definition of indexes. + */ + foreach(l, parse->arbiterElems) + { + InferenceElem *elem; + Var *var; + int attno; + + elem = (InferenceElem *) lfirst(l); + + /* + * Parse analysis of inference elements performs full parse analysis + * of Vars, even for non-expression indexes (in contrast with utility + * command related use of IndexElem). However, indexes are cataloged + * with simple attribute numbers for non-expression indexes. Those are + * handled later. + */ + if (!IsA(elem->expr, Var)) + { + inferElems = lappend(inferElems, elem->expr); + continue; + } + + var = (Var *) elem->expr; + attno = var->varattno; + + if (attno < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), + errmsg("system columns may not appear in unique index inference specification"))); + else if (attno == 0) + elog(ERROR, "whole row unique index inference specifications are not valid"); + + inferAttrs = bms_add_member(inferAttrs, attno); + } + + indexList = RelationGetIndexList(relation); + + /* + * Using that representation, iterate through the list of indexes on the + * target relation to try and find a match + */ + foreach(l, indexList) + { + Oid indexoid = lfirst_oid(l); + Relation idxRel; + Form_pg_index idxForm; + Bitmapset *indexedAttrs = NULL; + List *idxExprs; + List *predExprs; + List *whereExplicit; + AttrNumber natt; + ListCell *el; + + /* + * Extract info from the relation descriptor for the index. We know + * that this is a target, so get lock type it is known will ultimately + * be required by the executor. + * + * Let executor complain about !indimmediate case directly. + */ + idxRel = index_open(indexoid, RowExclusiveLock); + idxForm = idxRel->rd_index; + + if (!idxForm->indisunique || + !IndexIsValid(idxForm)) + goto next; + + /* + * If the index is valid, but cannot yet be used, ignore it. See + * src/backend/access/heap/README.HOT for discussion. + */ + if (idxForm->indcheckxmin && + !TransactionIdPrecedes(HeapTupleHeaderGetXmin(idxRel->rd_indextuple->t_data), + TransactionXmin)) + goto next; + + /* Build BMS representation of cataloged index attributes */ + for (natt = 0; natt < idxForm->indnatts; natt++) + { + int attno = idxRel->rd_index->indkey.values[natt]; + + if (attno < 0) + elog(ERROR, "system column in index"); + + if (attno != 0) + indexedAttrs = bms_add_member(indexedAttrs, attno); + } + + /* Non-expression attributes (if any) must match */ + if (!bms_equal(indexedAttrs, inferAttrs)) + goto next; + + /* Expression attributes (if any) must match */ + idxExprs = RelationGetIndexExpressions(idxRel); + foreach(el, parse->arbiterElems) + { + InferenceElem *elem = (InferenceElem *) lfirst(el); + + /* + * Ensure that collation/opclass aspects of inference expression + * element match. Even though this loop is primarily concerned + * with matching expressions, it is a convenient point to check + * this for both expressions and ordinary (non-expression) + * attributes appearing as inference elements. + */ + if (!infer_collation_opclass_match(elem, idxRel, inferAttrs, + idxExprs)) + goto next; + + /* + * Plain Vars don't factor into count of expression elements, and + * the question of whether or not they satisfy the index definition + * has already been considered (they must) + */ + if (IsA(elem->expr, Var)) + continue; + + /* + * Might as well avoid redundant check in the rare cases where + * infer_collation_opclass_match() is required to do real work. + * Otherwise, check that element expression appears in cataloged + * index definition. + */ + if (elem->infercollid != InvalidOid || + elem->inferopfamily != InvalidOid || + list_member(idxExprs, elem->expr)) + continue; + + goto next; + } + + /* + * Now that all inference elements were matched, ensure that the + * expression elements from inference clause are not missing any + * cataloged expressions. This does the right thing when unique + * indexes redundantly repeat the same attribute, or if attributes + * redundantly appear multiple times within an inference clause. + */ + if (list_difference(idxExprs, inferElems) != NIL) + goto next; + + /* + * Any user-supplied ON CONFLICT unique index inference WHERE clause + * need only be implied by the cataloged index definitions predicate + */ + predExprs = RelationGetIndexPredicate(idxRel); + whereExplicit = make_ands_implicit((Expr *) parse->arbiterWhere); + + if (!predicate_implied_by(predExprs, whereExplicit)) + goto next; + + candidates = lappend_oid(candidates, idxForm->indexrelid); +next: + index_close(idxRel, NoLock); + } + + list_free(indexList); + heap_close(relation, NoLock); + + if (candidates == NIL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), + errmsg("could not infer which unique index to use from expressions/columns and predicate provided for ON CONFLICT"))); + + return candidates; +} + +/* + * infer_collation_opclass_match - ensure infer element opclass/collation match + * + * Given unique index inference element from inference specification, if + * collation was specified, or if opclass (represented here as opfamily + + * opcintype) was specified, verify that there is at least one matching indexed + * attribute (occasionally, there may be more). Skip this in the common case + * where inference specification does not include collation or opclass (instead + * matching everything, regardless of cataloged collation/opclass of indexed + * attribute). + * + * At least historically, Postgres has not offered collations or opclasses with + * alternative-to-default notions of equality, so these additional criteria + * should only actually be matched on infrequently. + * + * Don't give up immediately when an inference element matches some attribute + * cataloged as indexed but not matching additional opclass/collation criteria. + * This is done so that the implementation is as forgiving as possible of + * redundancy within cataloged index attributes (or, less usefully, within + * inference specification elements). If collations actually differ between + * apparently redundantly indexed attributes (redundant within or across + * indexes), then there really is no redundancy as such. + * + * Note that if an inference element specifies an opclass and a collation at + * once, both must match in at least one particular attribute within index + * catalog definition in order for that inference element to be considered + * inferred/satisfied. + */ +static bool +infer_collation_opclass_match(InferenceElem *elem, Relation idxRel, + Bitmapset *inferAttrs, List *idxExprs) +{ + AttrNumber natt; + + /* + * If inference specification element lacks collation/opclass, then no need + * to check for exact match + */ + if (elem->infercollid == InvalidOid && elem->inferopfamily == InvalidOid) + return true; + + for (natt = 1; natt <= idxRel->rd_att->natts; natt++) + { + Oid opfamily = idxRel->rd_opfamily[natt - 1]; + Oid opcinputtype = idxRel->rd_opcintype[natt - 1]; + Oid collation = idxRel->rd_indcollation[natt - 1]; + + if (elem->inferopfamily != InvalidOid && + (elem->inferopfamily != opfamily || + elem->inferopcinputtype != opcinputtype)) + { + /* Attribute needed to match opclass, but didn't */ + continue; + } + + if (elem->infercollid != InvalidOid && + elem->infercollid != collation) + { + /* Attribute needed to match collation, but didn't */ + continue; + } + + if ((IsA(elem->expr, Var) && + bms_is_member(((Var *) elem->expr)->varattno, inferAttrs)) || + list_member(idxExprs, elem->expr)) + { + /* Found one match - good enough */ + return true; + } + } + + return false; +} + /* * estimate_rel_size - estimate # pages and # tuples in a table or index * diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index 4a5a5205391b9..642e4e0e2d1b9 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -387,6 +387,7 @@ transformDeleteStmt(ParseState *pstate, DeleteStmt *stmt) /* done building the range table and jointree */ qry->rtable = pstate->p_rtable; qry->jointree = makeFromExpr(pstate->p_joinlist, qual); + qry->specClause = SPEC_NONE; qry->hasSubLinks = pstate->p_hasSubLinks; qry->hasWindowFuncs = pstate->p_hasWindowFuncs; @@ -408,6 +409,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) { Query *qry = makeNode(Query); SelectStmt *selectStmt = (SelectStmt *) stmt->selectStmt; + SpecCmd spec = stmt->confClause ? stmt->confClause->specclause : SPEC_NONE; List *exprList = NIL; bool isGeneralSelect; List *sub_rtable; @@ -741,12 +743,13 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) } /* - * If we have a RETURNING clause, we need to add the target relation to - * the query namespace before processing it, so that Var references in - * RETURNING will work. Also, remove any namespace entries added in a - * sub-SELECT or VALUES list. + * If we have a RETURNING clause, or there are inference elements used as + * for ON CONFLICT, we need to add the target relation to the query + * namespace before processing it, so that Var references in RETURNING + * and/or the inference specification will work. Also, remove any + * namespace entries added in a sub-SELECT or VALUES list. */ - if (stmt->returningList) + if (stmt->returningList || stmt->confClause) { pstate->p_namespace = NIL; addRTEtoQuery(pstate, pstate->p_target_rangetblentry, @@ -759,8 +762,22 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) qry->rtable = pstate->p_rtable; qry->jointree = makeFromExpr(pstate->p_joinlist, NULL); + qry->specClause = spec; qry->hasSubLinks = pstate->p_hasSubLinks; + if (stmt->confClause) + { + /* + * Perform parse analysis of arbiter columns/expressions. These are + * later used to infer a unique index which arbitrates whether or not + * to take the alternative ON CONFLICT path (i.e. whether or not to + * INSERT or take the alternative path in respect of each slot proposed + * for insertion). + */ + transformConflictClause(pstate, stmt->confClause, &qry->arbiterElems, + &qry->arbiterWhere); + } + assign_query_collations(pstate, qry); return qry; @@ -1006,6 +1023,7 @@ transformSelectStmt(ParseState *pstate, SelectStmt *stmt) qry->rtable = pstate->p_rtable; qry->jointree = makeFromExpr(pstate->p_joinlist, qual); + qry->specClause = SPEC_NONE; qry->hasSubLinks = pstate->p_hasSubLinks; qry->hasWindowFuncs = pstate->p_hasWindowFuncs; diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 5818858a295e6..15f91298eea22 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -217,6 +217,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); RangeVar *range; IntoClause *into; WithClause *with; + InferClause *infer; + ConflictClause *conf; A_Indices *aind; ResTarget *target; struct PrivTarget *privtarget; @@ -343,7 +345,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); OptTableElementList TableElementList OptInherit definition OptTypedTableElementList TypedTableElementList reloptions opt_reloptions - OptWith opt_distinct opt_definition func_args func_args_list + OptWith distinct_clause opt_all_clause opt_definition func_args func_args_list func_args_with_defaults func_args_with_defaults_list aggr_args aggr_args_list func_as createfunc_opt_list alterfunc_opt_list @@ -387,7 +389,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type for_locking_item %type for_locking_clause opt_for_locking_clause for_locking_items %type locked_rels_list -%type opt_all +%type all_or_distinct %type join_outer join_qual %type join_type @@ -416,6 +418,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type SeqOptElem %type insert_rest +%type opt_conf_expr +%type opt_on_conflict %type generic_set set_rest set_rest_more generic_reset reset_rest SetResetClause FunctionSetResetClause @@ -555,8 +559,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); CACHE CALLED CASCADE CASCADED CASE CAST CATALOG_P CHAIN CHAR_P CHARACTER CHARACTERISTICS CHECK CHECKPOINT CLASS CLOSE CLUSTER COALESCE COLLATE COLLATION COLUMN COMMENT COMMENTS COMMIT - COMMITTED CONCURRENTLY CONFIGURATION CONNECTION CONSTRAINT CONSTRAINTS - CONTENT_P CONTINUE_P CONVERSION_P COPY COST CREATE + COMMITTED CONCURRENTLY CONFIGURATION CONFLICT CONNECTION CONSTRAINT + CONSTRAINTS CONTENT_P CONTINUE_P CONVERSION_P COPY COST CREATE CROSS CSV CURRENT_P CURRENT_CATALOG CURRENT_DATE CURRENT_ROLE CURRENT_SCHEMA CURRENT_TIME CURRENT_TIMESTAMP CURRENT_USER CURSOR CYCLE @@ -576,7 +580,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); HANDLER HAVING HEADER_P HOLD HOUR_P - IDENTITY_P IF_P ILIKE IMMEDIATE IMMUTABLE IMPLICIT_P IMPORT_P IN_P + IDENTITY_P IF_P IGNORE_P ILIKE IMMEDIATE IMMUTABLE IMPLICIT_P IMPORT_P IN_P INCLUDING INCREMENT INDEX INDEXES INHERIT INHERITS INITIALLY INLINE_P INNER_P INOUT INPUT_P INSENSITIVE INSERT INSTEAD INT_P INTEGER INTERSECT INTERVAL INTO INVOKER IS ISNULL ISOLATION @@ -9354,10 +9358,12 @@ DeallocateStmt: DEALLOCATE name *****************************************************************************/ InsertStmt: - opt_with_clause INSERT INTO qualified_name insert_rest returning_clause + opt_with_clause INSERT INTO qualified_name insert_rest + opt_on_conflict returning_clause { $5->relation = $4; - $5->returningList = $6; + $5->confClause = $6; + $5->returningList = $7; $5->withClause = $1; $$ = (Node *) $5; } @@ -9402,6 +9408,34 @@ insert_column_item: } ; +opt_on_conflict: + ON CONFLICT opt_conf_expr IGNORE_P + { + $$ = makeNode(ConflictClause); + $$->specclause = SPEC_IGNORE; + $$->infer = $3; + $$->location = @1; + } + | /*EMPTY*/ + { + $$ = NULL; + } + ; + +opt_conf_expr: + '(' index_params where_clause ')' + { + $$ = makeNode(InferClause); + $$->indexElems = $2; + $$->whereClause = $3; + $$->location = @1; + } + | /*EMPTY*/ + { + $$ = NULL; + } + ; + returning_clause: RETURNING target_list { $$ = $2; } | /* EMPTY */ { $$ = NIL; } @@ -9788,7 +9822,21 @@ select_clause: * However, this is not checked by the grammar; parse analysis must check it. */ simple_select: - SELECT opt_distinct opt_target_list + SELECT opt_all_clause opt_target_list + into_clause from_clause where_clause + group_clause having_clause window_clause + { + SelectStmt *n = makeNode(SelectStmt); + n->targetList = $3; + n->intoClause = $4; + n->fromClause = $5; + n->whereClause = $6; + n->groupClause = $7; + n->havingClause = $8; + n->windowClause = $9; + $$ = (Node *)n; + } + | SELECT distinct_clause target_list into_clause from_clause where_clause group_clause having_clause window_clause { @@ -9823,15 +9871,15 @@ simple_select: n->fromClause = list_make1($2); $$ = (Node *)n; } - | select_clause UNION opt_all select_clause + | select_clause UNION all_or_distinct select_clause { $$ = makeSetOp(SETOP_UNION, $3, $1, $4); } - | select_clause INTERSECT opt_all select_clause + | select_clause INTERSECT all_or_distinct select_clause { $$ = makeSetOp(SETOP_INTERSECT, $3, $1, $4); } - | select_clause EXCEPT opt_all select_clause + | select_clause EXCEPT all_or_distinct select_clause { $$ = makeSetOp(SETOP_EXCEPT, $3, $1, $4); } @@ -9970,7 +10018,8 @@ opt_table: TABLE {} | /*EMPTY*/ {} ; -opt_all: ALL { $$ = TRUE; } +all_or_distinct: + ALL { $$ = TRUE; } | DISTINCT { $$ = FALSE; } | /*EMPTY*/ { $$ = FALSE; } ; @@ -9978,10 +10027,13 @@ opt_all: ALL { $$ = TRUE; } /* We use (NIL) as a placeholder to indicate that all target expressions * should be placed in the DISTINCT list during parsetree analysis. */ -opt_distinct: +distinct_clause: DISTINCT { $$ = list_make1(NIL); } | DISTINCT ON '(' expr_list ')' { $$ = $4; } - | ALL { $$ = NIL; } + ; + +opt_all_clause: + ALL { $$ = NIL;} | /*EMPTY*/ { $$ = NIL; } ; @@ -13285,6 +13337,7 @@ unreserved_keyword: | COMMIT | COMMITTED | CONFIGURATION + | CONFLICT | CONNECTION | CONSTRAINTS | CONTENT_P @@ -13344,6 +13397,7 @@ unreserved_keyword: | HOUR_P | IDENTITY_P | IF_P + | IGNORE_P | IMMEDIATE | IMMUTABLE | IMPLICIT_P diff --git a/src/backend/parser/parse_clause.c b/src/backend/parser/parse_clause.c index 8d90b5098a13f..ed9dc566c08f9 100644 --- a/src/backend/parser/parse_clause.c +++ b/src/backend/parser/parse_clause.c @@ -16,6 +16,7 @@ #include "postgres.h" #include "access/heapam.h" +#include "catalog/catalog.h" #include "catalog/heap.h" #include "catalog/pg_type.h" #include "commands/defrem.h" @@ -32,6 +33,7 @@ #include "parser/parse_oper.h" #include "parser/parse_relation.h" #include "parser/parse_target.h" +#include "parser/parse_type.h" #include "rewrite/rewriteManip.h" #include "utils/guc.h" #include "utils/lsyscache.h" @@ -75,6 +77,8 @@ static TargetEntry *findTargetlistEntrySQL99(ParseState *pstate, Node *node, List **tlist, ParseExprKind exprKind); static int get_matching_location(int sortgroupref, List *sortgrouprefs, List *exprs); +static List *resolve_unique_index_expr(ParseState *pstate, InferClause * infer, + Relation heapRel); static List *addTargetToGroupList(ParseState *pstate, TargetEntry *tle, List *grouplist, List *targetlist, int location, bool resolveUnknown); @@ -2166,6 +2170,166 @@ get_matching_location(int sortgroupref, List *sortgrouprefs, List *exprs) return -1; /* keep compiler quiet */ } +/* + * resolve_unique_index_expr + * Infer a unique index from a list of indexElems, for ON + * CONFLICT clause + * + * Perform parse analysis of expressions and columns appearing within ON + * CONFLICT clause. During planning, the returned list of expressions is used + * to infer which unique index to use. + */ +static List * +resolve_unique_index_expr(ParseState *pstate, InferClause *infer, + Relation heapRel) +{ + List *result = NIL; + ListCell *l; + + foreach(l, infer->indexElems) + { + IndexElem *ielem = (IndexElem *) lfirst(l); + InferenceElem *pInfer = makeNode(InferenceElem); + Node *parse; + + /* + * Raw grammar re-uses CREATE INDEX infrastructure for unique index + * inference clause, and so will accept opclasses by name and so on. + * + * Make no attempt to match ASC or DESC ordering or NULLS FIRST/NULLS + * LAST ordering, since those are not significant for inference + * purposes (any unique index matching the inference specification in + * other regards is accepted indifferently). Actively reject this as + * wrong-headed. + */ + if (ielem->ordering != SORTBY_DEFAULT || + ielem->nulls_ordering != SORTBY_NULLS_DEFAULT) + ereport(ERROR, + (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), + errmsg("ON CONFLICT does not accept ordering or NULLS FIRST/LAST specifications"), + errhint("These factors do not affect uniqueness of indexed datums."), + parser_errposition(pstate, + exprLocation((Node *) infer)))); + + if (!ielem->expr) + { + /* Simple index attribute */ + ColumnRef *n; + + /* + * Grammar won't have built raw expression for us in event of plain + * column reference. Create one directly, and perform expression + * transformation. Planner expects this, and performs its own + * normalization for the purposes of matching against pg_index. + */ + n = makeNode(ColumnRef); + n->fields = list_make1(makeString(ielem->name)); + /* Location is approximately that of inference specification */ + n->location = infer->location; + parse = (Node *) n; + } + else + { + /* Do parse transformation of the raw expression */ + parse = (Node *) ielem->expr; + } + + /* + * transformExpr() should have already rejected subqueries, + * aggregates, and window functions, based on the EXPR_KIND_ for an + * index expression. Expressions returning sets won't have been + * rejected, but don't bother doing so here; there should be no + * available expression unique index to match any such expression + * against anyway. + */ + pInfer->expr = transformExpr(pstate, parse, EXPR_KIND_INDEX_EXPRESSION); + + /* Perform lookup of collation and operator class as required */ + if (!ielem->collation) + pInfer->infercollid = InvalidOid; + else + pInfer->infercollid = LookupCollation(pstate, ielem->collation, + exprLocation(pInfer->expr)); + + if (!ielem->opclass) + { + pInfer->inferopfamily = InvalidOid; + pInfer->inferopcinputtype = InvalidOid; + } + else + { + Oid opclass = get_opclass_oid(BTREE_AM_OID, ielem->opclass, + false); + + pInfer->inferopfamily = get_opclass_family(opclass); + pInfer->inferopcinputtype = get_opclass_input_type(opclass); + } + + result = lappend(result, pInfer); + } + + return result; +} + +/* + * transformConflictClauseExpr - + * transform expressions of ON CONFLICT. + * + * Transformed expressions used to infer one unique index relation to serve as + * an ON CONFLICT arbiter. Partial unique indexes may be inferred using WHERE + * clause from inference specification clause. + */ +void +transformConflictClause(ParseState *pstate, ConflictClause *confClause, + List **arbiterExpr, Node **arbiterWhere) +{ + InferClause *infer = confClause->infer; + + /* + * To simplify certain aspects of its design, speculative insertion into + * system catalogs is disallowed + */ + if (IsCatalogRelation(pstate->p_target_relation)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ON CONFLICT not supported with catalog relations"), + parser_errposition(pstate, + exprLocation((Node *) confClause)))); + + /* Same applies to table used by logical decoding as catalog table */ + if (RelationIsUsedAsCatalogTable(pstate->p_target_relation)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ON CONFLICT not supported on table \"%s\" used as a catalog table", + RelationGetRelationName(pstate->p_target_relation)), + parser_errposition(pstate, + exprLocation((Node *) confClause)))); + + /* ON CONFLICT IGNORE does not require an inference clause */ + if (infer) + { + *arbiterExpr = resolve_unique_index_expr(pstate, infer, + pstate->p_target_relation); + + /* + * Handling inference WHERE clause (for partial unique index + * inference) + */ + if (infer->whereClause) + *arbiterWhere = transformExpr(pstate, infer->whereClause, + EXPR_KIND_INDEX_PREDICATE); + } + + /* + * It's convenient to form a list of expressions based on the + * representation used by CREATE INDEX, since the same restrictions are + * appropriate (e.g. on subqueries). However, from here on, a dedicated + * primnode representation is used for inference elements, and so + * assign_query_collations() can be trusted to do the right thing with the + * post parse analysis query tree inference clause representation. + */ +} + /* * addTargetToSortList * If the given targetlist entry isn't already in the SortGroupClause diff --git a/src/backend/parser/parse_collate.c b/src/backend/parser/parse_collate.c index 7c6a11c7575e0..ed266f4d3099c 100644 --- a/src/backend/parser/parse_collate.c +++ b/src/backend/parser/parse_collate.c @@ -483,6 +483,7 @@ assign_collations_walker(Node *node, assign_collations_context *context) case T_JoinExpr: case T_FromExpr: case T_SortGroupClause: + case T_InferenceElem: (void) expression_tree_walker(node, assign_collations_walker, (void *) &loccontext); diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index eb7293f2f33cc..7b51bc5dd5a6d 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -412,6 +412,12 @@ DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) ReorderBufferXidSetCatalogChanges(ctx->reorder, xid, buf->origptr); break; + case XLOG_HEAP_CONFIRM: + /* + * Speculative assertion is actually confirmed by the absence of + * super deletion; do nothing with this + */ + break; case XLOG_HEAP_LOCK: /* we don't care about row level locks for now */ break; @@ -538,10 +544,14 @@ DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) return; change = ReorderBufferGetChange(ctx->reorder); - change->action = REORDER_BUFFER_CHANGE_INSERT; + if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE)) + change->action = REORDER_BUFFER_CHANGE_INSERT; + else + change->action = REORDER_BUFFER_CHANGE_SPEC_INSERT; + memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode)); - if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE) + if (xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE) { Size tuplelen; char *tupledata = XLogRecGetBlockData(r, 0, &tuplelen); @@ -583,7 +593,7 @@ DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) change->action = REORDER_BUFFER_CHANGE_UPDATE; memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode)); - if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE) + if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE) { data = XLogRecGetBlockData(r, 0, &datalen); @@ -592,7 +602,7 @@ DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) DecodeXLogTuple(data, datalen, change->data.tp.newtuple); } - if (xlrec->flags & XLOG_HEAP_CONTAINS_OLD) + if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD) { /* caution, remaining data in record is not aligned */ data = XLogRecGetData(r) + SizeOfHeapUpdate; @@ -629,12 +639,15 @@ DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) return; change = ReorderBufferGetChange(ctx->reorder); - change->action = REORDER_BUFFER_CHANGE_DELETE; + if (!(xlrec->flags & XLH_DELETE_IS_SUPER)) + change->action = REORDER_BUFFER_CHANGE_DELETE; + else + change->action = REORDER_BUFFER_CHANGE_SUPER_DELETE; memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode)); /* old primary key stored */ - if (xlrec->flags & XLOG_HEAP_CONTAINS_OLD) + if (xlrec->flags & XLH_DELETE_CONTAINS_OLD) { Assert(XLogRecGetDataLen(r) > (SizeOfHeapDelete + SizeOfHeapHeader)); @@ -694,7 +707,7 @@ DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) * We decode the tuple in pretty much the same way as DecodeXLogTuple, * but since the layout is slightly different, we can't use it here. */ - if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE) + if (xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE) { change->data.tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder); @@ -732,7 +745,7 @@ DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) * xl_multi_insert_tuple record emitted by one heap_multi_insert() * call. */ - if (xlrec->flags & XLOG_HEAP_LAST_MULTI_INSERT && + if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI && (i + 1) == xlrec->ntuples) change->data.tp.clear_toast_afterwards = true; else diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index dc855830c4e46..21d7933684ef5 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -401,6 +401,8 @@ ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change) case REORDER_BUFFER_CHANGE_INSERT: case REORDER_BUFFER_CHANGE_UPDATE: case REORDER_BUFFER_CHANGE_DELETE: + case REORDER_BUFFER_CHANGE_SPEC_INSERT: + case REORDER_BUFFER_CHANGE_SUPER_DELETE: if (change->data.tp.newtuple) { ReorderBufferReturnTupleBuf(rb, change->data.tp.newtuple); @@ -1314,6 +1316,7 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, PG_TRY(); { ReorderBufferChange *change; + ReorderBufferChange *specinsert = NULL; if (using_subtxn) BeginInternalSubTransaction("replay"); @@ -1333,6 +1336,8 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, case REORDER_BUFFER_CHANGE_INSERT: case REORDER_BUFFER_CHANGE_UPDATE: case REORDER_BUFFER_CHANGE_DELETE: + case REORDER_BUFFER_CHANGE_SPEC_INSERT: + case REORDER_BUFFER_CHANGE_SUPER_DELETE: Assert(snapshot_now); reloid = RelidByRelfilenode(change->data.tp.relnode.spcNode, @@ -1345,7 +1350,7 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, if (reloid == InvalidOid && change->data.tp.newtuple == NULL && change->data.tp.oldtuple == NULL) - continue; + goto change_done; else if (reloid == InvalidOid) elog(ERROR, "could not map filenode \"%s\" to relation OID", relpathperm(change->data.tp.relnode, @@ -1359,49 +1364,117 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, relpathperm(change->data.tp.relnode, MAIN_FORKNUM)); - if (RelationIsLogicallyLogged(relation)) + if (!RelationIsLogicallyLogged(relation)) + goto change_done; + + /* + * For now ignore sequence changes entirely. Most of + * the time they don't log changes using records we + * understand, so it doesn't make sense to handle the + * few cases we do. + */ + if (relation->rd_rel->relkind == RELKIND_SEQUENCE) + goto change_done; + + /* user-triggered change */ + if (!IsToastRelation(relation)) { /* - * For now ignore sequence changes entirely. Most of - * the time they don't log changes using records we - * understand, so it doesn't make sense to handle the - * few cases we do. + * Previous speculative insertion's success + * confirm by a new (non-superdelete) DML change */ - if (relation->rd_rel->relkind == RELKIND_SEQUENCE) + if (specinsert && + change->action != REORDER_BUFFER_CHANGE_SUPER_DELETE) { + /* Report as proper insert to client */ + specinsert->action = REORDER_BUFFER_CHANGE_INSERT; + rb->apply_change(rb, txn, relation, specinsert); + + /* Free memory from pending tuple */ + Assert(specinsert->data.tp.oldtuple == NULL); + ReorderBufferReturnTupleBuf(rb, specinsert->data.tp.newtuple); + specinsert = NULL; } - /* user-triggered change */ - else if (!IsToastRelation(relation)) + + ReorderBufferToastReplace(rb, txn, relation, change); + + /* + * Kludge: Speculative insertion occasionally makes + * use of "super deletion" -- an implementation + * defined delete of a speculatively inserted tuple. + * Neither the super deletion, nor the insertion + * (which must be the prior record type) are included + * in the final assembly when the tuple was + * super-deleted. Otherwise, an ordinary insertion is + * assembled. + */ + if (change->action == REORDER_BUFFER_CHANGE_SPEC_INSERT) { - ReorderBufferToastReplace(rb, txn, relation, change); - rb->apply_change(rb, txn, relation, change); + /* + * Need to ensure the memory used by speculatively + * inserted tuple isn't freed till we're done + * verifying that there is no super deletion that + * immediately follows. Otherwise it could get + * freed/reused while restoring spooled data from + * disk. + */ + dlist_delete(&change->node); + specinsert = change; + /* Don't clear reassembled toast chunks */ + continue; + } + else if (change->action == REORDER_BUFFER_CHANGE_SUPER_DELETE) + { + Assert(RelFileNodeEquals(change->data.tp.relnode, + specinsert->data.tp.relnode)); /* - * Only clear reassembled toast chunks if we're - * sure they're not required anymore. The creator - * of the tuple tells us. + * Free memory from pending tuple. Do not + * report as logical delete to encoding plugin. */ - if (change->data.tp.clear_toast_afterwards) - ReorderBufferToastReset(rb, txn); + Assert(specinsert->data.tp.oldtuple == NULL); + ReorderBufferReturnTupleBuf(rb, specinsert->data.tp.newtuple); + specinsert = NULL; } - /* we're not interested in toast deletions */ - else if (change->action == REORDER_BUFFER_CHANGE_INSERT) + else { /* - * Need to reassemble the full toasted Datum in - * memory, to ensure the chunks don't get reused - * till we're done remove it from the list of this - * transaction's changes. Otherwise it will get - * freed/reused while restoring spooled data from - * disk. + * Handle non-speculative insertion related + * changes */ - dlist_delete(&change->node); - ReorderBufferToastAppendChunk(rb, txn, relation, - change); + rb->apply_change(rb, txn, relation, change); } + /* + * Only clear reassembled toast chunks if we're + * sure they're not required anymore. The creator + * of the tuple tells us. + */ + if (change->data.tp.clear_toast_afterwards) + ReorderBufferToastReset(rb, txn); + } + /* we're not interested in toast deletions */ + else if (change->action == REORDER_BUFFER_CHANGE_INSERT) + { + /* + * Need to reassemble the full toasted Datum in + * memory, to ensure the chunks don't get reused till + * we're done remove it from the list of this + * transaction's changes. Otherwise it will get + * freed/reused while restoring spooled data from + * disk. + */ + dlist_delete(&change->node); + ReorderBufferToastAppendChunk(rb, txn, relation, + change); + } + + change_done: + if (relation != NULL) + { + RelationClose(relation); + relation = NULL; } - RelationClose(relation); break; case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: /* get rid of the old */ @@ -1471,6 +1544,36 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, } } + /* + * Previous speculative insertion's success confirm by reaching end of + * xact's changes + */ + if (specinsert) + { + Relation relation; + Oid reloid; + + reloid = RelidByRelfilenode(specinsert->data.tp.relnode.spcNode, + specinsert->data.tp.relnode.relNode); + + /* + * Catalog tuple without data, emitted while catalog was + * in the process of being rewritten. + */ + if (reloid == InvalidOid) + elog(ERROR, "could not map filenode \"%s\" to relation OID", + relpathperm(specinsert->data.tp.relnode, + MAIN_FORKNUM)); + + relation = RelationIdGetRelation(reloid); + /* Report as proper insert to client */ + specinsert->action = REORDER_BUFFER_CHANGE_INSERT; + rb->apply_change(rb, txn, relation, specinsert); + /* Free memory from pending tuple */ + Assert(specinsert->data.tp.oldtuple == NULL); + ReorderBufferReturnTupleBuf(rb, specinsert->data.tp.newtuple); + } + /* clean up the iterator */ ReorderBufferIterTXNFinish(rb, iterstate); iterstate = NULL; @@ -2003,6 +2106,10 @@ ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, case REORDER_BUFFER_CHANGE_UPDATE: /* fall through */ case REORDER_BUFFER_CHANGE_DELETE: + /* fall through */ + case REORDER_BUFFER_CHANGE_SPEC_INSERT: + /* fall through */ + case REORDER_BUFFER_CHANGE_SUPER_DELETE: { char *data; ReorderBufferTupleBuf *oldtup, @@ -2258,6 +2365,10 @@ ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn, case REORDER_BUFFER_CHANGE_UPDATE: /* fall through */ case REORDER_BUFFER_CHANGE_DELETE: + /* fall through */ + case REORDER_BUFFER_CHANGE_SPEC_INSERT: + /* fall through */ + case REORDER_BUFFER_CHANGE_SUPER_DELETE: if (change->data.tp.newtuple) { Size len = offsetof(ReorderBufferTupleBuf, t_data) + diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index 60c60caf3964f..88ba29b05db28 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -66,7 +66,7 @@ static void markQueryForLocking(Query *qry, Node *jtnode, LockClauseStrength strength, LockWaitPolicy waitPolicy, bool pushedDown); static List *matchLocks(CmdType event, RuleLock *rulelocks, - int varno, Query *parsetree); + int varno, Query *parsetree, bool *hasUpdate); static Query *fireRIRrules(Query *parsetree, List *activeRIRs, bool forUpdatePushedDown); static bool view_has_instead_trigger(Relation view, CmdType event); @@ -1288,7 +1288,8 @@ static List * matchLocks(CmdType event, RuleLock *rulelocks, int varno, - Query *parsetree) + Query *parsetree, + bool *hasUpdate) { List *matching_locks = NIL; int nlocks; @@ -1309,6 +1310,9 @@ matchLocks(CmdType event, { RewriteRule *oneLock = rulelocks->rules[i]; + if (oneLock->event == CMD_UPDATE) + *hasUpdate = true; + /* * Suppress ON INSERT/UPDATE/DELETE rules that are disabled or * configured to not fire during the current sessions replication @@ -2991,6 +2995,7 @@ RewriteQuery(Query *parsetree, List *rewrite_events) CmdType event = parsetree->commandType; bool instead = false; bool returning = false; + bool updatableview = false; Query *qual_product = NULL; List *rewritten = NIL; ListCell *lc1; @@ -3073,6 +3078,7 @@ RewriteQuery(Query *parsetree, List *rewrite_events) Relation rt_entry_relation; List *locks; List *product_queries; + bool hasUpdate = false; result_relation = parsetree->resultRelation; Assert(result_relation != 0); @@ -3141,7 +3147,7 @@ RewriteQuery(Query *parsetree, List *rewrite_events) * Collect and apply the appropriate rules. */ locks = matchLocks(event, rt_entry_relation->rd_rules, - result_relation, parsetree); + result_relation, parsetree, &hasUpdate); product_queries = fireRules(parsetree, result_relation, @@ -3190,6 +3196,7 @@ RewriteQuery(Query *parsetree, List *rewrite_events) */ instead = true; returning = true; + updatableview = true; } /* @@ -3270,6 +3277,17 @@ RewriteQuery(Query *parsetree, List *rewrite_events) } } + /* + * Updatable views are supported by ON CONFLICT IGNORE, so don't + * prevent that case from proceeding + */ + if (parsetree->specClause != SPEC_NONE && + (product_queries != NIL || hasUpdate) && + !updatableview) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("INSERT with ON CONFLICT clause may not target relation with INSERT or UPDATE rules"))); + heap_close(rt_entry_relation, NoLock); } diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c index d13a1673344b3..6decb11b178d8 100644 --- a/src/backend/storage/lmgr/lmgr.c +++ b/src/backend/storage/lmgr/lmgr.c @@ -25,6 +25,24 @@ #include "utils/inval.h" +/* + * Per-backend counter for generating speculative insertion tokens. + * + * This may wrap around, but that's OK as it's only used for the short + * duration between inserting a tuple and checking that there are no (unique) + * constraint violations. It's theoretically possible that a backend sees a + * tuple that was speculatively inserted by another backend, but before it has + * started waiting on the token, the other backend completes its insertion, + * and then then performs 2^32 unrelated insertions. And after all that, the + * first backend finally calls SpeculativeInsertionLockAcquire(), with the + * intention of waiting for the first insertion to complete, but ends up + * waiting for the latest unrelated insertion instead. Even then, nothing + * particularly bad happens: in the worst case they deadlock, causing one of + * the transactions to abort. + */ +static uint32 speculativeInsertionToken = 0; + + /* * Struct to hold context info for transaction lock waits. * @@ -40,6 +58,7 @@ typedef struct XactLockTableWaitInfo static void XactLockTableWaitErrorCb(void *arg); + /* * RelationInitLockInfo * Initializes the lock information in a relation descriptor. @@ -575,6 +594,73 @@ ConditionalXactLockTableWait(TransactionId xid) return true; } +/* + * SpeculativeInsertionLockAcquire + * + * Insert a lock showing that the given transaction ID is inserting a tuple, + * but hasn't yet decided whether it's going to keep it. The lock can then be + * used to wait for the decision to go ahead with the insertion, or aborting + * it. + * + * The token is used to distinguish multiple insertions by the same + * transaction. It is returned to caller. + */ +uint32 +SpeculativeInsertionLockAcquire(TransactionId xid) +{ + LOCKTAG tag; + + speculativeInsertionToken++; + + /* + * Check for wrap-around. Zero means no token is held, so don't use that. + */ + if (speculativeInsertionToken == 0) + speculativeInsertionToken = 1; + + SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, speculativeInsertionToken); + + (void) LockAcquire(&tag, ExclusiveLock, false, false); + + return speculativeInsertionToken; +} + +/* + * SpeculativeInsertionLockRelease + * + * Delete the lock showing that the given transaction is speculatively + * inserting a tuple. + */ +void +SpeculativeInsertionLockRelease(TransactionId xid) +{ + LOCKTAG tag; + + SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, speculativeInsertionToken); + + LockRelease(&tag, ExclusiveLock, false); +} + +/* + * SpeculativeInsertionWait + * + * Wait for the specified transaction to finish or abort the insertion of a + * tuple. + */ +void +SpeculativeInsertionWait(TransactionId xid, uint32 token) +{ + LOCKTAG tag; + + SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, token); + + Assert(TransactionIdIsValid(xid)); + Assert(token != 0); + + (void) LockAcquire(&tag, ShareLock, false, false); + LockRelease(&tag, ShareLock, false); +} + /* * XactLockTableWaitErrorContextCb * Error context callback for transaction lock waits. @@ -873,6 +959,12 @@ DescribeLockTag(StringInfo buf, const LOCKTAG *tag) tag->locktag_field1, tag->locktag_field2); break; + case LOCKTAG_SPECULATIVE_TOKEN: + appendStringInfo(buf, + _("speculative token %u of transaction %u"), + tag->locktag_field2, + tag->locktag_field1); + break; case LOCKTAG_OBJECT: appendStringInfo(buf, _("object %u of class %u of database %u"), diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c index a1967b69632f9..2ab66ce328514 100644 --- a/src/backend/utils/adt/lockfuncs.c +++ b/src/backend/utils/adt/lockfuncs.c @@ -28,6 +28,7 @@ static const char *const LockTagTypeNames[] = { "tuple", "transactionid", "virtualxid", + "speculative token", "object", "userlock", "advisory" diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c index a4a478d1142d8..b4284d6d94f03 100644 --- a/src/backend/utils/time/tqual.c +++ b/src/backend/utils/time/tqual.c @@ -405,6 +405,13 @@ HeapTupleSatisfiesToast(HeapTuple htup, Snapshot snapshot, } } } + /* + * An invalid Xmin can be left behind by a speculative insertion that + * is cancelled by super-deleting the tuple. We shouldn't see any of + * those in TOAST tables, but better safe than sorry. + */ + else if (!TransactionIdIsValid(HeapTupleHeaderGetXmin(tuple))) + return false; } /* otherwise assume the tuple is valid for TOAST. */ @@ -714,8 +721,11 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, * output argument to return the xids of concurrent xacts that affected the * tuple. snapshot->xmin is set to the tuple's xmin if that is another * transaction that's still in progress; or to InvalidTransactionId if the - * tuple's xmin is committed good, committed dead, or my own xact. Similarly - * for snapshot->xmax and the tuple's xmax. + * tuple's xmin is committed good, committed dead, or my own xact. + * Similarly for snapshot->xmax and the tuple's xmax. If the tuple was + * inserted speculatively, meaning that the inserter might still back down + * on the insertion without aborting the whole transaction, the associated + * token is also returned in snapshot->speculativeToken. */ bool HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, @@ -727,6 +737,7 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, Assert(htup->t_tableOid != InvalidOid); snapshot->xmin = snapshot->xmax = InvalidTransactionId; + snapshot->speculativeToken = 0; if (!HeapTupleHeaderXminCommitted(tuple)) { @@ -808,6 +819,20 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, } else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) { + /* + * Return the speculative token to caller. Caller can worry + * about xmax, since it requires a conclusively locked row + * version, and a concurrent update to this tuple is a conflict + * of its purposes. + */ + if (HeapTupleHeaderIsSpeculative(tuple)) + { + snapshot->speculativeToken = + HeapTupleHeaderGetSpeculativeToken(tuple); + + Assert(snapshot->speculativeToken != 0); + } + snapshot->xmin = HeapTupleHeaderGetRawXmin(tuple); /* XXX shouldn't we fall through to look at xmax? */ return true; /* in insertion by other */ diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 888cce7a2d841..0c5c5c3fe1c3c 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -28,6 +28,7 @@ #define HEAP_INSERT_SKIP_WAL 0x0001 #define HEAP_INSERT_SKIP_FSM 0x0002 #define HEAP_INSERT_FROZEN 0x0004 +#define HEAP_INSERT_SPECULATIVE 0x0008 typedef struct BulkInsertStateData *BulkInsertState; @@ -142,6 +143,8 @@ extern void heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, extern HTSU_Result heap_delete(Relation relation, ItemPointer tid, CommandId cid, Snapshot crosscheck, bool wait, HeapUpdateFailureData *hufd); +extern void heap_finish_speculative(Relation relation, HeapTuple tuple, + bool conflict); extern HTSU_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index f0f89dec0f229..caa0f14f4bf61 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -34,7 +34,7 @@ #define XLOG_HEAP_UPDATE 0x20 /* 0x030 is free, was XLOG_HEAP_MOVE */ #define XLOG_HEAP_HOT_UPDATE 0x40 -/* 0x050 is free, was XLOG_HEAP_NEWPAGE */ +#define XLOG_HEAP_CONFIRM 0x50 #define XLOG_HEAP_LOCK 0x60 #define XLOG_HEAP_INPLACE 0x70 @@ -60,23 +60,43 @@ #define XLOG_HEAP2_NEW_CID 0x70 /* - * xl_heap_* ->flag values, 8 bits are available. + * xl_heap_insert/xl_heap_multi_insert flag values, 8 bits are available. */ /* PD_ALL_VISIBLE was cleared */ -#define XLOG_HEAP_ALL_VISIBLE_CLEARED (1<<0) +#define XLH_INSERT_ALL_VISIBLE_CLEARED (1<<0) +#define XLH_INSERT_LAST_IN_MULTI (1<<1) +#define XLH_INSERT_IS_SPECULATIVE (1<<2) +#define XLH_INSERT_CONTAINS_NEW_TUPLE (1<<3) + +/* + * xl_heap_update flag values, 8 bits are available. + */ +/* PD_ALL_VISIBLE was cleared */ +#define XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED (1<<0) /* PD_ALL_VISIBLE was cleared in the 2nd page */ -#define XLOG_HEAP_NEW_ALL_VISIBLE_CLEARED (1<<1) -#define XLOG_HEAP_CONTAINS_OLD_TUPLE (1<<2) -#define XLOG_HEAP_CONTAINS_OLD_KEY (1<<3) -#define XLOG_HEAP_CONTAINS_NEW_TUPLE (1<<4) -#define XLOG_HEAP_PREFIX_FROM_OLD (1<<5) -#define XLOG_HEAP_SUFFIX_FROM_OLD (1<<6) -/* last xl_heap_multi_insert record for one heap_multi_insert() call */ -#define XLOG_HEAP_LAST_MULTI_INSERT (1<<7) +#define XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED (1<<1) +#define XLH_UPDATE_CONTAINS_OLD_TUPLE (1<<2) +#define XLH_UPDATE_CONTAINS_OLD_KEY (1<<3) +#define XLH_UPDATE_CONTAINS_NEW_TUPLE (1<<4) +#define XLH_UPDATE_PREFIX_FROM_OLD (1<<5) +#define XLH_UPDATE_SUFFIX_FROM_OLD (1<<6) /* convenience macro for checking whether any form of old tuple was logged */ -#define XLOG_HEAP_CONTAINS_OLD \ - (XLOG_HEAP_CONTAINS_OLD_TUPLE | XLOG_HEAP_CONTAINS_OLD_KEY) +#define XLH_UPDATE_CONTAINS_OLD \ + (XLH_UPDATE_CONTAINS_OLD_TUPLE | XLH_UPDATE_CONTAINS_OLD_KEY) + +/* + * xl_heap_delete flag values, 8 bits are available. + */ +/* PD_ALL_VISIBLE was cleared */ +#define XLH_DELETE_ALL_VISIBLE_CLEARED (1<<0) +#define XLH_DELETE_CONTAINS_OLD_TUPLE (1<<1) +#define XLH_DELETE_CONTAINS_OLD_KEY (1<<2) +#define XLH_DELETE_IS_SUPER (1<<3) + +/* convenience macro for checking whether any form of old tuple was logged */ +#define XLH_DELETE_CONTAINS_OLD \ + (XLH_DELETE_CONTAINS_OLD_TUPLE | XLH_DELETE_CONTAINS_OLD_KEY) /* This is what we need to know about delete */ typedef struct xl_heap_delete @@ -243,6 +263,14 @@ typedef struct xl_heap_lock_updated #define SizeOfHeapLockUpdated (offsetof(xl_heap_lock_updated, infobits_set) + sizeof(uint8)) +/* This is what we need to know about confirmation of speculative insertion */ +typedef struct xl_heap_confirm +{ + OffsetNumber offnum; /* confirmed tuple's offset on page */ +} xl_heap_confirm; + +#define SizeOfHeapConfirm (offsetof(xl_heap_confirm, offnum) + sizeof(OffsetNumber)) + /* This is what we need to know about in-place update */ typedef struct xl_heap_inplace { diff --git a/src/include/access/hio.h b/src/include/access/hio.h index b0140298b1fd0..b3b91e70d50f5 100644 --- a/src/include/access/hio.h +++ b/src/include/access/hio.h @@ -36,7 +36,7 @@ typedef struct BulkInsertStateData extern void RelationPutHeapTuple(Relation relation, Buffer buffer, - HeapTuple tuple); + HeapTuple tuple, bool token); extern Buffer RelationGetBufferForTuple(Relation relation, Size len, Buffer otherBuffer, int options, BulkInsertState bistate, diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h index 0a673cd52679d..80285acc3b620 100644 --- a/src/include/access/htup_details.h +++ b/src/include/access/htup_details.h @@ -96,6 +96,15 @@ * unrelated tuple stored into a slot recently freed by VACUUM. If either * check fails, one may assume that there is no live descendant version. * + * t_ctid is sometimes used to store a speculative insertion token, instead + * of a real TID. A speculative token is set on a tuple that's being + * inserted, until the inserter is sure that it wants to go ahead with the + * insertion. Hence a token should only be seen on a tuple with an XMAX + * that's still in-progress, or invalid/aborted. The token is replaced with + * the tuple's real TID when the insertion is confirmed. One should never + * see a speculative insertion token while following a chain of t_ctid links, + * because they are not used on updates, only insertions. + * * Following the fixed header fields, the nulls bitmap is stored (beginning * at t_bits). The bitmap is *not* stored if t_infomask shows that there * are no nulls in the tuple. If an OID field is present (as indicated by @@ -138,7 +147,8 @@ struct HeapTupleHeaderData DatumTupleFields t_datum; } t_choice; - ItemPointerData t_ctid; /* current TID of this or newer tuple */ + ItemPointerData t_ctid; /* current TID of this or newer tuple (or a + * speculative insertion token) */ /* Fields below here must match MinimalTupleData! */ @@ -241,6 +251,14 @@ struct HeapTupleHeaderData */ #define HEAP_TUPLE_HAS_MATCH HEAP_ONLY_TUPLE /* tuple has a join match */ +/* + * Special value used in t_ctid.ip_posid, to indicate that it holds a + * speculative insertion token rather than a real TID. This must be higher + * than MaxOffsetNumber, so that it can be distinguished from a valid + * offset number in a regular item pointer. + */ +#define SpecTokenOffsetNumber 0xfffe + /* * HeapTupleHeader accessor macros * @@ -377,6 +395,22 @@ do { \ (tup)->t_choice.t_heap.t_field3.t_xvac = (xid); \ } while (0) +#define HeapTupleHeaderIsSpeculative(tup) \ +( \ + (tup)->t_ctid.ip_posid == SpecTokenOffsetNumber \ +) + +#define HeapTupleHeaderGetSpeculativeToken(tup) \ +( \ + AssertMacro(HeapTupleHeaderIsSpeculative(tup)), \ + ItemPointerGetBlockNumber(&(tup)->t_ctid) \ +) + +#define HeapTupleHeaderSetSpeculativeToken(tup, token) \ +( \ + ItemPointerSet(&(tup)->t_ctid, token, SpecTokenOffsetNumber) \ +) + #define HeapTupleHeaderGetDatumLength(tup) \ VARSIZE(tup) diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index a04def96e4cad..d091e0b7bcda9 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -81,6 +81,8 @@ extern void index_drop(Oid indexId, bool concurrent); extern IndexInfo *BuildIndexInfo(Relation index); +extern void IndexInfoSpeculative(Relation index, IndexInfo *ii); + extern void FormIndexDatum(IndexInfo *indexInfo, TupleTableSlot *slot, EState *estate, diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 33c8fad844c66..db0d8f126b170 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -361,16 +361,17 @@ extern void UnregisterExprContextCallback(ExprContext *econtext, /* * prototypes from functions in execIndexing.c */ -extern void ExecOpenIndices(ResultRelInfo *resultRelInfo); +extern void ExecOpenIndices(ResultRelInfo *resultRelInfo, bool speculative); extern void ExecCloseIndices(ResultRelInfo *resultRelInfo); extern List *ExecInsertIndexTuples(TupleTableSlot *slot, ItemPointer tupleid, - EState *estate); -extern bool check_exclusion_constraint(Relation heap, Relation index, + EState *estate, bool noDupErr, List *arbiterIndexes); +extern bool ExecCheckIndexConstraints(TupleTableSlot *slot, EState *estate, + ItemPointer conflictTid, List *arbiterIndexes); +extern void check_exclusion_constraint(Relation heap, Relation index, IndexInfo *indexInfo, ItemPointer tupleid, Datum *values, bool *isnull, - EState *estate, - bool newIndex, bool errorOK); + EState *estate, bool newIndex); #endif /* EXECUTOR_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index dac542fbc14ab..9ffc1504e2430 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -41,6 +41,9 @@ * ExclusionOps Per-column exclusion operators, or NULL if none * ExclusionProcs Underlying function OIDs for ExclusionOps * ExclusionStrats Opclass strategy numbers for ExclusionOps + * UniqueOps Theses are like Exclusion*, but for unique indexes + * UniqueProcs + * UniqueStrats * Unique is it a unique index? * ReadyForInserts is it valid for inserts? * Concurrent are we doing a concurrent index build? @@ -62,6 +65,9 @@ typedef struct IndexInfo Oid *ii_ExclusionOps; /* array with one entry per column */ Oid *ii_ExclusionProcs; /* array with one entry per column */ uint16 *ii_ExclusionStrats; /* array with one entry per column */ + Oid *ii_UniqueOps; /* array with one entry per column */ + Oid *ii_UniqueProcs; /* array with one entry per column */ + uint16 *ii_UniqueStrats; /* array with one entry per column */ bool ii_Unique; bool ii_ReadyForInserts; bool ii_Concurrent; @@ -1092,6 +1098,8 @@ typedef struct ModifyTableState int mt_whichplan; /* which one is being executed (0..n-1) */ ResultRelInfo *resultRelInfo; /* per-subplan target relations */ List **mt_arowmarks; /* per-subplan ExecAuxRowMark lists */ + SpecCmd spec; /* reason for speculative insertion */ + List *arbiterIndexes; /* unique index OIDs to arbitrate taking alt path */ EPQState mt_epqstate; /* for evaluating EvalPlanQual rechecks */ bool fireBSTriggers; /* do we need to fire stmt triggers? */ } ModifyTableState; diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 38469ef4d1af4..f1b50297c60ad 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -168,6 +168,7 @@ typedef enum NodeTag T_CoerceToDomainValue, T_SetToDefault, T_CurrentOfExpr, + T_InferenceElem, T_TargetEntry, T_RangeTblRef, T_JoinExpr, @@ -412,6 +413,8 @@ typedef enum NodeTag T_RowMarkClause, T_XmlSerialize, T_WithClause, + T_InferClause, + T_ConflictClause, T_CommonTableExpr, T_RoleSpec, @@ -625,4 +628,16 @@ typedef enum JoinType (1 << JOIN_RIGHT) | \ (1 << JOIN_ANTI))) != 0) +/* + * SpecCmd - + * "Speculative insertion" clause + * + * This is needed in both parsenodes.h and plannodes.h, so put it here... + */ +typedef enum +{ + SPEC_NONE, /* Not involved in speculative insertion */ + SPEC_IGNORE /* INSERT of "ON CONFLICT IGNORE" */ +} SpecCmd; + #endif /* NODES_H */ diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 36e36d5631628..c4cb157e857a8 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -132,6 +132,10 @@ typedef struct Query List *withCheckOptions; /* a list of WithCheckOption's */ + SpecCmd specClause; /* speculative insertion clause */ + List *arbiterElems; /* unique index arbiter list (of InferenceElem's) */ + Node *arbiterWhere; /* unique index arbiter WHERE clause */ + List *returningList; /* return-values list (of TargetEntry) */ List *groupClause; /* a list of SortGroupClause's */ @@ -591,7 +595,7 @@ typedef enum TableLikeOption } TableLikeOption; /* - * IndexElem - index parameters (used in CREATE INDEX) + * IndexElem - index parameters (used in CREATE INDEX, and in ON CONFLICT) * * For a plain index attribute, 'name' is the name of the table column to * index, and 'expr' is NULL. For an index expression, 'name' is NULL and @@ -1023,6 +1027,34 @@ typedef struct WithClause int location; /* token location, or -1 if unknown */ } WithClause; +/* + * InferClause - + * ON CONFLICT unique index inference clause + * + * Note: InferClause does not propagate into the Query representation. + */ +typedef struct InferClause +{ + NodeTag type; + List *indexElems; /* IndexElems to infer unique index */ + Node *whereClause; /* qualification (partial-index predicate) */ + int location; /* token location, or -1 if unknown */ +} InferClause; + +/* + * ConflictClause - + * representation of ON CONFLICT clause + * + * Note: ConflictClause does not propagate into the Query representation. + */ +typedef struct ConflictClause +{ + NodeTag type; + SpecCmd specclause; /* Variant specified */ + InferClause *infer; /* Optional index inference clause */ + int location; /* token location, or -1 if unknown */ +} ConflictClause; + /* * CommonTableExpr - * representation of WITH list element @@ -1073,6 +1105,7 @@ typedef struct InsertStmt RangeVar *relation; /* relation to insert into */ List *cols; /* optional: names of the target columns */ Node *selectStmt; /* the source SELECT/VALUES, or NULL */ + ConflictClause *confClause; /* ON CONFLICT clause */ List *returningList; /* list of expressions to return */ WithClause *withClause; /* WITH clause */ } InsertStmt; diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 21cbfa8cf0feb..0558b7388e19f 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -178,6 +178,8 @@ typedef struct ModifyTable List *resultRelations; /* integer list of RT indexes */ int resultRelIndex; /* index of first resultRel in plan's list */ List *plans; /* plan(s) producing source data */ + SpecCmd spec; /* speculative insertion specification */ + List *arbiterIndexes; /* List of ON CONFLICT arbiter index OIDs */ List *withCheckOptionLists; /* per-target-table WCO lists */ List *returningLists; /* per-target-table RETURNING tlists */ List *fdwPrivLists; /* per-target-table FDW private data lists */ diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h index 4f1d234d30777..249cead65b9f2 100644 --- a/src/include/nodes/primnodes.h +++ b/src/include/nodes/primnodes.h @@ -1143,6 +1143,22 @@ typedef struct CurrentOfExpr int cursor_param; /* refcursor parameter number, or 0 */ } CurrentOfExpr; +/* + * InferenceElem - an element of a unique index inference specification + * + * This mostly matches the structure of IndexElems, but having a dedicated + * primnode allows for a clean separation between the use of index parameters + * by utility commands, and this node. + */ +typedef struct InferenceElem +{ + Expr xpr; + Node *expr; /* expression to infer from, or NULL */ + Oid infercollid; /* OID of collation, or InvalidOid */ + Oid inferopfamily; /* OID of att opfamily, or InvalidOid */ + Oid inferopcinputtype; /* OID of att input type, or InvalidOid */ +} InferenceElem; + /*-------------------- * TargetEntry - * a target entry (used in query target lists) diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index 8eb2e57d7b7be..11e7d4d26bbc7 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -28,6 +28,8 @@ extern PGDLLIMPORT get_relation_info_hook_type get_relation_info_hook; extern void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel); +extern List *infer_arbiter_indexes(PlannerInfo *root); + extern void estimate_rel_size(Relation rel, int32 *attr_widths, BlockNumber *pages, double *tuples, double *allvisfrac); diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index fa72918d1bbad..c3a0634d39489 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -85,7 +85,7 @@ extern ModifyTable *make_modifytable(PlannerInfo *root, Index nominalRelation, List *resultRelations, List *subplans, List *withCheckOptionLists, List *returningLists, - List *rowMarks, int epqParam); + List *rowMarks, SpecCmd spec, int epqParam); extern bool is_projection_capable_plan(Plan *plan); /* diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h index 7c243ecc06056..cf501e601874b 100644 --- a/src/include/parser/kwlist.h +++ b/src/include/parser/kwlist.h @@ -87,6 +87,7 @@ PG_KEYWORD("commit", COMMIT, UNRESERVED_KEYWORD) PG_KEYWORD("committed", COMMITTED, UNRESERVED_KEYWORD) PG_KEYWORD("concurrently", CONCURRENTLY, TYPE_FUNC_NAME_KEYWORD) PG_KEYWORD("configuration", CONFIGURATION, UNRESERVED_KEYWORD) +PG_KEYWORD("conflict", CONFLICT, UNRESERVED_KEYWORD) PG_KEYWORD("connection", CONNECTION, UNRESERVED_KEYWORD) PG_KEYWORD("constraint", CONSTRAINT, RESERVED_KEYWORD) PG_KEYWORD("constraints", CONSTRAINTS, UNRESERVED_KEYWORD) @@ -180,6 +181,7 @@ PG_KEYWORD("hold", HOLD, UNRESERVED_KEYWORD) PG_KEYWORD("hour", HOUR_P, UNRESERVED_KEYWORD) PG_KEYWORD("identity", IDENTITY_P, UNRESERVED_KEYWORD) PG_KEYWORD("if", IF_P, UNRESERVED_KEYWORD) +PG_KEYWORD("ignore", IGNORE_P, UNRESERVED_KEYWORD) PG_KEYWORD("ilike", ILIKE, TYPE_FUNC_NAME_KEYWORD) PG_KEYWORD("immediate", IMMEDIATE, UNRESERVED_KEYWORD) PG_KEYWORD("immutable", IMMUTABLE, UNRESERVED_KEYWORD) diff --git a/src/include/parser/parse_clause.h b/src/include/parser/parse_clause.h index 6a4438f556658..d1d0d1261b3ee 100644 --- a/src/include/parser/parse_clause.h +++ b/src/include/parser/parse_clause.h @@ -41,6 +41,8 @@ extern List *transformDistinctClause(ParseState *pstate, List **targetlist, List *sortClause, bool is_agg); extern List *transformDistinctOnClause(ParseState *pstate, List *distinctlist, List **targetlist, List *sortClause); +extern void transformConflictClause(ParseState *pstate, ConflictClause *confClause, + List **arbiterExpr, Node **arbiterWhere); extern List *addTargetToSortList(ParseState *pstate, TargetEntry *tle, List *sortlist, List *targetlist, SortBy *sortby, diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h index f1e0f57e7c2a5..717bdab565c5b 100644 --- a/src/include/replication/reorderbuffer.h +++ b/src/include/replication/reorderbuffer.h @@ -43,6 +43,15 @@ typedef struct ReorderBufferTupleBuf * and ComboCids in the same list with the user visible INSERT/UPDATE/DELETE * changes. Users of the decoding facilities will never see changes with * *_INTERNAL_* actions. + * + * The REORDER_BUFFER_CHANGE_SPEC_INSERT and REORDER_BUFFER_CHANGE_SUPER_DELETE + * changes concern "speculative insertions", and their "super deletion" + * respectively. Super deletion is a mechanism that speculative insertion + * makes use of to handle conflicts. + * + * At transaction reassembly these will be consolidated, and so decoding + * plugins will only ever handle REORDER_BUFFER_CHANGE_INSERT changes here too + * (in the common case where speculative insertion works out). */ enum ReorderBufferChangeType { @@ -51,7 +60,9 @@ enum ReorderBufferChangeType REORDER_BUFFER_CHANGE_DELETE, REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT, REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID, - REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID + REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID, + REORDER_BUFFER_CHANGE_SPEC_INSERT, + REORDER_BUFFER_CHANGE_SUPER_DELETE }; /* diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h index f5d70e5141eff..7cc75fc10654a 100644 --- a/src/include/storage/lmgr.h +++ b/src/include/storage/lmgr.h @@ -76,6 +76,11 @@ extern bool ConditionalXactLockTableWait(TransactionId xid); extern void WaitForLockers(LOCKTAG heaplocktag, LOCKMODE lockmode); extern void WaitForLockersMultiple(List *locktags, LOCKMODE lockmode); +/* Lock an XID for tuple insertion (used to wait for an insertion to finish) */ +extern uint32 SpeculativeInsertionLockAcquire(TransactionId xid); +extern void SpeculativeInsertionLockRelease(TransactionId xid); +extern void SpeculativeInsertionWait(TransactionId xid, uint32 token); + /* Lock a general object (other than a relation) of the current database */ extern void LockDatabaseObject(Oid classid, Oid objid, uint16 objsubid, LOCKMODE lockmode); diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index dae517f3fe001..b4eb1b4a9e309 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -176,6 +176,8 @@ typedef enum LockTagType /* ID info for a transaction is its TransactionId */ LOCKTAG_VIRTUALTRANSACTION, /* virtual transaction (ditto) */ /* ID info for a virtual transaction is its VirtualTransactionId */ + LOCKTAG_SPECULATIVE_TOKEN, /* speculative insertion Xid and token */ + /* ID info for a transaction is its TransactionId */ LOCKTAG_OBJECT, /* non-relation database object */ /* ID info for an object is DB OID + CLASS OID + OBJECT OID + SUBID */ @@ -261,6 +263,14 @@ typedef struct LOCKTAG (locktag).locktag_type = LOCKTAG_VIRTUALTRANSACTION, \ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) +#define SET_LOCKTAG_SPECULATIVE_INSERTION(locktag,xid,token) \ + ((locktag).locktag_field1 = (xid), \ + (locktag).locktag_field2 = (token), \ + (locktag).locktag_field3 = 0, \ + (locktag).locktag_field4 = 0, \ + (locktag).locktag_type = LOCKTAG_SPECULATIVE_TOKEN, \ + (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) + #define SET_LOCKTAG_OBJECT(locktag,dboid,classoid,objoid,objsubid) \ ((locktag).locktag_field1 = (dboid), \ (locktag).locktag_field2 = (classoid), \ diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 26fb2573c7103..a734bf00752f6 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -69,31 +69,41 @@ typedef struct SnapshotData * progress, unless the snapshot was taken during recovery in which case * it's empty. For historic MVCC snapshots, the meaning is inverted, i.e. * it contains *committed* transactions between xmin and xmax. + * + * note: all ids in xip[] satisfy xmin <= xip[i] < xmax */ TransactionId *xip; uint32 xcnt; /* # of xact ids in xip[] */ - /* note: all ids in xip[] satisfy xmin <= xip[i] < xmax */ - int32 subxcnt; /* # of xact ids in subxip[] */ /* * For non-historic MVCC snapshots, this contains subxact IDs that are in * progress (and other transactions that are in progress if taken during * recovery). For historic snapshot it contains *all* xids assigned to the * replayed transaction, including the toplevel xid. + * + * note: all ids in subxip[] are >= xmin, but we don't bother filtering + * out any that are >= xmax */ TransactionId *subxip; + int32 subxcnt; /* # of xact ids in subxip[] */ bool suboverflowed; /* has the subxip array overflowed? */ + bool takenDuringRecovery; /* recovery-shaped snapshot? */ bool copied; /* false if it's a static snapshot */ + CommandId curcid; /* in my xact, CID < curcid are visible */ + /* - * note: all ids in subxip[] are >= xmin, but we don't bother filtering - * out any that are >= xmax + * An extra return value for HeapTupleSatisfiesDirty, not used in MVCC + * snapshots. + */ + uint32 speculativeToken; + + /* + * Book-keeping information, used by the snapshot manager */ - CommandId curcid; /* in my xact, CID < curcid are visible */ uint32 active_count; /* refcount on ActiveSnapshot stack */ uint32 regd_count; /* refcount on RegisteredSnapshots */ - pairingheap_node ph_node; /* link in the RegisteredSnapshots heap */ } SnapshotData; diff --git a/src/test/isolation/expected/insert-conflict-ignore.out b/src/test/isolation/expected/insert-conflict-ignore.out new file mode 100644 index 0000000000000..e6cc2a1a09086 --- /dev/null +++ b/src/test/isolation/expected/insert-conflict-ignore.out @@ -0,0 +1,23 @@ +Parsed test spec with 2 sessions + +starting permutation: ignore1 ignore2 c1 select2 c2 +step ignore1: INSERT INTO ints(key, val) VALUES(1, 'ignore1') ON CONFLICT IGNORE; +step ignore2: INSERT INTO ints(key, val) VALUES(1, 'ignore2') ON CONFLICT IGNORE; +step c1: COMMIT; +step ignore2: <... completed> +step select2: SELECT * FROM ints; +key val + +1 ignore1 +step c2: COMMIT; + +starting permutation: ignore1 ignore2 a1 select2 c2 +step ignore1: INSERT INTO ints(key, val) VALUES(1, 'ignore1') ON CONFLICT IGNORE; +step ignore2: INSERT INTO ints(key, val) VALUES(1, 'ignore2') ON CONFLICT IGNORE; +step a1: ABORT; +step ignore2: <... completed> +step select2: SELECT * FROM ints; +key val + +1 ignore2 +step c2: COMMIT; diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule index 3e2614ecacdb3..fc357b4653534 100644 --- a/src/test/isolation/isolation_schedule +++ b/src/test/isolation/isolation_schedule @@ -16,6 +16,7 @@ test: fk-deadlock2 test: eval-plan-qual test: lock-update-delete test: lock-update-traversal +test: insert-conflict-ignore test: delete-abort-savept test: delete-abort-savept-2 test: aborted-keyrevoke diff --git a/src/test/isolation/specs/insert-conflict-ignore.spec b/src/test/isolation/specs/insert-conflict-ignore.spec new file mode 100644 index 0000000000000..fde43b3c1ddfb --- /dev/null +++ b/src/test/isolation/specs/insert-conflict-ignore.spec @@ -0,0 +1,41 @@ +# INSERT...ON CONFLICT IGNORE test +# +# This test tries to expose problems with the interaction between concurrent +# sessions during INSERT...ON CONFLICT IGNORE. +# +# The convention here is that session 1 always ends up inserting, and session 2 +# always ends up ignoring. + +setup +{ + CREATE TABLE ints (key int primary key, val text); +} + +teardown +{ + DROP TABLE ints; +} + +session "s1" +setup +{ + BEGIN ISOLATION LEVEL READ COMMITTED; +} +step "ignore1" { INSERT INTO ints(key, val) VALUES(1, 'ignore1') ON CONFLICT IGNORE; } +step "c1" { COMMIT; } +step "a1" { ABORT; } + +session "s2" +setup +{ + BEGIN ISOLATION LEVEL READ COMMITTED; +} +step "ignore2" { INSERT INTO ints(key, val) VALUES(1, 'ignore2') ON CONFLICT IGNORE; } +step "select2" { SELECT * FROM ints; } +step "c2" { COMMIT; } +step "a2" { ABORT; } + +# Regular case where one session block-waits on another to determine if it +# should proceed with an insert or ignore. +permutation "ignore1" "ignore2" "c1" "select2" "c2" +permutation "ignore1" "ignore2" "a1" "select2" "c2" diff --git a/src/test/regress/expected/errors.out b/src/test/regress/expected/errors.out index 5f8868da26ed3..210e5ff39cbca 100644 --- a/src/test/regress/expected/errors.out +++ b/src/test/regress/expected/errors.out @@ -32,7 +32,9 @@ LINE 1: select nonesuch from pg_database; ^ -- empty distinct list isn't OK select distinct from pg_database; -ERROR: SELECT DISTINCT must have at least one column +ERROR: syntax error at or near "from" +LINE 1: select distinct from pg_database; + ^ -- bad attribute name on lhs of operator select * from pg_database where nonesuch = pg_database.datname; ERROR: column "nonesuch" does not exist diff --git a/src/test/regress/expected/insert_conflict.out b/src/test/regress/expected/insert_conflict.out new file mode 100644 index 0000000000000..02600dd0e188f --- /dev/null +++ b/src/test/regress/expected/insert_conflict.out @@ -0,0 +1,178 @@ +-- +-- insert...on conflict unique index inference +-- +create table insertconflicttest(key int4, fruit text); +-- +-- Test unique index inference with operator class specifications and +-- named collations +-- +create unique index op_index_key on insertconflicttest(key, fruit text_pattern_ops); +create unique index collation_index_key on insertconflicttest(key, fruit collate "C"); +create unique index both_index_key on insertconflicttest(key, fruit collate "C" text_pattern_ops); +create unique index both_index_expr_key on insertconflicttest(key, lower(fruit) collate "C" text_pattern_ops); +-- fails +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (key) ignore; +ERROR: could not infer which unique index to use from expressions/columns and predicate provided for ON CONFLICT +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (fruit) ignore; +ERROR: could not infer which unique index to use from expressions/columns and predicate provided for ON CONFLICT +-- succeeds +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (key, fruit) ignore; + QUERY PLAN +------------------------------------------------------------------------------- + Insert on insertconflicttest + Conflict Arbiter Indexes: op_index_key, collation_index_key, both_index_key + -> Result +(3 rows) + +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (fruit, key, fruit, key) ignore; + QUERY PLAN +------------------------------------------------------------------------------- + Insert on insertconflicttest + Conflict Arbiter Indexes: op_index_key, collation_index_key, both_index_key + -> Result +(3 rows) + +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (lower(fruit), key, lower(fruit), key) ignore; + QUERY PLAN +------------------------------------------------- + Insert on insertconflicttest + Conflict Arbiter Indexes: both_index_expr_key + -> Result +(3 rows) + +-- Neither collation nor operator class specifications are required -- +-- supplying them merely *limits* matches to indexes with matching opclasses +-- used for relevant indexes +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (key, fruit text_pattern_ops) ignore; + QUERY PLAN +---------------------------------------------------------- + Insert on insertconflicttest + Conflict Arbiter Indexes: op_index_key, both_index_key + -> Result +(3 rows) + +-- Okay, arbitrates using both index where text_pattern_ops opclass does and +-- does not appear. +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (key, fruit collate "C") ignore; + QUERY PLAN +----------------------------------------------------------------- + Insert on insertconflicttest + Conflict Arbiter Indexes: collation_index_key, both_index_key + -> Result +(3 rows) + +-- Okay, but only accepts the single index where both opclass and collation are +-- specified +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (fruit collate "C" text_pattern_ops, key) ignore; + QUERY PLAN +-------------------------------------------- + Insert on insertconflicttest + Conflict Arbiter Indexes: both_index_key + -> Result +(3 rows) + +-- Okay, but only accepts the single index where both opclass and collation are +-- specified (plus expression variant) +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (lower(fruit) collate "C", key, key) ignore; + QUERY PLAN +------------------------------------------------- + Insert on insertconflicttest + Conflict Arbiter Indexes: both_index_expr_key + -> Result +(3 rows) + +-- Attribute appears twice, while not all attributes/expressions on attributes +-- appearing within index definition match in terms of both opclass and +-- collation. +-- +-- Works because every attribute in inference specification needs to be +-- satisfied once or more by cataloged index attribute, and as always when an +-- attribute in the cataloged definition has a non-default opclass/collation, +-- it still satisfied some inference attribute lacking any particular +-- opclass/collation specification. +-- +-- The implementation is liberal in accepting inference specifications on the +-- assumption that multiple inferred unique indexes will prevent problematic +-- cases. It rolls with unique indexes where attributes redundantly appear +-- multiple times, too (which is not tested here). +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (fruit, key, fruit text_pattern_ops, key) ignore; + QUERY PLAN +---------------------------------------------------------- + Insert on insertconflicttest + Conflict Arbiter Indexes: op_index_key, both_index_key + -> Result +(3 rows) + +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (lower(fruit) collate "C" text_pattern_ops, key, key) ignore; + QUERY PLAN +------------------------------------------------- + Insert on insertconflicttest + Conflict Arbiter Indexes: both_index_expr_key + -> Result +(3 rows) + +drop index op_index_key; +drop index collation_index_key; +drop index both_index_key; +drop index both_index_expr_key; +-- +-- Test partial unique index inference +-- +create unique index partial_key_index on insertconflicttest(key) where fruit like '%berry'; +-- Succeeds +insert into insertconflicttest values (23, 'Blackberry') on conflict (key where fruit like '%berry' and fruit = 'inconsequential') ignore; +-- fails +insert into insertconflicttest values (23, 'Blackberry') on conflict (key where fruit like '%berry' or fruit = 'consequential') ignore; +ERROR: could not infer which unique index to use from expressions/columns and predicate provided for ON CONFLICT +insert into insertconflicttest values (23, 'Uncovered by Index') on conflict (key where fruit like '%berry') ignore; +ERROR: inferred arbiter partial unique index's predicate does not cover tuple proposed for insertion +DETAIL: ON CONFLICT inference clause implies that the tuple proposed for insertion must be covered by the predicate of partial index "partial_key_index". +drop index partial_key_index; +-- Cleanup +drop table insertconflicttest; +-- ****************************************************************** +-- * * +-- * Test inheritance (example taken from tutorial) * +-- * * +-- ****************************************************************** +create table cities ( + name text, + population float8, + altitude int -- (in ft) +); +create table capitals ( + state char(2) +) inherits (cities); +-- Create unique indexes. Due to a general limitation of inheritance, +-- uniqueness is only enforced per-relation. Unique index inference +-- specification will do the right thing, though. +create unique index cities_names_unique on cities (name); +create unique index capitals_names_unique on capitals (name); +-- prepopulate the tables. +insert into cities values ('San Francisco', 7.24E+5, 63); +insert into cities values ('Las Vegas', 2.583E+5, 2174); +insert into cities values ('Mariposa', 1200, 1953); +insert into capitals values ('Sacramento', 3.694E+5, 30, 'CA'); +insert into capitals values ('Madison', 1.913E+5, 845, 'WI'); +-- Tests proper for inheritance: +select * from capitals; + name | population | altitude | state +------------+------------+----------+------- + Sacramento | 369400 | 30 | CA + Madison | 191300 | 845 | WI +(2 rows) + +-- Succeeds: +insert into cities values ('Las Vegas', 2.583E+5, 2174) on conflict ignore; +-- Wrong "Sacramento", ignored: +insert into capitals values ('Sacramento', 50, 2267, 'NE') on conflict (name) ignore; +select * from capitals; + name | population | altitude | state +------------+------------+----------+------- + Sacramento | 369400 | 30 | CA + Madison | 191300 | 845 | WI +(2 rows) + +-- clean up +drop table capitals; +drop table cities; diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 25095e5b700ac..f5f4b60d22286 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1123,6 +1123,10 @@ SELECT * FROM shoelace_log ORDER BY sl_name; SELECT * FROM shoelace_obsolete WHERE sl_avail = 0; insert into shoelace values ('sl9', 0, 'pink', 35.0, 'inch', 0.0); insert into shoelace values ('sl10', 1000, 'magenta', 40.0, 'inch', 0.0); +-- Unsupported (even though a similar updatable view construct is) +insert into shoelace values ('sl10', 1000, 'magenta', 40.0, 'inch', 0.0) + on conflict ignore; +ERROR: INSERT with ON CONFLICT clause may not target relation with INSERT or UPDATE rules SELECT * FROM shoelace_obsolete ORDER BY sl_len_cm; sl_name | sl_avail | sl_color | sl_len | sl_unit | sl_len_cm ------------+----------+------------+--------+----------+----------- @@ -2357,6 +2361,11 @@ DETAIL: Key (id3a, id3c)=(1, 13) is not present in table "rule_and_refint_t2". insert into rule_and_refint_t3 values (1, 13, 11, 'row6'); ERROR: insert or update on table "rule_and_refint_t3" violates foreign key constraint "rule_and_refint_t3_id3a_fkey" DETAIL: Key (id3a, id3b)=(1, 13) is not present in table "rule_and_refint_t1". +-- Ordinary table +insert into rule_and_refint_t3 values (1, 13, 11, 'row6') + on conflict ignore; +ERROR: insert or update on table "rule_and_refint_t3" violates foreign key constraint "rule_and_refint_t3_id3a_fkey" +DETAIL: Key (id3a, id3b)=(1, 13) is not present in table "rule_and_refint_t1". create rule rule_and_refint_t3_ins as on insert to rule_and_refint_t3 where (exists (select 1 from rule_and_refint_t3 where (((rule_and_refint_t3.id3a = new.id3a) diff --git a/src/test/regress/expected/updatable_views.out b/src/test/regress/expected/updatable_views.out index ccabe9e3dcb9a..dd3e444fd5c6d 100644 --- a/src/test/regress/expected/updatable_views.out +++ b/src/test/regress/expected/updatable_views.out @@ -215,6 +215,8 @@ INSERT INTO rw_view15 VALUES (3, 'ROW 3'); -- should fail ERROR: cannot insert into column "upper" of view "rw_view15" DETAIL: View columns that are not columns of their base relation are not updatable. INSERT INTO rw_view15 (a) VALUES (3); -- should be OK +INSERT INTO rw_view15 (a) VALUES (3) ON CONFLICT IGNORE; -- succeeds +INSERT INTO rw_view15 (a) VALUES (3) ON CONFLICT (a) IGNORE; -- succeeds ALTER VIEW rw_view15 ALTER COLUMN upper SET DEFAULT 'NOT SET'; INSERT INTO rw_view15 (a) VALUES (4); -- should fail ERROR: cannot insert into column "upper" of view "rw_view15" diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 6d3b865351d37..b0ebb6b3f4c65 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -36,6 +36,7 @@ test: geometry horology regex oidjoins type_sanity opr_sanity # These four each depend on the previous one # ---------- test: insert +test: insert_conflict test: create_function_1 test: create_type test: create_table diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index 8326894ed9d9c..8409c0f3ef205 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -50,6 +50,7 @@ test: oidjoins test: type_sanity test: opr_sanity test: insert +test: insert_conflict test: create_function_1 test: create_type test: create_table diff --git a/src/test/regress/sql/insert_conflict.sql b/src/test/regress/sql/insert_conflict.sql new file mode 100644 index 0000000000000..551b4cff4e134 --- /dev/null +++ b/src/test/regress/sql/insert_conflict.sql @@ -0,0 +1,115 @@ +-- +-- insert...on conflict unique index inference +-- +create table insertconflicttest(key int4, fruit text); + +-- +-- Test unique index inference with operator class specifications and +-- named collations +-- +create unique index op_index_key on insertconflicttest(key, fruit text_pattern_ops); +create unique index collation_index_key on insertconflicttest(key, fruit collate "C"); +create unique index both_index_key on insertconflicttest(key, fruit collate "C" text_pattern_ops); +create unique index both_index_expr_key on insertconflicttest(key, lower(fruit) collate "C" text_pattern_ops); + +-- fails +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (key) ignore; +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (fruit) ignore; + +-- succeeds +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (key, fruit) ignore; +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (fruit, key, fruit, key) ignore; +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (lower(fruit), key, lower(fruit), key) ignore; +-- Neither collation nor operator class specifications are required -- +-- supplying them merely *limits* matches to indexes with matching opclasses +-- used for relevant indexes +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (key, fruit text_pattern_ops) ignore; +-- Okay, arbitrates using both index where text_pattern_ops opclass does and +-- does not appear. +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (key, fruit collate "C") ignore; +-- Okay, but only accepts the single index where both opclass and collation are +-- specified +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (fruit collate "C" text_pattern_ops, key) ignore; +-- Okay, but only accepts the single index where both opclass and collation are +-- specified (plus expression variant) +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (lower(fruit) collate "C", key, key) ignore; +-- Attribute appears twice, while not all attributes/expressions on attributes +-- appearing within index definition match in terms of both opclass and +-- collation. +-- +-- Works because every attribute in inference specification needs to be +-- satisfied once or more by cataloged index attribute, and as always when an +-- attribute in the cataloged definition has a non-default opclass/collation, +-- it still satisfied some inference attribute lacking any particular +-- opclass/collation specification. +-- +-- The implementation is liberal in accepting inference specifications on the +-- assumption that multiple inferred unique indexes will prevent problematic +-- cases. It rolls with unique indexes where attributes redundantly appear +-- multiple times, too (which is not tested here). +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (fruit, key, fruit text_pattern_ops, key) ignore; +explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (lower(fruit) collate "C" text_pattern_ops, key, key) ignore; + +drop index op_index_key; +drop index collation_index_key; +drop index both_index_key; +drop index both_index_expr_key; + +-- +-- Test partial unique index inference +-- +create unique index partial_key_index on insertconflicttest(key) where fruit like '%berry'; + +-- Succeeds +insert into insertconflicttest values (23, 'Blackberry') on conflict (key where fruit like '%berry' and fruit = 'inconsequential') ignore; + +-- fails +insert into insertconflicttest values (23, 'Blackberry') on conflict (key where fruit like '%berry' or fruit = 'consequential') ignore; +insert into insertconflicttest values (23, 'Uncovered by Index') on conflict (key where fruit like '%berry') ignore; + +drop index partial_key_index; + +-- Cleanup +drop table insertconflicttest; + +-- ****************************************************************** +-- * * +-- * Test inheritance (example taken from tutorial) * +-- * * +-- ****************************************************************** +create table cities ( + name text, + population float8, + altitude int -- (in ft) +); + +create table capitals ( + state char(2) +) inherits (cities); + +-- Create unique indexes. Due to a general limitation of inheritance, +-- uniqueness is only enforced per-relation. Unique index inference +-- specification will do the right thing, though. +create unique index cities_names_unique on cities (name); +create unique index capitals_names_unique on capitals (name); + +-- prepopulate the tables. +insert into cities values ('San Francisco', 7.24E+5, 63); +insert into cities values ('Las Vegas', 2.583E+5, 2174); +insert into cities values ('Mariposa', 1200, 1953); + +insert into capitals values ('Sacramento', 3.694E+5, 30, 'CA'); +insert into capitals values ('Madison', 1.913E+5, 845, 'WI'); + +-- Tests proper for inheritance: +select * from capitals; + +-- Succeeds: +insert into cities values ('Las Vegas', 2.583E+5, 2174) on conflict ignore; +-- Wrong "Sacramento", ignored: +insert into capitals values ('Sacramento', 50, 2267, 'NE') on conflict (name) ignore; +select * from capitals; + +-- clean up +drop table capitals; +drop table cities; diff --git a/src/test/regress/sql/rules.sql b/src/test/regress/sql/rules.sql index c385e414578fa..58073318828e4 100644 --- a/src/test/regress/sql/rules.sql +++ b/src/test/regress/sql/rules.sql @@ -680,6 +680,9 @@ SELECT * FROM shoelace_log ORDER BY sl_name; insert into shoelace values ('sl9', 0, 'pink', 35.0, 'inch', 0.0); insert into shoelace values ('sl10', 1000, 'magenta', 40.0, 'inch', 0.0); +-- Unsupported (even though a similar updatable view construct is) +insert into shoelace values ('sl10', 1000, 'magenta', 40.0, 'inch', 0.0) + on conflict ignore; SELECT * FROM shoelace_obsolete ORDER BY sl_len_cm; SELECT * FROM shoelace_candelete; @@ -844,6 +847,9 @@ insert into rule_and_refint_t3 values (1, 12, 11, 'row3'); insert into rule_and_refint_t3 values (1, 12, 12, 'row4'); insert into rule_and_refint_t3 values (1, 11, 13, 'row5'); insert into rule_and_refint_t3 values (1, 13, 11, 'row6'); +-- Ordinary table +insert into rule_and_refint_t3 values (1, 13, 11, 'row6') + on conflict ignore; create rule rule_and_refint_t3_ins as on insert to rule_and_refint_t3 where (exists (select 1 from rule_and_refint_t3 diff --git a/src/test/regress/sql/updatable_views.sql b/src/test/regress/sql/updatable_views.sql index 697363665c1cd..94a8b40cf31f7 100644 --- a/src/test/regress/sql/updatable_views.sql +++ b/src/test/regress/sql/updatable_views.sql @@ -69,6 +69,8 @@ DELETE FROM rw_view14 WHERE a=3; -- should be OK -- Partially updatable view INSERT INTO rw_view15 VALUES (3, 'ROW 3'); -- should fail INSERT INTO rw_view15 (a) VALUES (3); -- should be OK +INSERT INTO rw_view15 (a) VALUES (3) ON CONFLICT IGNORE; -- succeeds +INSERT INTO rw_view15 (a) VALUES (3) ON CONFLICT (a) IGNORE; -- succeeds ALTER VIEW rw_view15 ALTER COLUMN upper SET DEFAULT 'NOT SET'; INSERT INTO rw_view15 (a) VALUES (4); -- should fail UPDATE rw_view15 SET upper='ROW 3' WHERE a=3; -- should fail