-
Notifications
You must be signed in to change notification settings - Fork 3.9k
/
row0sel.cc
6447 lines (5248 loc) · 212 KB
/
row0sel.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*****************************************************************************
Copyright (c) 1997, 2023, Oracle and/or its affiliates.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
briefly in the InnoDB documentation. The contributions by Google are
incorporated with their permission, and subject to the conditions contained in
the file COPYING.Google.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License, version 2.0, as published by the
Free Software Foundation.
This program is also distributed with certain software (including but not
limited to OpenSSL) that is licensed under separate terms, as designated in a
particular file or component or in included license documentation. The authors
of MySQL hereby grant you an additional permission to link the program and
your derivative works with the separately licensed software that they have
included with MySQL.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*****************************************************************************/
/** @file row/row0sel.cc
Select
Created 12/19/1997 Heikki Tuuri
*******************************************************/
#include "row0sel.h"
#include <sys/types.h>
#include "btr0btr.h"
#include "btr0cur.h"
#include "btr0sea.h"
#include "buf0lru.h"
#include "dict0boot.h"
#include "dict0dd.h"
#include "dict0dict.h"
#include "eval0eval.h"
#include "gis0rtree.h"
#include "ha_innodb.h"
#include "ha_prototypes.h"
#include "handler.h"
#include "lob0lob.h"
#include "lob0undo.h"
#include "lock0lock.h"
#include "mach0data.h"
#include "pars0pars.h"
#include "pars0sym.h"
#include "que0que.h"
#include "read0read.h"
#include "record_buffer.h"
#include "rem0cmp.h"
#include "row0mysql.h"
#include "row0row.h"
#include "row0upd.h"
#include "row0vers.h"
#include "srv0mon.h"
#include "trx0trx.h"
#include "trx0undo.h"
#include "ut0new.h"
#include "my_dbug.h"
/** Maximum number of rows to prefetch; MySQL interface has another parameter */
constexpr uint32_t SEL_MAX_N_PREFETCH = 16;
/** Number of rows fetched, after which to start prefetching; MySQL interface
has another parameter */
constexpr uint32_t SEL_PREFETCH_LIMIT = 1;
/** When a select has accessed about this many pages, it returns control back
to que_run_threads: this is to allow canceling runaway queries */
constexpr uint32_t SEL_COST_LIMIT = 100;
/** Flags for search shortcut */
constexpr uint32_t SEL_FOUND = 0;
constexpr uint32_t SEL_EXHAUSTED = 1;
constexpr uint32_t SEL_RETRY = 2;
/** Returns true if the user-defined column in a secondary index record
is alphabetically the same as the corresponding BLOB column in the clustered
index record.
NOTE: the comparison is NOT done as a binary comparison, but character
fields are compared with collation!
@return true if the columns are equal */
static bool row_sel_sec_rec_is_for_blob(
trx_t *trx, /*!< in: the operating transaction */
ulint mtype, /*!< in: main type */
ulint prtype, /*!< in: precise type */
ulint mbminmaxlen, /*!< in: minimum and maximum length of
a multi-byte character */
const byte *clust_field, /*!< in: the locally stored part of
the clustered index column, including
the BLOB pointer; the clustered
index record must be covered by
a lock or a page latch to protect it
against deletion (rollback or purge) */
ulint clust_len, /*!< in: length of clust_field */
const byte *sec_field, /*!< in: column in secondary index */
ulint sec_len, /*!< in: length of sec_field */
ulint prefix_len, /*!< in: index column prefix length
in bytes */
dict_table_t *table) /*!< in: table */
{
ulint len;
byte buf[REC_VERSION_56_MAX_INDEX_COL_LEN];
/* This function should never be invoked on tables in
ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT, because they
should always contain enough prefix in the clustered index record. */
ut_ad(dict_table_has_atomic_blobs(table));
ut_a(clust_len >= BTR_EXTERN_FIELD_REF_SIZE);
ut_ad(prefix_len >= sec_len);
ut_ad(prefix_len > 0);
ut_a(prefix_len <= sizeof buf);
if (!memcmp(clust_field + clust_len - BTR_EXTERN_FIELD_REF_SIZE,
field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)) {
/* The externally stored field was not written yet.
This record should only be seen by
trx_rollback_or_clean_all_recovered() or any
TRX_ISO_READ_UNCOMMITTED transactions. */
return false;
}
len = lob::btr_copy_externally_stored_field_prefix_func(
trx, table->first_index(), buf, prefix_len,
dict_tf_get_page_size(table->flags), clust_field,
IF_DEBUG(dict_table_is_sdi(table->id), ) clust_len);
if (len == 0) {
/* The BLOB was being deleted as the server crashed.
There should not be any secondary index records
referring to this clustered index record, because
btr_free_externally_stored_field() is called after all
secondary index entries of the row have been purged. */
return false;
}
len = dtype_get_at_most_n_mbchars(prtype, mbminmaxlen, prefix_len, len,
(const char *)buf);
/* We are testing for equality; ASC/DESC does not matter. */
return (!cmp_data_data(mtype, prtype, true, buf, len, sec_field, sec_len));
}
/** Returns true if the user-defined column values in a secondary index record
are alphabetically the same as the corresponding columns in the clustered
index record.
NOTE: the comparison is NOT done as a binary comparison, but character
fields are compared with collation!
@param[in] sec_rec secondary index record
@param[in] sec_index secondary index
@param[in] clust_rec clustered index record;
must be protected by a page s-latch
@param[in] clust_index clustered index
@param[in] thr query thread
@param[out] is_equal set to true if the secondary record is equal
to the corresponding fields in the clustered record, when compared with
collation; false if not equal or if the
clustered record has been marked for deletion; only valid if DB_SUCCESS was
returned
@return DB_SUCCESS or error code */
static dberr_t row_sel_sec_rec_is_for_clust_rec(
const rec_t *sec_rec, dict_index_t *sec_index, const rec_t *clust_rec,
dict_index_t *clust_index, que_thr_t *thr, bool &is_equal) {
const byte *sec_field;
ulint sec_len;
const byte *clust_field;
ulint n;
ulint i;
mem_heap_t *heap = nullptr;
ulint clust_offsets_[REC_OFFS_NORMAL_SIZE];
ulint sec_offsets_[REC_OFFS_SMALL_SIZE];
ulint *clust_offs = clust_offsets_;
ulint *sec_offs = sec_offsets_;
trx_t *trx = thr_get_trx(thr);
dberr_t err = DB_SUCCESS;
is_equal = true;
rec_offs_init(clust_offsets_);
rec_offs_init(sec_offsets_);
if (rec_get_deleted_flag(clust_rec, dict_table_is_comp(clust_index->table))) {
/* The clustered index record is delete-marked;
it is not visible in the read view. Besides,
if there are any externally stored columns,
some of them may have already been purged. */
is_equal = false;
return (DB_SUCCESS);
}
heap = mem_heap_create(256, UT_LOCATION_HERE);
clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
ULINT_UNDEFINED, UT_LOCATION_HERE, &heap);
sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs, ULINT_UNDEFINED,
UT_LOCATION_HERE, &heap);
n = dict_index_get_n_ordering_defined_by_user(sec_index);
for (i = 0; i < n; i++) {
const dict_field_t *ifield;
const dict_col_t *col;
ulint clust_pos = 0;
ulint clust_len;
ulint len;
row_ext_t *ext;
ifield = sec_index->get_field(i);
col = ifield->col;
/* For virtual column, its value will need to be
reconstructed from base column in cluster index */
if (col->is_virtual()) {
const dict_v_col_t *v_col;
const dtuple_t *row;
dfield_t *vfield;
v_col = reinterpret_cast<const dict_v_col_t *>(col);
row = row_build(ROW_COPY_POINTERS, clust_index, clust_rec, clust_offs,
nullptr, nullptr, nullptr, &ext, heap);
vfield = innobase_get_computed_value(row, v_col, clust_index, &heap, heap,
nullptr, thr_get_trx(thr)->mysql_thd,
thr->prebuilt->m_mysql_table,
nullptr, nullptr, nullptr);
if (vfield == nullptr) {
/* This may happen e.g. when this statement is executed in
read-uncommited isolation and value (like json function)
depends on an externally stored lob (like json) which
was not written yet. */
err = DB_COMPUTE_VALUE_FAILED;
goto func_exit;
}
clust_len = vfield->len;
clust_field = static_cast<byte *>(vfield->data);
} else {
clust_pos = dict_col_get_clust_pos(col, clust_index);
clust_field = rec_get_nth_field_instant(clust_rec, clust_offs, clust_pos,
clust_index, &clust_len);
}
sec_field = rec_get_nth_field(nullptr, sec_rec, sec_offs, i, &sec_len);
len = clust_len;
if (ifield->prefix_len > 0 && len != UNIV_SQL_NULL &&
sec_len != UNIV_SQL_NULL && !col->is_virtual()) {
if (rec_offs_nth_extern(clust_index, clust_offs, clust_pos)) {
len -= BTR_EXTERN_FIELD_REF_SIZE;
}
len = dtype_get_at_most_n_mbchars(col->prtype, col->mbminmaxlen,
ifield->prefix_len, len,
(char *)clust_field);
/* Check sec index field matches that of cluster index
in the case of for table with ATOMIC BLOB, note
we also need to check if sec_len is 0 */
if (rec_offs_nth_extern(clust_index, clust_offs, clust_pos) &&
(len < sec_len ||
(dict_table_has_atomic_blobs(sec_index->table) && sec_len == 0))) {
if (!row_sel_sec_rec_is_for_blob(
trx, col->mtype, col->prtype, col->mbminmaxlen, clust_field,
clust_len, sec_field, sec_len, ifield->prefix_len,
clust_index->table)) {
is_equal = false;
goto func_exit;
}
continue;
}
}
/* For spatial index, the first field is MBR, we check
if the MBR is equal or not. */
if (dict_index_is_spatial(sec_index) && i == 0) {
rtr_mbr_t tmp_mbr;
rtr_mbr_t sec_mbr;
byte *dptr = const_cast<byte *>(clust_field);
ut_ad(clust_len != UNIV_SQL_NULL);
/* For externally stored field, we need to get full
geo data to generate the MBR for comparing. */
if (rec_offs_nth_extern(clust_index, clust_offs, clust_pos)) {
dptr = lob::btr_copy_externally_stored_field(
trx, clust_index, &clust_len, nullptr, dptr,
dict_tf_get_page_size(sec_index->table->flags), len,
dict_index_is_sdi(sec_index), heap);
}
get_mbr_from_store(sec_index->rtr_srs.get(), dptr,
static_cast<uint>(clust_len), SPDIMS,
reinterpret_cast<double *>(&tmp_mbr), nullptr);
rtr_read_mbr(sec_field, &sec_mbr);
if (!mbr_equal_cmp(sec_index->rtr_srs.get(), &sec_mbr, &tmp_mbr)) {
is_equal = false;
goto func_exit;
}
} else if (col->is_multi_value()) {
if (!is_multi_value_clust_and_sec_equal(clust_field, clust_len, sec_field,
sec_len, col)) {
is_equal = false;
goto func_exit;
}
} else {
/* We are testing for equality; ASC/DESC does not
matter */
if (0 != cmp_data_data(col->mtype, col->prtype, true, clust_field, len,
sec_field, sec_len)) {
is_equal = false;
goto func_exit;
}
}
}
func_exit:
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
}
return (err);
}
/** Creates a select node struct.
@return own: select node struct */
sel_node_t *sel_node_create(
mem_heap_t *heap) /*!< in: memory heap where created */
{
sel_node_t *node;
node = static_cast<sel_node_t *>(mem_heap_alloc(heap, sizeof(sel_node_t)));
node->common.type = QUE_NODE_SELECT;
node->state = SEL_NODE_OPEN;
node->plans = nullptr;
return (node);
}
/** Frees the memory private to a select node when a query graph is freed,
does not free the heap where the node was originally created. */
void sel_node_free_private(sel_node_t *node) /*!< in: select node struct */
{
ulint i;
plan_t *plan;
if (node->plans != nullptr) {
for (i = 0; i < node->n_tables; i++) {
plan = sel_node_get_nth_plan(node, i);
plan->pcur.close();
plan->clust_pcur.close();
if (plan->old_vers_heap) {
mem_heap_free(plan->old_vers_heap);
}
}
}
}
/** Evaluates the values in a select list. If there are aggregate functions,
their argument value is added to the aggregate total. */
static inline void sel_eval_select_list(
sel_node_t *node) /*!< in: select node */
{
que_node_t *exp;
exp = node->select_list;
while (exp) {
eval_exp(exp);
exp = que_node_get_next(exp);
}
}
/** Assigns the values in the select list to the possible into-variables in
SELECT ... INTO ... */
static inline void sel_assign_into_var_values(
sym_node_t *var, /*!< in: first variable in a
list of variables */
sel_node_t *node) /*!< in: select node */
{
que_node_t *exp;
if (var == nullptr) {
return;
}
for (exp = node->select_list; var != nullptr;
var = static_cast<sym_node_t *>(que_node_get_next(var))) {
ut_ad(exp);
eval_node_copy_val(var->alias, exp);
exp = que_node_get_next(exp);
}
}
/** Resets the aggregate value totals in the select list of an aggregate type
query. */
static inline void sel_reset_aggregate_vals(
sel_node_t *node) /*!< in: select node */
{
func_node_t *func_node;
ut_ad(node->is_aggregate);
for (func_node = static_cast<func_node_t *>(node->select_list);
func_node != nullptr;
func_node = static_cast<func_node_t *>(que_node_get_next(func_node))) {
eval_node_set_int_val(func_node, 0);
}
node->aggregate_already_fetched = false;
}
/** Copies the input variable values when an explicit cursor is opened. */
static inline void row_sel_copy_input_variable_vals(
sel_node_t *node) /*!< in: select node */
{
for (auto var : node->copy_variables) {
eval_node_copy_val(var, var->alias);
var->indirection = nullptr;
}
}
/** Fetches the column values from a record.
@param[in] trx the current transaction or nullptr
@param[in] index record index
@param[in] rec record in a clustered or non-clustered index;
must be protected by a page latch
@param[in] offsets rec_get_offsets(rec, index)
@param[in] column first column in a column list, or NULL
@param[in] allow_null_lob allow null lob if true. default is false. */
static void row_sel_fetch_columns(trx_t *trx, dict_index_t *index,
const rec_t *rec, const ulint *offsets,
sym_node_t *column,
bool allow_null_lob = false) {
dfield_t *val;
ulint index_type;
ulint field_no;
const byte *data;
ulint len;
ut_ad(rec_offs_validate(rec, index, offsets));
if (index->is_clustered()) {
index_type = SYM_CLUST_FIELD_NO;
} else {
index_type = SYM_SEC_FIELD_NO;
}
while (column) {
mem_heap_t *heap = nullptr;
bool needs_copy;
field_no = column->field_nos[index_type];
if (field_no != ULINT_UNDEFINED) {
if (UNIV_UNLIKELY(rec_offs_nth_extern(index, offsets, field_no))) {
/* Copy an externally stored field to the
temporary heap, if possible. */
heap = mem_heap_create(1, UT_LOCATION_HERE);
data = lob::btr_rec_copy_externally_stored_field(
trx, index, rec, offsets, dict_table_page_size(index->table),
field_no, &len, nullptr, dict_index_is_sdi(index), heap);
if (data == nullptr) {
/* This means that the externally stored field was not written yet.
This record should only be seen by following situations:
- Read uncommitted transactions (TRX_ISO_READ_UNCOMMITTED)
- During crash recovery [trx_rollback_or_clean_all_recovered().]
- During lock-less consistent read, when the trx reads LOB even
though the clust_rec is not to be seen. */
ut_ad(allow_null_lob);
len = UNIV_SQL_NULL;
needs_copy = false;
} else {
needs_copy = true;
}
} else {
data = rec_get_nth_field_instant(rec, offsets, field_no, index, &len);
needs_copy = column->copy_val;
}
if (needs_copy) {
eval_node_copy_and_alloc_val(column, data, len);
} else {
val = que_node_get_val(column);
dfield_set_data(val, data, len);
}
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
}
}
column = UT_LIST_GET_NEXT(col_var_list, column);
}
}
/** Allocates a prefetch buffer for a column when prefetch is first time done.
*/
static void sel_col_prefetch_buf_alloc(
sym_node_t *column) /*!< in: symbol table node for a column */
{
sel_buf_t *sel_buf;
ulint i;
ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
column->prefetch_buf = static_cast<sel_buf_t *>(ut::malloc_withkey(
UT_NEW_THIS_FILE_PSI_KEY, SEL_MAX_N_PREFETCH * sizeof(sel_buf_t)));
for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
sel_buf = column->prefetch_buf + i;
sel_buf->data = nullptr;
sel_buf->len = 0;
sel_buf->val_buf_size = 0;
}
}
/** Frees a prefetch buffer for a column, including the dynamically allocated
memory for data stored there. */
void sel_col_prefetch_buf_free(
sel_buf_t *prefetch_buf) /*!< in, own: prefetch buffer */
{
sel_buf_t *sel_buf;
ulint i;
for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
sel_buf = prefetch_buf + i;
if (sel_buf->val_buf_size > 0) {
ut::free(sel_buf->data);
}
}
ut::free(prefetch_buf);
}
/** Pops the column values for a prefetched, cached row from the column prefetch
buffers and places them to the val fields in the column nodes. */
static void sel_dequeue_prefetched_row(
plan_t *plan) /*!< in: plan node for a table */
{
sel_buf_t *sel_buf;
dfield_t *val;
byte *data;
ulint len;
ulint val_buf_size;
ut_ad(plan->n_rows_prefetched > 0);
for (auto column : plan->columns) {
val = que_node_get_val(column);
if (!column->copy_val) {
/* We did not really push any value for the
column */
ut_ad(!column->prefetch_buf);
ut_ad(que_node_get_val_buf_size(column) == 0);
ut_d(dfield_set_null(val));
continue;
}
ut_ad(column->prefetch_buf);
ut_ad(!dfield_is_ext(val));
sel_buf = column->prefetch_buf + plan->first_prefetched;
data = sel_buf->data;
len = sel_buf->len;
val_buf_size = sel_buf->val_buf_size;
/* We must keep track of the allocated memory for
column values to be able to free it later: therefore
we swap the values for sel_buf and val */
sel_buf->data = static_cast<byte *>(dfield_get_data(val));
sel_buf->len = dfield_get_len(val);
sel_buf->val_buf_size = que_node_get_val_buf_size(column);
dfield_set_data(val, data, len);
que_node_set_val_buf_size(column, val_buf_size);
}
plan->n_rows_prefetched--;
plan->first_prefetched++;
}
/** Pushes the column values for a prefetched, cached row to the column prefetch
buffers from the val fields in the column nodes. */
static inline void sel_enqueue_prefetched_row(
plan_t *plan) /*!< in: plan node for a table */
{
sel_buf_t *sel_buf;
dfield_t *val;
byte *data;
ulint len;
ulint pos;
ulint val_buf_size;
if (plan->n_rows_prefetched == 0) {
pos = 0;
plan->first_prefetched = 0;
} else {
pos = plan->n_rows_prefetched;
/* We have the convention that pushing new rows starts only
after the prefetch stack has been emptied: */
ut_ad(plan->first_prefetched == 0);
}
plan->n_rows_prefetched++;
ut_ad(pos < SEL_MAX_N_PREFETCH);
for (auto column : plan->columns) {
if (!column->copy_val) {
/* There is no sense to push pointers to database
page fields when we do not keep latch on the page! */
continue;
}
if (!column->prefetch_buf) {
/* Allocate a new prefetch buffer */
sel_col_prefetch_buf_alloc(column);
}
sel_buf = column->prefetch_buf + pos;
val = que_node_get_val(column);
data = static_cast<byte *>(dfield_get_data(val));
len = dfield_get_len(val);
val_buf_size = que_node_get_val_buf_size(column);
/* We must keep track of the allocated memory for
column values to be able to free it later: therefore
we swap the values for sel_buf and val */
dfield_set_data(val, sel_buf->data, sel_buf->len);
que_node_set_val_buf_size(column, sel_buf->val_buf_size);
sel_buf->data = data;
sel_buf->len = len;
sel_buf->val_buf_size = val_buf_size;
}
}
/** Builds a previous version of a clustered index record for a consistent read
@return DB_SUCCESS or error code */
[[nodiscard]] static dberr_t row_sel_build_prev_vers(
ReadView *read_view, /*!< in: read view */
dict_index_t *index, /*!< in: plan node for table */
rec_t *rec, /*!< in: record in a clustered index */
ulint **offsets, /*!< in/out: offsets returned by
rec_get_offsets(rec, plan->index) */
mem_heap_t **offset_heap, /*!< in/out: memory heap from which
the offsets are allocated */
mem_heap_t **old_vers_heap, /*!< out: old version heap to use */
rec_t **old_vers, /*!< out: old version, or NULL if the
record does not exist in the view:
i.e., it was freshly inserted
afterwards */
mtr_t *mtr) /*!< in: mtr */
{
dberr_t err;
if (*old_vers_heap) {
mem_heap_empty(*old_vers_heap);
} else {
*old_vers_heap = mem_heap_create(512, UT_LOCATION_HERE);
}
err = row_vers_build_for_consistent_read(rec, mtr, index, offsets, read_view,
offset_heap, *old_vers_heap,
old_vers, nullptr, nullptr);
return (err);
}
/** Builds the last committed version of a clustered index record for a
semi-consistent read. */
static void row_sel_build_committed_vers_for_mysql(
dict_index_t *clust_index, /*!< in: clustered index */
row_prebuilt_t *prebuilt, /*!< in: prebuilt struct */
const rec_t *rec, /*!< in: record in a clustered index */
ulint **offsets, /*!< in/out: offsets returned by
rec_get_offsets(rec, clust_index) */
mem_heap_t **offset_heap, /*!< in/out: memory heap from which
the offsets are allocated */
const rec_t **old_vers, /*!< out: old version, or NULL if the
record does not exist in the view:
i.e., it was freshly inserted
afterwards */
const dtuple_t **vrow, /*!< out: to be filled with old virtual
column version if any */
mtr_t *mtr) /*!< in: mtr */
{
if (prebuilt->old_vers_heap) {
mem_heap_empty(prebuilt->old_vers_heap);
} else {
prebuilt->old_vers_heap =
mem_heap_create(rec_offs_size(*offsets), UT_LOCATION_HERE);
}
row_vers_build_for_semi_consistent_read(rec, mtr, clust_index, offsets,
offset_heap, prebuilt->old_vers_heap,
old_vers, vrow);
}
/** Tests the conditions which determine when the index segment we are searching
through has been exhausted.
@return true if row passed the tests */
static inline bool row_sel_test_end_conds(
plan_t *plan) /*!< in: plan for the table; the column values must
already have been retrieved and the right sides of
comparisons evaluated */
{
/* All conditions in end_conds are comparisons of a column to an
expression */
for (auto cond : plan->end_conds) {
/* Evaluate the left side of the comparison, i.e., get the
column value if there is an indirection */
eval_sym(static_cast<sym_node_t *>(cond->args));
/* Do the comparison */
if (!eval_cmp(cond)) {
return false;
}
}
return true;
}
/** Tests the other conditions.
@return true if row passed the tests */
static inline bool row_sel_test_other_conds(
plan_t *plan) /*!< in: plan for the table; the column values must
already have been retrieved */
{
for (auto cond : plan->other_conds) {
eval_exp(cond);
if (!eval_node_get_bool_val(cond)) {
return false;
}
}
return true;
}
/** Retrieves the clustered index record corresponding to a record in a
non-clustered index. Does the necessary locking.
@return DB_SUCCESS or error code */
[[nodiscard]] static dberr_t row_sel_get_clust_rec(
sel_node_t *node, /*!< in: select_node */
plan_t *plan, /*!< in: plan node for table */
rec_t *rec, /*!< in: record in a non-clustered index */
que_thr_t *thr, /*!< in: query thread */
rec_t **out_rec, /*!< out: clustered record or an old version of
it, NULL if the old version did not exist
in the read view, i.e., it was a fresh
inserted version */
mtr_t *mtr) /*!< in: mtr used to get access to the
non-clustered record; the same mtr is used to
access the clustered index */
{
dict_index_t *index;
rec_t *clust_rec;
rec_t *old_vers;
dberr_t err;
mem_heap_t *heap = nullptr;
ulint offsets_[REC_OFFS_NORMAL_SIZE];
ulint *offsets = offsets_;
rec_offs_init(offsets_);
*out_rec = nullptr;
offsets = rec_get_offsets(rec, plan->pcur.get_btr_cur()->index, offsets,
ULINT_UNDEFINED, UT_LOCATION_HERE, &heap);
row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
index = plan->table->first_index();
plan->clust_pcur.open_no_init(index, plan->clust_ref, PAGE_CUR_LE,
BTR_SEARCH_LEAF, 0, mtr, UT_LOCATION_HERE);
clust_rec = plan->clust_pcur.get_rec();
/* Note: only if the search ends up on a non-infimum record is the
low_match value the real match to the search tuple */
if (!page_rec_is_user_rec(clust_rec) ||
plan->clust_pcur.get_low_match() < dict_index_get_n_unique(index)) {
ut_a(rec_get_deleted_flag(rec, dict_table_is_comp(plan->table)));
ut_a(node->read_view);
/* In a rare case it is possible that no clust rec is found
for a delete-marked secondary index record: if in row0umod.cc
in row_undo_mod_remove_clust_low() we have already removed
the clust rec, while purge is still cleaning and removing
secondary index records associated with earlier versions of
the clustered index record. In that case we know that the
clustered index record did not exist in the read view of
trx. */
goto func_exit;
}
offsets = rec_get_offsets(clust_rec, index, offsets, ULINT_UNDEFINED,
UT_LOCATION_HERE, &heap);
if (!node->read_view) {
/* Try to place a lock on the index record */
ulint lock_type;
trx_t *trx;
trx = thr_get_trx(thr);
lock_type = trx->skip_gap_locks() ? LOCK_REC_NOT_GAP : LOCK_ORDINARY;
err = lock_clust_rec_read_check_and_lock(
lock_duration_t::REGULAR, plan->clust_pcur.get_block(), clust_rec,
index, offsets, SELECT_ORDINARY,
static_cast<lock_mode>(node->row_lock_mode), lock_type, thr);
switch (err) {
case DB_SUCCESS:
case DB_SUCCESS_LOCKED_REC:
/* Declare the variable uninitialized in Valgrind.
It should be set to DB_SUCCESS at func_exit. */
UNIV_MEM_INVALID(&err, sizeof err);
break;
default:
goto err_exit;
}
} else {
/* This is a non-locking consistent read: if necessary, fetch
a previous version of the record */
old_vers = nullptr;
if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
node->read_view)) {
err =
row_sel_build_prev_vers(node->read_view, index, clust_rec, &offsets,
&heap, &plan->old_vers_heap, &old_vers, mtr);
if (err != DB_SUCCESS) {
goto err_exit;
}
clust_rec = old_vers;
if (clust_rec == nullptr) {
goto func_exit;
}
}
/* If we had to go to an earlier version of row or the
secondary index record is delete marked, then it may be that
the secondary index record corresponding to clust_rec
(or old_vers) is not rec; in that case we must ignore
such row because in our snapshot rec would not have existed.
Remember that from rec we cannot see directly which transaction
id corresponds to it: we have to go to the clustered index
record. A query where we want to fetch all rows where
the secondary index value is in some interval would return
a wrong result if we would not drop rows which we come to
visit through secondary index records that would not really
exist in our snapshot. */
if (old_vers ||
rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
bool rec_equal;
err = row_sel_sec_rec_is_for_clust_rec(rec, plan->index, clust_rec, index,
thr, rec_equal);
if (err != DB_SUCCESS) {
goto err_exit;
} else if (!rec_equal) {
goto func_exit;
}
}
}
/* Fetch the columns needed in test conditions. The clustered
index record is protected by a page latch that was acquired
when plan->clust_pcur was positioned. The latch will not be
released until mtr_commit(mtr). */
ut_ad(!rec_get_deleted_flag(clust_rec, rec_offs_comp(offsets)));
row_sel_fetch_columns(thr_get_trx(thr), index, clust_rec, offsets,
UT_LIST_GET_FIRST(plan->columns));
*out_rec = clust_rec;
func_exit:
err = DB_SUCCESS;
err_exit:
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
}
return (err);
}
/** Sets a lock on a page of R-Tree record. This is all or none action,
mostly due to we cannot reposition a record in R-Tree (with the
nature of splitting)
@param[in] pcur cursor
@param[in] first_rec record
@param[in] index index
@param[in] offsets rec_get_offsets(rec, index)
@param[in] sel_mode select mode: SELECT_ORDINARY,
SELECT_SKIP_LOKCED, or SELECT_NO_WAIT
@param[in] mode lock mode
@param[in] type LOCK_ORDINARY, LOCK_GAP, or LOC_REC_NOT_GAP
@param[in] thr query thread
@param[in] mtr mtr
@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
static inline dberr_t sel_set_rtr_rec_lock(
btr_pcur_t *pcur, const rec_t *first_rec, dict_index_t *index,
const ulint *offsets, select_mode sel_mode, ulint mode, ulint type,
que_thr_t *thr, mtr_t *mtr) {
matched_rec_t *match = pcur->m_btr_cur.rtr_info->matches;
mem_heap_t *heap = nullptr;
dberr_t err = DB_SUCCESS;
trx_t *trx = thr_get_trx(thr);
buf_block_t *cur_block = pcur->get_block();
ulint offsets_[REC_OFFS_NORMAL_SIZE];
ulint *my_offsets = const_cast<ulint *>(offsets);
rec_t *rec = const_cast<rec_t *>(first_rec);
rtr_rec_vector *match_rec;
rtr_rec_vector::iterator end;
rec_offs_init(offsets_);
if (match->locked || page_rec_is_supremum(first_rec)) {
return (DB_SUCCESS_LOCKED_REC);
}
ut_ad(page_align(first_rec) == cur_block->frame);
ut_ad(match->valid);
rw_lock_x_lock(&(match->block.lock), UT_LOCATION_HERE);
retry:
cur_block = pcur->get_block();
ut_ad(rw_lock_own(&(match->block.lock), RW_LOCK_X) ||
rw_lock_own(&(match->block.lock), RW_LOCK_S));
ut_ad(page_is_leaf(buf_block_get_frame(cur_block)));
err = lock_sec_rec_read_check_and_lock(
lock_duration_t::REGULAR, cur_block, rec, index, my_offsets, sel_mode,
static_cast<lock_mode>(mode), type, thr);
switch (err) {
case DB_SUCCESS:
case DB_SUCCESS_LOCKED_REC:
case DB_SKIP_LOCKED:
goto lock_match;