Skip to content

Commit a6b6a6b

Browse files
author
Andrzej Jarzabek
committed
Bug#36879147 [InnoDB] FULLTEXT index limits FTS_DOC_ID to max unsigned 32-bit value
Issue: FTS_DOC_ID is a 64-bit field and can have values 2^32 and higher. However current implementation only supports 32-bit value range. This limtation takes the form of: - Assertions - Use of unsigned long type which resolves to 32-bit on some platforms - VLC (variable length coding) implementation supporting up to 35 buts Fix: Support 64-bit doc IDs: - Remove assertions - Replace use of unsigned long for doc ID deltas with uint64_t - Extend VLC functions to support full unsigned 64-bit range Change-Id: Id96ae8b5d906bda8587bc99f5dd9e9d4fe9375dc
1 parent ead5d54 commit a6b6a6b

File tree

9 files changed

+467
-106
lines changed

9 files changed

+467
-106
lines changed

storage/innobase/ddl/ddl0builder.cc

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -871,8 +871,6 @@ dberr_t Builder::copy_fts_column(Copy_ctx &ctx, dfield_t *field) noexcept {
871871
doc_id = fts.m_doc_id->current();
872872
}
873873

874-
ut_a(doc_id <= 4294967295u);
875-
876874
if (unlikely(!dfield_is_null(field))) {
877875
auto ptr = ut::malloc_withkey(UT_NEW_THIS_FILE_PSI_KEY,
878876
sizeof(FTS::Doc_item) + field->len);

storage/innobase/fts/fts0fts.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1066,7 +1066,6 @@ void fts_cache_node_add_positions(
10661066
ulint enc_len;
10671067
ulint last_pos;
10681068
byte *ptr_start;
1069-
ulint doc_id_delta;
10701069

10711070
#ifdef UNIV_DEBUG
10721071
if (cache) {
@@ -1077,7 +1076,7 @@ void fts_cache_node_add_positions(
10771076
ut_ad(doc_id >= node->last_doc_id);
10781077

10791078
/* Calculate the space required to store the ilist. */
1080-
doc_id_delta = (ulint)(doc_id - node->last_doc_id);
1079+
const uint64_t doc_id_delta = doc_id - node->last_doc_id;
10811080
enc_len = fts_get_encoded_len(doc_id_delta);
10821081

10831082
last_pos = 0;

storage/innobase/fts/fts0opt.cc

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1114,7 +1114,7 @@ static dberr_t fts_optimize_encode_node(
11141114
/* Calculate the space required to store the ilist. */
11151115
ut_ad(doc_id > node->last_doc_id);
11161116
doc_id_delta = doc_id - node->last_doc_id;
1117-
enc_len = fts_get_encoded_len(static_cast<ulint>(doc_id_delta));
1117+
enc_len = fts_get_encoded_len(doc_id_delta);
11181118

11191119
/* Calculate the size of the encoded pos array. */
11201120
while (*src) {
@@ -1159,9 +1159,8 @@ static dberr_t fts_optimize_encode_node(
11591159
src = enc->src_ilist_ptr;
11601160
dst = node->ilist + node->ilist_size;
11611161

1162-
/* Encode the doc id. Cast to ulint, the delta should be small and
1163-
therefore no loss of precision. */
1164-
dst += fts_encode_int((ulint)doc_id_delta, dst);
1162+
/* Encode the doc id. */
1163+
dst += fts_encode_int(doc_id_delta, dst);
11651164

11661165
/* Copy the encoded pos array. */
11671166
memcpy(dst, src, pos_enc_len);
@@ -1200,10 +1199,9 @@ static dberr_t fts_optimize_node(
12001199
into in the destination node. */
12011200
while (copied < src_node->ilist_size &&
12021201
dst_node->ilist_size < FTS_ILIST_MAX_SIZE) {
1203-
doc_id_t delta;
12041202
doc_id_t del_doc_id = FTS_NULL_DOC_ID;
12051203

1206-
delta = fts_decode_vlc(&enc->src_ilist_ptr);
1204+
doc_id_t delta = fts_decode_vlc(&enc->src_ilist_ptr);
12071205

12081206
test_again:
12091207
/* Check whether the doc id is in the delete list, if

storage/innobase/fts/fts0que.cc

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
3535
#include <math.h>
3636
#include <sys/types.h>
3737
#include <iomanip>
38+
#include <limits>
3839
#include <vector>
3940

4041
#include "dict0dict.h"
@@ -2893,14 +2894,17 @@ fts_query_find_doc_id(
28932894
ulint freq = 0;
28942895
ulint min_pos = 0;
28952896
ulint last_pos = 0;
2896-
ulint pos = fts_decode_vlc(&ptr);
2897+
const uint64_t delta = fts_decode_vlc(&ptr);
28972898

28982899
/* Add the delta. */
2899-
doc_id += pos;
2900+
doc_id += delta;
29002901

29012902
while (*ptr) {
29022903
++freq;
2903-
last_pos += fts_decode_vlc(&ptr);
2904+
const uint64_t decoded_pos = fts_decode_vlc(&ptr);
2905+
ut_ad(uint64_t(last_pos) + decoded_pos
2906+
<= std::numeric_limits<ulint>::max());
2907+
last_pos += static_cast<ulint>(decoded_pos);
29042908

29052909
/* Only if min_pos is not set and the current
29062910
term exists in a position greater than the
@@ -2968,15 +2972,15 @@ static dberr_t fts_query_filter_doc_ids(
29682972
fts_doc_freq_t *doc_freq;
29692973
fts_match_t *match = nullptr;
29702974
ulint last_pos = 0;
2971-
ulint pos = fts_decode_vlc(&ptr);
2975+
const uint64_t delta = fts_decode_vlc(&ptr);
29722976

29732977
/* Some sanity checks. */
29742978
if (doc_id == 0) {
2975-
ut_a(pos == node->first_doc_id);
2979+
ut_a(delta == node->first_doc_id);
29762980
}
29772981

29782982
/* Add the delta. */
2979-
doc_id += pos;
2983+
doc_id += delta;
29802984

29812985
if (calc_doc_count) {
29822986
word_freq->doc_count++;
@@ -3004,7 +3008,10 @@ static dberr_t fts_query_filter_doc_ids(
30043008

30053009
/* Unpack the positions within the document. */
30063010
while (*ptr) {
3007-
last_pos += fts_decode_vlc(&ptr);
3011+
const uint64_t pos_delta = fts_decode_vlc(&ptr);
3012+
ut_ad(uint64_t(last_pos) + pos_delta <=
3013+
std::numeric_limits<ulint>::max());
3014+
last_pos += static_cast<ulint>(pos_delta);
30083015

30093016
/* Collect the matching word positions, for phrase
30103017
matching later. */

storage/innobase/handler/i_s.cc

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2642,13 +2642,15 @@ static int i_s_fts_index_cache_fill_one_index(
26422642
ptr = node->ilist;
26432643

26442644
while (decoded < node->ilist_size) {
2645-
ulint pos = fts_decode_vlc(&ptr);
2645+
const uint64_t delta = fts_decode_vlc(&ptr);
26462646

2647-
doc_id += pos;
2647+
doc_id += delta;
26482648

26492649
/* Get position info */
26502650
while (*ptr) {
2651-
pos = fts_decode_vlc(&ptr);
2651+
const auto decoded_pos = fts_decode_vlc(&ptr);
2652+
ut_ad(decoded_pos <= std::numeric_limits<ulint>::max());
2653+
const auto pos = static_cast<ulint>(decoded_pos);
26522654

26532655
OK(field_store_string(fields[I_S_FTS_WORD], word_str));
26542656

@@ -2986,13 +2988,15 @@ static int i_s_fts_index_table_fill_one_fetch(
29862988
ptr = node->ilist;
29872989

29882990
while (decoded < node->ilist_size) {
2989-
ulint pos = fts_decode_vlc(&ptr);
2991+
const uint64_t delta = fts_decode_vlc(&ptr);
29902992

2991-
doc_id += pos;
2993+
doc_id += delta;
29922994

29932995
/* Get position info */
29942996
while (*ptr) {
2995-
pos = fts_decode_vlc(&ptr);
2997+
const auto decoded_pos = fts_decode_vlc(&ptr);
2998+
ut_ad(decoded_pos <= std::numeric_limits<ulint>::max());
2999+
const auto pos = static_cast<ulint>(decoded_pos);
29963000

29973001
OK(field_store_string(fields[I_S_FTS_WORD], word_str));
29983002

storage/innobase/include/fts0types.h

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
3434
#ifndef INNOBASE_FTS0TYPES_H
3535
#define INNOBASE_FTS0TYPES_H
3636

37+
#include <cstdint>
3738
#include "fts0fts.h"
3839
#include "fut0fut.h"
3940
#include "pars0pars.h"
@@ -309,11 +310,11 @@ extern const fts_index_selector_t fts_index_selector[];
309310
/** It's defined in fts/fts0fts.c */
310311
extern const fts_index_selector_t fts_index_selector_5_7[];
311312

312-
/** Decode and return the integer that was encoded using our VLC scheme.*/
313-
inline ulint fts_decode_vlc(
314-
/*!< out: value decoded */
315-
byte **ptr); /*!< in: ptr to decode from, this ptr is
316-
incremented by the number of bytes decoded */
313+
/** Decode and return the integer that was encoded using our VLC scheme.
314+
@param[in,out] ptr ptr to decode from, this ptr is incremented
315+
by the number of bytes decoded
316+
@return value decoded */
317+
inline uint64_t fts_decode_vlc(byte **ptr);
317318

318319
/** Duplicate a string.
319320
@param[in] dst dup to here
@@ -323,17 +324,16 @@ inline ulint fts_decode_vlc(
323324
inline void fts_string_dup(fts_string_t *dst, const fts_string_t *src,
324325
mem_heap_t *heap);
325326

326-
/** Return length of val if it were encoded using our VLC scheme. */
327-
inline ulint fts_get_encoded_len(
328-
/*!< out: length of value
329-
encoded, in bytes */
330-
ulint val); /*!< in: value to encode */
327+
/** Return length of val if it were encoded using our VLC scheme.
328+
@param[in] val value to encode
329+
@return length of value encoded, in bytes */
330+
inline unsigned int fts_get_encoded_len(uint64_t val);
331331

332332
/** Encode an integer using our VLC scheme and return the length in bytes.
333333
@param[in] val value to encode
334334
@param[in] buf buffer, must have enough space
335335
@return length of value encoded, in bytes */
336-
inline ulint fts_encode_int(ulint val, byte *buf);
336+
inline unsigned int fts_encode_int(uint64_t val, byte *buf);
337337

338338
/** Get the selected FTS aux INDEX suffix. */
339339
inline const char *fts_get_suffix(ulint selected); /*!< in: selected index */

storage/innobase/include/fts0vlc.ic

Lines changed: 47 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -34,87 +34,62 @@ this program; if not, write to the Free Software Foundation, Inc.,
3434
#ifndef INNOBASE_FTS0VLC_IC
3535
#define INNOBASE_FTS0VLC_IC
3636

37+
#include <cstddef>
38+
#include <cstdint>
3739
#include "fts0types.h"
3840

39-
/** Return length of val if it were encoded using our VLC scheme.
40-
FIXME: We will need to be able encode 8 bytes value
41-
@return length of value encoded, in bytes */
42-
inline ulint fts_get_encoded_len(ulint val) /* in: value to encode */
43-
{
44-
if (val <= 127) {
45-
return (1);
46-
} else if (val <= 16383) {
47-
return (2);
48-
} else if (val <= 2097151) {
49-
return (3);
50-
} else if (val <= 268435455) {
51-
return (4);
52-
} else {
53-
/* Possibly we should care that on 64-bit machines ulint can
54-
contain values that we can't encode in 5 bytes, but
55-
fts_encode_int doesn't handle them either so it doesn't much
56-
matter. */
57-
58-
return (5);
41+
inline unsigned int fts_get_encoded_len(uint64_t val) {
42+
unsigned int length = 1;
43+
for (;;) {
44+
val >>= 7;
45+
if (val != 0) {
46+
++length;
47+
} else {
48+
break;
49+
}
5950
}
51+
return length;
6052
}
6153

62-
/** Encode an integer using our VLC scheme and return the length in bytes.
63-
@return length of value encoded, in bytes */
64-
inline ulint fts_encode_int(ulint val, /* in: value to encode */
65-
byte *buf) /* in: buffer, must have enough space */
66-
{
67-
ulint len;
68-
69-
if (val <= 127) {
70-
*buf = (byte)val;
71-
72-
len = 1;
73-
} else if (val <= 16383) {
74-
*buf++ = (byte)(val >> 7);
75-
*buf = (byte)(val & 0x7F);
76-
77-
len = 2;
78-
} else if (val <= 2097151) {
79-
*buf++ = (byte)(val >> 14);
80-
*buf++ = (byte)((val >> 7) & 0x7F);
81-
*buf = (byte)(val & 0x7F);
82-
83-
len = 3;
84-
} else if (val <= 268435455) {
85-
*buf++ = (byte)(val >> 21);
86-
*buf++ = (byte)((val >> 14) & 0x7F);
87-
*buf++ = (byte)((val >> 7) & 0x7F);
88-
*buf = (byte)(val & 0x7F);
89-
90-
len = 4;
91-
} else {
92-
/* Best to keep the limitations of the 32/64 bit versions
93-
identical, at least for the time being. */
94-
ut_ad(val <= 4294967295u);
95-
96-
*buf++ = (byte)(val >> 28);
97-
*buf++ = (byte)((val >> 21) & 0x7F);
98-
*buf++ = (byte)((val >> 14) & 0x7F);
99-
*buf++ = (byte)((val >> 7) & 0x7F);
100-
*buf = (byte)(val & 0x7F);
101-
102-
len = 5;
54+
inline unsigned int fts_encode_int(uint64_t val, byte *buf) {
55+
constexpr unsigned int max_length = 10;
56+
57+
/* skip leading zeros */
58+
unsigned int count = max_length - 1;
59+
while (count > 0) {
60+
/* We split the value into 7 bit batches); so val >= 2^63 need 10 bytes,
61+
2^63 > val >= 2^56 needs 9 bytes, 2^56 > val >= 2^49 needs 8 bytes etc.
62+
*/
63+
if (val >= uint64_t(1) << (count * 7)) {
64+
break;
65+
}
66+
--count;
67+
}
68+
69+
unsigned int length = count + 1;
70+
71+
byte *bufptr{buf};
72+
73+
for (;;) {
74+
*bufptr = (byte)((val >> (7 * count)) & 0x7f);
75+
if (count == 0) {
76+
/* High-bit on means "last byte in the encoded integer". */
77+
*bufptr |= 0x80;
78+
break;
79+
} else {
80+
--count;
81+
++bufptr;
82+
}
10383
}
10484

105-
/* High-bit on means "last byte in the encoded integer". */
106-
*buf |= 0x80;
85+
ut_ad(length <= max_length);
86+
ut_a(bufptr - buf == std::ptrdiff_t(length) - 1);
10787

108-
return (len);
88+
return length;
10989
}
11090

111-
/** Decode and return the integer that was encoded using our VLC scheme.
112-
@return value decoded */
113-
inline ulint fts_decode_vlc(
114-
byte **ptr) /* in: ptr to decode from, this ptr is
115-
incremented by the number of bytes decoded */
116-
{
117-
ulint val = 0;
91+
inline uint64_t fts_decode_vlc(byte **ptr) {
92+
uint64_t val = 0;
11893

11994
for (;;) {
12095
byte b = **ptr;
@@ -130,7 +105,7 @@ inline ulint fts_decode_vlc(
130105
}
131106
}
132107

133-
return (val);
108+
return val;
134109
}
135110

136111
#endif

unittest/gunit/innodb/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ ENDIF()
3838
SET(TESTS
3939
#example
4040
fil_path
41+
fts0vlc
4142
ha_innodb
4243
log0log
4344
mem0mem

0 commit comments

Comments
 (0)