Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ranger: fix prefix index when charset is UTF-8 #7194

Merged
merged 9 commits into from Jul 31, 2018
16 changes: 16 additions & 0 deletions expression/integration_test.go
Expand Up @@ -3428,4 +3428,20 @@ func (s *testIntegrationSuite) TestPrefixIndex(c *C) {
tk.MustExec("insert into t1 values('借款策略集_网页');")
res := tk.MustQuery("select * from t1 where name = '借款策略集_网页';")
res.Check(testkit.Rows("借款策略集_网页"))

tk.MustExec(`CREATE TABLE prefix (
a int(11) NOT NULL,
b varchar(55) DEFAULT NULL,
c int(11) DEFAULT NULL,
PRIMARY KEY (a),
KEY prefix_index (b(2)),
KEY prefix_complex (a,b(2))
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin;`)

tk.MustExec("INSERT INTO prefix VALUES(0, 'b', 2), (1, 'bbb', 3), (2, 'bbc', 4), (3, 'bbb', 5), (4, 'abc', 6), (5, 'abc', 7), (6, 'abc', 7), (7, 'ÿÿ', 8), (8, 'ÿÿ0', 9), (9, 'ÿÿÿ', 10);")
res = tk.MustQuery("select c, b from prefix where b > 'ÿ' and b < 'ÿÿc'")
res.Check(testkit.Rows("8 ÿÿ", "9 ÿÿ0"))

res = tk.MustQuery("select a, b from prefix where b LIKE 'ÿÿ%'")
res.Check(testkit.Rows("7 ÿÿ", "8 ÿÿ0", "9 ÿÿÿ"))
}
2 changes: 1 addition & 1 deletion plan/physical_plan_test.go
Expand Up @@ -178,7 +178,7 @@ func (s *testPlanSuite) TestDAGPlanBuilderSimpleCase(c *C) {
// Test index filter condition push down.
{
sql: "select * from t use index(e_d_c_str_prefix) where t.c_str = 'abcdefghijk' and t.d_str = 'd' and t.e_str = 'e'",
best: "IndexLookUp(Index(t.e_d_c_str_prefix)[[\"e\" \"d\" \"[97 98 99 100 101 102 103 104 105 106]\",\"e\" \"d\" \"[97 98 99 100 101 102 103 104 105 106]\"]], Table(t)->Sel([eq(test.t.c_str, abcdefghijk)]))",
best: "IndexLookUp(Index(t.e_d_c_str_prefix)[[\"e\" \"d\" \"abcdefghij\",\"e\" \"d\" \"abcdefghij\"]], Table(t)->Sel([eq(test.t.c_str, abcdefghijk)]))",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this changed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You see, now prefix index is set by string when charset is UTF-8 rather than bytes.

},
{
sql: "select * from t use index(e_d_c_str_prefix) where t.e_str = b'1110000'",
Expand Down
17 changes: 8 additions & 9 deletions table/tables/index.go
Expand Up @@ -136,19 +136,18 @@ func (c *index) truncateIndexValuesIfNeeded(indexedValues []types.Datum) []types
if v.Kind() == types.KindString || v.Kind() == types.KindBytes {
ic := c.idxInfo.Columns[i]
colCharset := c.tblInfo.Columns[ic.Offset].Charset
if colCharset == charset.CharsetUTF8 || colCharset == charset.CharsetUTF8MB4 {
val := v.GetBytes()
if ic.Length != types.UnspecifiedLength && utf8.RuneCount(val) > ic.Length {
rs := bytes.Runes(val)
colValue := v.GetBytes()
isUTF8Charset := colCharset == charset.CharsetUTF8 || colCharset == charset.CharsetUTF8MB4
if isUTF8Charset {
if ic.Length != types.UnspecifiedLength && utf8.RuneCount(colValue) > ic.Length {
rs := bytes.Runes(colValue)
truncateStr := string(rs[:ic.Length])
// truncate value and limit its length
v.SetString(truncateStr)
}
} else {
if ic.Length != types.UnspecifiedLength && len(v.GetBytes()) > ic.Length {
// truncate value and limit its length
v.SetBytes(v.GetBytes()[:ic.Length])
}
} else if ic.Length != types.UnspecifiedLength && len(colValue) > ic.Length {
// truncate value and limit its length
v.SetBytes(colValue[:ic.Length])
}
}
}
Expand Down
35 changes: 27 additions & 8 deletions util/ranger/ranger.go
Expand Up @@ -17,6 +17,7 @@ import (
"bytes"
"math"
"sort"
"unicode/utf8"

"github.com/juju/errors"
"github.com/pingcap/tidb/ast"
Expand All @@ -25,6 +26,7 @@ import (
"github.com/pingcap/tidb/mysql"
"github.com/pingcap/tidb/sessionctx/stmtctx"
"github.com/pingcap/tidb/types"
"github.com/pingcap/tidb/util/charset"
"github.com/pingcap/tidb/util/codec"
)

Expand Down Expand Up @@ -327,7 +329,7 @@ func buildCNFIndexRange(sc *stmtctx.StatementContext, cols []*expression.Column,

// Take prefix index into consideration.
if hasPrefix(lengths) {
fixPrefixColRange(ranges, lengths)
fixPrefixColRange(ranges, lengths, newTp)
}

if len(ranges) > 0 && len(ranges[0].LowVal) < len(cols) {
Expand Down Expand Up @@ -410,23 +412,37 @@ func hasPrefix(lengths []int) bool {
return false
}

func fixPrefixColRange(ranges []*Range, lengths []int) {
func fixPrefixColRange(ranges []*Range, lengths []int, tp []*types.FieldType) {
for _, ran := range ranges {
for i := 0; i < len(ran.LowVal); i++ {
fixRangeDatum(&ran.LowVal[i], lengths[i])
fixRangeDatum(&ran.LowVal[i], lengths[i], tp[i])
}
ran.LowExclude = false
for i := 0; i < len(ran.HighVal); i++ {
fixRangeDatum(&ran.HighVal[i], lengths[i])
fixRangeDatum(&ran.HighVal[i], lengths[i], tp[i])
}
ran.HighExclude = false
}
}

func fixRangeDatum(v *types.Datum, length int) {
func fixRangeDatum(v *types.Datum, length int, tp *types.FieldType) {
// If this column is prefix and the prefix length is smaller than the range, cut it.
if length != types.UnspecifiedLength && length < len(v.GetBytes()) {
v.SetBytes(v.GetBytes()[:length])
// In case of UTF8, prefix should be cut by characters rather than bytes
if v.Kind() == types.KindString || v.Kind() == types.KindBytes {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For other types, should we consider length?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I doubt if it is possible to have prefix index on other types...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For string columns, indexes can be created that use only the leading part of column values, using col_name(length) syntax to specify an index prefix length:

  • Prefixes can be specified for CHAR, VARCHAR, BINARY, and VARBINARY key parts.

  • Prefixes must be specified for BLOB and TEXT key parts. Additionally, BLOB and TEXT columns can be indexed only for InnoDB, MyISAM, and BLACKHOLE tables.

colCharset := tp.Charset
colValue := v.GetBytes()
isUTF8Charset := colCharset == charset.CharsetUTF8 || colCharset == charset.CharsetUTF8MB4
if isUTF8Charset {
if length != types.UnspecifiedLength && utf8.RuneCount(colValue) > length {
rs := bytes.Runes(colValue)
truncateStr := string(rs[:length])
// truncate value and limit its length
v.SetString(truncateStr)
}
} else if length != types.UnspecifiedLength && len(colValue) > length {
// truncate value and limit its length
v.SetBytes(colValue[:length])
}
}
}

Expand All @@ -438,11 +454,14 @@ func newFieldType(tp *types.FieldType) *types.FieldType {
case mysql.TypeTiny, mysql.TypeShort, mysql.TypeInt24, mysql.TypeLong, mysql.TypeLonglong:
newTp := types.NewFieldType(mysql.TypeLonglong)
newTp.Flag = tp.Flag
newTp.Charset = tp.Charset
return newTp
// To avoid data truncate error.
case mysql.TypeFloat, mysql.TypeDouble, mysql.TypeBlob, mysql.TypeTinyBlob, mysql.TypeMediumBlob, mysql.TypeLongBlob,
mysql.TypeString, mysql.TypeVarchar, mysql.TypeVarString:
return types.NewFieldType(tp.Tp)
newTp := types.NewFieldType(tp.Tp)
newTp.Charset = tp.Charset
return newTp
default:
return tp
}
Expand Down
16 changes: 15 additions & 1 deletion util/ranger/ranger_test.go
Expand Up @@ -332,7 +332,7 @@ func (s *testRangerSuite) TestIndexRange(c *C) {
testKit := testkit.NewTestKit(c, store)
testKit.MustExec("use test")
testKit.MustExec("drop table if exists t")
testKit.MustExec("create table t(a varchar(50), b int, c double, index idx_ab(a(50), b), index idx_cb(c, a))")
testKit.MustExec("create table t(a varchar(50), b int, c double, d varchar(10), e binary(10), index idx_ab(a(50), b), index idx_cb(c, a), index idx_d(d(2)), index idx_e(e(2)))")

tests := []struct {
indexPos int
Expand Down Expand Up @@ -516,6 +516,20 @@ func (s *testRangerSuite) TestIndexRange(c *C) {
filterConds: "[or(gt(test.t.a, a), gt(test.t.c, 1))]",
resultStr: "[[NULL,+inf]]",
},
{
indexPos: 2,
exprStr: `d = "你好啊"`,
accessConds: "[eq(test.t.d, 你好啊)]",
filterConds: "[eq(test.t.d, 你好啊)]",
resultStr: "[[\"你好\",\"你好\"]]",
},
{
indexPos: 3,
exprStr: `e = "你好啊"`,
accessConds: "[eq(test.t.e, 你好啊)]",
filterConds: "[eq(test.t.e, 你好啊)]",
resultStr: "[[\"[228 189]\",\"[228 189]\"]]",
},
}

for _, tt := range tests {
Expand Down