Skip to content

Commit

Permalink
charset: support gb18030 on TiDB (pingcap#3)
Browse files Browse the repository at this point in the history
  • Loading branch information
CbcWestwolf committed Feb 27, 2023
1 parent dfbef74 commit b0f7d83
Show file tree
Hide file tree
Showing 21 changed files with 602 additions and 8 deletions.
2 changes: 2 additions & 0 deletions cmd/explaintest/r/collation_misc_disabled.result
Expand Up @@ -89,6 +89,7 @@ SELECT default_collate_name, maxlen FROM information_schema.character_sets ORDER
default_collate_name maxlen
ascii_bin 1
binary 1
gb18030_bin 4
gbk_bin 2
latin1_bin 1
utf8_bin 3
Expand All @@ -108,6 +109,7 @@ show charset;
Charset Description Default collation Maxlen
ascii US ASCII ascii_bin 1
binary binary binary 1
gb18030 China National Standard GB18030 gb18030_bin 4
gbk Chinese Internal Code Specification gbk_bin 2
latin1 Latin1 latin1_bin 1
utf8 UTF-8 Unicode utf8_bin 3
Expand Down
6 changes: 6 additions & 0 deletions cmd/explaintest/r/collation_misc_enabled.result
Expand Up @@ -92,6 +92,7 @@ SELECT default_collate_name, maxlen FROM information_schema.character_sets ORDER
default_collate_name maxlen
ascii_bin 1
binary 1
gb18030_chinese_ci 4
gbk_chinese_ci 2
latin1_bin 1
utf8_bin 3
Expand All @@ -100,6 +101,8 @@ SELECT character_set_name, id, sortlen FROM information_schema.collations ORDER
character_set_name id sortlen
ascii 65 1
binary 63 1
gb18030 249 1
gb18030 248 1
gbk 87 1
gbk 28 1
latin1 47 1
Expand All @@ -116,6 +119,7 @@ show charset;
Charset Description Default collation Maxlen
ascii US ASCII ascii_bin 1
binary binary binary 1
gb18030 China National Standard GB18030 gb18030_chinese_ci 4
gbk Chinese Internal Code Specification gbk_chinese_ci 2
latin1 Latin1 latin1_bin 1
utf8 UTF-8 Unicode utf8_bin 3
Expand All @@ -124,6 +128,8 @@ show collation;
Collation Charset Id Default Compiled Sortlen
ascii_bin ascii 65 Yes Yes 1
binary binary 63 Yes Yes 1
gb18030_bin gb18030 249 Yes 1
gb18030_chinese_ci gb18030 248 Yes Yes 1
gbk_bin gbk 87 Yes 1
gbk_chinese_ci gbk 28 Yes Yes 1
latin1_bin latin1 47 Yes Yes 1
Expand Down
4 changes: 4 additions & 0 deletions ddl/db_change_test.go
Expand Up @@ -2106,3 +2106,7 @@ func TestConcurrentSetDefaultValue(t *testing.T) {
tk.MustExec("show create table t")
tk.MustExec("insert into t value()")
}

func TestGB18030(t *testing.T) {

}
72 changes: 70 additions & 2 deletions executor/charset_test.go
Expand Up @@ -77,13 +77,18 @@ func TestCharsetFeatureCollation(t *testing.T) {
"(ascii_char char(10) character set ascii," +
"gbk_char char(10) character set gbk collate gbk_bin," +
"latin_char char(10) character set latin1," +
"utf8mb4_char char(10) character set utf8mb4)",
"utf8mb4_char char(10) character set utf8mb4," +
"gb18030_char char(10) character set gb18030)",
)
tk.MustExec("insert into t values ('a', 'a', 'a', 'a'), ('a', '啊', '€', 'ㅂ')")
tk.MustExec("insert into t values ('a', 'a', 'a', 'a', 'a'), ('a', '啊', '€', 'ㅂ', '🀁')")
tk.MustQuery("select collation(concat(ascii_char, gbk_char)) from t").Check(testkit.Rows("gbk_bin", "gbk_bin"))
tk.MustQuery("select collation(concat(gbk_char, ascii_char)) from t").Check(testkit.Rows("gbk_bin", "gbk_bin"))
tk.MustQuery("select collation(concat(utf8mb4_char, gbk_char)) from t").Check(testkit.Rows("utf8mb4_bin", "utf8mb4_bin"))
tk.MustQuery("select collation(concat(gbk_char, utf8mb4_char)) from t").Check(testkit.Rows("utf8mb4_bin", "utf8mb4_bin"))
tk.MustQuery("select collation(concat(utf8mb4_char, gb18030_char)) from t").Check(testkit.Rows("utf8mb4_bin", "utf8mb4_bin"))
tk.MustQuery("select collation(concat(gb18030_char, utf8mb4_char)) from t").Check(testkit.Rows("utf8mb4_bin", "utf8mb4_bin"))
tk.MustGetErrCode("select collation(concat(gbk_char, gb18030_char)) from t", mysql.ErrCantAggregate2collations)
tk.MustGetErrCode("select collation(concat(gb18030_char, gbk_char)) from t", mysql.ErrCantAggregate2collations)
tk.MustQuery("select collation(concat('啊', convert('啊' using gbk) collate gbk_bin))").Check(testkit.Rows("gbk_bin"))
tk.MustQuery("select collation(concat(_latin1 'a', convert('啊' using gbk) collate gbk_bin))").Check(testkit.Rows("gbk_bin"))

Expand All @@ -107,3 +112,66 @@ func TestCharsetWithPrefixIndex(t *testing.T) {
tk.MustExec("insert into t values ('a', '中文'), ('中文', '中文'), ('一二三', '一二三'), ('b', '一二三')")
tk.MustQuery("select * from t;").Sort().Check(testkit.Rows("a 中文", "b 一二三", "一二三 一二三", "中文 中文"))
}

func TestGB18030(t *testing.T) {
store := testkit.CreateMockStore(t)

tk := testkit.NewTestKit(t, store)
tk.MustExec("use test")

tk.MustQuery("select upper(convert('àáèéêìíòóùúüāēěīńňōūǎǐǒǔǖǘǚǜⅪⅫ' using gb18030))").
Check(testkit.Rows("ÀÁÈÉÊÌÍÒÓÙÚÜĀĒĚĪŃŇŌŪǍǏǑǓǕǗǙǛⅪⅫ"))
tk.MustQuery("select lower(convert('àáèéêìíòóùúüāēěīńňōūǎǐǒǔǖǘǚǜⅪⅫ' using gb18030))").
Check(testkit.Rows("àáèéêìíòóùúüāēěīńňōūǎǐǒǔǖǘǚǜⅺⅻ"))
tk.MustQuery("select convert(0x1e2 using gb18030)").
Check(testkit.Rows("<nil>"))
tk.MustQuery("select char(0x1234 using gb18030)").
Check(testkit.Rows("\x124"))
tk.MustQuery("select char(0xd2 using gb18030)").
Check(testkit.Rows("<nil>"))

tk.MustExec("create table t (a char(20) charset gb18030)")
tk.MustExec("create table t1 (a binary(20))")
tk.MustExec("create table t2 (a char(20) charset gb18030, b char(20) charset gb18030)")

tk.MustExec("insert into t values ('a'), ('一二三')")
tk.MustQuery("select hex(a) from t").Check(testkit.Rows("61", "D2BBB6FEC8FD"))
tk.MustQuery("select hex('ㅂ')").Check(testkit.Rows("E38582"))
tk.MustQuery("select ascii(a) from t").Check(testkit.Rows("97", "210"))
tk.MustQuery("select ascii('ㅂ')").Check(testkit.Rows("227"))
tk.MustQuery("select concat(a, 0x3f) from t").Check(testkit.Rows("a?", "一二三?"))
tk.MustQuery(`select concat_ws("你", a, a) from t`).Check(testkit.Rows("a你a", "一二三你一二三"))
tk.MustQuery("select length(a), octet_length(a), bit_length(a) from t").Check(testkit.Rows("1 1 8", "6 6 48"))
tk.MustQuery("select to_base64(a) from t").Check(testkit.Rows("YQ==", "0ru2/sj9"))
tk.MustQuery("select lower(a), upper(a) from t").Check(testkit.Rows("a A", "一二三 一二三"))
tk.MustQuery(`select upper("abcABC一二三abcABC"), lower("abcABC一二三abcABC")`).Check(testkit.Rows("ABCABC一二三ABCABC abcabc一二三abcabc"))
tk.MustQuery("select ord(a) from t").Check(testkit.Rows("97", "53947"))
tk.MustQuery("select aes_encrypt(a, 'key') from t").
Check(testkit.Rows("U)\xfai\xbe:\x14\xb5\xbd\x89R\x00LO\xceZ", "l\x1d*\xb7$\x92\xd2\xf9\xc8*\xe3\x8d\xf3J\\\x13"))
tk.MustQuery("select aes_decrypt(aes_encrypt(a, 'key'), 'key'), hex(a) from t").Check(testkit.Rows("a 61", \xb6\xfe\xc8\xfd D2BBB6FEC8FD"))
tk.MustQuery(`select encode(a, "key") from t`).Check(testkit.Rows("\xa2", "\x89\xb1a\xee}\x8c"))
tk.MustQuery(`select decode(encode(a, "key"), "key"), hex(a) from t`).Check(testkit.Rows("a 61", \xb6\xfe\xc8\xfd D2BBB6FEC8FD"))
tk.MustQuery(`select md5(a) from t`).Check(testkit.Rows("0cc175b9c0f1b6a831c399e269772661", "a45d4af7b243e7f393fa09bed72ac73e"))
tk.MustQuery(`select password(a) from t`).Check(testkit.Rows("*667F407DE7C6AD07358FA38DAED7828A72014B4E", "*A669F2B2DD49E2463FE62D8F72DDF4F858687EA5"))
tk.MustQuery(`select compress(a) from t`).Check(testkit.Rows("\x01\x00\x00\x00x\x9cJ\x04\x04\x00\x00\xff\xff\x00b\x00b", "\x06\x00\x00\x00x\x9c\xba\xb4{ۿ\x13\x7f\x01\x01\x00\x00\xff\xff\x10\xf8\x05\a"))
tk.MustQuery(`select uncompress(compress(a)), a from t`).Check(testkit.Rows("a a", \xb6\xfe\xc8\xfd 一二三"))
tk.MustQuery(`select sha1(a), sha2(a, "key") from t`).
Check(testkit.Rows("86f7e437faa5a7fce15d1ddcb9eaeaea377667b8 ca978112ca1bbdcafac231b39a23dc4da786eff8147c4e72b9807785afee48bb",
"30cda4eed59a2ff592f2881f39d42fed6e10cad8 b6c1ae1f8d8a07426ddb13fca5124fb0b9f1f0ef1cca6730615099cf198ca8af"))
tk.MustQuery(`select hex(a) from t where hex(a) = "D2BBB6FEC8FD"`).Check(testkit.Rows("D2BBB6FEC8FD"))
tk.MustQuery(`select length(a) from t where length(a) = 6`).Check(testkit.Rows("6"))
tk.MustQuery(`select bit_length(a) from t where bit_length(a) = 48`).Check(testkit.Rows("48"))
tk.MustQuery(`select ascii(a) from t where ascii(a) = 210`).Check(testkit.Rows("210"))
tk.MustQuery(`select concat(a, 0x3f) from t where concat(a, 0x3f) = "一二三?"`).Check(testkit.Rows("一二三?"))
tk.MustQuery(`select md5(a) from t where md5(a) = "a45d4af7b243e7f393fa09bed72ac73e"`).Check(testkit.Rows("a45d4af7b243e7f393fa09bed72ac73e"))
tk.MustQuery(`select sha1(a) from t where sha1(a) = "30cda4eed59a2ff592f2881f39d42fed6e10cad8"`).Check(testkit.Rows("30cda4eed59a2ff592f2881f39d42fed6e10cad8"))

tk.MustExec("insert into t1 values (0xe2e2)")
tk.MustQuery("select convert(a using gb18030) from t1").Check(testkit.Rows("忖\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"))

tk.MustExec(`insert into t2 values ("abc", "abc"), ("abc", "xyz"), ("abc", "qwe"), ("abc","234")`)
tk.MustExec(`insert into t2 values ("一二三", "一"), ("一二三", "二"), ("一二三", "三"), ("一二三","四")`)
tk.MustQuery(`select a, b, rank() over (partition by a order by b) as x from t2`).
Check(testkit.Rows("abc 234 1", "abc abc 2", "abc qwe 3", "abc xyz 4",
"一二三 二 1", "一二三 三 2", "一二三 四 3", "一二三 一 4"))
}
2 changes: 2 additions & 0 deletions executor/seqtest/seq_executor_test.go
Expand Up @@ -1196,6 +1196,8 @@ func TestShowForNewCollations(t *testing.T) {
expectRows := testkit.Rows(
"ascii_bin ascii 65 Yes Yes 1",
"binary binary 63 Yes Yes 1",
"gb18030_bin gb18030 249 Yes 1",
"gb18030_chinese_ci gb18030 248 Yes Yes 1",
"gbk_bin gbk 87 Yes 1",
"gbk_chinese_ci gbk 28 Yes Yes 1",
"latin1_bin latin1 47 Yes Yes 1",
Expand Down
5 changes: 5 additions & 0 deletions expression/builtin_string_test.go
Expand Up @@ -151,6 +151,9 @@ func TestASCII(t *testing.T) {
{"你好", "", 228},
{"世界", "gbk", 202},
{"世界", "", 228},
{"abc", "gb18030", 97},
{"你好", "gb18030", 196},
{"世界", "gb18030", 202},
}

for _, c := range tbl {
Expand Down Expand Up @@ -1279,6 +1282,7 @@ func TestHexFunc(t *testing.T) {
{0x12, false, false, "12"},
{nil, true, false, ""},
{errors.New("must err"), false, true, ""},
{"🀁", false, false, "F09F8081"},
}
for _, c := range cases {
f, err := newFunctionForTest(ctx, ast.Hex, primitiveValsToConstants(ctx, []interface{}{c.arg})...)
Expand Down Expand Up @@ -1306,6 +1310,7 @@ func TestHexFunc(t *testing.T) {
{"你好", "gbk", "C4E3BAC3", 0},
{"一忒(๑•ㅂ•)و✧", "", "E4B880E5BF9228E0B991E280A2E38582E280A229D988E29CA7", 0},
{"一忒(๑•ㅂ•)و✧", "gbk", "", errno.ErrInvalidCharacterString},
{"🀁", "gb18030", "9438E131", 0},
}
for _, c := range strCases {
err := ctx.GetSessionVars().SetSystemVarWithoutValidation(variable.CharacterSetConnection, c.chs)
Expand Down
4 changes: 3 additions & 1 deletion expression/collation.go
Expand Up @@ -485,7 +485,7 @@ func isUnicodeCollation(ch string) bool {
func isBinCollation(collate string) bool {
return collate == charset.CollationASCII || collate == charset.CollationLatin1 ||
collate == charset.CollationUTF8 || collate == charset.CollationUTF8MB4 ||
collate == charset.CollationGBKBin
collate == charset.CollationGBKBin || collate == charset.CollationGB18030Bin
}

// getBinCollation get binary collation by charset
Expand All @@ -497,6 +497,8 @@ func getBinCollation(cs string) string {
return charset.CollationUTF8MB4
case charset.CharsetGBK:
return charset.CollationGBKBin
case charset.CharsetGB18030:
return charset.CollationGB18030Bin
}

logutil.BgLogger().Error("unexpected charset " + cs)
Expand Down
8 changes: 7 additions & 1 deletion expression/expression.go
Expand Up @@ -1065,7 +1065,7 @@ func scalarExprSupportedByTiKV(sf *ScalarFunction) bool {
// ast.FindInSet, ast.Repeat,
ast.Length, ast.BitLength, ast.Concat, ast.ConcatWS, ast.Replace, ast.ASCII, ast.Hex,
ast.Reverse, ast.LTrim, ast.RTrim, ast.Strcmp, ast.Space, ast.Elt, ast.Field,
InternalFuncFromBinary, InternalFuncToBinary, ast.Mid, ast.Substring, ast.Substr, ast.CharLength,
ast.Mid, ast.Substring, ast.Substr, ast.CharLength,
ast.Right, /* ast.Left */

// json functions.
Expand Down Expand Up @@ -1096,6 +1096,12 @@ func scalarExprSupportedByTiKV(sf *ScalarFunction) bool {
/*ast.InetNtoa, ast.InetAton, ast.Inet6Ntoa, ast.Inet6Aton, ast.IsIPv4, ast.IsIPv4Compat, ast.IsIPv4Mapped, ast.IsIPv6,*/
ast.UUID:

return true
case InternalFuncFromBinary, InternalFuncToBinary:
arg0 := sf.Function.getArgs()[0]
if arg0.GetType().EvalType() == types.ETString && arg0.GetType().GetCharset() == charset.CharsetGB18030 {
return false
}
return true
case ast.Round:
switch sf.Function.PbCode() {
Expand Down
1 change: 1 addition & 0 deletions parser/charset/BUILD.bazel
Expand Up @@ -8,6 +8,7 @@ go_library(
"encoding_ascii.go",
"encoding_base.go",
"encoding_bin.go",
"encoding_gb18030.go",
"encoding_gbk.go",
"encoding_latin1.go",
"encoding_table.go",
Expand Down
8 changes: 7 additions & 1 deletion parser/charset/charset.go
Expand Up @@ -62,6 +62,7 @@ var CharacterSetInfos = map[string]*Charset{
CharsetLatin1: {CharsetLatin1, CollationLatin1, make(map[string]*Collation), "Latin1", 1},
CharsetBin: {CharsetBin, CollationBin, make(map[string]*Collation), "binary", 1},
CharsetGBK: {CharsetGBK, CollationGBKBin, make(map[string]*Collation), "Chinese Internal Code Specification", 2},
CharsetGB18030: {CharsetGB18030, CollationGB18030Bin, make(map[string]*Collation), "China National Standard GB18030", 4},
}

// All the names supported collations should be in the following table.
Expand Down Expand Up @@ -214,6 +215,10 @@ const (
CollationGBKBin = "gbk_bin"
// CollationGBKChineseCI is the default collation for CharsetGBK when new collation is enabled.
CollationGBKChineseCI = "gbk_chinese_ci"
// CollationGB18030Bin is the default collation for CharsetGB18030 when new collation is disabled.
CollationGB18030Bin = "gb18030_bin"
// CollationGB18030ChineseCI is the default collation for CharsetGB18030 when new collation is enabled.
CollationGB18030ChineseCI = "gb18030_chinese_ci"
)

const (
Expand All @@ -227,6 +232,8 @@ const (
CharsetUTF8 = "utf8"
// CharsetUTF8MB4 represents 4 bytes utf8, which works the same way as utf8 in Go.
CharsetUTF8MB4 = "utf8mb4"
// CharsetGB18030 represents 4 bytes gb18030.
CharsetGB18030 = "gb18030"
//revive:disable:exported
CharsetARMSCII8 = "armscii8"
CharsetBig5 = "big5"
Expand All @@ -241,7 +248,6 @@ const (
CharsetDEC8 = "dec8"
CharsetEUCJPMS = "eucjpms"
CharsetEUCKR = "euckr"
CharsetGB18030 = "gb18030"
CharsetGB2312 = "gb2312"
CharsetGBK = "gbk"
CharsetGEOSTD8 = "geostd8"
Expand Down
3 changes: 3 additions & 0 deletions parser/charset/encoding.go
Expand Up @@ -23,6 +23,7 @@ var (
_ Encoding = &encodingLatin1{}
_ Encoding = &encodingBin{}
_ Encoding = &encodingGBK{}
_ Encoding = &encodingGB18030{}
)

// IsSupportedEncoding checks if the charset is fully supported.
Expand Down Expand Up @@ -60,6 +61,7 @@ var encodingMap = map[string]Encoding{
CharsetLatin1: EncodingLatin1Impl,
CharsetBin: EncodingBinImpl,
CharsetASCII: EncodingASCIIImpl,
CharsetGB18030: EncodingGB18030Impl,
}

// Encoding provide encode/decode functions for a string with a specific charset.
Expand Down Expand Up @@ -98,6 +100,7 @@ const (
EncodingTpLatin1
EncodingTpBin
EncodingTpGBK
EncodingTpGB18030
)

//revive:enable
Expand Down

0 comments on commit b0f7d83

Please sign in to comment.