Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lightning: support latin1 source file encoding #44435

Merged
merged 10 commits into from
Jun 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions br/pkg/lightning/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,7 @@ type MydumperRuntime struct {
// - utf8mb4
// - GB18030
// - GBK: an extension of the GB2312 character set and is also known as Code Page 936.
// - latin1: IANA Windows1252
// - binary: no attempt to convert the encoding.
// Leave DataCharacterSet empty will make it use `binary` by default.
DataCharacterSet string `toml:"data-character-set" json:"data-character-set"`
Expand Down
14 changes: 14 additions & 0 deletions br/pkg/lightning/mydump/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"github.com/pingcap/tidb/br/pkg/storage"
"github.com/spkg/bom"
"go.uber.org/zap"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/simplifiedchinese"
)

Expand Down Expand Up @@ -63,6 +64,19 @@ func decodeCharacterSet(data []byte, characterSet string) ([]byte, error) {
return nil, errInvalidSchemaEncoding
}
data = decoded
case "latin1":
// use Windows1252 (not ISO 8859-1) to decode Latin1
// https://dev.mysql.com/doc/refman/8.0/en/charset-we-sets.html
decoded, err := charmap.Windows1252.NewDecoder().Bytes(data)
if err != nil {
return nil, errors.Trace(err)
}
// > Each byte that cannot be transcoded will be represented in the
// > output by the UTF-8 encoding of '\uFFFD'
if bytes.ContainsRune(decoded, '\ufffd') {
return nil, errInvalidSchemaEncoding
}
data = decoded
default:
return nil, errors.Errorf("Unsupported encoding %s", characterSet)
}
Expand Down
8 changes: 8 additions & 0 deletions br/tests/lightning_character_sets/latin1-only-schema.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[lightning]
table-concurrency = 1

[tikv-importer]
backend = "local"

[mydumper]
character-set = "latin1"
9 changes: 9 additions & 0 deletions br/tests/lightning_character_sets/latin1.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[lightning]
table-concurrency = 1

[tikv-importer]
backend = "local"

[mydumper]
character-set = "latin1"
data-character-set = "latin1"
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
create database charsets;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
create table `latin1` (`��` int primary key comment '��', `data` varchar(20));
3 changes: 3 additions & 0 deletions br/tests/lightning_character_sets/latin1/charsets.latin1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"��","data"
1,"����"
2,"����"
13 changes: 13 additions & 0 deletions br/tests/lightning_character_sets/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,16 @@ run_sql 'TRUNCATE TABLE charsets.greek;'
run_lightning --config "tests/$TEST_NAME/greek.toml" -d "tests/$TEST_NAME/greek" --backend tidb
run_sql "SELECT count(*) FROM charsets.greek WHERE c = 'α';"
check_contains 'count(*): 1'

# latin1
# wrong encoding will have wrong column name and data
run_lightning --config "tests/$TEST_NAME/binary.toml" -d "tests/$TEST_NAME/latin1" 2>&1 | grep -q "unknown columns in header"
run_sql 'DROP TABLE charsets.latin1;'
run_lightning --config "tests/$TEST_NAME/utf8mb4.toml" -d "tests/$TEST_NAME/latin1" 2>&1 | grep -q "invalid schema encoding"
run_lightning --config "tests/$TEST_NAME/latin1-only-schema.toml" -d "tests/$TEST_NAME/latin1" 2>&1 | grep -q "unknown columns in header"
run_lightning --config "tests/$TEST_NAME/latin1.toml" -d "tests/$TEST_NAME/latin1"
run_sql 'SELECT * FROM charsets.latin1'
check_contains 'ÏÐ: 1'
check_contains 'data: ‘’“”'
check_contains 'ÏÐ: 2'
check_contains 'data: ¡¢£¤'
1 change: 1 addition & 0 deletions br/tidb-lightning.toml
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ no-schema = false
# - utf8mb4: Indicates that the source data file uses UTF-8 encoding.
# - GB18030: Indicates that the source data file uses the GB-18030 encoding.
# - GBK: The source data file uses GBK encoding (GBK encoding is an extension of the GB-2312 character set, also known as Code Page 936).
# - latin1: IANA Windows1252
# - binary: Indicates that Lightning does not convert the encoding (by default).
# If left blank, the default value "binary" is used, that is to say, Lightning does not convert the encoding.
# Note that Lightning does not predict about the character set of the source data file and only converts the source file and import the data based on this configuration.
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ require (
gopkg.in/yaml.v2 v2.4.0
honnef.co/go/tools v0.4.3
k8s.io/api v0.27.2
k8s.io/utils v0.0.0-20230209194617-a36077c30491
sourcegraph.com/sourcegraph/appdash v0.0.0-20190731080439-ebfcffb1b5c0
sourcegraph.com/sourcegraph/appdash-data v0.0.0-20151005221446-73f23eafcf67
)
Expand Down Expand Up @@ -282,7 +283,6 @@ require (
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/apimachinery v0.27.2 // indirect
k8s.io/klog/v2 v2.90.1 // indirect
k8s.io/utils v0.0.0-20230209194617-a36077c30491 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
sigs.k8s.io/yaml v1.3.0 // indirect
Expand Down