Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lighting/parser: align NULL and ESCAPED BY with LOAD DATA #40909

Merged
merged 11 commits into from
Feb 13, 2023
76 changes: 60 additions & 16 deletions br/pkg/lightning/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -477,19 +477,47 @@ type PostRestore struct {
Compact bool `toml:"compact" json:"compact"`
}

// StringOrStringSlice can unmarshal a TOML string as string slice with one element.
type StringOrStringSlice []string

func (s *StringOrStringSlice) UnmarshalTOML(in interface{}) error {
switch v := in.(type) {
case string:
*s = []string{v}
case []interface{}:
*s = make([]string, 0, len(v))
for _, vv := range v {
vs, ok := vv.(string)
if !ok {
return errors.Errorf("invalid string slice '%v'", in)
}
*s = append(*s, vs)
}
default:
return errors.Errorf("invalid string slice '%v'", in)
}
return nil
}

type CSVConfig struct {
// Separator, Delimiter and Terminator should all be in utf8mb4 encoding.
Separator string `toml:"separator" json:"separator"`
Delimiter string `toml:"delimiter" json:"delimiter"`
Terminator string `toml:"terminator" json:"terminator"`
Null string `toml:"null" json:"null"`
Header bool `toml:"header" json:"header"`
TrimLastSep bool `toml:"trim-last-separator" json:"trim-last-separator"`
NotNull bool `toml:"not-null" json:"not-null"`
BackslashEscape bool `toml:"backslash-escape" json:"backslash-escape"`
Separator string `toml:"separator" json:"separator"`
Delimiter string `toml:"delimiter" json:"delimiter"`
Terminator string `toml:"terminator" json:"terminator"`
Null StringOrStringSlice `toml:"null" json:"null"`
Header bool `toml:"header" json:"header"`
TrimLastSep bool `toml:"trim-last-separator" json:"trim-last-separator"`
NotNull bool `toml:"not-null" json:"not-null"`
// deprecated, use `escaped-by` instead.
BackslashEscape bool `toml:"backslash-escape" json:"backslash-escape"`
// EscapedBy has higher priority than BackslashEscape, currently it must be a single character if set.
EscapedBy string `toml:"escaped-by" json:"escaped-by"`
// hide these options for lightning configuration file, they can only be used by LOAD DATA
// https://dev.mysql.com/doc/refman/8.0/en/load-data.html#load-data-field-line-handling
StartingBy string `toml:"-" json:"-"`
// for non-empty Delimiter (for example quotes), Null elements inside quotes are not considered as null except for
sleepymole marked this conversation as resolved.
Show resolved Hide resolved
// `\N` (when escape-by is `\`). That is to say, `\N` is special for null because it always means null.
QuotedNullIsText bool
}

type MydumperRuntime struct {
Expand Down Expand Up @@ -747,8 +775,9 @@ func NewConfig() *Config {
Delimiter: `"`,
Header: true,
NotNull: false,
Null: `\N`,
Null: []string{`\N`},
BackslashEscape: true,
EscapedBy: `\`,
TrimLastSep: false,
},
StrictFormat: false,
Expand Down Expand Up @@ -880,15 +909,30 @@ func (cfg *Config) Adjust(ctx context.Context) error {
return common.ErrInvalidConfig.GenWithStack("`mydumper.csv.separator` and `mydumper.csv.delimiter` must not be prefix of each other")
}

if csv.BackslashEscape {
if csv.Separator == `\` {
return common.ErrInvalidConfig.GenWithStack("cannot use '\\' as CSV separator when `mydumper.csv.backslash-escape` is true")
if len(csv.EscapedBy) > 1 {
return common.ErrInvalidConfig.GenWithStack("`mydumper.csv.escaped-by` must be empty or a single character")
}
if csv.BackslashEscape && csv.EscapedBy == "" {
csv.EscapedBy = `\`
}
if !csv.BackslashEscape && csv.EscapedBy == `\` {
csv.EscapedBy = ""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is csv.EscapedBy set to empty if it is set in the config but backslashEscape is false?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently the default value of EscapedBy is \, and BackslashEscape is hidden and has default value true. So if BackslashEscape is changed to false, it means this is an old format configuration file with false BackslashEscape, we should disable escaping

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lgtm

}

// keep compatibility with old behaviour
if !csv.NotNull && len(csv.Null) == 0 {
csv.Null = []string{""}
}

if len(csv.EscapedBy) > 0 {
if csv.Separator == csv.EscapedBy {
return common.ErrInvalidConfig.GenWithStack("cannot use '%s' both as CSV separator and `mydumper.csv.escaped-by`", csv.EscapedBy)
}
if csv.Delimiter == `\` {
return common.ErrInvalidConfig.GenWithStack("cannot use '\\' as CSV delimiter when `mydumper.csv.backslash-escape` is true")
if csv.Delimiter == csv.EscapedBy {
return common.ErrInvalidConfig.GenWithStack("cannot use '%s' both as CSV delimiter and `mydumper.csv.escaped-by`", csv.EscapedBy)
}
if csv.Terminator == `\` {
return common.ErrInvalidConfig.GenWithStack("cannot use '\\' as CSV terminator when `mydumper.csv.backslash-escape` is true")
if csv.Terminator == csv.EscapedBy {
return common.ErrInvalidConfig.GenWithStack("cannot use '%s' both as CSV terminator and `mydumper.csv.escaped-by`", csv.EscapedBy)
}
}

Expand Down
27 changes: 23 additions & 4 deletions br/pkg/lightning/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -477,15 +477,15 @@ func TestInvalidCSV(t *testing.T) {
separator = '\'
backslash-escape = true
`,
err: "[Lightning:Config:ErrInvalidConfig]cannot use '\\' as CSV separator when `mydumper.csv.backslash-escape` is true",
err: "[Lightning:Config:ErrInvalidConfig]cannot use '\\' both as CSV separator and `mydumper.csv.escaped-by`",
},
{
input: `
[mydumper.csv]
delimiter = '\'
backslash-escape = true
escaped-by = '\'
`,
err: "[Lightning:Config:ErrInvalidConfig]cannot use '\\' as CSV delimiter when `mydumper.csv.backslash-escape` is true",
err: "[Lightning:Config:ErrInvalidConfig]cannot use '\\' both as CSV delimiter and `mydumper.csv.escaped-by`",
},
{
input: `
Expand Down Expand Up @@ -528,7 +528,7 @@ func TestInvalidCSV(t *testing.T) {
if tc.err != "" {
require.EqualError(t, err, tc.err, comment)
} else {
require.NoError(t, err)
require.NoError(t, err, tc.input)
}
}
}
Expand All @@ -543,6 +543,25 @@ func TestInvalidTOML(t *testing.T) {
require.EqualError(t, err, "toml: line 2: expected '.' or '=', but got '[' instead")
}

func TestStringOrStringSlice(t *testing.T) {
cfg := &config.Config{}
err := cfg.LoadFromTOML([]byte(`
[mydumper.csv]
null = '\N'
`))
require.NoError(t, err)
err = cfg.LoadFromTOML([]byte(`
[mydumper.csv]
null = [ '\N', 'NULL' ]
`))
require.NoError(t, err)
err = cfg.LoadFromTOML([]byte(`
[mydumper.csv]
null = [ '\N', 123 ]
`))
require.ErrorContains(t, err, "invalid string slice")
}

func TestTOMLUnusedKeys(t *testing.T) {
cfg := &config.Config{}
err := cfg.LoadFromTOML([]byte(`
Expand Down
1 change: 1 addition & 0 deletions br/pkg/lightning/mydump/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ go_library(
"@com_github_xitongsys_parquet_go//parquet",
"@com_github_xitongsys_parquet_go//reader",
"@com_github_xitongsys_parquet_go//source",
"@org_golang_x_exp//slices",
"@org_golang_x_text//encoding",
"@org_golang_x_text//encoding/simplifiedchinese",
"@org_uber_go_zap//:zap",
Expand Down