Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

br: fix the issue that config not paused with ttl correctly #31725

Merged
merged 13 commits into from
May 10, 2022
17 changes: 13 additions & 4 deletions br/pkg/pdutil/pd.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ const (
schedulerPrefix = "pd/api/v1/schedulers"
maxMsgSize = int(128 * units.MiB) // pd.ScanRegion may return a large response
scheduleConfigPrefix = "pd/api/v1/config/schedule"
configPrefix = "pd/api/v1/config"
pauseTimeout = 5 * time.Minute

// pd request retry time when connection fail
Expand Down Expand Up @@ -541,12 +542,20 @@ func (p *PdController) UpdatePDScheduleConfig(ctx context.Context) error {
func (p *PdController) doUpdatePDScheduleConfig(
ctx context.Context, cfg map[string]interface{}, post pdHTTPRequest, prefixs ...string,
) error {
prefix := scheduleConfigPrefix
prefix := configPrefix
if len(prefixs) != 0 {
prefix = prefixs[0]
}
newCfg := make(map[string]interface{})
for k, v := range cfg {
// if we want use ttl, we need use config prefix first.
// which means cfg should transfer from "max-merge-region-keys" to "schedule.max-merge-region-keys".
sc := fmt.Sprintf("schedule.%s", k)
newCfg[sc] = v
}

for _, addr := range p.addrs {
reqData, err := json.Marshal(cfg)
reqData, err := json.Marshal(newCfg)
if err != nil {
return errors.Trace(err)
}
Expand All @@ -562,7 +571,7 @@ func (p *PdController) doUpdatePDScheduleConfig(

func (p *PdController) doPauseConfigs(ctx context.Context, cfg map[string]interface{}, post pdHTTPRequest) error {
// pause this scheduler with 300 seconds
prefix := fmt.Sprintf("%s?ttlSecond=%.0f", scheduleConfigPrefix, pauseTimeout.Seconds())
prefix := fmt.Sprintf("%s?ttlSecond=%.0f", configPrefix, pauseTimeout.Seconds())
return p.doUpdatePDScheduleConfig(ctx, cfg, post, prefix)
}

Expand All @@ -584,7 +593,7 @@ func restoreSchedulers(ctx context.Context, pd *PdController, clusterCfg Cluster
prefix := make([]string, 0, 1)
if pd.isPauseConfigEnabled() {
// set config's ttl to zero, make temporary config invalid immediately.
prefix = append(prefix, fmt.Sprintf("%s?ttlSecond=%d", scheduleConfigPrefix, 0))
prefix = append(prefix, fmt.Sprintf("%s?ttlSecond=%d", configPrefix, 0))
}
// reset config with previous value.
if err := pd.doUpdatePDScheduleConfig(ctx, mergeCfg, pdRequest, prefix...); err != nil {
Expand Down
21 changes: 17 additions & 4 deletions br/tests/br_other/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,23 @@ run_curl "https://localhost:$PPROF_PORT/debug/pprof/trace?seconds=1" &>/dev/null
echo "pprof started..."

run_curl https://$PD_ADDR/pd/api/v1/config/schedule | grep '"disable": false'
run_curl https://$PD_ADDR/pd/api/v1/config/schedule | jq '."enable-location-replacement"' | grep "false"
run_curl https://$PD_ADDR/pd/api/v1/config/schedule | jq '."max-pending-peer-count"' | grep "2147483647"
run_curl https://$PD_ADDR/pd/api/v1/config/schedule | jq '."max-merge-region-size"' | grep -E "^0$"
run_curl https://$PD_ADDR/pd/api/v1/config/schedule | jq '."max-merge-region-keys"' | grep -E "^0$"

# after apply the pause api. these configs won't change any more.
# run_curl https://$PD_ADDR/pd/api/v1/config/schedule | jq '."enable-location-replacement"' | grep "false"
# run_curl https://$PD_ADDR/pd/api/v1/config/schedule | jq '."max-pending-peer-count"' | grep "2147483647"
# run_curl https://$PD_ADDR/pd/api/v1/config/schedule | jq '."max-merge-region-size"' | grep -E "^0$"
# run_curl https://$PD_ADDR/pd/api/v1/config/schedule | jq '."max-merge-region-keys"' | grep -E "^0$"

# after https://github.com/tikv/pd/pull/4781 merged. we can use a hack way
# to check whether we pause config succeed is
# setting a paused config again and expect to get the failed.
run_pd_ctl -u https://$PD_ADDR config set max-merge-region-size 0 | grep -q "need to clean up TTL first for schedule.max-merge-region-size"
run_pd_ctl -u https://$PD_ADDR config set max-merge-region-keys 0 | grep -q "need to clean up TTL first for schedule.max-merge-region-keys"
run_pd_ctl -u https://$PD_ADDR config set max-pending-peer-count 0 | grep -q "need to clean up TTL first for schedule.max-pending-peer-count"
run_pd_ctl -u https://$PD_ADDR config set enable-location-replacement false | grep -q "need to clean up TTL first for schedule.enable-location-replacement"
run_pd_ctl -u https://$PD_ADDR config set leader-schedule-limit 0 | grep -q "need to clean up TTL first for schedule.leader-schedule-limit"
run_pd_ctl -u https://$PD_ADDR config set region-schedule-limit 0 | grep -q "need to clean up TTL first for schedule.region-schedule-limit"
run_pd_ctl -u https://$PD_ADDR config set max-snapshot-count 0 | grep -q "need to clean up TTL first for schedule.max-snapshot-count"

backup_fail=0
# generate 1.sst to make another backup failed.
Expand Down