Skip to content

Commit

Permalink
PD failover (#74)
Browse files Browse the repository at this point in the history
* *: pd failover
  • Loading branch information
weekface committed Sep 7, 2018
1 parent 65afece commit 4885371
Show file tree
Hide file tree
Showing 26 changed files with 1,467 additions and 209 deletions.
8 changes: 4 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
GOENV := GO15VENDOREXPERIMENT="1" CGO_ENABLED=0 GOOS=linux GOARCH=amd64
GO := $(GOENV) go
GOENV := GO15VENDOREXPERIMENT="1" CGO_ENABLED=0 GOOS=linux GOARCH=amd64
GO := $(GOENV) go
GOTEST := go test -v -cover

LDFLAGS += -X "github.com/pingcap/tidb-operator/version.BuildTS=$(shell date -u '+%Y-%m-%d %I:%M:%S')"
LDFLAGS += -X "github.com/pingcap/tidb-operator/version.GitSHA=$(shell git rev-parse HEAD)"
Expand Down Expand Up @@ -39,8 +40,7 @@ e2e-build:
$(GOENV) ginkgo build tests/e2e

test:
@echo "run unit tests"
@$(GO) test ./pkg/... -v -cover && echo success
@$(GOTEST) ./pkg/... && echo "\nUnit tests run successfully!"

check-all: lint check-static check-shadow check-gosec megacheck errcheck

Expand Down
46 changes: 37 additions & 9 deletions charts/tidb-cluster/templates/pd-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ data:
fi
source ${ANNOTATIONS} 2>/dev/null
PEER_SERVICE_DOMAIN="${HOSTNAME}.${PEER_SERVICE_NAME}.${NAMESPACE}.svc"
SERVICE_DOMAIN="${SERVICE_NAME}.${NAMESPACE}.svc"
runmode=${runmode:-normal}
if [[ X${runmode} == Xdebug ]]
then
Expand All @@ -42,18 +45,35 @@ data:
elapseTime=0
period=1
threshold=30
while true;do
nslookup ${HOSTNAME}.${PEER_SERVICE_NAME}.${NAMESPACE}.svc 2>/dev/null
[[ $? -eq 0 ]] && break
echo "nslookup domain ${HOSTNAME}.${PEER_SERVICE_NAME}.${NAMESPACE}.svc failed" >&2
while true; do
sleep ${period}
elapseTime=$(( elapseTime+period ))
if [[ ${elapseTime} -ge ${threshold} ]];then
echo "nslookup domain ${HOSTNAME}.${PEER_SERVICE_NAME}.${NAMESPACE}.svc timeout" >&2
if [[ ${elapseTime} -ge ${threshold} ]]
then
echo "waiting for pd cluster ready timeout" >&2
exit 1
fi
source ${ANNOTATIONS} 2>/dev/null
if nslookup ${PEER_SERVICE_DOMAIN} 2>/dev/null
then
echo "nslookup domain ${HOSTNAME}.${PEER_SERVICE_NAME}.${NAMESPACE}.svc success"
if [[ ${ORDINAL} -eq 0 ]]
then
[[ -z ${bootstrapping:-} ]] && continue
[[ ${bootstrapping} == "true" ]] && break
fi
[[ -d /var/lib/pd/member/wal ]] && break
wget -qO- ${SERVICE_DOMAIN}:2379/pd/api/v1/members 2>/dev/null
[[ $? -eq 0 ]] && break
echo "pd cluster is not ready now: ${SERVICE_DOMAIN}"
else
echo "nslookup domain ${PEER_SERVICE_DOMAIN} failed" >&2
fi
done
echo "nslookup domain ${HOSTNAME}.${PEER_SERVICE_NAME}.${NAMESPACE}.svc success"
ARGS="--data-dir=/var/lib/pd \
--name=${HOSTNAME} \
Expand All @@ -64,14 +84,22 @@ data:
--config=/etc/pd/pd.toml \
"
if [[ ${ORDINAL} == "0" ]]
replicas=${replicas:-3}
if [[ ${ORDINAL} -eq 0 && ${bootstrapping:-} == "true" ]]
then
ARGS="${ARGS}--initial-cluster=${HOSTNAME}=http://${HOSTNAME}.${PEER_SERVICE_NAME}.${NAMESPACE}.svc:2380"
else
if [[ ${ORDINAL} -eq 0 ]]
then
TOP=$((replicas-1))
else
TOP=$((ORDINAL-1))
fi
ARGS="${ARGS}--join="
TOP=$((ORDINAL-1))
for i in $(seq 0 ${TOP});
do
[[ ${i} -eq ${ORDINAL} ]] && continue
ARGS="${ARGS}http://${SET_NAME}-${i}.${PEER_SERVICE_NAME}.${NAMESPACE}.svc:2380"
if [[ ${i} -lt ${TOP} ]]
then
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ spec:
- /usr/local/bin/tidb-controller-manager
- -default-storage-class-name={{ .Values.defaultStorageClassName }}
- -cluster-scoped={{ .Values.clusterScoped }}
- -auto-failover={{ .Values.controllerManager.autoFailover | default false }}
- -pd-failover-period={{ .Values.controllerManager.pdFailoverPeriod | default "5m" }}
- -v={{ .Values.controllerManager.logLevel }}
env:
- name: NAMESPACE
Expand Down
4 changes: 2 additions & 2 deletions charts/tidb-operator/templates/controller-manager-rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ rules:
verbs: ["create", "get", "update"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch","update"]
verbs: ["get", "list", "watch","update", "delete"]
- apiGroups: [""]
resources: ["persistentvolumes"]
verbs: ["get", "list", "watch", "patch","update"]
Expand Down Expand Up @@ -84,7 +84,7 @@ rules:
verbs: ["get", "list", "watch", "update", "delete"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch"]
verbs: ["get", "list", "watch","update", "delete"]
- apiGroups: ["apps"]
resources: ["statefulsets"]
verbs: ["*"]
Expand Down
4 changes: 4 additions & 0 deletions charts/tidb-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ controllerManager:
requests:
cpu: 80m
memory: 50Mi
# autoFailover is whether tidb-operator should auto failover when failure occurs
autoFailover: false
# pd failover period default(5m)
pdFailoverPeriod: 5m
20 changes: 12 additions & 8 deletions cmd/controller-manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,15 @@ import (
)

var (
printVersion bool
workers int
leaseDuration = 15 * time.Second
renewDuration = 5 * time.Second
retryPeriod = 3 * time.Second
resyncDuration = 30 * time.Second
waitDuration = 5 * time.Second
printVersion bool
workers int
pdFailoverPeriod time.Duration
autoFailover bool
leaseDuration = 15 * time.Second
renewDuration = 5 * time.Second
retryPeriod = 3 * time.Second
resyncDuration = 30 * time.Second
waitDuration = 5 * time.Second
)

func init() {
Expand All @@ -53,6 +55,8 @@ func init() {
flag.IntVar(&workers, "workers", 5, "The number of workers that are allowed to sync concurrently. Larger number = more responsive management, but more CPU (and network) load")
flag.BoolVar(&controller.ClusterScoped, "cluster-scoped", true, "Whether tidb-operator should manage kubernetes cluster wide TiDB Clusters")
flag.StringVar(&controller.DefaultStorageClassName, "default-storage-class-name", "standard", "Default storage class name")
flag.BoolVar(&autoFailover, "auto-failover", false, "Auto failover")
flag.DurationVar(&pdFailoverPeriod, "pd-failover-period", time.Duration(5*time.Minute), "PD failover period default(5m)")

flag.Parse()
}
Expand Down Expand Up @@ -112,7 +116,7 @@ func main() {
},
}

tcController := tidbcluster.NewController(kubeCli, cli, informerFactory, kubeInformerFactory)
tcController := tidbcluster.NewController(kubeCli, cli, informerFactory, kubeInformerFactory, autoFailover, pdFailoverPeriod)
stop := make(chan struct{})
defer close(stop)
go informerFactory.Start(stop)
Expand Down
19 changes: 16 additions & 3 deletions pkg/apis/pingcap.com/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package v1alpha1
import (
apps "k8s.io/api/apps/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"

corev1 "k8s.io/api/core/v1"
)
Expand Down Expand Up @@ -165,9 +166,10 @@ type ResourceRequirement struct {

// PDStatus is PD status
type PDStatus struct {
Phase MemberPhase `json:"phase,omitempty"`
StatefulSet *apps.StatefulSetStatus `json:"statefulSet,omitempty"`
Members map[string]PDMember `json:"members,omitempty"`
Phase MemberPhase `json:"phase,omitempty"`
StatefulSet *apps.StatefulSetStatus `json:"statefulSet,omitempty"`
Members map[string]PDMember `json:"members,omitempty"`
FailureMembers map[string]PDFailureMember `json:"failureMembers,omitempty"`
}

// PDMember is PD member
Expand All @@ -178,6 +180,17 @@ type PDMember struct {
ID string `json:"id"`
ClientURL string `json:"clientURL"`
Health bool `json:"health"`
// Last time the health transitioned from one to another.
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"`
}

// PDFailureMember is the pd failure member information
type PDFailureMember struct {
PodName string `json:"podName,omitempty"`
MemberID string `json:"memberID,omitempty"`
PVUID types.UID `json:"pvUID,omitempty"`
Replicas int32 `json:"replicas,omitempty"`
MemberDeleted bool `json:"memberDeleted,omitempty"`
}

// TiDBStatus is TiDB status
Expand Down
24 changes: 24 additions & 0 deletions pkg/apis/pingcap.com/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 4885371

Please sign in to comment.