Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PD failover #74

Merged
merged 16 commits into from
Sep 7, 2018
8 changes: 4 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
GOENV := GO15VENDOREXPERIMENT="1" CGO_ENABLED=0 GOOS=linux GOARCH=amd64
GO := $(GOENV) go
GOENV := GO15VENDOREXPERIMENT="1" CGO_ENABLED=0 GOOS=linux GOARCH=amd64
GO := $(GOENV) go
GOTEST := go test -v -cover

LDFLAGS += -X "github.com/pingcap/tidb-operator/version.BuildTS=$(shell date -u '+%Y-%m-%d %I:%M:%S')"
LDFLAGS += -X "github.com/pingcap/tidb-operator/version.GitSHA=$(shell git rev-parse HEAD)"
Expand Down Expand Up @@ -39,8 +40,7 @@ e2e-build:
$(GOENV) ginkgo build tests/e2e

test:
@echo "run unit tests"
@$(GO) test ./pkg/... -v -cover && echo success
@$(GOTEST) ./pkg/... && echo "\nUnit tests run successfully!"

check-all: lint check-static check-shadow check-gosec megacheck errcheck

Expand Down
46 changes: 37 additions & 9 deletions charts/tidb-cluster/templates/pd-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ data:
fi
source ${ANNOTATIONS} 2>/dev/null

PEER_SERVICE_DOMAIN="${HOSTNAME}.${PEER_SERVICE_NAME}.${NAMESPACE}.svc"
SERVICE_DOMAIN="${SERVICE_NAME}.${NAMESPACE}.svc"

runmode=${runmode:-normal}
if [[ X${runmode} == Xdebug ]]
then
Expand All @@ -42,18 +45,35 @@ data:
elapseTime=0
period=1
threshold=30
while true;do
nslookup ${HOSTNAME}.${PEER_SERVICE_NAME}.${NAMESPACE}.svc 2>/dev/null
[[ $? -eq 0 ]] && break
echo "nslookup domain ${HOSTNAME}.${PEER_SERVICE_NAME}.${NAMESPACE}.svc failed" >&2
while true; do
sleep ${period}
elapseTime=$(( elapseTime+period ))
if [[ ${elapseTime} -ge ${threshold} ]];then
echo "nslookup domain ${HOSTNAME}.${PEER_SERVICE_NAME}.${NAMESPACE}.svc timeout" >&2

if [[ ${elapseTime} -ge ${threshold} ]]
then
echo "waiting for pd cluster ready timeout" >&2
exit 1
fi

source ${ANNOTATIONS} 2>/dev/null
if nslookup ${PEER_SERVICE_DOMAIN} 2>/dev/null
then
echo "nslookup domain ${HOSTNAME}.${PEER_SERVICE_NAME}.${NAMESPACE}.svc success"

if [[ ${ORDINAL} -eq 0 ]]
then
[[ -z ${bootstrapping:-} ]] && continue
[[ ${bootstrapping} == "true" ]] && break
fi

[[ -d /var/lib/pd/member/wal ]] && break
wget -qO- ${SERVICE_DOMAIN}:2379/pd/api/v1/members 2>/dev/null
[[ $? -eq 0 ]] && break
echo "pd cluster is not ready now: ${SERVICE_DOMAIN}"
else
echo "nslookup domain ${PEER_SERVICE_DOMAIN} failed" >&2
fi
done
echo "nslookup domain ${HOSTNAME}.${PEER_SERVICE_NAME}.${NAMESPACE}.svc success"

ARGS="--data-dir=/var/lib/pd \
--name=${HOSTNAME} \
Expand All @@ -64,14 +84,22 @@ data:
--config=/etc/pd/pd.toml \
"

if [[ ${ORDINAL} == "0" ]]
replicas=${replicas:-3}
if [[ ${ORDINAL} -eq 0 && ${bootstrapping:-} == "true" ]]
then
ARGS="${ARGS}--initial-cluster=${HOSTNAME}=http://${HOSTNAME}.${PEER_SERVICE_NAME}.${NAMESPACE}.svc:2380"
else
if [[ ${ORDINAL} -eq 0 ]]
then
TOP=$((replicas-1))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

replicas is undefined, define it like following:

replicas=${replicas:-3}

else
TOP=$((ORDINAL-1))
fi

ARGS="${ARGS}--join="
TOP=$((ORDINAL-1))
for i in $(seq 0 ${TOP});
do
[[ ${i} -eq ${ORDINAL} ]] && continue
ARGS="${ARGS}http://${SET_NAME}-${i}.${PEER_SERVICE_NAME}.${NAMESPACE}.svc:2380"
if [[ ${i} -lt ${TOP} ]]
then
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ spec:
- /usr/local/bin/tidb-controller-manager
- -default-storage-class-name={{ .Values.defaultStorageClassName }}
- -cluster-scoped={{ .Values.clusterScoped }}
- -auto-failover={{ .Values.controllerManager.autoFailover | default false }}
- -pd-failover-period={{ .Values.controllerManager.pdFailoverPeriod | default "5m" }}
- -v={{ .Values.controllerManager.logLevel }}
env:
- name: NAMESPACE
Expand Down
4 changes: 2 additions & 2 deletions charts/tidb-operator/templates/controller-manager-rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ rules:
verbs: ["create", "get", "update"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch","update"]
verbs: ["get", "list", "watch","update", "delete"]
- apiGroups: [""]
resources: ["persistentvolumes"]
verbs: ["get", "list", "watch", "patch","update"]
Expand Down Expand Up @@ -84,7 +84,7 @@ rules:
verbs: ["get", "list", "watch", "update", "delete"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch"]
verbs: ["get", "list", "watch","update", "delete"]
- apiGroups: ["apps"]
resources: ["statefulsets"]
verbs: ["*"]
Expand Down
4 changes: 4 additions & 0 deletions charts/tidb-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ controllerManager:
requests:
cpu: 80m
memory: 50Mi
# autoFailover is whether tidb-operator should auto failover when failure occurs
autoFailover: false
# pd failover period default(5m)
pdFailoverPeriod: 5m
20 changes: 12 additions & 8 deletions cmd/controller-manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,15 @@ import (
)

var (
printVersion bool
workers int
leaseDuration = 15 * time.Second
renewDuration = 5 * time.Second
retryPeriod = 3 * time.Second
resyncDuration = 30 * time.Second
waitDuration = 5 * time.Second
printVersion bool
workers int
pdFailoverPeriod time.Duration
autoFailover bool
leaseDuration = 15 * time.Second
renewDuration = 5 * time.Second
retryPeriod = 3 * time.Second
resyncDuration = 30 * time.Second
waitDuration = 5 * time.Second
)

func init() {
Expand All @@ -53,6 +55,8 @@ func init() {
flag.IntVar(&workers, "workers", 5, "The number of workers that are allowed to sync concurrently. Larger number = more responsive management, but more CPU (and network) load")
flag.BoolVar(&controller.ClusterScoped, "cluster-scoped", true, "Whether tidb-operator should manage kubernetes cluster wide TiDB Clusters")
flag.StringVar(&controller.DefaultStorageClassName, "default-storage-class-name", "standard", "Default storage class name")
flag.BoolVar(&autoFailover, "auto-failover", false, "Auto failover")
flag.DurationVar(&pdFailoverPeriod, "pd-failover-period", time.Duration(5*time.Minute), "PD failover period default(5m)")

flag.Parse()
}
Expand Down Expand Up @@ -112,7 +116,7 @@ func main() {
},
}

tcController := tidbcluster.NewController(kubeCli, cli, informerFactory, kubeInformerFactory)
tcController := tidbcluster.NewController(kubeCli, cli, informerFactory, kubeInformerFactory, autoFailover, pdFailoverPeriod)
stop := make(chan struct{})
defer close(stop)
go informerFactory.Start(stop)
Expand Down
19 changes: 16 additions & 3 deletions pkg/apis/pingcap.com/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package v1alpha1
import (
apps "k8s.io/api/apps/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"

corev1 "k8s.io/api/core/v1"
)
Expand Down Expand Up @@ -165,9 +166,10 @@ type ResourceRequirement struct {

// PDStatus is PD status
type PDStatus struct {
Phase MemberPhase `json:"phase,omitempty"`
StatefulSet *apps.StatefulSetStatus `json:"statefulSet,omitempty"`
Members map[string]PDMember `json:"members,omitempty"`
Phase MemberPhase `json:"phase,omitempty"`
StatefulSet *apps.StatefulSetStatus `json:"statefulSet,omitempty"`
Members map[string]PDMember `json:"members,omitempty"`
FailureMembers map[string]PDFailureMember `json:"failureMembers,omitempty"`
}

// PDMember is PD member
Expand All @@ -178,6 +180,17 @@ type PDMember struct {
ID string `json:"id"`
ClientURL string `json:"clientURL"`
Health bool `json:"health"`
// Last time the health transitioned from one to another.
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"`
}

// PDFailureMember is the pd failure member information
type PDFailureMember struct {
PodName string `json:"podName,omitempty"`
MemberID string `json:"memberID,omitempty"`
PVUID types.UID `json:"pvUID,omitempty"`
Replicas int32 `json:"replicas,omitempty"`
MemberDeleted bool `json:"memberDeleted,omitempty"`
}

// TiDBStatus is TiDB status
Expand Down
24 changes: 24 additions & 0 deletions pkg/apis/pingcap.com/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.