forked from etcd-io/etcd
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cluster_health.go
141 lines (122 loc) · 3.15 KB
/
cluster_health.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
package command
import (
"encoding/json"
"errors"
"fmt"
"net/http"
"os"
"sort"
"strings"
"time"
"github.com/coreos/etcd/Godeps/_workspace/src/github.com/codegangsta/cli"
"github.com/coreos/etcd/Godeps/_workspace/src/github.com/coreos/go-etcd/etcd"
"github.com/coreos/etcd/etcdserver/stats"
)
func NewClusterHealthCommand() cli.Command {
return cli.Command{
Name: "cluster-health",
Usage: "check the health of the etcd cluster",
Flags: []cli.Flag{},
Action: handleClusterHealth,
}
}
func handleClusterHealth(c *cli.Context) {
endpoints, err := getEndpoints(c)
if err != nil {
handleError(ErrorFromEtcd, err)
}
tr, err := getTransport(c)
if err != nil {
handleError(ErrorFromEtcd, err)
}
client := etcd.NewClient(endpoints)
client.SetTransport(tr)
if c.GlobalBool("debug") {
go dumpCURL(client)
}
if ok := client.SyncCluster(); !ok {
handleError(FailedToConnectToHost, errors.New("cannot sync with the cluster using endpoints "+strings.Join(endpoints, ", ")))
}
// do we have a leader?
ep, ls0, err := getLeaderStats(tr, client.GetCluster())
if err != nil {
fmt.Println("cluster is unhealthy")
os.Exit(1)
}
// is raft stable and making progress?
client = etcd.NewClient([]string{ep})
client.SetTransport(tr)
resp, err := client.Get("/", false, false)
if err != nil {
fmt.Println("cluster is unhealthy")
os.Exit(1)
}
rt0, ri0 := resp.RaftTerm, resp.RaftIndex
time.Sleep(time.Second)
resp, err = client.Get("/", false, false)
if err != nil {
fmt.Println("cluster is unhealthy")
os.Exit(1)
}
rt1, ri1 := resp.RaftTerm, resp.RaftIndex
if rt0 != rt1 {
fmt.Println("cluster is unhealthy")
os.Exit(1)
}
if ri1 == ri0 {
fmt.Println("cluster is unhealthy")
os.Exit(1)
}
// are all the members makeing progress?
_, ls1, err := getLeaderStats(tr, []string{ep})
if err != nil {
fmt.Println("cluster is unhealthy")
os.Exit(1)
}
fmt.Println("cluster is healthy")
// self is healthy
var prints []string
prints = append(prints, fmt.Sprintf("member %s is healthy\n", ls1.Leader))
for name, fs0 := range ls0.Followers {
fs1, ok := ls1.Followers[name]
if !ok {
fmt.Println("Cluster configuration changed during health checking. Please retry.")
os.Exit(1)
}
if fs1.Counts.Success <= fs0.Counts.Success {
prints = append(prints, fmt.Sprintf("member %s is unhealthy\n", name))
} else {
prints = append(prints, fmt.Sprintf("member %s is healthy\n", name))
}
}
sort.Strings(prints)
for _, p := range prints {
fmt.Print(p)
}
os.Exit(0)
}
func getLeaderStats(tr *http.Transport, endpoints []string) (string, *stats.LeaderStats, error) {
// go-etcd does not support cluster stats, use http client for now
// TODO: use new etcd client with new member/stats endpoint
httpclient := http.Client{
Transport: tr,
}
for _, ep := range endpoints {
resp, err := httpclient.Get(ep + "/v2/stats/leader")
if err != nil {
continue
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
continue
}
ls := &stats.LeaderStats{}
d := json.NewDecoder(resp.Body)
err = d.Decode(ls)
if err != nil {
continue
}
return ep, ls, nil
}
return "", nil, errors.New("no leader")
}