-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
13 changed files
with
1,035 additions
and
582 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
language: go | ||
|
||
before_script: | ||
- go get golang.org/x/tools/cmd/cover | ||
- go get github.com/mattn/goveralls | ||
|
||
script: | ||
- go test -covermode=count -coverprofile=profile.cov ./... | ||
- goveralls -coverprofile=profile.cov -service=travis-ci |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
package healthcheck | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"time" | ||
|
||
"github.com/m-lab/rebot/node" | ||
"github.com/m-lab/rebot/promtest" | ||
"github.com/prometheus/common/model" | ||
log "github.com/sirupsen/logrus" | ||
) | ||
|
||
var ( | ||
NodeQuery = `(label_replace(sum_over_time(probe_success{service="ssh806", module="ssh_v4_online"}[%[1]dm]) == 0, | ||
"site", "$1", "machine", ".+?\\.(.+?)\\..+") | ||
unless on (machine) | ||
label_replace(sum_over_time(probe_success{service="ssh", module="ssh_v4_online"}[%[1]dm]) > 0, | ||
"site", "$1", "machine", ".+?\\.(.+?)\\..+")) | ||
unless on(machine) gmx_machine_maintenance == 1 | ||
unless on(site) gmx_site_maintenance == 1 | ||
unless on (machine) lame_duck_node == 1 | ||
unless on (machine) count_over_time(probe_success{service="ssh806", module="ssh_v4_online"}[%[1]dm]) < 14 | ||
unless on (machine) rate(inotify_extension_create_total{ext=".s2c_snaplog"}[%[1]dm]) > 0` | ||
|
||
// To determine if a switch is offline, pings are generally more reliable | ||
// than SNMP scraping. | ||
SwitchQuery = `sum_over_time(probe_success{instance=~"s1.*", module="icmp"}[15m]) == 0 unless on(site) gmx_site_maintenance == 1` | ||
) | ||
|
||
// getOfflineSites checks for offline switches in the last N minutes. | ||
// It returns a sitename -> Sample map. | ||
func getOfflineSites(prom promtest.PromClient) (map[string]*model.Sample, error) { | ||
offline := make(map[string]*model.Sample) | ||
|
||
values, err := prom.Query(context.Background(), SwitchQuery, time.Now()) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
for _, s := range values.(model.Vector) { | ||
offline[string(s.Metric["site"])] = s | ||
log.WithFields(log.Fields{"site": s.Metric["site"]}).Warn("Offline switch found.") | ||
} | ||
|
||
return offline, err | ||
} | ||
|
||
// getOfflineNodes checks for offline nodes in the last N minutes. | ||
// It returns a Vector of samples. | ||
func getOfflineNodes(prom promtest.PromClient, minutes int) ([]node.Node, error) { | ||
values, err := prom.Query(context.Background(), fmt.Sprintf(NodeQuery, minutes), time.Now()) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
if len(values.(model.Vector)) != 0 { | ||
log.WithFields(log.Fields{"nodes": values}).Warn("Offline nodes found.") | ||
} | ||
|
||
candidates := make([]node.Node, 0) | ||
|
||
for _, sample := range values.(model.Vector) { | ||
site := sample.Metric["site"] | ||
machine := sample.Metric["machine"] | ||
log.Info("adding " + string(machine)) | ||
candidates = append(candidates, node.Node{ | ||
Name: string(machine), | ||
Site: string(site), | ||
}) | ||
} | ||
|
||
return candidates, nil | ||
} | ||
|
||
func filterOfflineSites(sites map[string]*model.Sample, toFilter []node.Node) []node.Node { | ||
|
||
filtered := make([]node.Node, 0) | ||
|
||
for _, c := range toFilter { | ||
// Ignore machines in sites where the switch is offline. | ||
site := c.Site | ||
machine := c.Name | ||
if _, ok := sites[site]; !ok { | ||
filtered = append(filtered, c) | ||
} else { | ||
log.Info("Ignoring " + machine + " as the switch is offline.") | ||
} | ||
} | ||
|
||
return filtered | ||
} | ||
|
||
func GetRebootable(prom promtest.PromClient, minutes int) ([]node.Node, error) { | ||
// Query for offline switches | ||
sites, err := getOfflineSites(prom) | ||
if err != nil { | ||
log.Error("Unable to retrieve offline sites from Prometheus") | ||
return nil, err | ||
} | ||
|
||
// Query for offline nodes | ||
nodes, err := getOfflineNodes(prom, minutes) | ||
if err != nil { | ||
log.Error("Unable to retrieve offline nodes from Prometheus") | ||
return nil, err | ||
} | ||
|
||
offline := filterOfflineSites(sites, nodes) | ||
return offline, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,210 @@ | ||
package healthcheck | ||
|
||
import ( | ||
"fmt" | ||
"reflect" | ||
"testing" | ||
"time" | ||
|
||
"github.com/m-lab/rebot/node" | ||
"github.com/m-lab/rebot/promtest" | ||
"github.com/prometheus/common/model" | ||
) | ||
|
||
var ( | ||
fakeProm *promtest.PrometheusMockClient | ||
fakePromErr *promtest.PrometheusMockClient | ||
fakeOfflineSwitch *model.Sample | ||
fakeOfflineNode *model.Sample | ||
|
||
offlineNodes model.Vector | ||
|
||
testMins = 15 | ||
) | ||
|
||
func init() { | ||
fakeProm = promtest.NewPrometheusMockClient() | ||
// This client does not have any registered query, thus it always | ||
// returns an error. | ||
fakePromErr = promtest.NewPrometheusMockClient() | ||
|
||
now := model.Time(time.Now().Unix()) | ||
|
||
fakeOfflineSwitch = promtest.CreateSample(map[string]string{ | ||
"instance": "s1.iad0t.measurement-lab.org", | ||
"job": "blackbox-targets", | ||
"module": "icmp", | ||
"site": "iad0t", | ||
}, 0, now) | ||
|
||
var offlineSwitches = model.Vector{ | ||
fakeOfflineSwitch, | ||
} | ||
|
||
fakeOfflineNode = promtest.CreateSample(map[string]string{ | ||
"instance": "mlab1.iad0t.measurement-lab.org:806", | ||
"job": "blackbox-targets", | ||
"machine": "mlab1.iad0t.measurement-lab.org", | ||
"module": "ssh_v4_online", | ||
"service": "ssh806", | ||
"site": "iad0t", | ||
}, 0, now) | ||
|
||
offlineNodes = model.Vector{ | ||
fakeOfflineNode, | ||
} | ||
|
||
fakeProm.Register(SwitchQuery, offlineSwitches, nil) | ||
fakeProm.Register(fmt.Sprintf(NodeQuery, testMins), offlineNodes, nil) | ||
} | ||
|
||
func Test_getOfflineSites(t *testing.T) { | ||
tests := []struct { | ||
name string | ||
prom promtest.PromClient | ||
want map[string]*model.Sample | ||
wantErr bool | ||
}{ | ||
{ | ||
name: "success", | ||
want: map[string]*model.Sample{ | ||
"iad0t": fakeOfflineSwitch, | ||
}, | ||
prom: fakeProm, | ||
}, | ||
{ | ||
name: "error", | ||
prom: fakePromErr, | ||
wantErr: true, | ||
}, | ||
} | ||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
got, err := getOfflineSites(tt.prom) | ||
if (err != nil) != tt.wantErr { | ||
t.Errorf("getOfflineSites() error = %v, wantErr %v", err, tt.wantErr) | ||
return | ||
} | ||
if !reflect.DeepEqual(got, tt.want) { | ||
t.Errorf("getOfflineSites() = %v, want %v", got, tt.want) | ||
} | ||
}) | ||
} | ||
} | ||
|
||
func Test_getOfflineNodes(t *testing.T) { | ||
tests := []struct { | ||
name string | ||
prom promtest.PromClient | ||
minutes int | ||
want []node.Node | ||
wantErr bool | ||
}{ | ||
{ | ||
name: "success", | ||
prom: fakeProm, | ||
minutes: testMins, | ||
want: []node.Node{ | ||
node.New("mlab1.iad0t.measurement-lab.org", "iad0t"), | ||
}, | ||
}, | ||
{ | ||
name: "error", | ||
prom: fakePromErr, | ||
minutes: testMins, | ||
wantErr: true, | ||
}, | ||
} | ||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
got, err := getOfflineNodes(tt.prom, tt.minutes) | ||
if (err != nil) != tt.wantErr { | ||
t.Errorf("getOfflineNodes() error = %v, wantErr %v", err, tt.wantErr) | ||
return | ||
} | ||
if !reflect.DeepEqual(got, tt.want) { | ||
t.Errorf("getOfflineNodes() = %v, want %v", got, tt.want) | ||
} | ||
}) | ||
} | ||
} | ||
|
||
func Test_filterOfflineSites(t *testing.T) { | ||
|
||
candidates := []node.Node{ | ||
node.New("mlab1.iad0t.measurement-lab.org", "iad0t"), | ||
} | ||
|
||
tests := []struct { | ||
name string | ||
sites map[string]*model.Sample | ||
candidates []node.Node | ||
want []node.Node | ||
}{ | ||
{ | ||
name: "success-filtered-node-when-site-offline", | ||
sites: map[string]*model.Sample{ | ||
"iad0t": fakeOfflineSwitch, | ||
}, | ||
candidates: candidates, | ||
want: []node.Node{}, | ||
}, | ||
{ | ||
name: "success-offline-node-returned", | ||
sites: map[string]*model.Sample{ | ||
"iad1t": fakeOfflineSwitch, | ||
}, | ||
candidates: candidates, | ||
want: []node.Node{ | ||
node.New("mlab1.iad0t.measurement-lab.org", "iad0t"), | ||
}, | ||
}, | ||
} | ||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
if got := filterOfflineSites(tt.sites, tt.candidates); !(len(got) == 0 && len(tt.want) == 0) && !reflect.DeepEqual(got, tt.want) { | ||
t.Errorf("filterOfflineSites() = %v, want %v", got, tt.want) | ||
} | ||
}) | ||
} | ||
} | ||
|
||
func TestGetRebootable(t *testing.T) { | ||
|
||
t.Run("success", func(t *testing.T) { | ||
got, err := GetRebootable(fakeProm, testMins) | ||
|
||
if err != nil { | ||
t.Errorf("GetRebootable() error = %v, wantErr %v", err, false) | ||
return | ||
} | ||
|
||
if !reflect.DeepEqual(got, []node.Node{}) { | ||
t.Errorf("GetRebootable() = %v, want %v", got, []node.Node{}) | ||
} | ||
}) | ||
|
||
t.Run("error-retrieving-sites", func(t *testing.T) { | ||
_, err := GetRebootable(fakePromErr, testMins) | ||
|
||
if err == nil { | ||
t.Errorf("GetRebootable() error = %v, wantErr %v", err, true) | ||
return | ||
} | ||
}) | ||
|
||
// Unregister nodes query from the fake client | ||
restore := fakeProm.Unregister(fmt.Sprintf(NodeQuery, testMins)) | ||
|
||
t.Run("error-retrieving-nodes", func(t *testing.T) { | ||
_, err := GetRebootable(fakeProm, testMins) | ||
|
||
if err == nil { | ||
t.Errorf("GetRebootable() error = %v, wantErr %v", err, true) | ||
return | ||
} | ||
}) | ||
|
||
restore() | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
package history | ||
|
||
import ( | ||
"encoding/json" | ||
"io/ioutil" | ||
"time" | ||
|
||
"github.com/m-lab/rebot/node" | ||
|
||
"github.com/m-lab/go/rtx" | ||
log "github.com/sirupsen/logrus" | ||
) | ||
|
||
// Read reads a JSON file containing a map of | ||
// string -> candidate. If the file cannot be read or deserialized, it returns | ||
// an empty map. | ||
func Read(path string) map[string]node.History { | ||
var candidateHistory map[string]node.History | ||
file, err := ioutil.ReadFile(path) | ||
|
||
if err != nil { | ||
// There is no existing candidate history file -> return empty map. | ||
return make(map[string]node.History) | ||
} | ||
|
||
err = json.Unmarshal(file, &candidateHistory) | ||
|
||
if err != nil { | ||
log.Warn("Cannot unmarshal the candidates' history file - ignoring it. ", err) | ||
return make(map[string]node.History) | ||
} | ||
|
||
return candidateHistory | ||
} | ||
|
||
// Write serializes a string -> candidate map to a JSON file. | ||
// If the map cannot be serialized or the file cannot be written, it exits. | ||
func Write(path string, candidateHistory map[string]node.History) { | ||
newCandidates, err := json.Marshal(candidateHistory) | ||
rtx.Must(err, "Cannot marshal the candidates history!") | ||
|
||
err = ioutil.WriteFile(path, newCandidates, 0644) | ||
rtx.Must(err, "Cannot write the candidates history's JSON file!") | ||
} | ||
|
||
// Update updates the LastReboot field for all the candidates named in | ||
// the nodes slice. If a candidate did not previously exist, it creates a | ||
// new one. | ||
func Update(candidates []node.Node, history map[string]node.History) { | ||
if len(candidates) == 0 { | ||
return | ||
} | ||
|
||
log.WithFields(log.Fields{"nodes": candidates}).Info("Updating history...") | ||
for _, c := range candidates { | ||
el, ok := history[c.Name] | ||
if ok { | ||
el.LastReboot = time.Now() | ||
history[c.Name] = el | ||
} else { | ||
history[c.Name] = node.NewHistory(c.Name, c.Site, time.Now()) | ||
} | ||
} | ||
|
||
} |
Oops, something went wrong.