Skip to content

Commit

Permalink
Merge fd1b859 into cd5452f
Browse files Browse the repository at this point in the history
  • Loading branch information
robertodauria committed Nov 30, 2018
2 parents cd5452f + fd1b859 commit 0f319fb
Show file tree
Hide file tree
Showing 13 changed files with 1,035 additions and 582 deletions.
9 changes: 9 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
language: go

before_script:
- go get golang.org/x/tools/cmd/cover
- go get github.com/mattn/goveralls

script:
- go test -covermode=count -coverprofile=profile.cov ./...
- goveralls -coverprofile=profile.cov -service=travis-ci
111 changes: 111 additions & 0 deletions healthcheck/prometheus.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package healthcheck

import (
"context"
"fmt"
"time"

"github.com/m-lab/rebot/node"
"github.com/m-lab/rebot/promtest"
"github.com/prometheus/common/model"
log "github.com/sirupsen/logrus"
)

var (
NodeQuery = `(label_replace(sum_over_time(probe_success{service="ssh806", module="ssh_v4_online"}[%[1]dm]) == 0,
"site", "$1", "machine", ".+?\\.(.+?)\\..+")
unless on (machine)
label_replace(sum_over_time(probe_success{service="ssh", module="ssh_v4_online"}[%[1]dm]) > 0,
"site", "$1", "machine", ".+?\\.(.+?)\\..+"))
unless on(machine) gmx_machine_maintenance == 1
unless on(site) gmx_site_maintenance == 1
unless on (machine) lame_duck_node == 1
unless on (machine) count_over_time(probe_success{service="ssh806", module="ssh_v4_online"}[%[1]dm]) < 14
unless on (machine) rate(inotify_extension_create_total{ext=".s2c_snaplog"}[%[1]dm]) > 0`

// To determine if a switch is offline, pings are generally more reliable
// than SNMP scraping.
SwitchQuery = `sum_over_time(probe_success{instance=~"s1.*", module="icmp"}[15m]) == 0 unless on(site) gmx_site_maintenance == 1`
)

// getOfflineSites checks for offline switches in the last N minutes.
// It returns a sitename -> Sample map.
func getOfflineSites(prom promtest.PromClient) (map[string]*model.Sample, error) {
offline := make(map[string]*model.Sample)

values, err := prom.Query(context.Background(), SwitchQuery, time.Now())
if err != nil {
return nil, err
}

for _, s := range values.(model.Vector) {
offline[string(s.Metric["site"])] = s
log.WithFields(log.Fields{"site": s.Metric["site"]}).Warn("Offline switch found.")
}

return offline, err
}

// getOfflineNodes checks for offline nodes in the last N minutes.
// It returns a Vector of samples.
func getOfflineNodes(prom promtest.PromClient, minutes int) ([]node.Node, error) {
values, err := prom.Query(context.Background(), fmt.Sprintf(NodeQuery, minutes), time.Now())
if err != nil {
return nil, err
}

if len(values.(model.Vector)) != 0 {
log.WithFields(log.Fields{"nodes": values}).Warn("Offline nodes found.")
}

candidates := make([]node.Node, 0)

for _, sample := range values.(model.Vector) {
site := sample.Metric["site"]
machine := sample.Metric["machine"]
log.Info("adding " + string(machine))
candidates = append(candidates, node.Node{
Name: string(machine),
Site: string(site),
})
}

return candidates, nil
}

func filterOfflineSites(sites map[string]*model.Sample, toFilter []node.Node) []node.Node {

filtered := make([]node.Node, 0)

for _, c := range toFilter {
// Ignore machines in sites where the switch is offline.
site := c.Site
machine := c.Name
if _, ok := sites[site]; !ok {
filtered = append(filtered, c)
} else {
log.Info("Ignoring " + machine + " as the switch is offline.")
}
}

return filtered
}

func GetRebootable(prom promtest.PromClient, minutes int) ([]node.Node, error) {
// Query for offline switches
sites, err := getOfflineSites(prom)
if err != nil {
log.Error("Unable to retrieve offline sites from Prometheus")
return nil, err
}

// Query for offline nodes
nodes, err := getOfflineNodes(prom, minutes)
if err != nil {
log.Error("Unable to retrieve offline nodes from Prometheus")
return nil, err
}

offline := filterOfflineSites(sites, nodes)
return offline, nil
}
210 changes: 210 additions & 0 deletions healthcheck/prometheus_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
package healthcheck

import (
"fmt"
"reflect"
"testing"
"time"

"github.com/m-lab/rebot/node"
"github.com/m-lab/rebot/promtest"
"github.com/prometheus/common/model"
)

var (
fakeProm *promtest.PrometheusMockClient
fakePromErr *promtest.PrometheusMockClient
fakeOfflineSwitch *model.Sample
fakeOfflineNode *model.Sample

offlineNodes model.Vector

testMins = 15
)

func init() {
fakeProm = promtest.NewPrometheusMockClient()
// This client does not have any registered query, thus it always
// returns an error.
fakePromErr = promtest.NewPrometheusMockClient()

now := model.Time(time.Now().Unix())

fakeOfflineSwitch = promtest.CreateSample(map[string]string{
"instance": "s1.iad0t.measurement-lab.org",
"job": "blackbox-targets",
"module": "icmp",
"site": "iad0t",
}, 0, now)

var offlineSwitches = model.Vector{
fakeOfflineSwitch,
}

fakeOfflineNode = promtest.CreateSample(map[string]string{
"instance": "mlab1.iad0t.measurement-lab.org:806",
"job": "blackbox-targets",
"machine": "mlab1.iad0t.measurement-lab.org",
"module": "ssh_v4_online",
"service": "ssh806",
"site": "iad0t",
}, 0, now)

offlineNodes = model.Vector{
fakeOfflineNode,
}

fakeProm.Register(SwitchQuery, offlineSwitches, nil)
fakeProm.Register(fmt.Sprintf(NodeQuery, testMins), offlineNodes, nil)
}

func Test_getOfflineSites(t *testing.T) {
tests := []struct {
name string
prom promtest.PromClient
want map[string]*model.Sample
wantErr bool
}{
{
name: "success",
want: map[string]*model.Sample{
"iad0t": fakeOfflineSwitch,
},
prom: fakeProm,
},
{
name: "error",
prom: fakePromErr,
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := getOfflineSites(tt.prom)
if (err != nil) != tt.wantErr {
t.Errorf("getOfflineSites() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("getOfflineSites() = %v, want %v", got, tt.want)
}
})
}
}

func Test_getOfflineNodes(t *testing.T) {
tests := []struct {
name string
prom promtest.PromClient
minutes int
want []node.Node
wantErr bool
}{
{
name: "success",
prom: fakeProm,
minutes: testMins,
want: []node.Node{
node.New("mlab1.iad0t.measurement-lab.org", "iad0t"),
},
},
{
name: "error",
prom: fakePromErr,
minutes: testMins,
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := getOfflineNodes(tt.prom, tt.minutes)
if (err != nil) != tt.wantErr {
t.Errorf("getOfflineNodes() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("getOfflineNodes() = %v, want %v", got, tt.want)
}
})
}
}

func Test_filterOfflineSites(t *testing.T) {

candidates := []node.Node{
node.New("mlab1.iad0t.measurement-lab.org", "iad0t"),
}

tests := []struct {
name string
sites map[string]*model.Sample
candidates []node.Node
want []node.Node
}{
{
name: "success-filtered-node-when-site-offline",
sites: map[string]*model.Sample{
"iad0t": fakeOfflineSwitch,
},
candidates: candidates,
want: []node.Node{},
},
{
name: "success-offline-node-returned",
sites: map[string]*model.Sample{
"iad1t": fakeOfflineSwitch,
},
candidates: candidates,
want: []node.Node{
node.New("mlab1.iad0t.measurement-lab.org", "iad0t"),
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := filterOfflineSites(tt.sites, tt.candidates); !(len(got) == 0 && len(tt.want) == 0) && !reflect.DeepEqual(got, tt.want) {
t.Errorf("filterOfflineSites() = %v, want %v", got, tt.want)
}
})
}
}

func TestGetRebootable(t *testing.T) {

t.Run("success", func(t *testing.T) {
got, err := GetRebootable(fakeProm, testMins)

if err != nil {
t.Errorf("GetRebootable() error = %v, wantErr %v", err, false)
return
}

if !reflect.DeepEqual(got, []node.Node{}) {
t.Errorf("GetRebootable() = %v, want %v", got, []node.Node{})
}
})

t.Run("error-retrieving-sites", func(t *testing.T) {
_, err := GetRebootable(fakePromErr, testMins)

if err == nil {
t.Errorf("GetRebootable() error = %v, wantErr %v", err, true)
return
}
})

// Unregister nodes query from the fake client
restore := fakeProm.Unregister(fmt.Sprintf(NodeQuery, testMins))

t.Run("error-retrieving-nodes", func(t *testing.T) {
_, err := GetRebootable(fakeProm, testMins)

if err == nil {
t.Errorf("GetRebootable() error = %v, wantErr %v", err, true)
return
}
})

restore()

}
65 changes: 65 additions & 0 deletions history/history.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package history

import (
"encoding/json"
"io/ioutil"
"time"

"github.com/m-lab/rebot/node"

"github.com/m-lab/go/rtx"
log "github.com/sirupsen/logrus"
)

// Read reads a JSON file containing a map of
// string -> candidate. If the file cannot be read or deserialized, it returns
// an empty map.
func Read(path string) map[string]node.History {
var candidateHistory map[string]node.History
file, err := ioutil.ReadFile(path)

if err != nil {
// There is no existing candidate history file -> return empty map.
return make(map[string]node.History)
}

err = json.Unmarshal(file, &candidateHistory)

if err != nil {
log.Warn("Cannot unmarshal the candidates' history file - ignoring it. ", err)
return make(map[string]node.History)
}

return candidateHistory
}

// Write serializes a string -> candidate map to a JSON file.
// If the map cannot be serialized or the file cannot be written, it exits.
func Write(path string, candidateHistory map[string]node.History) {
newCandidates, err := json.Marshal(candidateHistory)
rtx.Must(err, "Cannot marshal the candidates history!")

err = ioutil.WriteFile(path, newCandidates, 0644)
rtx.Must(err, "Cannot write the candidates history's JSON file!")
}

// Update updates the LastReboot field for all the candidates named in
// the nodes slice. If a candidate did not previously exist, it creates a
// new one.
func Update(candidates []node.Node, history map[string]node.History) {
if len(candidates) == 0 {
return
}

log.WithFields(log.Fields{"nodes": candidates}).Info("Updating history...")
for _, c := range candidates {
el, ok := history[c.Name]
if ok {
el.LastReboot = time.Now()
history[c.Name] = el
} else {
history[c.Name] = node.NewHistory(c.Name, c.Site, time.Now())
}
}

}
Loading

0 comments on commit 0f319fb

Please sign in to comment.