forked from ipfs-cluster/ipfs-cluster
/
pubsubmon.go
296 lines (241 loc) · 6.87 KB
/
pubsubmon.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
// Package pubsubmon implements a PeerMonitor component for IPFS Cluster that
// uses PubSub to send and receive metrics.
package pubsubmon
import (
"bytes"
"context"
"time"
"sync"
"github.com/lubanproj/ipfs-cluster/api"
"github.com/lubanproj/ipfs-cluster/monitor/metrics"
logging "github.com/ipfs/go-log/v2"
peer "github.com/libp2p/go-libp2p-core/peer"
rpc "github.com/libp2p/go-libp2p-gorpc"
pubsub "github.com/libp2p/go-libp2p-pubsub"
gocodec "github.com/ugorji/go/codec"
"go.opencensus.io/trace"
)
var logger = logging.Logger("monitor")
// PubsubTopic specifies the topic used to publish Cluster metrics.
var PubsubTopic = "monitor.metrics"
var msgpackHandle = &gocodec.MsgpackHandle{}
// Monitor is a component in charge of monitoring peers, logging
// metrics and detecting failures
type Monitor struct {
ctx context.Context
cancel func()
rpcClient *rpc.Client
rpcReady chan struct{}
pubsub *pubsub.PubSub
topic *pubsub.Topic
subscription *pubsub.Subscription
peers PeersFunc
metrics *metrics.Store
checker *metrics.Checker
config *Config
shutdownLock sync.Mutex
shutdown bool
wg sync.WaitGroup
}
// PeersFunc allows the Monitor to filter and discard metrics
// that do not belong to a given peerset.
type PeersFunc func(context.Context) ([]peer.ID, error)
// New creates a new PubSub monitor, using the given host, config and
// PeersFunc. The PeersFunc can be nil. In this case, no metric filtering is
// done based on peers (any peer is considered part of the peerset).
func New(
ctx context.Context,
cfg *Config,
psub *pubsub.PubSub,
peers PeersFunc,
) (*Monitor, error) {
err := cfg.Validate()
if err != nil {
return nil, err
}
ctx, cancel := context.WithCancel(ctx)
mtrs := metrics.NewStore()
checker := metrics.NewChecker(ctx, mtrs)
topic, err := psub.Join(PubsubTopic)
if err != nil {
cancel()
return nil, err
}
subscription, err := topic.Subscribe()
if err != nil {
cancel()
return nil, err
}
mon := &Monitor{
ctx: ctx,
cancel: cancel,
rpcReady: make(chan struct{}, 1),
pubsub: psub,
topic: topic,
subscription: subscription,
peers: peers,
metrics: mtrs,
checker: checker,
config: cfg,
}
go mon.run()
return mon, nil
}
func (mon *Monitor) run() {
select {
case <-mon.rpcReady:
go mon.logFromPubsub()
go mon.checker.Watch(mon.ctx, mon.peers, mon.config.CheckInterval)
case <-mon.ctx.Done():
}
}
// logFromPubsub logs metrics received in the subscribed topic.
func (mon *Monitor) logFromPubsub() {
ctx, span := trace.StartSpan(mon.ctx, "monitor/pubsub/logFromPubsub")
defer span.End()
decodeWarningPrinted := false
// Previous versions use multicodec with the following header, which
// we need to remove.
multicodecPrefix := append([]byte{byte(9)}, []byte("/msgpack\n")...)
for {
select {
case <-ctx.Done():
return
default:
msg, err := mon.subscription.Next(ctx)
if err != nil { // context canceled enters here
continue
}
data := msg.GetData()
buf := bytes.NewBuffer(data)
dec := gocodec.NewDecoder(buf, msgpackHandle)
metric := api.Metric{}
err = dec.Decode(&metric)
if err != nil {
if bytes.HasPrefix(data, multicodecPrefix) {
buf := bytes.NewBuffer(data[len(multicodecPrefix):])
dec := gocodec.NewDecoder(buf, msgpackHandle)
err = dec.Decode(&metric)
if err != nil {
logger.Error(err)
continue
}
// managed to decode an older version metric. Warn about it once.
if !decodeWarningPrinted {
logger.Warning("Peers in versions <= v0.13.3 detected. These peers will not receive metrics from this or other newer peers. Please upgrade them.")
decodeWarningPrinted = true
}
} else {
logger.Error(err)
continue
}
}
debug("received", metric)
err = mon.LogMetric(ctx, metric)
if err != nil {
logger.Error(err)
continue
}
}
}
}
// SetClient saves the given rpc.Client for later use
func (mon *Monitor) SetClient(c *rpc.Client) {
mon.rpcClient = c
mon.rpcReady <- struct{}{}
}
// Shutdown stops the peer monitor. It particular, it will
// not deliver any alerts.
func (mon *Monitor) Shutdown(ctx context.Context) error {
_, span := trace.StartSpan(ctx, "monitor/pubsub/Shutdown")
defer span.End()
mon.shutdownLock.Lock()
defer mon.shutdownLock.Unlock()
if mon.shutdown {
logger.Warn("Monitor already shut down")
return nil
}
logger.Info("stopping Monitor")
close(mon.rpcReady)
mon.cancel()
mon.wg.Wait()
mon.shutdown = true
return nil
}
// LogMetric stores a metric so it can later be retrieved.
func (mon *Monitor) LogMetric(ctx context.Context, m api.Metric) error {
_, span := trace.StartSpan(ctx, "monitor/pubsub/LogMetric")
defer span.End()
mon.metrics.Add(m)
debug("logged", m)
if !m.Discard() { // We received a valid metric so avoid alerting.
mon.checker.ResetAlerts(m.Peer, m.Name)
}
return nil
}
// PublishMetric broadcasts a metric to all current cluster peers.
func (mon *Monitor) PublishMetric(ctx context.Context, m api.Metric) error {
ctx, span := trace.StartSpan(ctx, "monitor/pubsub/PublishMetric")
defer span.End()
if m.Discard() {
logger.Warnf("discarding invalid metric: %+v", m)
return nil
}
var b bytes.Buffer
enc := gocodec.NewEncoder(&b, msgpackHandle)
err := enc.Encode(m)
if err != nil {
logger.Error(err)
return err
}
debug("publish", m)
err = mon.topic.Publish(ctx, b.Bytes())
if err != nil {
logger.Error(err)
return err
}
return nil
}
// LatestMetrics returns last known VALID metrics of a given type. A metric
// is only valid if it has not expired and belongs to a current cluster peer.
func (mon *Monitor) LatestMetrics(ctx context.Context, name string) []api.Metric {
ctx, span := trace.StartSpan(ctx, "monitor/pubsub/LatestMetrics")
defer span.End()
latest := mon.metrics.LatestValid(name)
if mon.peers == nil {
return latest
}
// Make sure we only return metrics in the current peerset if we have
// a peerset provider.
peers, err := mon.peers(ctx)
if err != nil {
return []api.Metric{}
}
return metrics.PeersetFilter(latest, peers)
}
// LatestForPeer returns the latest metric received for a peer (it may have
// expired). It returns nil if no metric exists.
func (mon *Monitor) LatestForPeer(ctx context.Context, name string, pid peer.ID) api.Metric {
return mon.metrics.PeerLatest(name, pid)
}
// Alerts returns a channel on which alerts are sent when the
// monitor detects a failure.
func (mon *Monitor) Alerts() <-chan api.Alert {
return mon.checker.Alerts()
}
// MetricNames lists all metric names.
func (mon *Monitor) MetricNames(ctx context.Context) []string {
_, span := trace.StartSpan(ctx, "monitor/pubsub/MetricNames")
defer span.End()
return mon.metrics.MetricNames()
}
func debug(event string, m api.Metric) {
logger.Debugf(
"%s metric: '%s' - '%s' - '%s' - '%s'",
event,
m.Peer,
m.Name,
m.Value,
time.Unix(0, m.Expire),
)
}