/
clustering.go
783 lines (732 loc) · 24.7 KB
/
clustering.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
// Copyright 2017-2021 The NATS Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package server
import (
"bytes"
"errors"
"fmt"
"io"
"os"
"path/filepath"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/hashicorp/raft"
"github.com/nats-io/nats-streaming-server/spb"
"github.com/nats-io/nats.go"
)
const (
addClusterNodeSubj = defaultRaftPrefix + ".%s.node.add"
removeClusterNodeSubj = defaultRaftPrefix + ".%s.node.remove"
)
const (
defaultJoinRaftGroupTimeout = time.Second
defaultRaftHBTimeout = 2 * time.Second
defaultRaftElectionTimeout = 2 * time.Second
defaultRaftLeaseTimeout = time.Second
defaultRaftCommitTimeout = 100 * time.Millisecond
defaultTPortTimeout = 10 * time.Second
)
var (
runningInTests bool
joinRaftGroupTimeout = defaultJoinRaftGroupTimeout
testPauseAfterNewRaftCalled bool
tportTimeout = defaultTPortTimeout
)
const (
testLazyReplicationInterval = 250 * time.Millisecond
)
func clusterSetupForTest() {
runningInTests = true
lazyReplicationInterval = testLazyReplicationInterval
joinRaftGroupTimeout = 250 * time.Millisecond
tportTimeout = 250 * time.Millisecond
}
// ClusteringOptions contains STAN Server options related to clustering.
type ClusteringOptions struct {
Clustered bool // Run the server in a clustered configuration.
NodeID string // ID of the node within the cluster.
Bootstrap bool // Bootstrap the cluster as a seed node if there is no existing state.
Peers []string // List of cluster peer node IDs to bootstrap cluster state.
RaftLogPath string // Path to Raft log store directory.
LogCacheSize int // Number of Raft log entries to cache in memory to reduce disk IO.
LogSnapshots int // Number of Raft log snapshots to retain.
TrailingLogs int64 // Number of logs left after a snapshot.
Sync bool // Do a file sync after every write to the Raft log and message store.
RaftLogging bool // Enable logging of Raft library (disabled by default since really verbose).
// Enable creation of dedicated NATS connections to communicate with other
// nodes. Normally, the server has a single NATS connection and subscribes
// to a subject where other nodes can submit requests to "connect" to it.
// When a remote connects, a new subscription on an inbox is created on
// both sides and they use their single "raft" NATS connection to communicate.
// If node "A" connects to both "B" and "C" it will have two subscriptions
// and two "outbox" subjects (on per remote node) to which send data to.
//
// With this option enabled, NATS connection(s) will be created per remote
// node. This should help with performance and reduce contention.
// The RAFT transport is pooling connections, so there may be more than
// one connection per remote node.
NodesConnections bool
// If this is enabled, the leader of the cluster will listen to add/remove
// requests on NATS subject "_STAN.raft.<cluster ID>.node.[add|remove]".
// Admin can/should limit permissions to send to this subject to prevent
// a user to inadvertently change the cluster configuration.
AllowAddRemoveNode bool
// When a node processes a snapshot (either on startup or if falling behind) and its is
// not in phase with the message store's state, it is required to reconcile its state
// with the current leader. If it is unable, the node will fail to start or exit.
// If all nodes are starting and there is no way to have a leader at this point,
// then if this boolean is set to true, then the node will attempt to reconcile but
// if it can't it will still proceed.
ProceedOnRestoreFailure bool
// These will be set to some sane defaults. Change only if experiencing raft issues.
RaftHeartbeatTimeout time.Duration
RaftElectionTimeout time.Duration
RaftLeaseTimeout time.Duration
RaftCommitTimeout time.Duration
// These options influence the RAFT store implementation which uses bolt DB.
//
// Sync freelist to disk. This reduces the database write performance, but
// speed up recovery since there is no need for a full database re-sync.
BoltFreeListSync bool
// BoltFreeListMap sets the backend freelist type to use a map instead of
// the default array type.
// The "array" type (the default) is simple but suffers dramatic performance
// degradation if database is large and framentation in freelist is common.
// The "hashmap which is faster in almost all circumstances but doesn't guarantee
// that it offers the smallest page id available. In normal case it is safe.
BoltFreeListMap bool
}
// raftNode is a handle to a member in a Raft consensus group.
type raftNode struct {
leader int64
sync.Mutex
closed bool
*raft.Raft
store *raftLog
transport *raft.NetworkTransport
logInput io.WriteCloser
joinSub *nats.Subscription
notifyCh <-chan bool
fsm *raftFSM
}
type replicatedSub struct {
sub *subState
err error
}
type raftFSM struct {
sync.Mutex
server *StanServer
restoreFromInit bool
}
// shutdown attempts to stop the Raft node.
func (r *raftNode) shutdown() error {
r.Lock()
if r.closed {
r.Unlock()
return nil
}
r.closed = true
r.Unlock()
if r.transport != nil {
if err := r.transport.Close(); err != nil {
return err
}
}
if r.Raft != nil {
if err := r.Raft.Shutdown().Error(); err != nil {
return err
}
}
if r.store != nil {
if err := r.store.Close(); err != nil {
return err
}
}
if r.logInput != nil {
if err := r.logInput.Close(); err != nil {
return err
}
}
return nil
}
// createRaftNode creates and starts a new Raft node.
func (s *StanServer) createServerRaftNode(hasStreamingState bool) error {
var (
name = s.info.ClusterID
addr = s.getClusteringAddr(name)
existingState, err = s.createRaftNode(name)
)
if err != nil {
return err
}
if !existingState && hasStreamingState {
return fmt.Errorf("streaming state was recovered but cluster log path %q is empty", s.opts.Clustering.RaftLogPath)
}
node := s.raft
// Bootstrap if there is no previous state and we are starting this node as
// a seed or a cluster configuration is provided.
bootstrap := !existingState && (s.opts.Clustering.Bootstrap || len(s.opts.Clustering.Peers) > 0)
if bootstrap {
if err := s.bootstrapCluster(name, node.Raft); err != nil {
node.shutdown()
return err
}
} else if !existingState {
// Attempt to join the cluster if we're not bootstrapping.
req, err := (&spb.RaftJoinRequest{NodeID: s.opts.Clustering.NodeID, NodeAddr: addr}).Marshal()
if err != nil {
panic(err)
}
var (
joined = false
resp = &spb.RaftJoinResponse{}
)
s.log.Debugf("Joining Raft group %s", name)
// Attempt to join up to 5 times before giving up.
for i := 0; i < 5; i++ {
r, err := s.ncr.Request(fmt.Sprintf("%s.%s.join", defaultRaftPrefix, name), req, joinRaftGroupTimeout)
if err != nil {
waitTime := 20 * time.Millisecond
if err == nats.ErrNoResponders {
// wait the equivalent of the Request() timeout, so that our
// loop does not fail too fast.
waitTime += joinRaftGroupTimeout
}
time.Sleep(waitTime)
continue
}
if err := resp.Unmarshal(r.Data); err != nil {
time.Sleep(20 * time.Millisecond)
continue
}
if resp.Error != "" {
time.Sleep(20 * time.Millisecond)
continue
}
joined = true
break
}
if !joined {
node.shutdown()
return fmt.Errorf("failed to join Raft group %s", name)
}
}
if s.opts.Clustering.Bootstrap {
// If node is started with bootstrap, regardless if state exist or not, try to
// detect (and report) other nodes in same cluster started with bootstrap=true.
s.wg.Add(1)
go func() {
s.detectBootstrapMisconfig(name)
s.wg.Done()
}()
}
return nil
}
func (s *StanServer) detectBootstrapMisconfig(name string) {
srvID := []byte(s.serverID)
subj := fmt.Sprintf("%s.%s.bootstrap", defaultRaftPrefix, name)
s.ncr.Subscribe(subj, func(m *nats.Msg) {
if m.Data != nil && m.Reply != "" {
// Ignore message to ourself
if string(m.Data) != s.serverID {
s.ncr.Publish(m.Reply, srvID)
s.log.Fatalf("Server %s was also started with -cluster_bootstrap", string(m.Data))
}
}
})
inbox := nats.NewInbox()
s.ncr.Subscribe(inbox, func(m *nats.Msg) {
s.log.Fatalf("Server %s was also started with -cluster_bootstrap", string(m.Data))
})
if err := s.ncr.Flush(); err != nil {
s.log.Errorf("Error setting up bootstrap misconfiguration detection: %v", err)
return
}
ticker := time.NewTicker(time.Second)
for {
select {
case <-s.shutdownCh:
ticker.Stop()
return
case <-ticker.C:
s.ncr.PublishRequest(subj, inbox, srvID)
}
}
}
type raftLogger struct {
*StanServer
}
func (rl *raftLogger) Write(b []byte) (int, error) {
if !rl.raftLogging {
return len(b), nil
}
levelStart := bytes.IndexByte(b, '[')
if levelStart != -1 {
// RAFT has various "headers", sometimes it is "[xxxx] raft:",
// sometimes "[xxx] raft:" or "[xxx] raft-net:", etc..
// So look for the closing ']' and skip spaces to determine the offset.
offset := levelStart + 1 + bytes.IndexByte(b[levelStart+1:], ']')
for offset = offset + 1; offset < len(b); offset++ {
if b[offset] != ' ' {
break
}
}
if offset == len(b) {
return len(b), nil
}
switch b[levelStart+1] {
case 'D': // [DEBUG]
rl.log.Debugf("%s", b[offset:])
case 'I': // [INFO]
rl.log.Noticef("%s", b[offset:])
case 'W': // [WARN]
rl.log.Warnf("%s", b[offset:])
case 'E': // [ERROR]
rl.log.Errorf("%s", b[offset:])
default:
rl.log.Noticef("%s", b)
}
}
return len(b), nil
}
func (rl *raftLogger) Close() error { return nil }
// createRaftNode creates and starts a new Raft node with the given name and FSM.
func (s *StanServer) createRaftNode(name string) (bool, error) {
path := filepath.Join(s.opts.Clustering.RaftLogPath, name)
if _, err := os.Stat(path); os.IsNotExist(err) {
if err := os.MkdirAll(path, os.ModeDir+os.ModePerm); err != nil {
return false, err
}
}
// We create s.raft early because once NewRaft() is called, the
// raft code may asynchronously invoke FSM.Apply() and FSM.Restore()
// So we want the object to exist so we can check on leader atomic, etc..
s.raft = &raftNode{}
raftLogFileName := filepath.Join(path, raftLogFile)
store, err := newRaftLog(s.log, raftLogFileName, s.opts)
if err != nil {
return false, err
}
// Go through the list of channels that we have recovered from streaming store
// and set their corresponding UID.
s.channels.Lock()
for cname, c := range s.channels.channels {
id, err := store.GetChannelID(cname)
if err != nil {
s.channels.Unlock()
return false, err
}
c.id = id
}
s.channels.Unlock()
addr := s.getClusteringAddr(name)
config := raft.DefaultConfig()
// For tests
if runningInTests {
config.ElectionTimeout = 100 * time.Millisecond
config.HeartbeatTimeout = 100 * time.Millisecond
config.LeaderLeaseTimeout = 100 * time.Millisecond
} else {
if s.opts.Clustering.RaftHeartbeatTimeout == 0 {
s.opts.Clustering.RaftHeartbeatTimeout = defaultRaftHBTimeout
}
if s.opts.Clustering.RaftElectionTimeout == 0 {
s.opts.Clustering.RaftElectionTimeout = defaultRaftElectionTimeout
}
if s.opts.Clustering.RaftLeaseTimeout == 0 {
s.opts.Clustering.RaftLeaseTimeout = defaultRaftLeaseTimeout
}
if s.opts.Clustering.RaftCommitTimeout == 0 {
s.opts.Clustering.RaftCommitTimeout = defaultRaftCommitTimeout
}
config.HeartbeatTimeout = s.opts.Clustering.RaftHeartbeatTimeout
config.ElectionTimeout = s.opts.Clustering.RaftElectionTimeout
config.LeaderLeaseTimeout = s.opts.Clustering.RaftLeaseTimeout
config.CommitTimeout = s.opts.Clustering.RaftCommitTimeout
}
config.LocalID = raft.ServerID(s.opts.Clustering.NodeID)
config.TrailingLogs = uint64(s.opts.Clustering.TrailingLogs)
logWriter := &raftLogger{s}
config.LogOutput = logWriter
snapshotStore, err := raft.NewFileSnapshotStore(path, s.opts.Clustering.LogSnapshots, logWriter)
if err != nil {
store.Close()
return false, err
}
sl, err := snapshotStore.List()
if err != nil {
store.Close()
return false, err
}
var makeConn natsRaftConnCreator
if s.opts.Clustering.NodesConnections {
makeConn = s.createNewRaftNATSConn
}
transport, err := newNATSTransport(addr, s.ncr, tportTimeout, logWriter, makeConn)
if err != nil {
store.Close()
return false, err
}
// Make the snapshot process never timeout... check (s *serverSnapshot).Persist() for details
transport.TimeoutScale = 1
// Set up a channel for reliable leader notifications.
raftNotifyCh := make(chan bool, 1)
config.NotifyCh = raftNotifyCh
fsm := &raftFSM{server: s}
// We want to know in snapshot.go:Restore() if we are called from NewRaft() or
// at runtime when catching up with the leader. To do so we will set this boolean
// if there were more than one snapshot before the call. The boolean will be cleared
// in Restore() itself.
if len(sl) > 0 {
fsm.Lock()
fsm.restoreFromInit = true
fsm.Unlock()
}
s.raft.fsm = fsm
node, err := raft.NewRaft(config, fsm, store, store, snapshotStore, transport)
if err != nil {
transport.Close()
store.Close()
return false, err
}
if testPauseAfterNewRaftCalled {
time.Sleep(time.Second)
}
existingState, err := raft.HasExistingState(store, store, snapshotStore)
if err != nil {
node.Shutdown()
transport.Close()
store.Close()
return false, err
}
if existingState {
s.log.Debugf("Loaded existing state for Raft group %s", name)
}
// Handle requests to join the cluster.
sub, err := s.ncr.Subscribe(fmt.Sprintf("%s.%s.join", defaultRaftPrefix, name), func(msg *nats.Msg) {
// Drop the request if we're not the leader. There's no race condition
// after this check because even if we proceed with the cluster add, it
// will fail if the node is not the leader as cluster changes go
// through the Raft log.
if node.State() != raft.Leader {
return
}
req := &spb.RaftJoinRequest{}
if err := req.Unmarshal(msg.Data); err != nil {
s.log.Errorf("Invalid join request for Raft group %s", name)
return
}
// Add the node as a voter. This is idempotent. No-op if the request
// came from ourselves.
resp := &spb.RaftJoinResponse{}
if req.NodeID != s.opts.Clustering.NodeID {
future := node.AddVoter(
raft.ServerID(req.NodeID),
raft.ServerAddress(req.NodeAddr), 0, 0)
if err := future.Error(); err != nil {
resp.Error = err.Error()
}
}
// Send the response.
r, err := resp.Marshal()
if err != nil {
panic(err)
}
s.ncr.Publish(msg.Reply, r)
})
if err != nil {
node.Shutdown()
transport.Close()
store.Close()
return false, err
}
s.raft.Raft = node
s.raft.store = store
s.raft.transport = transport
s.raft.logInput = logWriter
s.raft.notifyCh = raftNotifyCh
s.raft.joinSub = sub
return existingState, nil
}
func (s *StanServer) createNewRaftNATSConn(name string) (*nats.Conn, error) {
remoteNodeID := strings.TrimPrefix(name, s.opts.ID+".")
remoteNodeID = strings.TrimSuffix(remoteNodeID, "."+s.opts.ID)
conn, err := s.createNatsClientConn(s.opts.Clustering.NodeID + "-to-" + remoteNodeID)
return conn, err
}
// bootstrapCluster bootstraps the node for the provided Raft group either as a
// seed node or with the given peer configuration, depending on configuration
// and with the latter taking precedence.
func (s *StanServer) bootstrapCluster(name string, node *raft.Raft) error {
var (
addr = s.getClusteringAddr(name)
// Include ourself in the cluster.
servers = []raft.Server{{
ID: raft.ServerID(s.opts.Clustering.NodeID),
Address: raft.ServerAddress(addr),
}}
)
if len(s.opts.Clustering.Peers) > 0 {
// Bootstrap using provided cluster configuration.
s.log.Debugf("Bootstrapping Raft group %s using provided configuration", name)
for _, peer := range s.opts.Clustering.Peers {
servers = append(servers, raft.Server{
ID: raft.ServerID(peer),
Address: raft.ServerAddress(s.getClusteringPeerAddr(name, peer)),
})
}
} else {
// Bootstrap as a seed node.
s.log.Debugf("Bootstrapping Raft group %s as seed node", name)
}
config := raft.Configuration{Servers: servers}
return node.BootstrapCluster(config).Error()
}
// This is bad because we have something like: "test-cluster.a.test-cluster",
// unfortunately, we can't change now without breaking backward compatibility,
// because new/old servers would not be able to connect to each other, since
// this is used for the subscription's subject to accept/send requests between
// nodes.
func (s *StanServer) getClusteringAddr(raftName string) string {
return s.getClusteringPeerAddr(raftName, s.opts.Clustering.NodeID)
}
// See comment above...
func (s *StanServer) getClusteringPeerAddr(raftName, nodeID string) string {
return fmt.Sprintf("%s.%s.%s", s.opts.ID, nodeID, raftName)
}
// Returns the message store first and last sequence.
// When in clustered mode, if the first and last are 0, returns the value of
// the last sequence that we possibly got from the last snapshot. If a node
// restores a snapshot that let's say has first=1 and last=100, but when it
// tries to get these messages from the leader, the leader does not send them
// back because they have all expired, the node will not store anything.
// If we just rely on store's first/last, this node would use and report 0
// for channel's first and last while when all messages have expired, it should
// be last+1/last.
func (s *StanServer) getChannelFirstAndlLastSeq(c *channel) (uint64, uint64, error) {
first, last, err := c.store.Msgs.FirstAndLastSequence()
if !s.isClustered {
return first, last, err
}
if err != nil {
return 0, 0, err
}
if first == 0 && last == 0 {
if fseq := atomic.LoadUint64(&c.firstSeq); fseq != 0 {
first = fseq
last = fseq - 1
}
}
return first, last, nil
}
// Apply log is invoked once a log entry is committed.
// It returns a value which will be made available in the
// ApplyFuture returned by Raft.Apply method if that
// method was called on the same Raft node as the FSM.
func (r *raftFSM) Apply(l *raft.Log) interface{} {
s := r.server
op := &spb.RaftOperation{}
if err := op.Unmarshal(l.Data); err != nil {
return fmt.Errorf("unable to unmarshal RaftOperation: %v", err)
}
// We don't want snapshot Persist() and Apply() to execute concurrently,
// so use common lock.
r.Lock()
defer r.Unlock()
switch op.OpType {
case spb.RaftOperation_Publish:
// Message replication.
if len(op.PublishBatch.Messages) == 0 {
return nil
}
// This is a batch for a given channel, so lookup channel once.
msg := op.PublishBatch.Messages[0]
c, err := r.lookupOrCreateChannel(msg.Subject, op.ChannelID)
if err != nil {
goto FATAL_ERROR
}
// `c` will be nil if the existing channel has an ID > than op.ChannelID.
// This will be the case in RAFT log replay if we have several "versions"
// of the same channel. In that case, simply ignore the log replay.
if c == nil {
return nil
}
if !c.lSeqChecked {
// If msg.Sequence is > 1, then make sure we have no gap.
if msg.Sequence > 1 {
// We pass `1` for the `first` sequence. The function we call
// will do the right thing when it comes to restore possible
// missing messages.
if err = s.raft.fsm.restoreMsgsFromSnapshot(c, 1, msg.Sequence-1, true); err != nil {
goto FATAL_ERROR
}
}
c.lSeqChecked = true
}
for _, msg = range op.PublishBatch.Messages {
if _, err = c.store.Msgs.Store(msg); err != nil {
goto FATAL_ERROR
}
}
return c.store.Msgs.Flush()
FATAL_ERROR:
panic(fmt.Errorf("failed to store replicated message %d on channel %s: %v",
msg.Sequence, msg.Subject, err))
case spb.RaftOperation_Connect:
// Client connection create replication.
return s.processConnect(op.ClientConnect.Request, op.ClientConnect.Refresh)
case spb.RaftOperation_Disconnect:
// Client connection close replication.
return s.closeClient(op.ClientDisconnect.ClientID)
case spb.RaftOperation_Subscribe:
// Subscription replication.
c, err := r.lookupOrCreateChannel(op.Sub.Request.Subject, op.ChannelID)
if c == nil && err == nil {
err = fmt.Errorf("unable to process subscription on channel %q, wrong ID %v",
op.Sub.Request.Subject, op.ChannelID)
}
if err != nil {
return &replicatedSub{sub: nil, err: err}
}
sub, err := s.processSub(c, op.Sub.Request, op.Sub.AckInbox, op.Sub.ID)
return &replicatedSub{sub: sub, err: err}
case spb.RaftOperation_RemoveSubscription:
fallthrough
case spb.RaftOperation_CloseSubscription:
// Close/Unsub subscription replication.
isSubClose := op.OpType == spb.RaftOperation_CloseSubscription
s.closeMu.Lock()
err := s.unsubscribe(op.Unsub, isSubClose)
s.closeMu.Unlock()
return err
case spb.RaftOperation_SendAndAck:
if !s.isLeader() {
s.processReplicatedSendAndAck(op.SubSentAck)
}
return nil
case spb.RaftOperation_DeleteChannel:
// Delete only if the channel exists and has the same ID.
if r.lookupChannel(op.Channel, op.ChannelID) == nil {
return nil
}
s.processDeleteChannel(op.Channel)
return nil
default:
panic(fmt.Sprintf("unknown op type %s", op.OpType))
}
}
// This returns a channel object only if it is found with the proper ID.
func (r *raftFSM) lookupChannel(name string, id uint64) *channel {
s := r.server
cs := s.channels
cs.RLock()
defer cs.RUnlock()
c := cs.channels[name]
// Consider no ID (either in channel or from param) to be a match.
// See note in raftFSM.lookupOrCreateChannel() regarding id == 0
// when dealing with channels created by older versions.
if c != nil && (id == 0 || c.id == 0 || c.id == id) {
return c
}
// No channel, or wrong ID
return nil
}
// Returns the channel with this name and ID.
// If channel exists and has an ID that is greater than the given `id`, then
// this function will return `nil` to indicate that the streaming version
// is more recent than the asked version. Otherwise, if `id` is greater,
// the channel is first deleted then recreated with the given `id`.
//
// Note that to support existing streaming and RAFT stores, the given `id` may
// be empty when processing existing RAFT snapshots/logs, or the streaming
// channel may not have an ID. In any of those cases, the existing channel
// object is returned.
func (r *raftFSM) lookupOrCreateChannel(name string, id uint64) (*channel, error) {
s := r.server
cs := s.channels
cs.Lock()
defer cs.Unlock()
c := cs.channels[name]
if c != nil {
// Consider no ID (either in channel or from param) to be a match.
// See note above regarding meaning of id == 0.
if id == 0 || c.id == 0 || c.id == id {
return c, nil
}
// If this channel is a more recent version than the asked `id` return
// nil to indicate this to the caller.
if c.id > id {
return nil, nil
}
// Here the existing channel has an older ID (version) so replace.
err := cs.store.DeleteChannel(name)
if err == nil {
err = s.raft.store.DeleteChannelID(name)
}
if err != nil {
s.log.Errorf("Error deleting channel %q: %v", name, err)
if s.isLeader() && c.activity != nil {
c.activity.deleteInProgress = false
c.startDeleteTimer()
}
return nil, err
}
delete(cs.channels, name)
}
// Channel does exist or has been deleted. Create now with given ID.
return cs.createChannelLocked(s, name, id)
}
func (s *StanServer) processAddNode(m *nats.Msg) {
var err error
nodeID := string(m.Data)
if nodeID != "" {
addr := s.getClusteringPeerAddr(s.opts.ID, nodeID)
err = s.raft.AddVoter(raft.ServerID(nodeID), raft.ServerAddress(addr), 0, 0).Error()
if err == nil {
s.log.Noticef("Added node %q", nodeID)
m.Respond([]byte("+OK"))
return
}
} else {
err = errors.New("invalid node ID")
}
s.log.Errorf("Error adding node %q: %v", nodeID, err)
m.Respond([]byte(fmt.Sprintf("-ERR adding node %q: %v", nodeID, err)))
}
func (s *StanServer) processRemoveNode(m *nats.Msg) {
nodeID := string(m.Data)
err := s.raft.RemoveServer(raft.ServerID(nodeID), 0, 0).Error()
if err == nil {
s.log.Noticef("Removed node %q", nodeID)
m.Respond([]byte("+OK"))
if nodeID == s.opts.Clustering.NodeID {
s.nc.Flush()
// Wait that we step down...
timeout := time.Now().Add(5 * time.Second)
for time.Now().Before(timeout) {
if atomic.LoadInt64(&(s.raft.leader)) == 0 {
break
}
}
s.Shutdown()
if !runningInTests {
os.Exit(0)
}
}
return
}
s.log.Errorf("Error removing node %q: %v", nodeID, err)
m.Respond([]byte(fmt.Sprintf("-ERR removing node %q: %v", nodeID, err)))
}