-
-
Notifications
You must be signed in to change notification settings - Fork 474
/
searchdreplication.cpp
6042 lines (4974 loc) · 180 KB
/
searchdreplication.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//
// Copyright (c) 2017-2023, Manticore Software LTD (https://manticoresearch.com)
// All rights reserved
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License. You should have
// received a copy of the GPL license along with this program; if you
// did not, you can find it at http://www.gnu.org/
//
#include "sphinxstd.h"
#include "sphinxutils.h"
#include "memio.h"
#include "sphinxpq.h"
#include "searchdreplication.h"
#include "accumulator.h"
#include "fileutils.h"
#include <math.h>
#include "digest_sha1.h"
#include "tracer.h"
#if !HAVE_WSREP
#include "replication/wsrep_api_stub.h"
// it also populates header guard, so next including of 'normal' wsrep_api will break nothing
#endif
#include "replication/wsrep_api.h"
#include "coroutine.h"
#if !_WIN32
// MAC-specific header
#include <netinet/in.h>
#endif
// global application context for wsrep callbacks
static int64_t GetQueryTimeout ( int64_t iTimeout=0 ); // 2 minutes in msec
// 200 msec is ok as we do not need to any missed nodes in cluster node list
static int g_iAnyNodesTimeout = 200;
static int g_iNodeRetry = 3;
static int g_iNodeRetryWait = 500;
// debug options passed into Galera for our logreplication command line key
static const char * g_sDebugOptions = "debug=on;cert.log_conflicts=yes";
// prefix added for Galera nodes
static const char * g_sDefaultReplicationNodes = "gcomm://";
// verbose logging of replcating transactions, ruled by this env variable
static bool LOG_LEVEL_RPL_TNX = val_from_env ( "MANTICORE_LOG_RPL_TNX", false );
#define LOG_COMPONENT_RPL_TNX ""
#define RPL_TNX LOGMSG ( RPL_DEBUG, RPL_TNX, RPL_TNX )
// cluster state this node sees
enum class ClusterState_e
{
// stop terminal states
CLOSED, // node shut well or not started
DESTROYED, // node closed with error
// transaction states
JOINING, // node joining cluster
DONOR, // node is donor for another node joining cluster
// ready terminal state
SYNCED // node works as usual
};
// managed port got back into global ports list
class ScopedPort_c : public ISphNoncopyable
{
int m_iPort = -1;
public:
explicit ScopedPort_c ( int iPort ) : m_iPort ( iPort ) {}
~ScopedPort_c();
int Get() const { return m_iPort; }
void Set ( int iPort );
int Leak();
void Free();
};
// cluster related data
class ReplicationCluster_t : public ClusterDesc_t
{
public:
ReplicationCluster_t() = default;
virtual ~ReplicationCluster_t();
// replicator
wsrep_t * m_pProvider = nullptr;
// serializer for replicator - guards for only one replication Op a time
Threads::Coro::Mutex_c m_tReplicationMutex;
// receiver thread
Threads::Coro::Waitable_T<bool> m_bWorkerActive { false };
// nodes at cluster
CSphString m_sViewNodes; // raw nodes addresses (API and replication) from whole cluster
// lock protects access to m_sViewNodes and m_sClusterNodes
CSphMutex m_tViewNodesLock;
// representation of m_dIndexes for binarySearch
// to quickly validate query to cluster:index
CSphVector<uint64_t> m_dIndexHashes;
// Galera port got from global ports list on cluster created
ScopedPort_c m_tPort { -1 };
bool m_bUserRequest { false };
// lock protects access to m_hOptions
CSphMutex m_tOptsLock;
// state variables cached from Galera
CSphFixedVector<char> m_dUUID { 0 };
int m_iConfID = 0;
int m_iStatus = WSREP_VIEW_DISCONNECTED;
int m_iSize = 0;
int m_iIdx = 0;
// error that got reported to main thread
CSphMutex m_tErrorLock;
StringBuilder_c m_sError { ";" };
void UpdateIndexHashes();
// state of node
void SetNodeState ( ClusterState_e eNodeState )
{
m_tNodeState.SetValue ( eNodeState );
m_tNodeState.NotifyAll();
}
ClusterState_e GetNodeState() const { return m_tNodeState.GetValue(); }
void HeartBeat ()
{
m_tHeardBeat.NotifyAll();
}
template<typename PRED>
void WaitHeartBeat ( PRED&& fnPred )
{
m_tHeardBeat.WaitVoid ( std::forward<PRED> ( fnPred ) );
}
template<typename PRED>
bool WaitHeartBeatForMs ( PRED&& fnPred, int64_t iPeriodMs )
{
return m_tHeardBeat.WaitVoidForMs ( std::forward<PRED> ( fnPred ), iPeriodMs );
}
template<typename PRED>
ClusterState_e WaitAny (PRED&& fnPred)
{
return m_tNodeState.Wait ( std::forward<PRED> ( fnPred ) );
}
template<typename PRED>
ClusterState_e WaitAnyForMs (PRED&& fnPred, int64_t iPeriodMs )
{
return m_tNodeState.WaitForMs ( std::forward<PRED> ( fnPred ), iPeriodMs );
}
ClusterState_e WaitReady()
{
return WaitAny ( [] ( ClusterState_e i ) { return i != ClusterState_e::CLOSED && i != ClusterState_e::JOINING && i != ClusterState_e::DONOR; } );
}
void SetPrimary ( wsrep_view_status_t eStatus )
{
m_iStatus = eStatus;
}
bool IsPrimary() const { return ( m_iStatus==WSREP_VIEW_PRIMARY ); }
private:
Threads::Coro::Waitable_T<ClusterState_e> m_tNodeState { ClusterState_e::CLOSED };
Threads::Coro::Waitable_T<bool> m_tHeardBeat { false };
};
// arguments to replication static functions
struct ReplicationArgs_t
{
// cluster description
SharedPtr_t<ReplicationCluster_t> m_pCluster { new ReplicationCluster_t };
// to create new cluster or join existed
bool m_bNewCluster = false;
// node address to listen by Galera
CSphString m_sListenAddr;
// node incoming address passed into cluster, IP used from listen API or node_address
int m_iListenPort = -1;
// nodes list to join by Galera
CSphString m_sNodes;
};
// interface to pass into static Replicate to issue commit for specific command
class CommitMonitor_c
{
public:
explicit CommitMonitor_c ( RtAccum_t & tAcc )
: m_tAcc ( tAcc )
{}
CommitMonitor_c ( RtAccum_t & tAcc, CSphString * pWarning, int * pUpdated )
: m_tAcc ( tAcc )
, m_pWarning ( pWarning )
, m_pUpdated ( pUpdated )
{}
CommitMonitor_c ( RtAccum_t & tAcc, int * pDeletedCount, CSphString * pWarning, int * pUpdated )
: m_tAcc ( tAcc )
, m_pDeletedCount ( pDeletedCount )
, m_pWarning ( pWarning )
, m_pUpdated ( pUpdated )
{}
~CommitMonitor_c();
// commit for common commands
bool Commit ( CSphString & sError );
// commit for Total Order Isolation commands
bool CommitTOI ( ServedClone_c* pDesc, CSphString& sError );
// update with Total Order Isolation
bool Update ( bool bCluster, CSphString & sError );
private:
RtAccum_t & m_tAcc;
int * m_pDeletedCount = nullptr;
CSphString * m_pWarning = nullptr;
int * m_pUpdated = nullptr;
bool CommitNonEmptyCmds ( RtIndex_i* pIndex, const ReplicationCommand_t& tCmd, bool bOnlyTruncate, CSphString& sError ) const;
};
typedef SharedPtr_t<ReplicationCluster_t> ReplicationClusterPtr_t;
// lock protects operations at g_hClusters
static Threads::Coro::RWLock_c g_tClustersLock;
// cluster list
static SmallStringHash_T<ReplicationClusterPtr_t> g_hClusters GUARDED_BY ( g_tClustersLock );
// hack for abort callback to invalidate only specific cluster
static thread_local ReplicationCluster_t* g_pTlsCluster;
/////////////////////////////////////////////////////////////////////////////
// forward declarations
/////////////////////////////////////////////////////////////////////////////
// abort callback that invalidates specific cluster
static void ReplicationAbort();
/////////////////////////////////////////////////////////////////////////////
// JSON config related functions
/////////////////////////////////////////////////////////////////////////////
// create string by join global data_dir and cluster path
static CSphString GetClusterPath ( const CSphString & sPath );
/////////////////////////////////////////////////////////////////////////////
// remote commands for cluster and index managements
/////////////////////////////////////////////////////////////////////////////
struct PQRemoteReply_t;
struct PQRemoteData_t;
// command at remote node for CLUSTER_DELETE to delete cluster
static bool RemoteClusterDelete ( const CSphString & sCluster, CSphString & sError );
// command at remote node for CLUSTER_FILE_RESERVE to check
// - file could be allocated on disk at cluster path and reserve disk space for a file
// - or make sure that index has exact same index file, ie sha1 matched
static bool RemoteFileReserve ( const PQRemoteData_t & tCmd, PQRemoteReply_t & tRes, CSphString & sError );
// command at remote node for CLUSTER_FILE_SEND to store data into file, data size and file offset defined by sender
static void RemoteFileStore ( const PQRemoteData_t & tCmd, PQRemoteReply_t & tRes, CSphString & sError );
// command at remote node for CLUSTER_INDEX_ADD_LOCAL to check sha1 of index file matched and load index into daemon
static bool RemoteLoadIndex ( const PQRemoteData_t & tCmd, PQRemoteReply_t & tRes, CSphString & sError );
// send local indexes to remote nodes via API
static bool SendClusterIndexes ( const ReplicationCluster_t * pCluster, const CSphString & sNode, bool bBypass, const wsrep_gtid_t & tStateID, CSphString & sError );
// callback at remote node for CLUSTER_SYNCED to pick up received indexes then call Galera sst_received
static bool RemoteClusterSynced ( const PQRemoteData_t & tCmd, CSphString & sError );
// callback for Galera apply_cb to parse replicated command
static bool ParseCmdReplicated ( const BYTE * pData, int iLen, bool bIsolated, const CSphString & sCluster, RtAccum_t & tAcc, CSphQuery & tQuery );
// callback for Galera commit_cb to commit replicated command
static bool HandleCmdReplicated ( RtAccum_t & tAcc );
// command to all remote nodes at cluster to get actual nodes list
static bool ClusterGetNodes ( const CSphString & sClusterNodes, const CSphString & sCluster, const CSphString & sGTID, Proto_e eProto, CSphString & sError, CSphString & sNodes );
// callback at remote node for CLUSTER_GET_NODES to return actual nodes list at cluster
static bool RemoteClusterGetNodes ( const CSphString & sCluster, const CSphString & sGTID, CSphString & sError, CSphString & sNodes );
// utility function to filter nodes list provided at string by specific protocol
static bool ClusterFilterNodes ( Str_t sSrcNodes, Proto_e eProto, CSphString & sDstNodes, CSphString & sError );
// callback at remote node for CLUSTER_UPDATE_NODES to update nodes list at cluster from actual nodes list
static bool RemoteClusterUpdateNodes ( const CSphString & sCluster, CSphString * pNodes, bool bSaveConf, bool bMergeNodes, CSphString & sError );
static bool IsClusterCommand ( const RtAccum_t & tAcc );
static bool IsUpdateCommand ( const RtAccum_t & tAcc );
static void SaveUpdate ( const CSphAttrUpdate & tUpd, CSphVector<BYTE> & dOut );
static int LoadUpdate ( const BYTE * pBuf, int iLen, CSphAttrUpdate & tUpd, bool & bBlob );
static void SaveUpdate ( const CSphQuery & tQuery, CSphVector<BYTE> & dOut );
static int LoadUpdate ( const BYTE * pBuf, int iLen, CSphQuery & tQuery );
static bool ValidateUpdate ( const ReplicationCommand_t & tCmd, CSphString & sError );
static bool DoClusterAlterUpdate ( const CSphString & sCluster, const CSphString & sUpdate, bool bRemoteError, bool bJoinUpdate, CSphString & sError );
static CSphString GetAddr ( const ListenerDesc_t & tListen );
static bool IsInetAddrFree ( DWORD uAddr, int iPort )
{
static struct sockaddr_in iaddr;
memset ( &iaddr, 0, sizeof(iaddr) );
iaddr.sin_family = AF_INET;
iaddr.sin_addr.s_addr = uAddr;
iaddr.sin_port = htons ( (short)iPort );
int iSock = (int)socket ( AF_INET, SOCK_STREAM, 0 );
if ( iSock==-1 )
{
sphWarning ( "failed to create TCP socket: %s", sphSockError() );
return false;
}
int iRes = bind ( iSock, (struct sockaddr *)&iaddr, sizeof(iaddr) );
SafeClose ( iSock );
return ( iRes==0 );
}
static const int g_iDefaultPortBias = 10;
static const int g_iDefaultPortRange = 200;
struct PortsRange_t
{
int m_iPort = 0;
int m_iCount = 0;
};
// manage ports pairs for clusters set as Galera replication listener ports range
class FreePortList_c
{
private:
CSphVector<int> m_dFree;
CSphTightVector<PortsRange_t> m_dPorts;
CSphMutex m_tLock;
DWORD m_uAddr = 0;
public:
// set range of ports there is could generate ports pairs
void AddRange ( const PortsRange_t & tPorts )
{
assert ( tPorts.m_iPort && tPorts.m_iCount && ( tPorts.m_iCount%2 )==0 );
ScopedMutex_t tLock ( m_tLock );
m_dPorts.Add ( tPorts );
}
void AddAddr ( const CSphString & sAddr )
{
m_uAddr = sphGetAddress ( sAddr.cstr (), false, false );
}
// get next available range of ports for Galera listener
// first reuse ports pair that was recently released
// or pair from range set
int Get ()
{
int iPortMin = -1;
ScopedMutex_t tLock ( m_tLock );
while ( iPortMin==-1 && ( m_dPorts.GetLength() || m_dFree.GetLength () ) )
{
if ( m_dFree.GetLength() )
{
iPortMin = m_dFree.Pop();
} else if ( m_dPorts.GetLength() )
{
assert ( m_dPorts.Last().m_iCount>=2 );
PortsRange_t & tPorts = m_dPorts.Last();
iPortMin = tPorts.m_iPort;
tPorts.m_iPort += 2;
tPorts.m_iCount -= 2;
if ( !tPorts.m_iCount )
m_dPorts.Pop();
}
if ( IsInetAddrFree ( m_uAddr, iPortMin ) && IsInetAddrFree ( m_uAddr, iPortMin+1 ) )
break;
iPortMin = -1;
}
return iPortMin;
}
// free ports pair and add it to free list
void Free ( int iPort )
{
ScopedMutex_t tLock ( m_tLock );
m_dFree.Add ( iPort );
}
};
static bool g_bReplicationEnabled = false;
static bool g_bReplicationStarted = false;
// incoming address guessed (false) or set via searchd.node_address
static bool g_bHasIncoming = false;
// incoming IP part of address set by searchd.node_address or took from listener
static CSphString g_sIncomingIP;
// incoming address (IP:port from API listener) used for request to this node from other daemons
static CSphString g_sIncomingProto;
// listen IP part of address for Galera
static CSphString g_sListenReplicationIP;
// ports pairs manager
static FreePortList_c g_tPorts;
bool ReplicationIsEnabled()
{
return g_bReplicationEnabled;
}
void ReplicationSetIncoming ( CSphString sIncoming )
{
g_sIncomingIP = sIncoming;
g_bHasIncoming = !g_sIncomingIP.IsEmpty();
}
ScopedPort_c::~ScopedPort_c()
{
Free();
}
void ScopedPort_c::Free()
{
if ( m_iPort!=-1 )
g_tPorts.Free ( m_iPort );
m_iPort = -1;
}
void ScopedPort_c::Set ( int iPort )
{
Free();
m_iPort = iPort;
}
int ScopedPort_c::Leak()
{
int iPort = m_iPort;
m_iPort = -1;
return iPort;
}
// data passed to Galera and used at callbacks
struct ReceiverCtx_t
{
ReplicationClusterPtr_t m_pCluster;
// share of remote commands received between apply and commit callbacks
RtAccum_t m_tAcc;
CSphQuery m_tQuery;
void Cleanup();
};
// log strings enum -> strings
static const char * g_sClusterStatus[WSREP_VIEW_MAX] = { "primary", "non-primary", "disconnected" };
static const char * g_sStatusDesc[] = {
"success",
"warning",
"transaction is not known",
"transaction aborted, server can continue",
"transaction was victim of brute force abort",
"data exceeded maximum supported size",
"error in client connection, must abort",
"error in node state, must reinit",
"fatal error, server must abort",
"transaction was aborted before commencing pre-commit",
"feature not implemented"
};
static const char * GetNodeState ( ClusterState_e eState )
{
switch ( eState )
{
case ClusterState_e::CLOSED: return "closed";
case ClusterState_e::DESTROYED: return "destroyed";
case ClusterState_e::JOINING: return "joining";
case ClusterState_e::DONOR: return "donor";
case ClusterState_e::SYNCED: return "synced";
default: return "undefined";
}
};
static const char * GetStatus ( wsrep_status_t tStatus )
{
if ( tStatus>=WSREP_OK && tStatus<=WSREP_NOT_IMPLEMENTED )
return g_sStatusDesc[tStatus];
else
return strerror ( tStatus );
}
// var args wrapper for log callback
static void LoggerWrapper ( wsrep_log_level_t eLevel, const char * sFmt, ... ) __attribute__ ( ( format ( printf, 2, 3 ) ) );
void LoggerWrapper ( wsrep_log_level_t eLevel, const char * sFmt, ... )
{
ESphLogLevel eLevelDst = SPH_LOG_INFO;
switch ( eLevel )
{
// FIXME!!! add --logreplicationv daemon option to show galera debug messages in this mode
//case WSREP_LOG_INFO: eLevelDst = SPH_LOG_RPL_DEBUG;
//case WSREP_LOG_DEBUG: eLevelDst = SPH_LOG_RPL_DEBUG;
// return;
case WSREP_LOG_FATAL: eLevelDst = SPH_LOG_FATAL; break;
case WSREP_LOG_ERROR: eLevelDst = SPH_LOG_FATAL; break;
case WSREP_LOG_WARN: eLevelDst = SPH_LOG_WARNING; break;
case WSREP_LOG_INFO: eLevelDst = SPH_LOG_RPL_DEBUG; break;
case WSREP_LOG_DEBUG: eLevelDst = SPH_LOG_RPL_DEBUG; break;
default: eLevelDst = SPH_LOG_RPL_DEBUG;
}
va_list ap;
va_start ( ap, sFmt );
sphLogVa ( sFmt, ap, eLevelDst );
va_end ( ap );
}
static bool CheckNoWarning ( const char * sMsg );
// callback for Galera logger_cb to log messages and errors
static void Logger_fn ( wsrep_log_level_t eLevel, const char * sMsg )
{
// in normal flow need to skip certain messages from Galera but keep messagea in debug replication verbosity level
// dont want to patch Galera source code
if ( g_eLogLevel<SPH_LOG_RPL_DEBUG && eLevel==WSREP_LOG_WARN && CheckNoWarning ( sMsg ) )
return;
LoggerWrapper ( eLevel, "%s", sMsg );
}
CSphString WaitClusterReady ( const CSphString& sCluster, int64_t iTimeoutS )
{
ReplicationClusterPtr_t pCluster { nullptr };
{
Threads::SccRL_t rLock ( g_tClustersLock );
auto ppCluster = g_hClusters(sCluster);
if ( !ppCluster )
return SphSprintf ( "unknown cluster '%s'", sCluster.cstr() );
pCluster = *ppCluster;
}
ClusterState_e eState;
if ( iTimeoutS<=0 )
eState = pCluster->WaitAny ( [] ( ClusterState_e i ) { return i == ClusterState_e::SYNCED || i == ClusterState_e::DONOR; } );
else
eState = pCluster->WaitAnyForMs ( [] ( ClusterState_e i ) { return i == ClusterState_e::SYNCED || i == ClusterState_e::DONOR; }, iTimeoutS*1000 );
return GetNodeState ( eState );
}
std::pair<int, CSphString> WaitClusterCommit ( const CSphString& sCluster, int iTxn, int64_t iTimeoutS )
{
ReplicationClusterPtr_t pCluster { nullptr };
{
Threads::SccRL_t rLock ( g_tClustersLock );
auto ppCluster = g_hClusters ( sCluster );
if ( !ppCluster )
return {-1, SphSprintf ( "unknown cluster '%s'", sCluster.cstr() ) };
pCluster = *ppCluster;
}
int64_t iVal = -1;
auto fnPred = [iTxn, pCluster, &iVal]()
{
auto pProvider = pCluster->m_pProvider;
if ( !pProvider )
return false;
assert ( pProvider );
wsrep_stats_var* pAllVars = pProvider->stats_get ( pProvider );
for ( wsrep_stats_var* pVars = pAllVars; pVars->name; ++pVars )
if ( !strcmp ( pVars->name, "last_committed" ) )
{
iVal = (int)pVars->value._int64;
break;
}
pProvider->stats_free ( pProvider, pAllVars );
return iVal >= iTxn;
};
bool bSuccess = true;
if ( iTimeoutS <= 0 )
pCluster->WaitHeartBeat ( std::move ( fnPred ) );
else
bSuccess = pCluster->WaitHeartBeatForMs ( std::move ( fnPred ), iTimeoutS * 1000 );
if ( !bSuccess )
return { iVal, "timeout" };
return { iVal, "" };
}
// commands version (commands these got replicated via Galera)
// ver 0x104 added docstore from RT index
// ver 0x105 fixed CSphWordHit serialization - instead of direct raw blob copy only fields sent (16 bytes vs 24)
// ver 0x106 add total indexed bytes to accum
// ver 0x107 add blobs vector to replicate update statement
static const WORD g_iReplicateCommandVer = 0x107;
// log debug info about cluster nodes as current nodes views that
static void LogGroupView ( const wsrep_view_info_t * pView )
{
if ( g_eLogLevel<SPH_LOG_RPL_DEBUG )
return;
sphLogDebugRpl ( "new cluster membership: %d(%d), global seqno: " INT64_FMT ", status %s, gap %d",
pView->my_idx, pView->memb_num, (int64_t)pView->state_id.seqno, g_sClusterStatus[pView->status], (int)pView->state_gap );
StringBuilder_c sBuf;
const wsrep_member_info_t * pBoxes = pView->members;
for ( int i=0; i<pView->memb_num; i++ )
sBuf.Appendf ( "\n'%s', '%s' %s", pBoxes[i].name, pBoxes[i].incoming, ( i==pView->my_idx ? "*" : "" ) );
sphLogDebugRpl ( "%s", sBuf.cstr() );
}
// check cluster state prior passing write transaction \ commands into cluster
static bool CheckClasterState ( ClusterState_e eState, bool bPrimary, const CSphString & sCluster, CSphString & sError )
{
if ( !bPrimary )
{
sError.SetSprintf ( "cluster '%s' is not ready, not primary state (%s)", sCluster.cstr(), GetNodeState ( eState ) );
return false;
}
if ( eState!=ClusterState_e::SYNCED && eState!=ClusterState_e::DONOR )
{
sError.SetSprintf ( "cluster '%s' is not ready, current state is %s", sCluster.cstr(), GetNodeState ( eState ) );
return false;
}
return true;
}
// check cluster state wrapper
static bool CheckClasterState ( const ReplicationCluster_t * pCluster, CSphString & sError )
{
assert ( pCluster );
const ClusterState_e eState = pCluster->GetNodeState();
const bool bPrimary = pCluster->IsPrimary();
return CheckClasterState ( eState, bPrimary, pCluster->m_sName, sError );
}
// update cluster state nodes from Galera callback on cluster view changes
static void UpdateGroupView ( const wsrep_view_info_t * pView, ReplicationCluster_t * pCluster )
{
StringBuilder_c sBuf ( "," );
const wsrep_member_info_t * pBoxes = pView->members;
for ( int i=0; i<pView->memb_num; i++ )
sBuf += pBoxes[i].incoming;
// change of nodes happens only here with single thread
// its safe to compare nodes string here without lock
if ( pCluster->m_sViewNodes!=sBuf.cstr() )
{
ScopedMutex_t tLock ( pCluster->m_tViewNodesLock );
sphLogDebugRpl ( "view nodes changed: %s > %s", sBuf.cstr(), pCluster->m_sViewNodes.cstr() );
pCluster->m_sViewNodes = sBuf.cstr();
}
}
// callback for Galera view_handler_cb that cluster view got changed, ie node either added or removed from cluster
// This will be called on cluster view change (nodes joining, leaving, etc.).
// Each view change is the point where application may be pronounced out of
// sync with the current cluster view and need state transfer.
// It is guaranteed that no other callbacks are called concurrently with it.
static wsrep_cb_status_t ViewChanged_fn ( void * pAppCtx, void * pRecvCtx, const wsrep_view_info_t * pView, const char * pState, size_t iStateLen, void ** ppSstReq, size_t * pSstReqLen )
{
ReceiverCtx_t * pLocalCtx = (ReceiverCtx_t *)pRecvCtx;
LogGroupView ( pView );
ReplicationCluster_t * pCluster = pLocalCtx->m_pCluster;
memcpy ( pCluster->m_dUUID.Begin(), pView->state_id.uuid.data, pCluster->m_dUUID.GetLengthBytes() );
pCluster->m_iConfID = (int)pView->view;
pCluster->m_iSize = pView->memb_num;
pCluster->m_iIdx = pView->my_idx;
pCluster->SetPrimary ( pView->status );
if ( pCluster->IsPrimary() )
UpdateGroupView ( pView, pCluster );
*ppSstReq = nullptr;
*pSstReqLen = 0;
if ( pView->state_gap )
{
auto sAddr = ::strdup( g_sIncomingProto.scstr() );
sphLogDebugRpl ( "join %s to %s", sAddr, pCluster->m_sName.cstr() );
*pSstReqLen = strlen(sAddr) + 1;
*ppSstReq = sAddr;
pCluster->SetNodeState ( ClusterState_e::JOINING );
}
return WSREP_CB_SUCCESS;
}
// callback for Galera sst_donate_cb to become of donor and start sending SST (all cluster indexes) to joiner
static wsrep_cb_status_t SstDonate_fn ( void * pAppCtx, void * pRecvCtx, const void * sMsg, size_t iMsgLen, const wsrep_gtid_t * pStateID, const char * pState, size_t iStateLen, wsrep_bool_t bBypass )
{
ReceiverCtx_t * pLocalCtx = (ReceiverCtx_t *)pRecvCtx;
CSphString sNode;
sNode.SetBinary ( (const char *)sMsg, (int) iMsgLen );
wsrep_gtid_t tGtid = *pStateID;
char sGtid[WSREP_GTID_STR_LEN];
wsrep_gtid_print ( &tGtid, sGtid, sizeof(sGtid) );
ReplicationCluster_t * pCluster = pLocalCtx->m_pCluster;
sphLogDebugRpl ( "donate %s to %s, gtid %s, bypass %d", pCluster->m_sName.cstr(), sNode.cstr(), sGtid, (int)bBypass );
CSphString sError;
pCluster->SetNodeState ( ClusterState_e::DONOR );
const bool bOk = SendClusterIndexes ( pCluster, sNode, bBypass, tGtid, sError );
pCluster->SetNodeState ( ClusterState_e::SYNCED );
if ( !bOk )
{
sphWarning ( "%s", sError.cstr() );
tGtid.seqno = WSREP_SEQNO_UNDEFINED;
}
pCluster->m_pProvider->sst_sent( pCluster->m_pProvider, &tGtid, ( bOk ? 0 : -ECANCELED ) );
sphLogDebugRpl ( "donate cluster %s to %s, gtid %s, bypass %d, done %d", pCluster->m_sName.cstr(), sNode.cstr(), sGtid, (int)bBypass, (int)bOk );
return ( bOk ? WSREP_CB_SUCCESS : WSREP_CB_FAILURE );
}
// callback for Galera synced_cb that cluster fully synced and could accept transactions
static void Synced_fn ( void * pAppCtx )
{
ReceiverCtx_t * pLocalCtx = (ReceiverCtx_t *)pAppCtx;
assert ( pLocalCtx );
ReplicationCluster_t * pCluster = pLocalCtx->m_pCluster;
pCluster->SetNodeState ( ClusterState_e::SYNCED );
sphLogDebugRpl ( "synced cluster %s", pCluster->m_sName.cstr() );
}
void ReceiverCtx_t::Cleanup()
{
m_tAcc.Cleanup();
m_tQuery.m_dFilters.Reset();
m_tQuery.m_dFilterTree.Reset();
m_pCluster->HeartBeat();
}
// callback for Galera apply_cb that transaction received from cluster by node
// This is called to "apply" writeset.
// If writesets don't conflict on keys, it may be called concurrently to
// utilize several CPU cores.
static wsrep_cb_status_t Apply_fn ( void * pCtx, const void * pData, size_t uSize, uint32_t uFlags, const wsrep_trx_meta_t * pMeta )
{
ReceiverCtx_t * pLocalCtx = (ReceiverCtx_t *)pCtx;
bool bCommit = ( ( uFlags & WSREP_FLAG_COMMIT )!=0 );
bool bIsolated = ( ( uFlags & WSREP_FLAG_ISOLATION )!=0 );
RPL_TNX << "writeset at apply, seq " << (int64_t)pMeta->gtid.seqno << ", size " << (int)uSize << ", flags " << uFlags << ", on " << ( bCommit ? "commit" : "rollback" );
if ( !ParseCmdReplicated ( (const BYTE *)pData, (int) uSize, bIsolated, pLocalCtx->m_pCluster->m_sName, pLocalCtx->m_tAcc, pLocalCtx->m_tQuery ) )
return WSREP_CB_FAILURE;
return WSREP_CB_SUCCESS;
}
// callback for Galera commit_cb that transaction received and parsed before should be committed or rolled back
static wsrep_cb_status_t Commit_fn ( void * pCtx, const void * hndTrx, uint32_t uFlags, const wsrep_trx_meta_t * pMeta, wsrep_bool_t * pExit, wsrep_bool_t bCommit )
{
ReceiverCtx_t * pLocalCtx = (ReceiverCtx_t *)pCtx;
wsrep_t * pWsrep = pLocalCtx->m_pCluster->m_pProvider;
RPL_TNX << "writeset at " << ( bCommit ? "commit" : "rollback" ) << ", seq " << (int64_t)pMeta->gtid.seqno << ", flags " << uFlags;
wsrep_cb_status_t eRes = WSREP_CB_SUCCESS;
if ( bCommit && pLocalCtx->m_tAcc.m_dCmd.GetLength() )
{
bool bOk = false;
bool bIsolated = ( pLocalCtx->m_tAcc.m_dCmd[0]->m_bIsolated );
if ( bIsolated )
{
bOk = HandleCmdReplicated ( pLocalCtx->m_tAcc );
} else
{
pWsrep->applier_pre_commit ( pWsrep, const_cast<void *>( hndTrx ) );
bOk = HandleCmdReplicated ( pLocalCtx->m_tAcc );
pWsrep->applier_interim_commit ( pWsrep, const_cast<void *>( hndTrx ) );
pWsrep->applier_post_commit ( pWsrep, const_cast<void *>( hndTrx ) );
}
RPL_TNX << "seq " << (int64_t)pMeta->gtid.seqno << ", committed " << (int)bOk << ", isolated " << (int)bIsolated;
eRes = ( bOk ? WSREP_CB_SUCCESS : WSREP_CB_FAILURE );
}
pLocalCtx->Cleanup();
return eRes;
}
// callback for Galera unordered_cb that unordered transaction received
static wsrep_cb_status_t Unordered_fn ( void * pCtx, const void * pData, size_t uSize )
{
//ReceiverCtx_t * pLocalCtx = (ReceiverCtx_t *)pCtx;
sphLogDebugRpl ( "unordered" );
return WSREP_CB_SUCCESS;
}
// main recv thread of Galera that handles cluster
// This is the listening thread. It blocks in wsrep::recv() call until
// disconnect from cluster. It will apply and commit writesets through the
// callbacks defined above.
void ReplicationRecv_fn ( std::unique_ptr<ReceiverCtx_t> pCtx )
{
g_pTlsCluster = pCtx->m_pCluster;
pCtx->m_pCluster->m_bWorkerActive.SetValue ( true );
auto tFinish = AtScopeExit ( [&pCtx] { pCtx->m_pCluster->m_bWorkerActive.SetValueAndNotifyAll ( false ); } );
pCtx->m_pCluster->SetNodeState ( ClusterState_e::JOINING );
sphLogDebugRpl ( "receiver %s thread started", pCtx->m_pCluster->m_sName.cstr() );
wsrep_t * pWsrep = pCtx->m_pCluster->m_pProvider;
wsrep_status_t eState = pWsrep->recv ( pWsrep, pCtx.get() );
sphLogDebugRpl ( "receiver %s done, code %d, %s", pCtx->m_pCluster->m_sName.cstr(), eState, GetStatus ( eState ) );
if ( eState==WSREP_CONN_FAIL || eState==WSREP_NODE_FAIL || eState==WSREP_FATAL )
{
pCtx->m_pCluster->SetNodeState ( ClusterState_e::DESTROYED );
} else
{
pCtx->m_pCluster->SetNodeState ( ClusterState_e::CLOSED );
}
}
// callback for Galera pfs_instr_cb there all mutex \ threads \ events should be implemented, could also count these operations for extended stats
// @brief a callback to create PFS instrumented mutex/condition variables
//
// @param type mutex or condition variable
// @param ops add/init or remove/destory mutex/condition variable
// @param tag tag/name of instrument to monitor
// @param value created mutex or condition variable
// @param alliedvalue allied value for supporting operation.
// for example: while waiting for cond-var corresponding
// mutex is passes through this variable.
// @param ts time to wait for condition.
static void Instr_fn ( wsrep_pfs_instr_type_t type, wsrep_pfs_instr_ops_t ops, wsrep_pfs_instr_tag_t tag, void ** value, void ** alliedvalue, const void * ts )
{
if ( type==WSREP_PFS_INSTR_TYPE_THREAD || type==WSREP_PFS_INSTR_TYPE_FILE )
return;
#if !_WIN32
if ( type==WSREP_PFS_INSTR_TYPE_MUTEX )
{
switch ( ops )
{
case WSREP_PFS_INSTR_OPS_INIT:
{
pthread_mutex_t* pMutex = new pthread_mutex_t;
pthread_mutex_init ( pMutex, nullptr );
*value = pMutex;
}
break;
case WSREP_PFS_INSTR_OPS_DESTROY:
{
pthread_mutex_t* pMutex = ( pthread_mutex_t* ) ( *value );
assert ( pMutex );
pthread_mutex_destroy ( pMutex );
delete ( pMutex );
*value = nullptr;
}
break;
case WSREP_PFS_INSTR_OPS_LOCK:
{
pthread_mutex_t* pMutex = ( pthread_mutex_t* ) ( *value );
assert ( pMutex );
pthread_mutex_lock ( pMutex );
}
break;
case WSREP_PFS_INSTR_OPS_UNLOCK:
{
pthread_mutex_t* pMutex = ( pthread_mutex_t* ) ( *value );
assert ( pMutex );
pthread_mutex_unlock ( pMutex );
}
break;
default: assert( 0 );
break;
}
} else if ( type==WSREP_PFS_INSTR_TYPE_CONDVAR )
{
switch ( ops )
{
case WSREP_PFS_INSTR_OPS_INIT:
{
pthread_cond_t* pCond = new pthread_cond_t;
pthread_cond_init ( pCond, nullptr );
*value = pCond;
}
break;
case WSREP_PFS_INSTR_OPS_DESTROY:
{
pthread_cond_t* pCond = ( pthread_cond_t* ) ( *value );
assert ( pCond );
pthread_cond_destroy ( pCond );
delete ( pCond );
*value = nullptr;
}
break;
case WSREP_PFS_INSTR_OPS_WAIT:
{
pthread_cond_t* pCond = ( pthread_cond_t* ) ( *value );
pthread_mutex_t* pMutex = ( pthread_mutex_t* ) ( *alliedvalue );
assert ( pCond && pMutex );
pthread_cond_wait ( pCond, pMutex );
}
break;
case WSREP_PFS_INSTR_OPS_TIMEDWAIT:
{
pthread_cond_t* pCond = ( pthread_cond_t* ) ( *value );
pthread_mutex_t* pMutex = ( pthread_mutex_t* ) ( *alliedvalue );
const timespec* wtime = ( const timespec* ) ts;
assert ( pCond && pMutex );
pthread_cond_timedwait ( pCond, pMutex, wtime );
}
break;
case WSREP_PFS_INSTR_OPS_SIGNAL:
{
pthread_cond_t* pCond = ( pthread_cond_t* ) ( *value );
assert ( pCond );
pthread_cond_signal ( pCond );
}
break;
case WSREP_PFS_INSTR_OPS_BROADCAST:
{
pthread_cond_t* pCond = ( pthread_cond_t* ) ( *value );
assert ( pCond );
pthread_cond_broadcast ( pCond );
}
break;
default: assert( 0 );
break;
}
}
#endif
}
static int GetClusterMemLimit ( const StrVec_t & dIndexes )
{
int64_t iMemLimit = 0;
int iIndexes = 0;
for ( const CSphString & sIndex : dIndexes )
{
cServedIndexRefPtr_c pServed = GetServed ( sIndex );
if ( !pServed )
continue;
iMemLimit = Max ( iMemLimit, pServed->m_tSettings.m_iMemLimit );
iIndexes++;
}
const int CACHE_PER_INDEX = 16;
// change default cache size to 16Mb per added index or size of largest rt_mem_limit of RT index
int iSize = int ( iMemLimit / 1024 / 1024 );
int iCount = Max ( 1, iIndexes );
iSize = Max ( iCount * CACHE_PER_INDEX, iSize );
return iSize;
}