/
sphinx.h
3563 lines (2791 loc) · 115 KB
/
sphinx.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//
// Copyright (c) 2017-2019, Manticore Software LTD (http://manticoresearch.com)
// Copyright (c) 2001-2016, Andrew Aksyonoff
// Copyright (c) 2008-2016, Sphinx Technologies Inc
// All rights reserved
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License. You should have
// received a copy of the GPL license along with this program; if you
// did not, you can find it at http://www.gnu.org/
//
#ifndef _sphinx_
#define _sphinx_
/////////////////////////////////////////////////////////////////////////////
#include "sphinxstd.h"
#include "sphinxexpr.h" // to remove?
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#if USE_PGSQL
#include <libpq-fe.h>
#endif
#if USE_WINDOWS
#include <winsock2.h>
#include <WS2tcpip.h>
#else
#include <sys/types.h>
#include <unistd.h>
#endif
#if USE_MYSQL
#include <mysql.h>
#endif
#if USE_WINDOWS
typedef __int64 SphOffset_t;
#define STDOUT_FILENO fileno(stdout)
#define STDERR_FILENO fileno(stderr)
#else
typedef off_t SphOffset_t;
#endif
#if USE_ODBC
#include <sqlext.h>
#endif
/////////////////////////////////////////////////////////////////////////////
typedef uint64_t SphWordID_t;
typedef uint64_t SphDocID_t;
#define DOCID_MAX U64C(0xffffffffffffffff)
#define WORDID_MAX U64C(0xffffffffffffffff)
#define DOCID_FMT UINT64_FMT
#define DOCINFO_IDSIZE 2
STATIC_SIZE_ASSERT ( SphWordID_t, 8 );
STATIC_SIZE_ASSERT ( SphDocID_t, 8 );
#define DWSIZEOF(a) ( sizeof(a) / sizeof(DWORD) )
//////////////////////////////////////////////////////////////////////////
/// row entry (storage only, does not necessarily map 1:1 to attributes)
typedef DWORD CSphRowitem;
typedef const BYTE * CSphRowitemPtr;
/// widest integer type that can be be stored as an attribute (ideally, fully decoupled from rowitem size!)
typedef int64_t SphAttr_t;
const CSphRowitem ROWITEM_MAX = UINT_MAX;
const int ROWITEM_BITS = 8*sizeof(CSphRowitem);
const int ROWITEMPTR_BITS = 8*sizeof(CSphRowitemPtr);
const int ROWITEM_SHIFT = 5;
STATIC_ASSERT ( ( 1 << ROWITEM_SHIFT )==ROWITEM_BITS, INVALID_ROWITEM_SHIFT );
#ifndef USE_LITTLE_ENDIAN
#error Please define endianness
#endif
template < typename DOCID >
inline DOCID DOCINFO2ID_T ( const DWORD * pDocinfo );
template<> inline DWORD DOCINFO2ID_T ( const DWORD * pDocinfo )
{
return pDocinfo[0];
}
template<> inline uint64_t DOCINFO2ID_T ( const DWORD * pDocinfo )
{
#if USE_LITTLE_ENDIAN
return *(uint64_t *) pDocinfo;
#else
return uint64_t(pDocinfo[1]) + (uint64_t(pDocinfo[0])<<32);
#endif
}
inline void DOCINFOSETID ( DWORD * pDocinfo, DWORD uValue )
{
*pDocinfo = uValue;
}
inline void DOCINFOSETID ( DWORD * pDocinfo, uint64_t uValue )
{
#if USE_LITTLE_ENDIAN
*( uint64_t * ) pDocinfo = uValue;
#else
pDocinfo[0] = (DWORD)(uValue>>32);
pDocinfo[1] = (DWORD)uValue;
#endif
}
inline SphDocID_t DOCINFO2ID ( const DWORD * pDocinfo )
{
return DOCINFO2ID_T<SphDocID_t> ( pDocinfo );
}
#if PARANOID
template < typename DOCID > inline DWORD * DOCINFO2ATTRS_T ( DWORD * pDocinfo ) { assert ( pDocinfo ); return pDocinfo+DWSIZEOF(DOCID); }
template < typename DOCID > inline const DWORD * DOCINFO2ATTRS_T ( const DWORD * pDocinfo ) { assert ( pDocinfo ); return pDocinfo+DWSIZEOF(DOCID); }
template < typename DOCID > inline DWORD * STATIC2DOCINFO_T ( DWORD * pAttrs ) { assert ( pAttrs ); return pAttrs-DWSIZEOF(DOCID); }
template < typename DOCID > inline const DWORD * STATIC2DOCINFO_T ( const DWORD * pAttrs ) { assert ( pAttrs ); return pAttrs-DWSIZEOF(DOCID); }
#else
template < typename DOCID > inline DWORD * DOCINFO2ATTRS_T ( DWORD * pDocinfo ) { return pDocinfo + DWSIZEOF(DOCID); }
template < typename DOCID > inline const DWORD * DOCINFO2ATTRS_T ( const DWORD * pDocinfo ) { return pDocinfo + DWSIZEOF(DOCID); }
template < typename DOCID > inline DWORD * STATIC2DOCINFO_T ( DWORD * pAttrs ) { return pAttrs - DWSIZEOF(DOCID); }
template < typename DOCID > inline const DWORD * STATIC2DOCINFO_T ( const DWORD * pAttrs ) { return pAttrs - DWSIZEOF(DOCID); }
#endif
inline DWORD * DOCINFO2ATTRS ( DWORD * pDocinfo ) { return DOCINFO2ATTRS_T<SphDocID_t>(pDocinfo); }
inline const DWORD * DOCINFO2ATTRS ( const DWORD * pDocinfo ) { return DOCINFO2ATTRS_T<SphDocID_t>(pDocinfo); }
inline DWORD * STATIC2DOCINFO ( DWORD * pAttrs ) { return STATIC2DOCINFO_T<SphDocID_t>(pAttrs); }
inline const DWORD * STATIC2DOCINFO ( const DWORD * pAttrs ) { return STATIC2DOCINFO_T<SphDocID_t>(pAttrs); }
/////////////////////////////////////////////////////////////////////////////
#ifdef BUILD_WITH_CMAKE
#include "gen_sphinxversion.h"
#else
#include "sphinxversion.h"
#endif
#ifndef SPHINX_TAG
#define BANNER_TAG "dev"
#else
#define BANNER_TAG SPHINX_TAG
#endif
#ifndef GIT_TIMESTAMP_ID
#define GIT_TIMESTAMP_ID "000101"
#endif
// this line is deprecated and no more used. Leaved here for a while.
// numbers now to be defined via sphinxversion.h
#ifndef VERNUMBERS
#define VERNUMBERS "7.7.7"
#endif
#define SPHINX_VERSION VERNUMBERS " " SPH_GIT_COMMIT_ID "@" GIT_TIMESTAMP_ID " " BANNER_TAG
#define SPHINX_BANNER "Manticore " SPHINX_VERSION "\nCopyright (c) 2001-2016, Andrew Aksyonoff\n" \
"Copyright (c) 2008-2016, Sphinx Technologies Inc (http://sphinxsearch.com)\n" \
"Copyright (c) 2017-2019, Manticore Software LTD (http://manticoresearch.com)\n\n"
#define SPHINX_SEARCHD_PROTO 1
#define SPHINX_CLIENT_VERSION 1
#define SPH_MAX_WORD_LEN 42 // so that any UTF-8 word fits 127 bytes
#define SPH_MAX_FILENAME_LEN 512
#define SPH_MAX_FIELDS 256
/////////////////////////////////////////////////////////////////////////////
extern int64_t g_iIndexerCurrentDocID;
extern int64_t g_iIndexerCurrentHits;
extern int64_t g_iIndexerCurrentRangeMin;
extern int64_t g_iIndexerCurrentRangeMax;
extern int64_t g_iIndexerPoolStartDocID;
extern int64_t g_iIndexerPoolStartHit;
/////////////////////////////////////////////////////////////////////////////
/// Sphinx CRC32 implementation
extern DWORD g_dSphinxCRC32 [ 256 ];
DWORD sphCRC32 ( const void * pString );
DWORD sphCRC32 ( const void * pString, int iLen );
DWORD sphCRC32 ( const void * pString, int iLen, DWORD uPrevCRC );
/// Fast check if our endianess is correct
const char* sphCheckEndian();
/// Sphinx FNV64 implementation
const uint64_t SPH_FNV64_SEED = 0xcbf29ce484222325ULL;
uint64_t sphFNV64 ( const void * pString );
uint64_t sphFNV64 ( const void * s, int iLen, uint64_t uPrev = SPH_FNV64_SEED );
uint64_t sphFNV64cont ( const void * pString, uint64_t uPrev );
/// calculate file crc32
bool sphCalcFileCRC32 ( const char * szFilename, DWORD & uCRC32 );
/// try to obtain an exclusive lock on specified file
/// bWait specifies whether to wait
bool sphLockEx ( int iFile, bool bWait );
/// remove existing locks
void sphLockUn ( int iFile );
/// millisecond-precision sleep
void sphSleepMsec ( int iMsec );
/// check if file exists and is a readable file
bool sphIsReadable ( const char * sFilename, CSphString * pError=NULL );
bool sphIsReadable ( const CSphString& sFilename, CSphString * pError = NULL );
/// set throttling options
void sphSetThrottling ( int iMaxIOps, int iMaxIOSize );
/// immediately interrupt current query
void sphInterruptNow();
/// check if we got interrupted
bool sphInterrupted();
/// initialize IO statistics collecting
bool sphInitIOStats ();
/// clean up IO statistics collector
void sphDoneIOStats ();
class CSphIOStats
{
public:
int64_t m_iReadTime = 0;
DWORD m_iReadOps = 0;
int64_t m_iReadBytes = 0;
int64_t m_iWriteTime = 0;
DWORD m_iWriteOps = 0;
int64_t m_iWriteBytes = 0;
~CSphIOStats ();
void Start();
void Stop();
void Add ( const CSphIOStats & b );
bool IsEnabled() { return m_bEnabled; }
private:
bool m_bEnabled = false;
CSphIOStats * m_pPrev = nullptr;
};
//////////////////////////////////////////////////////////////////////////
#if UNALIGNED_RAM_ACCESS
/// pass-through wrapper
template < typename T > inline T sphUnalignedRead ( const T & tRef )
{
return tRef;
}
/// pass-through wrapper
template < typename T > void sphUnalignedWrite ( void * pPtr, const T & tVal )
{
*(T*)pPtr = tVal;
}
#else
/// unaligned read wrapper for some architectures (eg. SPARC)
template < typename T >
inline T sphUnalignedRead ( const T & tRef )
{
T uTmp;
BYTE * pSrc = (BYTE *) &tRef;
BYTE * pDst = (BYTE *) &uTmp;
for ( int i=0; i<(int)sizeof(T); i++ )
*pDst++ = *pSrc++;
return uTmp;
}
/// unaligned write wrapper for some architectures (eg. SPARC)
template < typename T >
void sphUnalignedWrite ( void * pPtr, const T & tVal )
{
BYTE * pDst = (BYTE *) pPtr;
BYTE * pSrc = (BYTE *) &tVal;
for ( int i=0; i<(int)sizeof(T); i++ )
*pDst++ = *pSrc++;
}
#endif // unalgined
#if UNALIGNED_RAM_ACCESS && USE_LITTLE_ENDIAN
/// get a dword from memory, intel version
inline DWORD sphGetDword ( const BYTE * p )
{
return *(const DWORD*)p;
}
#else
/// get a dword from memory, non-intel version
inline DWORD sphGetDword ( const BYTE * p )
{
return p[0] + ( p[1]<<8 ) + ( p[2]<<16 ) + ( p[3]<<24 );
}
#endif
int sphUTF8Len ( const char * pStr );
/// check for valid attribute name char
inline int sphIsAttr ( int c )
{
// different from sphIsAlpha() in that we don't allow minus
return ( c>='0' && c<='9' ) || ( c>='a' && c<='z' ) || ( c>='A' && c<='Z' ) || c=='_';
}
/////////////////////////////////////////////////////////////////////////////
// TOKENIZERS
/////////////////////////////////////////////////////////////////////////////
extern const char * SPHINX_DEFAULT_UTF8_TABLE;
/////////////////////////////////////////////////////////////////////////////
/// lowercaser remap range
struct CSphRemapRange
{
int m_iStart;
int m_iEnd;
int m_iRemapStart;
CSphRemapRange ()
: m_iStart ( -1 )
, m_iEnd ( -1 )
, m_iRemapStart ( -1 )
{}
CSphRemapRange ( int iStart, int iEnd, int iRemapStart )
: m_iStart ( iStart )
, m_iEnd ( iEnd )
, m_iRemapStart ( iRemapStart )
{}
};
inline bool operator < ( const CSphRemapRange & a, const CSphRemapRange & b )
{
return a.m_iStart < b.m_iStart;
}
/// lowercaser
class CSphLowercaser
{
friend class ISphTokenizer;
friend class CSphTokenizerBase;
friend class CSphTokenizerBase2;
public:
~CSphLowercaser ();
void Reset ();
void SetRemap ( const CSphLowercaser * pLC );
void AddRemaps ( const CSphVector<CSphRemapRange> & dRemaps, DWORD uFlags );
void AddSpecials ( const char * sSpecials );
uint64_t GetFNV () const;
public:
CSphLowercaser & operator = ( const CSphLowercaser & rhs );
public:
inline int ToLower ( int iCode ) const
{
if ( iCode<0 || iCode>=MAX_CODE )
return iCode;
register int * pChunk = m_pChunk [ iCode >> CHUNK_BITS ];
if ( pChunk )
return pChunk [ iCode & CHUNK_MASK ];
return 0;
}
int GetMaxCodepointLength () const;
protected:
static const int CHUNK_COUNT = 0x300;
static const int CHUNK_BITS = 8;
static const int CHUNK_SIZE = 1 << CHUNK_BITS;
static const int CHUNK_MASK = CHUNK_SIZE - 1;
static const int MAX_CODE = CHUNK_COUNT * CHUNK_SIZE;
int m_iChunks = 0; ///< how much chunks are actually allocated
int * m_pData = nullptr; ///< chunks themselves
int * m_pChunk [ CHUNK_COUNT ] { nullptr }; ///< pointers to non-empty chunks
};
/////////////////////////////////////////////////////////////////////////////
enum
{
// where was TOKENIZER_SBCS=1 once
TOKENIZER_UTF8 = 2,
TOKENIZER_NGRAM = 3
};
struct CSphSavedFile
{
CSphString m_sFilename;
SphOffset_t m_uSize = 0;
SphOffset_t m_uCTime = 0;
SphOffset_t m_uMTime = 0;
DWORD m_uCRC32 = 0;
};
struct CSphEmbeddedFiles
{
bool m_bEmbeddedSynonyms = false;
bool m_bEmbeddedStopwords = false;
bool m_bEmbeddedWordforms = false;
CSphSavedFile m_tSynonymFile;
StrVec_t m_dSynonyms;
CSphVector<CSphSavedFile> m_dStopwordFiles;
CSphVector<SphWordID_t> m_dStopwords;
StrVec_t m_dWordforms;
CSphVector<CSphSavedFile> m_dWordformFiles;
void Reset();
};
struct CSphTokenizerSettings
{
int m_iType { TOKENIZER_UTF8 };
CSphString m_sCaseFolding;
int m_iMinWordLen = 1;
CSphString m_sSynonymsFile;
CSphString m_sBoundary;
CSphString m_sIgnoreChars;
int m_iNgramLen = 0;
CSphString m_sNgramChars;
CSphString m_sBlendChars;
CSphString m_sBlendMode;
};
enum ESphBigram
{
SPH_BIGRAM_NONE = 0, ///< no bigrams
SPH_BIGRAM_ALL = 1, ///< index all word pairs
SPH_BIGRAM_FIRSTFREQ = 2, ///< only index pairs where one of the words is in a frequent words list
SPH_BIGRAM_BOTHFREQ = 3 ///< only index pairs where both words are in a frequent words list
};
enum ESphTokenizerClone
{
SPH_CLONE_INDEX, ///< clone tokenizer and set indexing mode
SPH_CLONE_QUERY, ///< clone tokenizer and set querying mode
SPH_CLONE_QUERY_LIGHTWEIGHT ///< lightweight clone for querying (can parse, can NOT modify settings, shares pointers to the original lowercaser table)
};
enum ESphTokenMorph
{
SPH_TOKEN_MORPH_RAW, ///< no morphology applied, tokenizer does not handle morphology
SPH_TOKEN_MORPH_ORIGINAL, ///< no morphology applied, but tokenizer handles morphology
SPH_TOKEN_MORPH_GUESS ///< morphology applied
};
struct CSphMultiformContainer;
class CSphWriter;
/// generic tokenizer
class ISphTokenizer : public ISphRefcountedMT
{
/// trivial dtor - inherited from Refcounted
protected:
~ISphTokenizer() override {};
public:
/// set new translation table
/// returns true on success, false on failure
virtual bool SetCaseFolding ( const char * sConfig, CSphString & sError );
/// add additional character as valid (with folding to itself)
virtual void AddPlainChar ( char c );
/// add special chars to translation table
/// updates lowercaser so that these remap to -1
virtual void AddSpecials ( const char * sSpecials );
/// set ignored characters
virtual bool SetIgnoreChars ( const char * sIgnored, CSphString & sError );
/// set n-gram characters (for CJK n-gram indexing)
virtual bool SetNgramChars ( const char *, CSphString & ) { return true; }
/// set n-gram length (for CJK n-gram indexing)
virtual void SetNgramLen ( int ) {}
/// load synonyms list
virtual bool LoadSynonyms ( const char * sFilename, const CSphEmbeddedFiles * pFiles, CSphString & sError ) = 0;
/// write synonyms to file
virtual void WriteSynonyms ( CSphWriter & tWriter ) const = 0;
/// set phrase boundary chars
virtual bool SetBoundary ( const char * sConfig, CSphString & sError );
/// set blended characters
virtual bool SetBlendChars ( const char * sConfig, CSphString & sError );
/// set blended tokens processing mode
virtual bool SetBlendMode ( const char * sMode, CSphString & sError );
/// setup tokenizer using given settings
virtual void Setup ( const CSphTokenizerSettings & tSettings );
/// create a tokenizer using the given settings
static ISphTokenizer * Create ( const CSphTokenizerSettings & tSettings, const CSphEmbeddedFiles * pFiles, CSphString & sError );
/// create a token filter
static ISphTokenizer * CreateMultiformFilter ( ISphTokenizer * pTokenizer, const CSphMultiformContainer * pContainer );
/// create a token filter
static ISphTokenizer * CreateBigramFilter ( ISphTokenizer * pTokenizer, ESphBigram eBigramIndex, const CSphString & sBigramWords, CSphString & sError );
/// create a plugin filter
/// sSspec is a library, name, and options specification string, eg "myplugins.dll:myfilter1:arg1=123"
static ISphTokenizer * CreatePluginFilter ( ISphTokenizer * pTokenizer, const CSphString & sSpec, CSphString & sError );
/// save tokenizer settings to a stream
virtual const CSphTokenizerSettings & GetSettings () const { return m_tSettings; }
/// get synonym file info
virtual const CSphSavedFile & GetSynFileInfo () const { return m_tSynFileInfo; }
public:
/// pass next buffer
virtual void SetBuffer ( const BYTE * sBuffer, int iLength ) = 0;
/// set current index schema (only intended for the token filter plugins)
virtual bool SetFilterSchema ( const CSphSchema &, CSphString & ) { return true; }
/// set per-document options from INSERT
virtual bool SetFilterOptions ( const char *, CSphString & ) { return true; }
/// notify tokenizer that we now begin indexing a field with a given number (only intended for the token filter plugins)
virtual void BeginField ( int ) {}
/// get next token
virtual BYTE * GetToken () = 0;
/// calc codepoint length
virtual int GetCodepointLength ( int iCode ) const = 0;
/// get max codepoint length
virtual int GetMaxCodepointLength () const = 0;
/// enable indexing-time sentence boundary detection, and paragraph indexing
virtual bool EnableSentenceIndexing ( CSphString & sError );
/// enable zone indexing
virtual bool EnableZoneIndexing ( CSphString & sError );
/// enable tokenized multiform tracking
virtual void EnableTokenizedMultiformTracking () {}
/// get last token length, in codepoints
virtual int GetLastTokenLen () const { return m_iLastTokenLen; }
/// get last token boundary flag (true if there was a boundary before the token)
virtual bool GetBoundary () { return m_bTokenBoundary; }
/// get byte offset of the last boundary character
virtual int GetBoundaryOffset () { return m_iBoundaryOffset; }
/// was last token a special one?
virtual bool WasTokenSpecial () { return m_bWasSpecial; }
virtual bool WasTokenSynonym () const { return m_bWasSynonym; }
/// get amount of overshort keywords skipped before this token
virtual int GetOvershortCount () { return ( !m_bBlended && m_bBlendedPart ? 0 : m_iOvershortCount ); }
/// get original tokenized multiform (if any); NULL means there was none
virtual BYTE * GetTokenizedMultiform () { return NULL; }
/// was last token a part of multi-wordforms destination
/// head parameter might be useful to distinguish between sequence of different multi-wordforms
virtual bool WasTokenMultiformDestination ( bool & bHead, int & iDestCount ) const = 0;
/// check whether this token is a generated morphological guess
ESphTokenMorph GetTokenMorph() const { return m_eTokenMorph; }
virtual bool TokenIsBlended () const { return m_bBlended; }
virtual bool TokenIsBlendedPart () const { return m_bBlendedPart; }
virtual int SkipBlended () { return 0; }
public:
/// spawn a clone of my own
virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const = 0;
/// start buffer point of last token
virtual const char * GetTokenStart () const = 0;
/// end buffer point of last token (exclusive, ie. *GetTokenEnd() is already NOT part of a token!)
virtual const char * GetTokenEnd () const = 0;
/// current buffer ptr
virtual const char * GetBufferPtr () const = 0;
/// buffer end
virtual const char * GetBufferEnd () const = 0;
/// set new buffer ptr (must be within current bounds)
virtual void SetBufferPtr ( const char * sNewPtr ) = 0;
/// get settings hash
virtual uint64_t GetSettingsFNV () const;
/// get (readonly) lowercaser
const CSphLowercaser & GetLowercaser() const { return m_tLC; }
protected:
virtual bool RemapCharacters ( const char * sConfig, DWORD uFlags, const char * sSource, bool bCanRemap, CSphString & sError );
virtual bool AddSpecialsSPZ ( const char * sSpecials, const char * sDirective, CSphString & sError );
protected:
static const BYTE BLEND_TRIM_NONE = 1;
static const BYTE BLEND_TRIM_HEAD = 2;
static const BYTE BLEND_TRIM_TAIL = 4;
static const BYTE BLEND_TRIM_BOTH = 8;
static const BYTE BLEND_TRIM_ALL = 16;
CSphLowercaser m_tLC; ///< my lowercaser
int m_iLastTokenLen = 0; ///< last token length, in codepoints
bool m_bTokenBoundary = false; ///< last token boundary flag (true after boundary codepoint followed by separator)
bool m_bBoundary = false; ///< boundary flag (true immediately after boundary codepoint)
int m_iBoundaryOffset = 0; ///< boundary character offset (in bytes)
bool m_bWasSpecial = false; ///< special token flag
bool m_bWasSynonym = false; ///< last token is a synonym token
int m_iOvershortCount = 0; ///< skipped overshort tokens count
ESphTokenMorph m_eTokenMorph {SPH_TOKEN_MORPH_RAW}; ///< whether last token was a generated morphological guess
bool m_bBlended = false; ///< whether last token (as in just returned from GetToken()) was blended
bool m_bNonBlended = true; ///< internal, whether there were any normal chars in that blended token
bool m_bBlendedPart = false; ///< whether last token is a normal subtoken of a blended token
bool m_bBlendAdd = false; ///< whether we have more pending blended variants (of current accumulator) to return
BYTE m_uBlendVariants {BLEND_TRIM_NONE}; ///< mask of blended variants as requested by blend_mode (see BLEND_TRIM_xxx flags)
BYTE m_uBlendVariantsPending = 0;///< mask of pending blended variants (we clear bits as we return variants)
bool m_bBlendSkipPure = false; ///< skip purely blended tokens
bool m_bShortTokenFilter = false;///< short token filter flag
bool m_bDetectSentences = false; ///< should we detect sentence boundaries?
CSphTokenizerSettings m_tSettings; ///< tokenizer settings
CSphSavedFile m_tSynFileInfo; ///< synonyms file info
public:
bool m_bPhrase = false;
};
using ISphTokenizerRefPtr_c = CSphRefcountedPtr<ISphTokenizer>;
/// parse charset table
bool sphParseCharset ( const char * sCharset, CSphVector<CSphRemapRange> & dRemaps );
/// create UTF-8 tokenizer
ISphTokenizer * sphCreateUTF8Tokenizer ();
/// create UTF-8 tokenizer with n-grams support (for CJK n-gram indexing)
ISphTokenizer * sphCreateUTF8NgramTokenizer ();
/////////////////////////////////////////////////////////////////////////////
// DICTIONARIES
/////////////////////////////////////////////////////////////////////////////
struct CSphDictSettings
{
CSphString m_sMorphology;
CSphString m_sMorphFields;
CSphString m_sStopwords;
StrVec_t m_dWordforms;
int m_iMinStemmingLen;
bool m_bWordDict;
bool m_bStopwordsUnstemmed;
CSphString m_sMorphFingerprint; ///< not used for creation; only for a check when loading
CSphDictSettings ()
: m_iMinStemmingLen ( 1 )
, m_bWordDict ( true )
, m_bStopwordsUnstemmed ( false )
{}
};
/// dictionary entry
/// some of the fields might be unused depending on specific dictionary type
struct CSphDictEntry
{
SphWordID_t m_uWordID = 0; ///< keyword id (for dict=crc)
const BYTE * m_sKeyword = nullptr; ///< keyword text (for dict=keywords)
int m_iDocs = 0; ///< number of matching documents
int m_iHits = 0; ///< number of occurrences
SphOffset_t m_iDoclistOffset = 0; ///< absolute document list offset (into .spd)
SphOffset_t m_iDoclistLength = 0; ///< document list length in bytes
SphOffset_t m_iSkiplistOffset = 0; ///< absolute skiplist offset (into .spe)
int m_iDoclistHint = 0; ///< raw document list length hint value (0..255 range, 1 byte)
};
/// stored normal form
struct CSphStoredNF
{
CSphString m_sWord;
bool m_bAfterMorphology;
};
/// wordforms container
struct CSphWordforms
{
int m_iRefCount;
CSphVector<CSphSavedFile> m_dFiles;
uint64_t m_uTokenizerFNV;
CSphString m_sIndexName;
bool m_bHavePostMorphNF;
CSphVector <CSphStoredNF> m_dNormalForms;
CSphMultiformContainer * m_pMultiWordforms;
CSphOrderedHash < int, CSphString, CSphStrHashFunc, 1048576 > m_dHash;
CSphWordforms ();
~CSphWordforms ();
bool IsEqual ( const CSphVector<CSphSavedFile> & dFiles );
bool ToNormalForm ( BYTE * pWord, bool bBefore, bool bOnlyCheck ) const;
};
/// abstract word dictionary interface
struct CSphWordHit;
class CSphAutofile;
struct DictHeader_t;
class CSphDict : public ISphRefcountedMT
{
protected:
/// virtualizing dtor. Protected to follow refcounted rules.
virtual ~CSphDict () {}
public:
static const int ST_OK = 0;
static const int ST_ERROR = 1;
static const int ST_WARNING = 2;
public:
/// Get word ID by word, "text" version
/// may apply stemming and modify word inplace
/// modified word may become bigger than the original one, so make sure you have enough space in buffer which is pointer by pWord
/// a general practice is to use char[3*SPH_MAX_WORD_LEN+4] as a buffer
/// returns 0 for stopwords
virtual SphWordID_t GetWordID ( BYTE * pWord ) = 0;
/// get word ID by word, "text" version
/// may apply stemming and modify word inplace
/// accepts words with already prepended MAGIC_WORD_HEAD
/// appends MAGIC_WORD_TAIL
/// returns 0 for stopwords
virtual SphWordID_t GetWordIDWithMarkers ( BYTE * pWord ) { return GetWordID ( pWord ); }
/// get word ID by word, "text" version
/// does NOT apply stemming
/// accepts words with already prepended MAGIC_WORD_HEAD_NONSTEMMED
/// returns 0 for stopwords
virtual SphWordID_t GetWordIDNonStemmed ( BYTE * pWord ) { return GetWordID ( pWord ); }
/// get word ID by word, "binary" version
/// only used with prefix/infix indexing
/// must not apply stemming and modify anything
/// filters stopwords on request
virtual SphWordID_t GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops ) = 0;
/// apply stemmers to the given word
virtual void ApplyStemmers ( BYTE * ) const {}
/// load stopwords from given files
virtual void LoadStopwords ( const char * sFiles, const ISphTokenizer * pTokenizer, bool bStripFile ) = 0;
/// load stopwords from an array
virtual void LoadStopwords ( const CSphVector<SphWordID_t> & dStopwords ) = 0;
/// write stopwords to a file
virtual void WriteStopwords ( CSphWriter & tWriter ) const = 0;
/// load wordforms from a given list of files
virtual bool LoadWordforms ( const StrVec_t &, const CSphEmbeddedFiles * pEmbedded, const ISphTokenizer * pTokenizer, const char * sIndex ) = 0;
/// write wordforms to a file
virtual void WriteWordforms ( CSphWriter & tWriter ) const = 0;
/// get wordforms
virtual const CSphWordforms * GetWordforms() { return NULL; }
/// disable wordforms processing
virtual void DisableWordforms() {}
/// set morphology
/// returns 0 on success, 1 on hard error, 2 on a warning (see ST_xxx constants)
virtual int SetMorphology ( const char * szMorph, CSphString & sMessage ) = 0;
/// are there any morphological processors?
virtual bool HasMorphology () const { return false; }
/// morphological data fingerprint (lemmatizer filenames and crc32s)
virtual const CSphString & GetMorphDataFingerprint () const { return m_sMorphFingerprint; }
/// setup dictionary using settings
virtual void Setup ( const CSphDictSettings & tSettings ) = 0;
/// get dictionary settings
virtual const CSphDictSettings & GetSettings () const = 0;
/// stopwords file infos
virtual const CSphVector <CSphSavedFile> & GetStopwordsFileInfos () const = 0;
/// wordforms file infos
virtual const CSphVector <CSphSavedFile> & GetWordformsFileInfos () const = 0;
/// get multiwordforms
virtual const CSphMultiformContainer * GetMultiWordforms () const = 0;
/// check what given word is stopword
virtual bool IsStopWord ( const BYTE * pWord ) const = 0;
public:
/// enable actually collecting keywords (needed for stopwords/wordforms loading)
virtual void HitblockBegin () {}
/// callback to let dictionary do hit block post-processing
virtual void HitblockPatch ( CSphWordHit *, int ) const {}
/// resolve temporary hit block wide wordid (!) back to keyword
virtual const char * HitblockGetKeyword ( SphWordID_t ) { return NULL; }
/// check current memory usage
virtual int HitblockGetMemUse () { return 0; }
/// hit block dismissed
virtual void HitblockReset () {}
public:
/// begin creating dictionary file, setup any needed internal structures
virtual void DictBegin ( CSphAutofile & tTempDict, CSphAutofile & tDict, int iDictLimit );
/// add next keyword entry to final dict
virtual void DictEntry ( const CSphDictEntry & tEntry );
/// flush last entry
virtual void DictEndEntries ( SphOffset_t iDoclistOffset );
/// end indexing, store dictionary and checkpoints
virtual bool DictEnd ( DictHeader_t * pHeader, int iMemLimit, CSphString & sError );
/// check whether there were any errors during indexing
virtual bool DictIsError () const;
public:
/// check whether this dict is stateful (when it comes to lookups)
virtual bool HasState () const { return false; }
/// make a clone
virtual CSphDict * Clone () const { return nullptr; }
/// get settings hash
virtual uint64_t GetSettingsFNV () const = 0;
protected:
CSphString m_sMorphFingerprint;
};
using CSphDictRefPtr_c = CSphRefcountedPtr<CSphDict>;
/// returns pDict, if stateless. Or it's clone, if not
CSphDict * GetStatelessDict ( CSphDict * pDict );
/// traits dictionary factory (no storage, only tokenizing, lemmatizing, etc.)
CSphDict * sphCreateDictionaryTemplate ( const CSphDictSettings & tSettings, const CSphEmbeddedFiles * pFiles, const ISphTokenizer * pTokenizer, const char * sIndex, bool bStripFile, CSphString & sError );
/// CRC32/FNV64 dictionary factory
CSphDict * sphCreateDictionaryCRC ( const CSphDictSettings & tSettings, const CSphEmbeddedFiles * pFiles, const ISphTokenizer * pTokenizer, const char * sIndex, bool bStripFile, CSphString & sError );
/// keyword-storing dictionary factory
CSphDict * sphCreateDictionaryKeywords ( const CSphDictSettings & tSettings, const CSphEmbeddedFiles * pFiles, const ISphTokenizer * pTokenizer, const char * sIndex, bool bStripFile, CSphString & sError );
/// clear wordform cache
void sphShutdownWordforms ();
/// update/clear global IDF cache
bool sphPrereadGlobalIDF ( const CSphString & sPath, CSphString & sError );
void sphUpdateGlobalIDFs ( const StrVec_t & dFiles );
void sphInitGlobalIDFs ();
void sphShutdownGlobalIDFs ();
/////////////////////////////////////////////////////////////////////////////
// DATASOURCES
/////////////////////////////////////////////////////////////////////////////
/// hit position storage type
typedef DWORD Hitpos_t;
/// empty hit value
#define EMPTY_HIT 0
/// hit processing tools
/// Hitpos_t consists of three things:
/// 1) high bits store field number
/// 2) middle bit - field end marker
/// 3) lower bits store hit position in field
template < int FIELD_BITS >
class Hitman_c
{
protected:
enum
{
POS_BITS = 31 - FIELD_BITS,
FIELD_OFF = 32 - FIELD_BITS,
FIELDEND_OFF = 31 - FIELD_BITS,
FIELDEND_MASK = (1UL << POS_BITS),
POS_MASK = (1UL << POS_BITS) - 1
};
public:
static Hitpos_t Create ( int iField, int iPos )
{
return ( iField << FIELD_OFF ) + ( iPos & POS_MASK );
}
static Hitpos_t Create ( int iField, int iPos, bool bEnd )
{
return ( iField << FIELD_OFF ) + ( ((int)bEnd) << FIELDEND_OFF ) + ( iPos & POS_MASK );
}
static inline int GetField ( Hitpos_t uHitpos )
{
return uHitpos >> FIELD_OFF;
}
static inline int GetPos ( Hitpos_t uHitpos )
{
return uHitpos & POS_MASK;
}
static inline bool IsEnd ( Hitpos_t uHitpos )
{
return ( uHitpos & FIELDEND_MASK )!=0;
}
static inline DWORD GetPosWithField ( Hitpos_t uHitpos )
{
return uHitpos & ~FIELDEND_MASK;
}
static void AddPos ( Hitpos_t * pHitpos, int iAdd )
{
// FIXME! add range checks (eg. so that 0:0-1 does not overflow)
*pHitpos += iAdd;
}
static Hitpos_t CreateSum ( Hitpos_t uHitpos, int iAdd )
{
// FIXME! add range checks (eg. so that 0:0-1 does not overflow)
return ( uHitpos+iAdd ) & ~FIELDEND_MASK;
}
static void SetEndMarker ( Hitpos_t * pHitpos )
{
*pHitpos |= FIELDEND_MASK;
}
};
// this could be just DWORD[] but it's methods are very handy
// used to store field information e.g. which fields do we need to search in
struct FieldMask_t