Skip to content
Permalink
Browse files

fixed #474 index with only wordforms has no matches for prefixes due …

…to wrong exact form check; added regression to test 196
  • Loading branch information...
tomatolog committed Feb 22, 2019
1 parent 8af8101 commit 0721696d6780c200c65d596624a8187983fb7fcb
Showing with 54 additions and 17 deletions.
  1. +9 −9 src/sphinx.cpp
  2. +3 −3 src/sphinxint.h
  3. +4 −4 src/sphinxrt.cpp
  4. +1 −1 test/test_196/model.bin
  5. +37 −0 test/test_196/test.xml
@@ -16613,7 +16613,7 @@ void ISphQueryFilter::GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords, co
{
dQposWildcards.Add ( iQpos );

ISphWordlist::Args_t tWordlist ( false, tCtx.m_iExpansionLimit, tCtx.m_bHasMorphology, tCtx.m_eHitless, tCtx.m_pIndexData );
ISphWordlist::Args_t tWordlist ( false, tCtx.m_iExpansionLimit, tCtx.m_bHasExactForms, tCtx.m_eHitless, tCtx.m_pIndexData );
bool bExpanded = sphExpandGetWords ( (const char *)sWord, tCtx, tWordlist );

int iDocs = 0;
@@ -16903,7 +16903,7 @@ bool CSphIndex_VLN::DoGetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords,
tExpCtx.m_pWordlist = &m_tWordlist;
tExpCtx.m_iMinPrefixLen = m_tSettings.m_iMinPrefixLen;
tExpCtx.m_iMinInfixLen = m_tSettings.m_iMinInfixLen;
tExpCtx.m_bHasMorphology = m_pDict->HasMorphology();
tExpCtx.m_bHasExactForms = ( m_pDict->HasMorphology() || m_tSettings.m_bIndexExactWords );
tExpCtx.m_bMergeSingles = false;
tExpCtx.m_eHitless = m_tSettings.m_eHitless;

@@ -17302,7 +17302,7 @@ XQNode_t * sphExpandXQNode ( XQNode_t * pNode, ExpansionContext_t & tCtx )
return pNode;

bool bUseTermMerge = ( tCtx.m_bMergeSingles && pNode->m_dSpec.m_dZones.GetLength()==0 );
ISphWordlist::Args_t tWordlist ( bUseTermMerge, tCtx.m_iExpansionLimit, tCtx.m_bHasMorphology, tCtx.m_eHitless, tCtx.m_pIndexData );
ISphWordlist::Args_t tWordlist ( bUseTermMerge, tCtx.m_iExpansionLimit, tCtx.m_bHasExactForms, tCtx.m_eHitless, tCtx.m_pIndexData );

if ( !sphExpandGetWords ( sFull, tCtx, tWordlist ) )
{
@@ -17409,7 +17409,7 @@ bool sphExpandGetWords ( const char * sWord, const ExpansionContext_t & tCtx, IS
int iBytes = sCodes - sPrefix;
// prefix expansion should work on nonstemmed words only
char sFixed[MAX_KEYWORD_BYTES];
if ( tCtx.m_bHasMorphology )
if ( tCtx.m_bHasExactForms )
{
sFixed[0] = MAGIC_WORD_HEAD_NONSTEMMED;
memcpy ( sFixed+1, sPrefix, iBytes );
@@ -17480,7 +17480,7 @@ XQNode_t * CSphIndex_VLN::ExpandPrefix ( XQNode_t * pNode, CSphQueryResultMeta *
tCtx.m_iMinPrefixLen = m_tSettings.m_iMinPrefixLen;
tCtx.m_iMinInfixLen = m_tSettings.m_iMinInfixLen;
tCtx.m_iExpansionLimit = m_iExpansionLimit;
tCtx.m_bHasMorphology = m_pDict->HasMorphology();
tCtx.m_bHasExactForms = ( m_pDict->HasMorphology() || m_tSettings.m_bIndexExactWords );
tCtx.m_bMergeSingles = ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_INLINE && ( uQueryDebugFlags & QUERY_DEBUG_NO_PAYLOAD )==0 );
tCtx.m_pPayloads = pPayloads;
tCtx.m_eHitless = m_tSettings.m_eHitless;
}


ISphWordlist::Args_t::Args_t ( bool bPayload, int iExpansionLimit, bool bHasMorphology, ESphHitless eHitless, const void * pIndexData )
ISphWordlist::Args_t::Args_t ( bool bPayload, int iExpansionLimit, bool bHasExactForms, ESphHitless eHitless, const void * pIndexData )
: m_bPayload ( bPayload )
, m_iExpansionLimit ( iExpansionLimit )
, m_bHasMorphology ( bHasMorphology )
, m_bHasExactForms ( bHasExactForms )
, m_eHitless ( eHitless )
, m_pIndexData ( pIndexData )
{
return;

DictEntryDiskPayload_t tDict2Payload ( tArgs.m_bPayload, tArgs.m_eHitless );
const int iSkipMagic = ( tArgs.m_bHasMorphology ? 1 : 0 ); // whether to skip heading magic chars in the prefix, like NONSTEMMED maker
const int iSkipMagic = ( tArgs.m_bHasExactForms ? 1 : 0 ); // whether to skip heading magic chars in the prefix, like NONSTEMMED maker

int dWildcard [ SPH_MAX_WORD_LEN + 1 ];
int * pWildcard = ( sphIsUTF8 ( sWildcard ) && sphUTF8ToWideChar ( sWildcard, dWildcard, SPH_MAX_WORD_LEN ) ) ? dWildcard : NULL;
break;

// stemmed terms should not match suffixes
if ( tArgs.m_bHasMorphology && *tDictReader.m_sKeyword!=MAGIC_WORD_HEAD_NONSTEMMED )
if ( tArgs.m_bHasExactForms && *tDictReader.m_sKeyword!=MAGIC_WORD_HEAD_NONSTEMMED )
continue;

if ( sphWildcardMatch ( (const char *)tDictReader.m_sKeyword+iSkipMagic, sWildcard, pWildcard ) )
@@ -2223,15 +2223,15 @@ class ISphWordlist
CSphVector<SphExpanded_t> m_dExpanded;
const bool m_bPayload;
int m_iExpansionLimit;
const bool m_bHasMorphology;
const bool m_bHasExactForms;
const ESphHitless m_eHitless;

ISphSubstringPayload * m_pPayload;
int m_iTotalDocs;
int m_iTotalHits;
const void * m_pIndexData;

Args_t ( bool bPayload, int iExpansionLimit, bool bHasMorphology, ESphHitless eHitless, const void * pIndexData );
Args_t ( bool bPayload, int iExpansionLimit, bool bHasExactForms, ESphHitless eHitless, const void * pIndexData );
~Args_t ();
void AddExpanded ( const BYTE * sWord, int iLen, int iDocs, int iHits );
const char * GetWordExpanded ( int iIndex ) const;
@@ -2270,7 +2270,7 @@ struct ExpansionContext_t
int m_iMinPrefixLen = 0;
int m_iMinInfixLen = 0;
int m_iExpansionLimit = 0;
bool m_bHasMorphology = false;
bool m_bHasExactForms = false;
bool m_bMergeSingles = false;
CSphScopedPayload * m_pPayloads = nullptr;
ESphHitless m_eHitless {SPH_HITLESS_NONE};
@@ -6362,7 +6362,7 @@ void RtIndex_t::GetInfixedWords ( const char * sSubstring, int iSubLen, const ch

// find those prefixes
CSphVector<DWORD> dPoints;
const int iSkipMagic = ( tArgs.m_bHasMorphology ? 1 : 0 ); // whether to skip heading magic chars in the prefix, like NONSTEMMED maker
const int iSkipMagic = ( tArgs.m_bHasExactForms ? 1 : 0 ); // whether to skip heading magic chars in the prefix, like NONSTEMMED maker
const CSphFixedVector<RtSegment_t*> & dSegments = *((CSphFixedVector<RtSegment_t*> *)tArgs.m_pIndexData);

DictEntryRtPayload_t tDict2Payload ( tArgs.m_bPayload, dSegments.GetLength() );
@@ -6393,7 +6393,7 @@ void RtIndex_t::GetInfixedWords ( const char * sSubstring, int iSubLen, const ch
const RtWord_t * pWord = NULL;
while ( ( pWord = tReader.UnzipWord() )!=NULL )
{
if ( tArgs.m_bHasMorphology && pWord->m_sWord[1]!=MAGIC_WORD_HEAD_NONSTEMMED )
if ( tArgs.m_bHasExactForms && pWord->m_sWord[1]!=MAGIC_WORD_HEAD_NONSTEMMED )
continue;

// check it
@@ -7089,7 +7089,7 @@ bool RtIndex_t::MultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult
tExpCtx.m_iMinPrefixLen = m_tSettings.m_iMinPrefixLen;
tExpCtx.m_iMinInfixLen = m_tSettings.m_iMinInfixLen;
tExpCtx.m_iExpansionLimit = m_iExpansionLimit;
tExpCtx.m_bHasMorphology = m_pDict->HasMorphology();
tExpCtx.m_bHasExactForms = ( m_pDict->HasMorphology() || m_tSettings.m_bIndexExactWords );
tExpCtx.m_bMergeSingles = ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_INLINE && ( pQuery->m_uDebugFlags & QUERY_DEBUG_NO_PAYLOAD )==0 );
tExpCtx.m_pPayloads = &tPayloads;
tExpCtx.m_pIndexData = &tGuard.m_dRamChunks;
@@ -7540,7 +7540,7 @@ bool RtIndex_t::DoGetKeywords ( CSphVector<CSphKeywordInfo> & dKeywords, const c
tExpCtx.m_pWordlist = this;
tExpCtx.m_iMinPrefixLen = m_tSettings.m_iMinPrefixLen;
tExpCtx.m_iMinInfixLen = m_tSettings.m_iMinInfixLen;
tExpCtx.m_bHasMorphology = m_pDict->HasMorphology();
tExpCtx.m_bHasExactForms = ( m_pDict->HasMorphology() || m_tSettings.m_bIndexExactWords );
tExpCtx.m_bMergeSingles = false;
tExpCtx.m_pIndexData = &tGuard.m_dRamChunks;

@@ -1 +1 @@
a:1:{i:0;a:13:{i:0;a:3:{s:8:"sphinxql";s:46:"select * from test where match('multi_before')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:1:{s:2:"id";s:3:"100";}}}i:1;a:3:{s:8:"sphinxql";s:42:"select * from test where match('wordform')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:1:{s:2:"id";s:3:"101";}}}i:2;a:2:{s:8:"sphinxql";s:41:"select * from test where match('result1')";s:10:"total_rows";i:0;}i:3;a:3:{s:8:"sphinxql";s:41:"select * from test where match('result2')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:1:{s:2:"id";s:3:"102";}}}i:4;a:2:{s:8:"sphinxql";s:39:"select * from test where match('book1')";s:10:"total_rows";i:0;}i:5;a:3:{s:8:"sphinxql";s:39:"select * from test where match('book2')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:1:{s:2:"id";s:3:"103";}}}i:6;a:2:{s:8:"sphinxql";s:45:"select * from test where match('shouldnotbe')";s:10:"total_rows";i:0;}i:7;a:3:{s:8:"sphinxql";s:37:"select * from test where match('ran')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:1:{s:2:"id";s:3:"105";}}}i:8;a:3:{s:8:"sphinxql";s:38:"select * from test where match('halt')";s:10:"total_rows";i:2;s:4:"rows";a:2:{i:0;a:1:{s:2:"id";s:3:"107";}i:1;a:1:{s:2:"id";s:3:"108";}}}i:9;a:3:{s:8:"sphinxql";s:43:"select * from test where match('test test')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:1:{s:2:"id";s:3:"111";}}}i:10;a:3:{s:8:"sphinxql";s:43:"select * from test where match('test pest')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:1:{s:2:"id";s:3:"111";}}}i:11;a:3:{s:8:"sphinxql";s:41:"select * from ord1 where match('^wound$')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:1:{s:2:"id";s:3:"120";}}}i:12;a:3:{s:8:"sphinxql";s:41:"select * from ord2 where match('^wound$')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:1:{s:2:"id";s:3:"120";}}}}}
a:1:{i:0;a:15:{i:0;a:3:{s:8:"sphinxql";s:46:"select * from test where match('multi_before')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:1:{s:2:"id";s:3:"100";}}}i:1;a:3:{s:8:"sphinxql";s:42:"select * from test where match('wordform')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:1:{s:2:"id";s:3:"101";}}}i:2;a:2:{s:8:"sphinxql";s:41:"select * from test where match('result1')";s:10:"total_rows";i:0;}i:3;a:3:{s:8:"sphinxql";s:41:"select * from test where match('result2')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:1:{s:2:"id";s:3:"102";}}}i:4;a:2:{s:8:"sphinxql";s:39:"select * from test where match('book1')";s:10:"total_rows";i:0;}i:5;a:3:{s:8:"sphinxql";s:39:"select * from test where match('book2')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:1:{s:2:"id";s:3:"103";}}}i:6;a:2:{s:8:"sphinxql";s:45:"select * from test where match('shouldnotbe')";s:10:"total_rows";i:0;}i:7;a:3:{s:8:"sphinxql";s:37:"select * from test where match('ran')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:1:{s:2:"id";s:3:"105";}}}i:8;a:3:{s:8:"sphinxql";s:38:"select * from test where match('halt')";s:10:"total_rows";i:2;s:4:"rows";a:2:{i:0;a:1:{s:2:"id";s:3:"107";}i:1;a:1:{s:2:"id";s:3:"108";}}}i:9;a:3:{s:8:"sphinxql";s:43:"select * from test where match('test test')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:1:{s:2:"id";s:3:"111";}}}i:10;a:3:{s:8:"sphinxql";s:43:"select * from test where match('test pest')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:1:{s:2:"id";s:3:"111";}}}i:11;a:3:{s:8:"sphinxql";s:41:"select * from ord1 where match('^wound$')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:1:{s:2:"id";s:3:"120";}}}i:12;a:3:{s:8:"sphinxql";s:41:"select * from ord2 where match('^wound$')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:1:{s:2:"id";s:3:"120";}}}i:13;a:3:{s:8:"sphinxql";s:61:"select id from idx_pre1 where match('forty*') order by id asc";s:10:"total_rows";i:2;s:4:"rows";a:2:{i:0;a:1:{s:2:"id";s:3:"100";}i:1;a:1:{s:2:"id";s:3:"101";}}}i:14;a:3:{s:8:"sphinxql";s:61:"select id from idx_pre2 where match('forty*') order by id asc";s:10:"total_rows";i:2;s:4:"rows";a:2:{i:0;a:1:{s:2:"id";s:3:"100";}i:1;a:1:{s:2:"id";s:3:"101";}}}}}
@@ -44,6 +44,29 @@ index ord2
path = <data_path/>/ord2
wordforms = <this_test/>/wf2.txt
}

source test1
{
type = mysql
<sql_settings/>
sql_query = select *, 11 as gid from test_table1
sql_attr_uint = gid
}

index idx_pre1
{
source = test1
path = <data_path/>/idx_pre1
index_exact_words = 1
min_prefix_len = 1
}

index idx_pre2 : idx_pre1
{
path = <data_path/>/idx_pre2
wordforms = <this_test/>/wf_pre.txt
}

</config>

<db_create>
@@ -69,6 +92,16 @@ INSERT INTO test_table VALUES
( 120, 'wound care and ostomy' )
</db_insert>

<db_create>
CREATE TABLE test_table1
(
id INTEGER PRIMARY KEY NOT NULL,
content VARCHAR(255) NOT NULL
);
</db_create>
<db_drop>DROP TABLE IF EXISTS test_table1</db_drop>
<db_insert>INSERT INTO test_table1 VALUES ( 100, 'forty' ), ( 101, 'fortyfour' )</db_insert>

<sphqueries>
<sphinxql>select * from test where match('multi_before')</sphinxql>
<sphinxql>select * from test where match('wordform')</sphinxql>
@@ -85,6 +118,10 @@ INSERT INTO test_table VALUES
<!-- regression for multiforms applied in the wrong order -->
<sphinxql>select * from ord1 where match('^wound$')</sphinxql>
<sphinxql>select * from ord2 where match('^wound$')</sphinxql>

<!-- regression prefix matches no wordform only index -->
<sphinxql>select id from idx_pre1 where match('forty*') order by id asc</sphinxql>
<sphinxql>select id from idx_pre2 where match('forty*') order by id asc</sphinxql>
</sphqueries>

</test>

0 comments on commit 0721696

Please sign in to comment.
You can’t perform that action at this time.