Skip to content
Permalink
Browse files

fixed #517 special symbols inside words for CALL KEYWORDS result set;…

… fixed #532 missed terms from disk chunks of RT index for CALL KEYWORDS; added regressions to test 365
  • Loading branch information
tomatolog committed Oct 12, 2018
1 parent eabe30e commit f974f20bda3214a56877c393a192be1a77150958
Showing with 155 additions and 12 deletions.
  1. +33 −7 src/sphinx.cpp
  2. +3 −0 src/sphinxint.h
  3. +46 −5 src/sphinxrt.cpp
  4. +1 −0 test/test_365/model.bin
  5. +72 −0 test/test_365/test.xml
@@ -1254,8 +1254,7 @@ struct CSphTemplateQueryFilter : public ISphQueryFilter
tInfo.m_iHits = 0;
tInfo.m_iQpos = iQpos;

if ( tInfo.m_sNormalized.cstr()[0]==MAGIC_WORD_HEAD_NONSTEMMED )
*(char *)tInfo.m_sNormalized.cstr() = '=';
RemoveDictSpecials ( tInfo.m_sNormalized );
}
};

@@ -16685,6 +16684,8 @@ void ISphQueryFilter::GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords, co
tInfo.m_iDocs = tWordlist.m_dExpanded[i].m_iDocs;
tInfo.m_iHits = tWordlist.m_dExpanded[i].m_iHits;
tInfo.m_iQpos = iQpos;

RemoveDictSpecials ( tInfo.m_sNormalized );
}
}

@@ -16804,6 +16805,8 @@ void ISphQueryFilter::GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords, co
dKeywords[iTokenized].m_iDocs = iDocs;
dKeywords[iTokenized].m_iHits = iHits;
dKeywords[iTokenized].m_sNormalized = sNormalizedWithMaxHits;

RemoveDictSpecials ( dKeywords[iTokenized].m_sNormalized );
}
}
}
@@ -16843,8 +16846,7 @@ struct CSphPlainQueryFilter : public ISphQueryFilter
tInfo.m_iHits = m_tFoldSettings.m_bStats ? m_pQueryWord->m_iHits : 0;
tInfo.m_iQpos = iQpos;

if ( tInfo.m_sNormalized.cstr()[0]==MAGIC_WORD_HEAD_NONSTEMMED )
*(char *)tInfo.m_sNormalized.cstr() = '=';
RemoveDictSpecials ( tInfo.m_sNormalized );
}
};

// CSphQueryResultMeta
//////////////////////////////////////////////////////////////////////////

void CSphQueryResultMeta::AddStat ( const CSphString & sWord, int64_t iDocs, int64_t iHits )
void RemoveDictSpecials ( CSphString & sWord )
{
if ( sWord.cstr()[0]==MAGIC_WORD_HEAD )
{
*(char *)( sWord.cstr() ) = '*';
} else if ( sWord.cstr()[0]==MAGIC_WORD_HEAD_NONSTEMMED )
{
*(char *)( sWord.cstr() ) = '=';
} else
{
const char * p = strchr ( sWord.cstr(), MAGIC_WORD_BIGRAM );
if ( p )
{
*(char *)p = ' ';
}
}
}

const CSphString & RemoveDictSpecials ( const CSphString & sWord, CSphString & sFixed )
{
CSphString sFixed;
const CSphString * pFixed = &sWord;
if ( sWord.cstr()[0]==MAGIC_WORD_HEAD )
{
}
}

WordStat_t & tStats = m_hWordStats.AddUnique ( *pFixed );
return *pFixed;
}

void CSphQueryResultMeta::AddStat ( const CSphString & sWord, int64_t iDocs, int64_t iHits )
{
CSphString sBuf;
const CSphString & tFixed = RemoveDictSpecials ( sWord, sBuf );
WordStat_t & tStats = m_hWordStats.AddUnique ( tFixed );
tStats.m_iDocs += iDocs;
tStats.m_iHits += iHits;
}
@@ -1638,6 +1638,9 @@ class CSphDictExact : public CSphDictTraits
SphWordID_t GetWordIDNonStemmed ( BYTE * pWord ) final { return m_pDict->GetWordIDNonStemmed ( pWord ); }
};

void RemoveDictSpecials ( CSphString & sWord );
const CSphString & RemoveDictSpecials ( const CSphString & sWord, CSphString & sBuf );

//////////////////////////////////////////////////////////////////////////
// TOKEN FILTER
//////////////////////////////////////////////////////////////////////////
@@ -7421,8 +7421,7 @@ void RtIndex_t::AddKeywordStats ( BYTE * sWord, const BYTE * sTokenized, CSphDic
tInfo.m_iHits = bGetStats ? pQueryWord->m_iHits : 0;
tInfo.m_iQpos = iQpos;

if ( tInfo.m_sNormalized.cstr()[0]==MAGIC_WORD_HEAD_NONSTEMMED )
*(char *)tInfo.m_sNormalized.cstr() = '=';
RemoveDictSpecials ( tInfo.m_sNormalized );
}


@@ -7446,8 +7445,20 @@ struct CSphRtQueryFilter : public ISphQueryFilter, public ISphNoncopyable
}
};

static void HashKeywords ( CSphVector<CSphKeywordInfo> & dKeywords, SmallStringHash_T<CSphKeywordInfo> & hKeywords )
{
for ( CSphKeywordInfo & tSrc : dKeywords )
{
CSphKeywordInfo & tDst = hKeywords.AddUnique ( tSrc.m_sNormalized );
tDst.m_sTokenized = std::move ( tSrc.m_sTokenized );
tDst.m_sNormalized = std::move ( tSrc.m_sNormalized );
tDst.m_iQpos = tSrc.m_iQpos;
tDst.m_iDocs += tSrc.m_iDocs;
tDst.m_iHits += tSrc.m_iHits;
}
}

bool RtIndex_t::DoGetKeywords ( CSphVector<CSphKeywordInfo> & dKeywords, const char * sQuery, const GetKeywordsSettings_t & tSettings, bool bFillOnly, CSphString * , const SphChunkGuard_t & tGuard ) const
bool RtIndex_t::DoGetKeywords ( CSphVector<CSphKeywordInfo> & dKeywords, const char * sQuery, const GetKeywordsSettings_t & tSettings, bool bFillOnly, CSphString * pError, const SphChunkGuard_t & tGuard ) const
{
if ( !bFillOnly )
dKeywords.Resize ( 0 );
@@ -7532,8 +7543,38 @@ bool RtIndex_t::DoGetKeywords ( CSphVector<CSphKeywordInfo> & dKeywords, const c
if ( !tSettings.m_bStats )
return true;

ARRAY_FOREACH ( iChunk, tGuard.m_dDiskChunks )
tGuard.m_dDiskChunks[iChunk]->FillKeywords ( dKeywords );
if ( bFillOnly )
{
ARRAY_FOREACH ( iChunk, tGuard.m_dDiskChunks )
tGuard.m_dDiskChunks[iChunk]->FillKeywords ( dKeywords );
} else
{
// bigram and expanded might differs need to merge infos
CSphVector<CSphKeywordInfo> dChunkKeywords;
SmallStringHash_T<CSphKeywordInfo> hKeywords;
ARRAY_FOREACH ( iChunk, tGuard.m_dDiskChunks )
{
tGuard.m_dDiskChunks[iChunk]->GetKeywords ( dChunkKeywords, sQuery, tSettings, pError );
HashKeywords ( dChunkKeywords, hKeywords );
dChunkKeywords.Resize ( 0 );
}

if ( hKeywords.GetLength() )
{
// merge keywords from RAM parts with disk keywords into hash
HashKeywords ( dKeywords, hKeywords );
dKeywords.Resize ( 0 );
dKeywords.Reserve ( hKeywords.GetLength() );

hKeywords.IterateStart();
while ( hKeywords.IterateNext() )
{
const CSphKeywordInfo & tSrc = hKeywords.IterateGet();
dKeywords.Add ( tSrc );
}
sphSort ( dKeywords.Begin(), dKeywords.GetLength(), bind ( &CSphKeywordInfo::m_iQpos ) );
}
}

return true;
}
@@ -0,0 +1 @@
a:1:{i:0;a:6:{i:0;a:2:{s:8:"sphinxql";s:118:"INSERT INTO rt_bi (id, gid, title) VALUES ( 1, 1, 'manticore search is the best'), (2, 2, 'manticore is more better' )";s:14:"total_affected";i:2;}i:1;a:3:{s:8:"sphinxql";s:69:"CALL KEYWORDS ( 'manti*', 'idx_bi', 1 AS stats, 0 AS fold_wildcards )";s:10:"total_rows";i:4;s:4:"rows";a:4:{i:0;a:5:{s:4:"qpos";s:1:"1";s:9:"tokenized";s:6:"manti*";s:10:"normalized";s:9:"manticore";s:4:"docs";s:1:"3";s:4:"hits";s:1:"3";}i:1;a:5:{s:4:"qpos";s:1:"1";s:9:"tokenized";s:6:"manti*";s:10:"normalized";s:12:"manticore is";s:4:"docs";s:1:"1";s:4:"hits";s:1:"1";}i:2;a:5:{s:4:"qpos";s:1:"1";s:9:"tokenized";s:6:"manti*";s:10:"normalized";s:14:"manticore like";s:4:"docs";s:1:"1";s:4:"hits";s:1:"1";}i:3;a:5:{s:4:"qpos";s:1:"1";s:9:"tokenized";s:6:"manti*";s:10:"normalized";s:16:"manticore search";s:4:"docs";s:1:"1";s:4:"hits";s:1:"1";}}}i:2;a:3:{s:8:"sphinxql";s:68:"CALL KEYWORDS ( 'manti*', 'rt_bi', 1 AS stats, 0 AS fold_wildcards )";s:10:"total_rows";i:3;s:4:"rows";a:3:{i:0;a:5:{s:4:"qpos";s:1:"1";s:9:"tokenized";s:6:"manti*";s:10:"normalized";s:16:"manticore search";s:4:"docs";s:1:"1";s:4:"hits";s:1:"1";}i:1;a:5:{s:4:"qpos";s:1:"1";s:9:"tokenized";s:6:"manti*";s:10:"normalized";s:9:"manticore";s:4:"docs";s:1:"2";s:4:"hits";s:1:"2";}i:2;a:5:{s:4:"qpos";s:1:"1";s:9:"tokenized";s:6:"manti*";s:10:"normalized";s:12:"manticore is";s:4:"docs";s:1:"1";s:4:"hits";s:1:"1";}}}i:3;a:2:{s:8:"sphinxql";s:20:"FLUSH RAMCHUNK rt_bi";s:14:"total_affected";i:0;}i:4;a:2:{s:8:"sphinxql";s:84:"INSERT INTO rt_bi (id, gid, title) VALUES (3, 3, 'search with manticore like many' )";s:14:"total_affected";i:1;}i:5;a:3:{s:8:"sphinxql";s:68:"CALL KEYWORDS ( 'manti*', 'rt_bi', 1 AS stats, 0 AS fold_wildcards )";s:10:"total_rows";i:4;s:4:"rows";a:4:{i:0;a:5:{s:4:"qpos";s:1:"1";s:9:"tokenized";s:6:"manti*";s:10:"normalized";s:14:"manticore like";s:4:"docs";s:1:"1";s:4:"hits";s:1:"1";}i:1;a:5:{s:4:"qpos";s:1:"1";s:9:"tokenized";s:6:"manti*";s:10:"normalized";s:16:"manticore search";s:4:"docs";s:1:"1";s:4:"hits";s:1:"1";}i:2;a:5:{s:4:"qpos";s:1:"1";s:9:"tokenized";s:6:"manti*";s:10:"normalized";s:12:"manticore is";s:4:"docs";s:1:"1";s:4:"hits";s:1:"1";}i:3;a:5:{s:4:"qpos";s:1:"1";s:9:"tokenized";s:6:"manti*";s:10:"normalized";s:9:"manticore";s:4:"docs";s:1:"3";s:4:"hits";s:1:"3";}}}}}
@@ -0,0 +1,72 @@
<?xml version="1.0" encoding="utf-8"?>
<test>

<name>KEYWORDS for bigram</name>

<config>
indexer
{
mem_limit = 16M
}

searchd
{
<searchd_settings/>
}

source src_bi
{
type = mysql
<sql_settings/>
sql_query = select * from test_table
sql_attr_uint = gid
}

index idx_bi
{
source = src_bi
path = <data_path/>/idx_bi

bigram_index = all
dict = keywords
min_prefix_len = 2
}

index rt_bi
{
type = rt
path = <data_path/>/rt_bi
docinfo = extern
rt_mem_limit = 128k

rt_field = title
rt_attr_uint = gid

bigram_index = all
dict = keywords
min_prefix_len = 2
}
</config>
<db_create>
CREATE TABLE `test_table` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`gid` int(11) NOT NULL,
`title` varchar(255) NOT NULL,
PRIMARY KEY (`id`)
)
</db_create>
<db_drop>DROP TABLE IF EXISTS `test_table`;</db_drop>
<db_insert>INSERT INTO `test_table` VALUES ( 1, 1, 'manticore search is the best'), (2, 2, 'manticore is more better' ), (3, 3, 'search with manticore like many' )</db_insert>

<sphqueries>
<sphinxql>INSERT INTO rt_bi (id, gid, title) VALUES ( 1, 1, 'manticore search is the best'), (2, 2, 'manticore is more better' )</sphinxql>

<sphinxql>CALL KEYWORDS ( 'manti*', 'idx_bi', 1 AS stats, 0 AS fold_wildcards )</sphinxql>
<sphinxql>CALL KEYWORDS ( 'manti*', 'rt_bi', 1 AS stats, 0 AS fold_wildcards )</sphinxql>

<sphinxql>FLUSH RAMCHUNK rt_bi</sphinxql>
<sphinxql>INSERT INTO rt_bi (id, gid, title) VALUES (3, 3, 'search with manticore like many' )</sphinxql>
<sphinxql>CALL KEYWORDS ( 'manti*', 'rt_bi', 1 AS stats, 0 AS fold_wildcards )</sphinxql>
</sphqueries>

</test>

0 comments on commit f974f20

Please sign in to comment.
You can’t perform that action at this time.