Skip to content
Permalink
Browse files

fixed #394 github#103 wrong embedded stopwords in disk chunk by RT in…

…dex after daemon restart; added regression to test 360
  • Loading branch information
tomatolog committed Aug 14, 2018
1 parent cdac6d1 commit e61ec00b6b27d1d5878247e2ee817f3b1e7fde16
Showing with 79 additions and 3 deletions.
  1. +4 −3 src/sphinx.cpp
  2. +1 −0 src/sphinxrt.cpp
  3. +1 −0 test/test_360/model.bin
  4. +1 −0 test/test_360/stops.txt
  5. +72 −0 test/test_360/test.xml
@@ -3337,7 +3337,7 @@ void SaveTokenizerSettings ( CSphWriter & tWriter, const ISphTokenizer * pTokeni
tWriter.PutString ( tSettings.m_sCaseFolding.cstr () );
tWriter.PutDword ( tSettings.m_iMinWordLen );

bool bEmbedSynonyms = pTokenizer->GetSynFileInfo ().m_uSize<=(SphOffset_t)iEmbeddedLimit;
bool bEmbedSynonyms = ( iEmbeddedLimit>0 && pTokenizer->GetSynFileInfo ().m_uSize<=(SphOffset_t)iEmbeddedLimit );
tWriter.PutByte ( bEmbedSynonyms ? 1 : 0 );
if ( bEmbedSynonyms )
pTokenizer->WriteSynonyms ( tWriter );
@@ -3447,7 +3447,8 @@ void SaveDictionarySettings ( CSphWriter & tWriter, const CSphDict * pDict, bool
ARRAY_FOREACH ( i, dSWFileInfos )
uTotalSize += dSWFileInfos[i].m_uSize;

bool bEmbedStopwords = uTotalSize<=(SphOffset_t)iEmbeddedLimit;
// embed only in case it allowed
bool bEmbedStopwords = ( iEmbeddedLimit>0 && uTotalSize<=(SphOffset_t)iEmbeddedLimit );
tWriter.PutByte ( bEmbedStopwords ? 1 : 0 );
if ( bEmbedStopwords )
pDict->WriteStopwords ( tWriter );
@@ -3465,7 +3466,7 @@ void SaveDictionarySettings ( CSphWriter & tWriter, const CSphDict * pDict, bool
ARRAY_FOREACH ( i, dWFFileInfos )
uTotalSize += dWFFileInfos[i].m_uSize;

bool bEmbedWordforms = uTotalSize<=(SphOffset_t)iEmbeddedLimit;
bool bEmbedWordforms = ( iEmbeddedLimit>0 && uTotalSize<=(SphOffset_t)iEmbeddedLimit );
tWriter.PutByte ( bEmbedWordforms ? 1 : 0 );
if ( bEmbedWordforms )
pDict->WriteWordforms ( tWriter );
@@ -3922,6 +3922,7 @@ void RtIndex_t::SaveDiskHeader ( const char * sFilename, SphDocID_t iMinDocID, i
SaveTokenizerSettings ( tWriter, m_pTokenizer, m_tSettings.m_iEmbeddedLimit );

// dictionary
// can not use embedding as stopwords id differs between RT and plain dictionaries
SaveDictionarySettings ( tWriter, m_pDict, m_bKeywordDict, 0 );

// kill-list size
@@ -0,0 +1 @@
a:1:{i:0;a:9:{i:0;a:2:{s:8:"sphinxql";s:64:"insert into testrt (id, gid, title) values (1, 11, 'test me up')";s:14:"total_affected";i:1;}i:1;a:2:{s:8:"sphinxql";s:21:"flush ramchunk testrt";s:14:"total_affected";i:0;}i:2;a:3:{s:8:"sphinxql";s:63:"select * from testrt where match ( 'test me ') order by id desc";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:2:{s:2:"id";s:1:"1";s:3:"gid";s:2:"11";}}}i:3;a:1:{s:8:"sphinxql";s:65:"restart-daemon => stop=ok, return code=0; start=ok, return code=2";}i:4;a:3:{s:8:"sphinxql";s:63:"select * from testrt where match ( 'test me ') order by id desc";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:2:{s:2:"id";s:1:"1";s:3:"gid";s:2:"11";}}}i:5;a:2:{s:8:"sphinxql";s:64:"insert into testrt (id, gid, title) values (2, 22, 'test me up')";s:14:"total_affected";i:1;}i:6;a:3:{s:8:"sphinxql";s:63:"select * from testrt where match ( 'test me ') order by id desc";s:10:"total_rows";i:2;s:4:"rows";a:2:{i:0;a:2:{s:2:"id";s:1:"2";s:3:"gid";s:2:"22";}i:1;a:2:{s:2:"id";s:1:"1";s:3:"gid";s:2:"11";}}}i:7;a:2:{s:8:"sphinxql";s:21:"flush ramchunk testrt";s:14:"total_affected";i:0;}i:8;a:3:{s:8:"sphinxql";s:63:"select * from testrt where match ( 'test me ') order by id desc";s:10:"total_rows";i:2;s:4:"rows";a:2:{i:0;a:2:{s:2:"id";s:1:"2";s:3:"gid";s:2:"22";}i:1;a:2:{s:2:"id";s:1:"1";s:3:"gid";s:2:"11";}}}}}
@@ -0,0 +1 @@
test
@@ -0,0 +1,72 @@
<?xml version="1.0" encoding="utf-8"?>
<test>

<name>stopwords vs RT index disk chunks</name>

<config>
indexer
{
mem_limit = 16M
}

searchd
{
<searchd_settings/>
workers = threads
}

source dummy
{
type = mysql
<sql_settings/>
sql_query = select * from test_table
sql_attr_uint = gid
}

index dummy
{
source = dummy
path = <data_path/>/dummy
}

index testrt
{
type = rt
rt_mem_limit = 128M
path = <data_path/>/testrt
rt_field = title
rt_attr_uint = gid

stopwords = <this_test/>/stops.txt
}

</config>

<db_create>
CREATE TABLE `test_table` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`gid` int(11) NOT NULL,
`title` varchar(255) NOT NULL,
PRIMARY KEY (`id`)
)
</db_create>
<db_drop>DROP TABLE IF EXISTS `test_table`;</db_drop>
<db_insert>INSERT INTO `test_table` VALUES ( 1, 1, 'test 1' )</db_insert>

<sphqueries>
<!-- first flush on just created RT index is fine -->
<sphinxql>insert into testrt (id, gid, title) values (1, 11, 'test me up')</sphinxql>
<sphinxql>flush ramchunk testrt</sphinxql>
<sphinxql>select * from testrt where match ( 'test me ') order by id desc</sphinxql>

<!-- next disk chunks from RT index is with embeded stopwords from RT index that is wrong -->
<sphinxql system="restart-daemon"></sphinxql>

<sphinxql>select * from testrt where match ( 'test me ') order by id desc</sphinxql>
<sphinxql>insert into testrt (id, gid, title) values (2, 22, 'test me up')</sphinxql>
<sphinxql>select * from testrt where match ( 'test me ') order by id desc</sphinxql>
<sphinxql>flush ramchunk testrt</sphinxql>
<sphinxql>select * from testrt where match ( 'test me ') order by id desc</sphinxql>
</sphqueries>

</test>

0 comments on commit e61ec00

Please sign in to comment.
You can’t perform that action at this time.