Permalink
Browse files

Merge branch '195_utf8_substring' into 'master'

fixed #195 utf8 substring matching with min lengths defined

Closes #195

See merge request manticoresearch/dev!27
  • Loading branch information...
tomatolog committed Feb 27, 2018
2 parents aec6bc3 + f304323 commit 8740fd63ae842c2ab0e51a8fcd6180d787ed57ec
Showing with 96 additions and 13 deletions.
  1. +27 −13 src/sphinx.cpp
  2. +1 −0 test/test_350/model.bin
  3. +68 −0 test/test_350/test.xml
@@ -17679,25 +17679,27 @@ bool sphExpandGetWords ( const char * sWord, const ExpansionContext_t & tCtx, IS
// compute non-wildcard prefix length
int iPrefix = 0;
for ( const char * s = sPrefix; *s && !sphIsWild ( *s ); s++ )
const char * sCodes = sPrefix;
for ( ; *sCodes && !sphIsWild ( *sCodes ); sCodes+=sphUtf8CharBytes ( *sCodes ) )
iPrefix++;
// do not expand prefixes under min length
int iMinLen = Max ( tCtx.m_iMinPrefixLen, tCtx.m_iMinInfixLen );
if ( iPrefix<iMinLen )
return false;
int iBytes = sCodes - sPrefix;
// prefix expansion should work on nonstemmed words only
char sFixed[MAX_KEYWORD_BYTES];
if ( tCtx.m_bHasMorphology )
{
sFixed[0] = MAGIC_WORD_HEAD_NONSTEMMED;
memcpy ( sFixed+1, sPrefix, iPrefix );
memcpy ( sFixed+1, sPrefix, iBytes );
sPrefix = sFixed;
iPrefix++;
iBytes++;
}
tCtx.m_pWordlist->GetPrefixedWords ( sPrefix, iPrefix, sWildcard, tWordlist );
tCtx.m_pWordlist->GetPrefixedWords ( sPrefix, iBytes, sWildcard, tWordlist );
} else
{
@@ -17706,28 +17708,40 @@ bool sphExpandGetWords ( const char * sWord, const ExpansionContext_t & tCtx, IS
assert ( tCtx.m_iMinInfixLen>0 );
// find the longest substring of non-wildcards
int iCodepoints = 0;
int iInfixCodepoints = 0;
int iInfixBytes = 0;
const char * sMaxInfix = NULL;
int iMaxInfix = 0;
int iCur = 0;
const char * sInfix = sWord;
for ( const char * s = sWord; *s; s++ )
for ( const char * s = sWord; *s; )
{
int iCodeLen = sphUtf8CharBytes ( *s );
if ( sphIsWild ( *s ) )
{
iCur = 0;
} else if ( ++iCur > iMaxInfix )
sInfix = s + 1;
iCodepoints = 0;
} else
{
sMaxInfix = s-iCur+1;
iMaxInfix = iCur;
iCodepoints++;
if ( s - sInfix + iCodeLen > iInfixBytes )
{
sMaxInfix = sInfix;
iInfixBytes = s - sInfix + iCodeLen;
iInfixCodepoints = iCodepoints;
}
}
s += iCodeLen;
}
// do not expand infixes under min_infix_len
if ( iMaxInfix < tCtx.m_iMinInfixLen )
if ( iInfixCodepoints < tCtx.m_iMinInfixLen )
return false;
// ignore heading star
tCtx.m_pWordlist->GetInfixedWords ( sMaxInfix, iMaxInfix, sWord, tWordlist );
tCtx.m_pWordlist->GetInfixedWords ( sMaxInfix, iInfixBytes, sWord, tWordlist );
}
return true;
@@ -0,0 +1 @@
a:1:{i:0;a:8:{i:0;a:2:{s:8:"sphinxql";s:42:"select * from test where match ('как*')";s:10:"total_rows";i:0;}i:1;a:3:{s:8:"sphinxql";s:13:"show warnings";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:3:{s:5:"Level";s:7:"warning";s:4:"Code";s:4:"1000";s:7:"Message";s:65:"Query word length is less than min infix length. word: 'как*' ";}}}i:2;a:3:{s:8:"sphinxql";s:44:"select * from test where match ('какт*')";s:10:"total_rows";i:2;s:4:"rows";a:2:{i:0;a:3:{s:2:"id";s:2:"11";s:3:"idd";s:2:"11";s:4:"body";s:25:"както неочень";}i:1;a:3:{s:2:"id";s:2:"12";s:3:"idd";s:2:"12";s:4:"body";s:27:"кактотак очень";}}}i:3;a:2:{s:8:"sphinxql";s:13:"show warnings";s:14:"total_affected";i:0;}i:4;a:2:{s:8:"sphinxql";s:43:"select * from test where match ('*акт*')";s:10:"total_rows";i:0;}i:5;a:3:{s:8:"sphinxql";s:13:"show warnings";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:3:{s:5:"Level";s:7:"warning";s:4:"Code";s:4:"1000";s:7:"Message";s:66:"Query word length is less than min infix length. word: '*акт*' ";}}}i:6;a:3:{s:8:"sphinxql";s:45:"select * from test where match ('*акто*')";s:10:"total_rows";i:2;s:4:"rows";a:2:{i:0;a:3:{s:2:"id";s:2:"11";s:3:"idd";s:2:"11";s:4:"body";s:25:"както неочень";}i:1;a:3:{s:2:"id";s:2:"12";s:3:"idd";s:2:"12";s:4:"body";s:27:"кактотак очень";}}}i:7;a:2:{s:8:"sphinxql";s:13:"show warnings";s:14:"total_affected";i:0;}}}
@@ -0,0 +1,68 @@
<?xml version="1.0" encoding="utf-8"?>
<test>
<name>warnings from short utf8 prefix and infix</name>
<config>
indexer
{
mem_limit = 16M
}
searchd
{
<searchd_settings/>
}
source test
{
type = mysql
<sql_settings/>
sql_query_pre = SET NAMES utf8
sql_query = SELECT * FROM test_table
sql_attr_uint = idd
sql_field_string = body
}
index test
{
source = test
path = <data_path/>/plain1
docinfo = extern
dict = keywords
charset_table = a..z, _, A..Z->a..z, russian
min_infix_len = 4
}
</config>
<db_create>
CREATE TABLE test_table
(
id INTEGER PRIMARY KEY NOT NULL,
idd INTEGER NOT NULL,
body VARCHAR(255) NOT NULL
) CHARACTER SET utf8;
</db_create>
<db_drop>DROP TABLE IF EXISTS test_table</db_drop>
<db_insert>SET NAMES utf8</db_insert>
<db_insert>INSERT INTO test_table VALUES
( 1, 1, 'test' ), (11, 11, 'както неочень') , (12, 12, 'кактотак очень')
</db_insert>
<sphqueries>
<!-- regression: warning from utf8 term -->
<sphinxql>select * from test where match ('как*')</sphinxql>
<sphinxql>show warnings</sphinxql>
<sphinxql>select * from test where match ('какт*')</sphinxql>
<sphinxql>show warnings</sphinxql>
<sphinxql>select * from test where match ('*акт*')</sphinxql>
<sphinxql>show warnings</sphinxql>
<sphinxql>select * from test where match ('*акто*')</sphinxql>
<sphinxql>show warnings</sphinxql>
</sphqueries>
</test>

0 comments on commit 8740fd6

Please sign in to comment.