Skip to content

Commit

Permalink
fixed UTF8 to handle properly at the upper and lower string functions…
Browse files Browse the repository at this point in the history
…; added regressions to test 455; added uni-algo library to convert strings; set case convertion fast path only in case C locale explicitly set; fixed #2360
  • Loading branch information
tomatolog committed Mar 27, 2023
1 parent b0fa697 commit 0bf17d9
Show file tree
Hide file tree
Showing 10 changed files with 245 additions and 61 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Expand Up @@ -301,6 +301,7 @@ dl_package ( PostgreSQL "pgsql" )
win_install ( PostgreSQL indexer )

include(GetNLJSON)
include(GetUniAlgo)

# Storing compiler version
set ( COMPILER "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}" )
Expand Down
5 changes: 5 additions & 0 deletions cmake/GetNLJSON.cmake
Expand Up @@ -11,6 +11,11 @@ include ( FetchContent )
set(JSON_BuildTests OFF CACHE INTERNAL "")
set(JSON_MultipleHeaders ON CACHE BOOL "")
set(JSON_GlobalUDLs OFF CACHE BOOL "")

if(POLICY CMP0135)
cmake_policy(SET CMP0135 NEW)
endif()

set ( NLJSON_URL_GITHUB "http://github.com/nlohmann/json/archive/refs/tags/v3.10.5.zip" )
message ( STATUS "Use nljson from github ${NLJSON_URL_GITHUB}" )
FetchContent_Declare ( nljson
Expand Down
39 changes: 39 additions & 0 deletions cmake/GetUniAlgo.cmake
@@ -0,0 +1,39 @@
# build UNIALGO at configure time
cmake_minimum_required ( VERSION 3.1 FATAL_ERROR )

set ( UNIALGO_LIBDIR "${MANTICORE_BINARY_DIR}/unialgo" )
set ( UNIALGO_SRC "${MANTICORE_BINARY_DIR}/unialgo-src" )
mark_as_advanced ( UNIALGO_SRC UNIALGO_LIBDIR )

include ( FetchContent )
# check whether we have local copy (to not disturb network)

set(UNI_ALGO_DISABLE_NORM ON CACHE BOOL "")
set(UNI_ALGO_DISABLE_PROP ON CACHE BOOL "")
set(UNI_ALGO_DISABLE_BREAK_WORD ON CACHE BOOL "")
set(UNI_ALGO_DISABLE_COLLATE ON CACHE BOOL "")
set(UNI_ALGO_DISABLE_NFKC_NFKD ON CACHE BOOL "")
set(UNI_ALGO_DISABLE_SHRINK_TO_FIT ON CACHE BOOL "")
set(UNI_ALGO_DISABLE_SHRINK_TO_FIT ON CACHE BOOL "")

if(POLICY CMP0135)
cmake_policy(SET CMP0135 NEW)
endif()

set ( UNIALGO_URL_GITHUB "https://github.com/manticoresoftware/uni-algo/archive/refs/tags/v0.7.2.tar.gz" )
message ( STATUS "Use UNIALGO from github ${UNIALGO_URL_GITHUB}" )
FetchContent_Declare ( unialgo
SOURCE_DIR "${UNIALGO_SRC}"
BINARY_DIR "${UNIALGO_LIBDIR}"
URL ${UNIALGO_URL_GITHUB}
GIT_TAG cmake-3.x-5.7
GIT_SHALLOW TRUE
)

FetchContent_GetProperties ( unialgo )
if ( NOT unialgo_POPULATED )
FetchContent_Populate ( unialgo )
add_subdirectory ( ${unialgo_SOURCE_DIR} ${unialgo_BINARY_DIR} )
endif ()
mark_as_advanced ( FETCHCONTENT_FULLY_DISCONNECTED FETCHCONTENT_QUIET FETCHCONTENT_UPDATES_DISCONNECTED
FETCHCONTENT_SOURCE_DIR_UNIALGO FETCHCONTENT_UPDATES_DISCONNECTED_UNIALGO )
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Expand Up @@ -204,6 +204,7 @@ set_property ( SOURCE sphinxql_debug.cpp APPEND PROPERTY OBJECT_DEPENDS ${FLEX_S

# our mega-lib
target_sources ( lmanticore PUBLIC ${LMANTICORE_BISON} ${LMANTICORE_FLEX} ${HEADERS} ${CHARSET_FILES} ${CHARSET_TEMPLATE} "../misc/manticore.natvis" )
target_link_libraries ( lmanticore PUBLIC uni-algo::uni-algo )

add_library ( lsearchd OBJECT searchdha.cpp http/http_parser.c searchdhttp.cpp
searchdtask.cpp taskping.cpp taskmalloctrim.cpp taskoptimize.cpp taskglobalidf.cpp tasksavestate.cpp
Expand Down
25 changes: 25 additions & 0 deletions src/collation.cpp
Expand Up @@ -424,3 +424,28 @@ ESphCollation sphCollationFromName ( const CSphString & sName, CSphString * pErr
pError->SetSprintf ( "Unknown collation: '%s'", sName.cstr() );
return SPH_COLLATION_DEFAULT;
}

static CSphString g_sLocale;
static std::locale g_tLocale;
static bool g_bGlobalLocaleSet = false;

void SetLocale ( const CSphString & sLocale, bool bSet )
{
g_sLocale = sLocale;
g_tLocale = std::locale();
if ( g_sLocale.IsEmpty() )
return;

g_bGlobalLocaleSet = bSet;
g_tLocale = std::locale ( sLocale.cstr() );
}

const std::locale & GlobalLocale()
{
return g_tLocale;
}

bool IsGlobalLocaleSet()
{
return g_bGlobalLocaleSet;
}
4 changes: 4 additions & 0 deletions src/collation.h
Expand Up @@ -14,6 +14,7 @@
#define _collation_

#include "std/fnv64.h"
#include <locale>

class LibcCSHash_fn
{
Expand Down Expand Up @@ -64,5 +65,8 @@ StrHashCalc_fn GetStringHashCalcFunc ( ESphCollation eCollation );
void sphCollationInit();
volatile ESphCollation& GlobalCollation();
ESphCollation sphCollationFromName ( const CSphString & sName, CSphString * pError );
void SetLocale ( const CSphString & sLocale, bool bSet );
const std::locale & GlobalLocale();
bool IsGlobalLocaleSet();

#endif // _collation_
7 changes: 6 additions & 1 deletion src/searchd.cpp
Expand Up @@ -18972,12 +18972,17 @@ void ConfigureSearchd ( const CSphConfig & hConf, bool bOptPIDFile, bool bTestMo
{
g_iTFO &= ~TFO_LISTEN;
}

bool bLocaleSet = false;
if ( hSearchd ( "collation_libc_locale" ) )
{
auto sLocale = hSearchd.GetStr ( "collation_libc_locale" );
if ( !setlocale ( LC_COLLATE, sLocale.cstr() ) )
bLocaleSet = setlocale ( LC_COLLATE, sLocale.cstr() );
if ( !bLocaleSet )
sphWarning ( "setlocale failed (locale='%s')", sLocale.cstr() );
}
CSphString sLoc = setlocale ( LC_COLLATE, nullptr );
SetLocale( sLoc, bLocaleSet );

if ( hSearchd ( "collation_server" ) )
{
Expand Down
199 changes: 142 additions & 57 deletions src/sphinxexpr.cpp
Expand Up @@ -26,6 +26,7 @@
#include "conversion.h"
#include <time.h>
#include <math.h>
#include "uni_algo/case.h"

#if WITH_RE2
#include <re2/re2.h>
Expand Down Expand Up @@ -2466,15 +2467,12 @@ int Expr_SubstringIndex_c::RightSearch ( const char * pDoc, int iDocLen, int iCo
return LeftSearch ( pDoc, iDocLen, iCount, true, ppResStr, pResLen );
}

template<bool UPPER>
class Expr_Case_c : public ISphStringExpr
class ExprCaseBase_c : public ISphStringExpr
{
private:
CSphRefcountedPtr<ISphExpr> m_pArg;

public:
explicit Expr_Case_c ( ISphExpr * pArg )
explicit ExprCaseBase_c ( ISphExpr * pArg, const char * sClassName )
: m_pArg ( pArg )
, m_sClassName ( sClassName )
{
assert( pArg );
SafeAddRef( pArg );
Expand All @@ -2492,36 +2490,6 @@ class Expr_Case_c : public ISphStringExpr
m_pArg->Command ( eCmd, pArg );
}

int StringEval ( const CSphMatch & tMatch, const BYTE ** ppStr ) const final
{
const char * pDoc = nullptr;
int iDocLen = m_pArg->StringEval ( tMatch, (const BYTE **)&pDoc );
*ppStr = nullptr;

// create CSphVector and store the value
CSphVector<BYTE> dStrBuffer;
dStrBuffer.Append ( pDoc, iDocLen );

BYTE * pStrBeg = dStrBuffer.begin();
const BYTE * pStrEnd = ( pStrBeg + iDocLen );

if ( pDoc && iDocLen>0 )
{
while( pStrBeg<pStrEnd )
{
// convert the current character to its uppercase or lowercase version if it exists
DoCase ( (char *)pStrBeg );
pStrBeg++;
}
}

*ppStr = dStrBuffer.LeakData();
FreeDataPtr ( *m_pArg, pDoc );

// return the resultant string
return iDocLen;
}

bool IsDataPtrAttr() const final
{
return true;
Expand Down Expand Up @@ -2604,43 +2572,150 @@ class Expr_Case_c : public ISphStringExpr
FreeDataPtr ( *this, pBuf );
return iVal;
}

bool IsConst () const final { return false; }

uint64_t GetHash ( const ISphSchema & tSorterSchema, uint64_t uPrevHash, bool & bDisable ) final
{
EXPR_CLASS_NAME("Expr_Case_c");
CALC_CHILD_HASH(m_pArg);
return CALC_DEP_HASHES();
}
bool IsConst () const final { return false; }

ISphExpr * Clone () const final
{
return new Expr_Case_c ( *this );
}
uint64_t GetHash ( const ISphSchema & tSorterSchema, uint64_t uPrevHash, bool & bDisable ) final
{
EXPR_CLASS_NAME(m_sClassName);
CALC_CHILD_HASH(m_pArg);
return CALC_DEP_HASHES();
}

private:
void DoCase ( char * pString ) const;
protected:
CSphRefcountedPtr<ISphExpr> m_pArg;

Expr_Case_c ( const Expr_Case_c & rhs )
ExprCaseBase_c ( const ExprCaseBase_c & rhs )
: m_pArg ( SafeClone ( rhs.m_pArg ) )
{}
, m_sClassName ( rhs.m_sClassName )
{}

private:
const char * m_sClassName = nullptr;
};

template<bool UPPER>
class ExprCaseTrival_c : public ExprCaseBase_c
{
public:
explicit ExprCaseTrival_c ( ISphExpr * pArg )
: ExprCaseBase_c ( pArg, "ExprCaseTrival_c" )
{
}

int StringEval ( const CSphMatch & tMatch, const BYTE ** ppStr ) const final
{
const char * pDoc = nullptr;
int iDocLen = m_pArg->StringEval ( tMatch, (const BYTE **)&pDoc );
*ppStr = nullptr;

// create CSphVector and store the value
CSphVector<BYTE> dStrBuffer;
dStrBuffer.Append ( pDoc, iDocLen );

BYTE * pStrBeg = dStrBuffer.begin();
const BYTE * pStrEnd = ( pStrBeg + iDocLen );

if ( pDoc && iDocLen>0 )
{
while( pStrBeg<pStrEnd )
{
// convert the current character to its uppercase or lowercase version if it exists
DoCase ( (char *)pStrBeg );
pStrBeg++;
}
}

*ppStr = dStrBuffer.LeakData();
FreeDataPtr ( *m_pArg, pDoc );

// return the resultant string
return iDocLen;
}

ISphExpr * Clone () const final
{
return new ExprCaseTrival_c ( *this );
}

private:
void DoCase ( char * pString ) const;

ExprCaseTrival_c ( const ExprCaseTrival_c & rhs )
: ExprCaseBase_c ( rhs )
{}
};

// For upper() function
template<>
void Expr_Case_c<true> :: DoCase ( char *pString ) const
void ExprCaseTrival_c<true>::DoCase ( char * pString ) const
{
*pString = toupper ( *pString );
}

// For lower() function
template<>
void Expr_Case_c<false> :: DoCase ( char *pString ) const
void ExprCaseTrival_c<false>::DoCase ( char * pString ) const
{
*pString = tolower ( *pString );
*pString = tolower ( *pString );
}

void UTF8ToLower( std::vector<char> & dBuf, std::basic_string_view<char> source )
{
return una::detail::t_map<std::vector<char>, std::basic_string_view<char>, una::detail::impl_x_case_map_utf8, una::detail::impl_case_map_loc_utf8> ( dBuf, source, una::detail::impl_case_map_mode_lowercase );
}

void UTF8ToUpper( std::vector<char> & dBuf, std::basic_string_view<char> source )
{
return una::detail::t_map<std::vector<char>, std::basic_string_view<char>, una::detail::impl_x_case_map_utf8, una::detail::impl_case_map_loc_utf8> ( dBuf, source, una::detail::impl_case_map_mode_uppercase );
}

template<bool UPPER>
class ExprCaseComplex_c : public ExprCaseBase_c
{
public:
explicit ExprCaseComplex_c ( ISphExpr * pArg )
: ExprCaseBase_c ( pArg, "ExprCaseComplex_c" )
{
}

int StringEval ( const CSphMatch & tMatch, const BYTE ** ppStr ) const final
{
*ppStr = nullptr;

const char * pSrcDoc = nullptr;
int iSrcDocLen = m_pArg->StringEval ( tMatch, (const BYTE **)&pSrcDoc );

std::vector<char> & dBuf = const_cast<std::vector<char> &> ( m_dBuf );
dBuf.resize ( 0 );
if ( UPPER )
UTF8ToUpper ( dBuf, std::basic_string_view<char> ( pSrcDoc, iSrcDocLen ) );
else
UTF8ToLower ( dBuf, std::basic_string_view<char> ( pSrcDoc, iSrcDocLen ) );

int iDstDocLen = dBuf.size();
CSphFixedVector<BYTE> dDst ( iDstDocLen );
memcpy ( dDst.Begin(), dBuf.data(), iDstDocLen );
*ppStr = dDst.LeakData();

// return the resultant string
return iDstDocLen;
}

ISphExpr * Clone () const final
{
return new ExprCaseComplex_c ( *this );
}

private:

ExprCaseComplex_c ( const ExprCaseComplex_c & rhs )
: ExprCaseBase_c ( rhs )
{}

std::vector<char> m_dBuf;
};

class Expr_Iterator_c : public Expr_JsonField_c
{
public:
Expand Down Expand Up @@ -6869,9 +6944,19 @@ ISphExpr * ExprParser_t::CreateFuncExpr ( int iNode, VecRefPtrs_t<ISphExpr*> & d
return new Expr_SubstringIndex_c ( dArgs[0], dArgs[1], dArgs[2] );

case FUNC_UPPER:
return new Expr_Case_c<true> ( dArgs[0] );
{
if ( IsGlobalLocaleSet() && GlobalLocale()==std::locale::classic() )
return new ExprCaseTrival_c<true> ( dArgs[0] );
else
return new ExprCaseComplex_c<true> ( dArgs[0] );
}
case FUNC_LOWER:
return new Expr_Case_c<false> ( dArgs[0] );
{
if ( IsGlobalLocaleSet() && GlobalLocale()==std::locale::classic() )
return new ExprCaseTrival_c<false> ( dArgs[0] );
else
return new ExprCaseComplex_c<false> ( dArgs[0] );
}

case FUNC_LAST_INSERT_ID: return new Expr_LastInsertID_c();
case FUNC_CURRENT_USER:
Expand Down

0 comments on commit 0bf17d9

Please sign in to comment.