Skip to content

Commit

Permalink
Cache iconv context per-thread
Browse files Browse the repository at this point in the history
  • Loading branch information
sfan5 committed Mar 30, 2024
1 parent 008d6be commit 5df60d8
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 10 deletions.
7 changes: 7 additions & 0 deletions src/unittest/test_utilities.cpp
Expand Up @@ -318,6 +318,7 @@ void TestUtilities::testUTF8()
UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("")), "");
UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("the shovel dug a crumbly node!")),
"the shovel dug a crumbly node!");

UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide(u8"-ä-")),
u8"-ä-");
UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide(u8"-\U0002000b-")),
Expand All @@ -326,6 +327,12 @@ void TestUtilities::testUTF8()
const auto *literal = U"-\U0002000b-";
UASSERT(utf8_to_wide(u8"-\U0002000b-") == reinterpret_cast<const wchar_t*>(literal));
}

// try to check that the conversion function does not accidentally keep
// its internal state across invocations.
// \xC4\x81 is UTF-8 for \u0101
utf8_to_wide("\xC4");
UASSERT(utf8_to_wide("\x81") != L"\u0101");
}

void TestUtilities::testRemoveEscapes()
Expand Down
49 changes: 39 additions & 10 deletions src/util/string.cpp
Expand Up @@ -41,28 +41,49 @@ with this program; if not, write to the Free Software Foundation, Inc.,

#ifndef _WIN32

static bool convert(const char *to, const char *from, char *outbuf,
size_t *outbuf_size, char *inbuf, size_t inbuf_size)
namespace {
class IconvSmartPointer {
iconv_t m_cd;
static const iconv_t null_value;
public:
IconvSmartPointer() : m_cd(null_value) {}
~IconvSmartPointer() { reset(); }

DISABLE_CLASS_COPY(IconvSmartPointer)
ALLOW_CLASS_MOVE(IconvSmartPointer)

iconv_t get() const { return m_cd; }
operator bool() const { return m_cd != null_value; }
void reset(iconv_t cd = null_value) {
if (m_cd != null_value)
iconv_close(m_cd);
m_cd = cd;
}
};

// note that this can't be constexpr if iconv_t is a pointer
const iconv_t IconvSmartPointer::null_value = (iconv_t) -1;
}

static bool convert(iconv_t cd, char *outbuf, size_t *outbuf_size,
char *inbuf, size_t inbuf_size)
{
iconv_t cd = iconv_open(to, from);
// reset conversion state
iconv(cd, nullptr, nullptr, nullptr, nullptr);

char *inbuf_ptr = inbuf;
char *outbuf_ptr = outbuf;

size_t *inbuf_left_ptr = &inbuf_size;

const size_t old_outbuf_size = *outbuf_size;
size_t old_size = inbuf_size;
while (inbuf_size > 0) {
iconv(cd, &inbuf_ptr, inbuf_left_ptr, &outbuf_ptr, outbuf_size);
iconv(cd, &inbuf_ptr, &inbuf_size, &outbuf_ptr, outbuf_size);
if (inbuf_size == old_size) {
iconv_close(cd);
return false;
}
old_size = inbuf_size;
}

iconv_close(cd);
*outbuf_size = old_outbuf_size - *outbuf_size;
return true;
}
Expand All @@ -80,6 +101,10 @@ constexpr auto DEFAULT_ENCODING = ([] () -> const char* {

std::wstring utf8_to_wide(std::string_view input)
{
thread_local IconvSmartPointer cd;
if (!cd)
cd.reset(iconv_open(DEFAULT_ENCODING, "UTF-8"));

const size_t inbuf_size = input.length();
// maximum possible size, every character is sizeof(wchar_t) bytes
size_t outbuf_size = input.length() * sizeof(wchar_t);
Expand All @@ -90,7 +115,7 @@ std::wstring utf8_to_wide(std::string_view input)
out.resize(outbuf_size / sizeof(wchar_t));

char *outbuf = reinterpret_cast<char*>(&out[0]);
if (!convert(DEFAULT_ENCODING, "UTF-8", outbuf, &outbuf_size, inbuf, inbuf_size)) {
if (!convert(cd.get(), outbuf, &outbuf_size, inbuf, inbuf_size)) {
infostream << "Couldn't convert UTF-8 string 0x" << hex_encode(input)
<< " into wstring" << std::endl;
delete[] inbuf;
Expand All @@ -104,6 +129,10 @@ std::wstring utf8_to_wide(std::string_view input)

std::string wide_to_utf8(std::wstring_view input)
{
thread_local IconvSmartPointer cd;
if (!cd)
cd.reset(iconv_open("UTF-8", DEFAULT_ENCODING));

const size_t inbuf_size = input.length() * sizeof(wchar_t);
// maximum possible size: utf-8 encodes codepoints using 1 up to 4 bytes
size_t outbuf_size = input.length() * 4;
Expand All @@ -113,7 +142,7 @@ std::string wide_to_utf8(std::wstring_view input)
std::string out;
out.resize(outbuf_size);

if (!convert("UTF-8", DEFAULT_ENCODING, &out[0], &outbuf_size, inbuf, inbuf_size)) {
if (!convert(cd.get(), &out[0], &outbuf_size, inbuf, inbuf_size)) {
infostream << "Couldn't convert wstring 0x" << hex_encode(inbuf, inbuf_size)
<< " into UTF-8 string" << std::endl;
delete[] inbuf;
Expand Down

0 comments on commit 5df60d8

Please sign in to comment.