[libc++] Fix UTF-8 decoding in codecvts #68442

dimztimz · 2023-10-06T20:01:35Z

This patch fixes one case where the decoding member function in() was returning partial instead of error. Additionally, it adds large testsuite that tests all codecvt facets that were added in C++11 and in C++20. The testsuite covers this bug.

Fixes #60177.

llvmbot · 2023-10-06T20:02:39Z

@llvm/pr-subscribers-libcxx

Changes

This patch fixes one case where the decoding member function in() was returning partial instead of error. Additionally, it adds large testsuite that tests all codecvt facets that were added in C++11 and in C++20. The testsuite covers this bug.

Patch is 88.99 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/68442.diff

2 Files Affected:

(modified) libcxx/src/locale.cpp (+49-23)
(added) libcxx/test/std/localization/codecvt_unicode.pass.cpp (+2130)

diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index 317b4dec7d241e5..18bcf02d2a12f5a 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -1985,10 +1985,9 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
         }
         else if (c1 < 0xF0)
         {
-            if (frm_end-frm_nxt < 3)
+            if (frm_end-frm_nxt < 2)
                 return codecvt_base::partial;
             uint8_t c2 = frm_nxt[1];
-            uint8_t c3 = frm_nxt[2];
             switch (c1)
             {
             case 0xE0:
@@ -2004,6 +2003,9 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
                     return codecvt_base::error;
                  break;
             }
+            if (frm_end-frm_nxt < 3)
+                return codecvt_base::partial;
+            uint8_t c3 = frm_nxt[2];
             if ((c3 & 0xC0) != 0x80)
                 return codecvt_base::error;
             uint16_t t = static_cast<uint16_t>(((c1 & 0x0F) << 12)
@@ -2016,11 +2018,9 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
         }
         else if (c1 < 0xF5)
         {
-            if (frm_end-frm_nxt < 4)
+            if (frm_end-frm_nxt < 2)
                 return codecvt_base::partial;
             uint8_t c2 = frm_nxt[1];
-            uint8_t c3 = frm_nxt[2];
-            uint8_t c4 = frm_nxt[3];
             switch (c1)
             {
             case 0xF0:
@@ -2036,8 +2036,16 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
                     return codecvt_base::error;
                  break;
             }
-            if ((c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
-                return codecvt_base::error;
+            if (frm_end-frm_nxt < 3)
+                 return codecvt_base::partial;
+            uint8_t c3 = frm_nxt[2];
+            if ((c3 & 0xC0) != 0x80)
+                 return codecvt_base::error;
+            if (frm_end-frm_nxt < 4)
+                 return codecvt_base::partial;
+            uint8_t c4 = frm_nxt[3];
+            if ((c4 & 0xC0) != 0x80)
+                 return codecvt_base::error;
             if (to_end-to_nxt < 2)
                 return codecvt_base::partial;
             if ((((c1 & 7UL) << 18) +
@@ -2106,10 +2114,9 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
         }
         else if (c1 < 0xF0)
         {
-            if (frm_end-frm_nxt < 3)
+            if (frm_end-frm_nxt < 2)
                 return codecvt_base::partial;
             uint8_t c2 = frm_nxt[1];
-            uint8_t c3 = frm_nxt[2];
             switch (c1)
             {
             case 0xE0:
@@ -2125,6 +2132,9 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
                     return codecvt_base::error;
                  break;
             }
+            if (frm_end-frm_nxt < 3)
+                 return codecvt_base::partial;
+            uint8_t c3 = frm_nxt[2];
             if ((c3 & 0xC0) != 0x80)
                 return codecvt_base::error;
             uint16_t t = static_cast<uint16_t>(((c1 & 0x0F) << 12)
@@ -2137,11 +2147,9 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
         }
         else if (c1 < 0xF5)
         {
-            if (frm_end-frm_nxt < 4)
+            if (frm_end-frm_nxt < 2)
                 return codecvt_base::partial;
             uint8_t c2 = frm_nxt[1];
-            uint8_t c3 = frm_nxt[2];
-            uint8_t c4 = frm_nxt[3];
             switch (c1)
             {
             case 0xF0:
@@ -2157,8 +2165,16 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
                     return codecvt_base::error;
                  break;
             }
-            if ((c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
-                return codecvt_base::error;
+            if (frm_end-frm_nxt < 3)
+                 return codecvt_base::partial;
+            uint8_t c3 = frm_nxt[2];
+            if ((c3 & 0xC0) != 0x80)
+                 return codecvt_base::error;
+            if (frm_end-frm_nxt < 4)
+                 return codecvt_base::partial;
+            uint8_t c4 = frm_nxt[3];
+            if ((c4 & 0xC0) != 0x80)
+                 return codecvt_base::error;
             if (to_end-to_nxt < 2)
                 return codecvt_base::partial;
             if ((((c1 & 7UL) << 18) +
@@ -2384,10 +2400,9 @@ utf8_to_ucs4(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
         }
         else if (c1 < 0xF0)
         {
-            if (frm_end-frm_nxt < 3)
+            if (frm_end-frm_nxt < 2)
                 return codecvt_base::partial;
             uint8_t c2 = frm_nxt[1];
-            uint8_t c3 = frm_nxt[2];
             switch (c1)
             {
             case 0xE0:
@@ -2403,6 +2418,9 @@ utf8_to_ucs4(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
                     return codecvt_base::error;
                  break;
             }
+            if (frm_end-frm_nxt < 3)
+                 return codecvt_base::partial;
+            uint8_t c3 = frm_nxt[2];
             if ((c3 & 0xC0) != 0x80)
                 return codecvt_base::error;
             uint32_t t = static_cast<uint32_t>(((c1 & 0x0F) << 12)
@@ -2415,11 +2433,9 @@ utf8_to_ucs4(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
         }
         else if (c1 < 0xF5)
         {
-            if (frm_end-frm_nxt < 4)
+            if (frm_end-frm_nxt < 2)
                 return codecvt_base::partial;
             uint8_t c2 = frm_nxt[1];
-            uint8_t c3 = frm_nxt[2];
-            uint8_t c4 = frm_nxt[3];
             switch (c1)
             {
             case 0xF0:
@@ -2435,8 +2451,16 @@ utf8_to_ucs4(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
                     return codecvt_base::error;
                  break;
             }
-            if ((c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
-                return codecvt_base::error;
+            if (frm_end-frm_nxt < 3)
+                 return codecvt_base::partial;
+            uint8_t c3 = frm_nxt[2];
+            if ((c3 & 0xC0) != 0x80)
+                 return codecvt_base::error;
+            if (frm_end-frm_nxt < 4)
+                 return codecvt_base::partial;
+            uint8_t c4 = frm_nxt[3];
+            if ((c4 & 0xC0) != 0x80)
+                 return codecvt_base::error;
             uint32_t t = static_cast<uint32_t>(((c1 & 0x07) << 18)
                                              | ((c2 & 0x3F) << 12)
                                              | ((c3 & 0x3F) << 6)
@@ -2642,10 +2666,9 @@ utf8_to_ucs2(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
         }
         else if (c1 < 0xF0)
         {
-            if (frm_end-frm_nxt < 3)
+            if (frm_end-frm_nxt < 2)
                 return codecvt_base::partial;
             uint8_t c2 = frm_nxt[1];
-            uint8_t c3 = frm_nxt[2];
             switch (c1)
             {
             case 0xE0:
@@ -2661,6 +2684,9 @@ utf8_to_ucs2(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
                     return codecvt_base::error;
                  break;
             }
+            if (frm_end-frm_nxt < 3)
+                 return codecvt_base::partial;
+            uint8_t c3 = frm_nxt[2];
             if ((c3 & 0xC0) != 0x80)
                 return codecvt_base::error;
             uint16_t t = static_cast<uint16_t>(((c1 & 0x0F) << 12)
diff --git a/libcxx/test/std/localization/codecvt_unicode.pass.cpp b/libcxx/test/std/localization/codecvt_unicode.pass.cpp
new file mode 100644
index 000000000000000..6abe3d7ba69da6a
--- /dev/null
+++ b/libcxx/test/std/localization/codecvt_unicode.pass.cpp
@@ -0,0 +1,2130 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0|12.0|13.0}}
+
+#include <algorithm>
+#include <cassert>
+#include <codecvt>
+#include <locale>
+#include <string>
+
+#include "test_macros.h"
+
+struct test_offsets_ok {
+  size_t in_size;
+  size_t out_size;
+};
+struct test_offsets_partial {
+  size_t in_size;
+  size_t out_size;
+  size_t expected_in_next;
+  size_t expected_out_next;
+};
+
+template <class CharT>
+struct test_offsets_error {
+  size_t in_size;
+  size_t out_size;
+  size_t expected_in_next;
+  size_t expected_out_next;
+  CharT replace_char;
+  size_t replace_pos;
+};
+
+#define array_size(x) (sizeof(x) / sizeof(x)[0])
+
+using std::begin;
+using std::char_traits;
+using std::codecvt_base;
+using std::copy;
+using std::end;
+
+template <class InternT, class ExternT>
+void utf8_to_utf32_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
+  // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
+  const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
+  const char32_t expected[]   = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
+  static_assert(array_size(input) == 11, "");
+  static_assert(array_size(expected) == 5, "");
+
+  ExternT in[array_size(input)];
+  InternT exp[array_size(expected)];
+  copy(begin(input), end(input), begin(in));
+  copy(begin(expected), end(expected), begin(exp));
+  assert(char_traits<ExternT>::length(in) == 10);
+  assert(char_traits<InternT>::length(exp) == 4);
+  test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 4}};
+  for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
+    test_offsets_ok t                = *it;
+    InternT out[array_size(exp) - 1] = {};
+    assert(t.in_size <= array_size(in));
+    assert(t.out_size <= array_size(out));
+    mbstate_t state          = {};
+    const ExternT* in_next   = nullptr;
+    InternT* out_next        = nullptr;
+    codecvt_base::result res = codecvt_base::ok;
+
+    res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
+    assert(res == cvt.ok);
+    assert(in_next == in + t.in_size);
+    assert(out_next == out + t.out_size);
+    assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
+    if (t.out_size < array_size(out))
+      assert(out[t.out_size] == 0);
+  }
+
+  for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
+    test_offsets_ok t            = *it;
+    InternT out[array_size(exp)] = {};
+    assert(t.in_size <= array_size(in));
+    assert(t.out_size <= array_size(out));
+    mbstate_t state          = {};
+    const ExternT* in_next   = nullptr;
+    InternT* out_next        = nullptr;
+    codecvt_base::result res = codecvt_base::ok;
+
+    res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
+    assert(res == cvt.ok);
+    assert(in_next == in + t.in_size);
+    assert(out_next == out + t.out_size);
+    assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
+    if (t.out_size < array_size(out))
+      assert(out[t.out_size] == 0);
+  }
+}
+
+template <class InternT, class ExternT>
+void utf8_to_utf32_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
+  // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
+  const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
+  const char32_t expected[]   = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
+  static_assert(array_size(input) == 11, "");
+  static_assert(array_size(expected) == 5, "");
+
+  ExternT in[array_size(input)];
+  InternT exp[array_size(expected)];
+  copy(begin(input), end(input), begin(in));
+  copy(begin(expected), end(expected), begin(exp));
+  assert(char_traits<ExternT>::length(in) == 10);
+  assert(char_traits<InternT>::length(exp) == 4);
+
+  test_offsets_partial offsets[] = {
+      {1, 0, 0, 0}, // no space for first CP
+
+      {3, 1, 1, 1}, // no space for second CP
+      {2, 2, 1, 1}, // incomplete second CP
+      {2, 1, 1, 1}, // incomplete second CP, and no space for it
+
+      {6, 2, 3, 2}, // no space for third CP
+      {4, 3, 3, 2}, // incomplete third CP
+      {5, 3, 3, 2}, // incomplete third CP
+      {4, 2, 3, 2}, // incomplete third CP, and no space for it
+      {5, 2, 3, 2}, // incomplete third CP, and no space for it
+
+      {10, 3, 6, 3}, // no space for fourth CP
+      {7, 4, 6, 3},  // incomplete fourth CP
+      {8, 4, 6, 3},  // incomplete fourth CP
+      {9, 4, 6, 3},  // incomplete fourth CP
+      {7, 3, 6, 3},  // incomplete fourth CP, and no space for it
+      {8, 3, 6, 3},  // incomplete fourth CP, and no space for it
+      {9, 3, 6, 3},  // incomplete fourth CP, and no space for it
+  };
+
+  for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
+    test_offsets_partial t           = *it;
+    InternT out[array_size(exp) - 1] = {};
+    assert(t.in_size <= array_size(in));
+    assert(t.out_size <= array_size(out));
+    assert(t.expected_in_next <= t.in_size);
+    assert(t.expected_out_next <= t.out_size);
+    mbstate_t state          = {};
+    const ExternT* in_next   = nullptr;
+    InternT* out_next        = nullptr;
+    codecvt_base::result res = codecvt_base::ok;
+
+    res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
+    assert(res == cvt.partial);
+    assert(in_next == in + t.expected_in_next);
+    assert(out_next == out + t.expected_out_next);
+    assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
+    if (t.expected_out_next < array_size(out))
+      assert(out[t.expected_out_next] == 0);
+  }
+}
+
+template <class InternT, class ExternT>
+void utf8_to_utf32_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
+  // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
+  const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
+  const char32_t expected[]   = {'b', 0x0448, 0xD700, 0x10AAAA, 0};
+  static_assert(array_size(input) == 11, "");
+  static_assert(array_size(expected) == 5, "");
+
+  ExternT in[array_size(input)];
+  InternT exp[array_size(expected)];
+  copy(begin(input), end(input), begin(in));
+  copy(begin(expected), end(expected), begin(exp));
+  assert(char_traits<ExternT>::length(in) == 10);
+  assert(char_traits<InternT>::length(exp) == 4);
+
+  // There are 5 classes of errors in UTF-8 decoding
+  // 1. Missing leading byte
+  // 2. Missing trailing byte
+  // 3. Surrogate CP
+  // 4. Overlong sequence
+  // 5. CP out of Unicode range
+  test_offsets_error<unsigned char> offsets[] = {
+
+      // 1. Missing leading byte. We will replace the leading byte with
+      // non-leading byte, such as a byte that is always invalid or a trailing
+      // byte.
+
+      // replace leading byte with invalid byte
+      {1, 4, 0, 0, 0xFF, 0},
+      {3, 4, 1, 1, 0xFF, 1},
+      {6, 4, 3, 2, 0xFF, 3},
+      {10, 4, 6, 3, 0xFF, 6},
+
+      // replace leading byte with trailing byte
+      {1, 4, 0, 0, 0b10101010, 0},
+      {3, 4, 1, 1, 0b10101010, 1},
+      {6, 4, 3, 2, 0b10101010, 3},
+      {10, 4, 6, 3, 0b10101010, 6},
+
+      // 2. Missing trailing byte. We will replace the trailing byte with
+      // non-trailing byte, such as a byte that is always invalid or a leading
+      // byte (simple ASCII byte in our case).
+
+      // replace first trailing byte with ASCII byte
+      {3, 4, 1, 1, 'z', 2},
+      {6, 4, 3, 2, 'z', 4},
+      {10, 4, 6, 3, 'z', 7},
+
+      // replace first trailing byte with invalid byte
+      {3, 4, 1, 1, 0xFF, 2},
+      {6, 4, 3, 2, 0xFF, 4},
+      {10, 4, 6, 3, 0xFF, 7},
+
+      // replace second trailing byte with ASCII byte
+      {6, 4, 3, 2, 'z', 5},
+      {10, 4, 6, 3, 'z', 8},
+
+      // replace second trailing byte with invalid byte
+      {6, 4, 3, 2, 0xFF, 5},
+      {10, 4, 6, 3, 0xFF, 8},
+
+      // replace third trailing byte
+      {10, 4, 6, 3, 'z', 9},
+      {10, 4, 6, 3, 0xFF, 9},
+
+      // 2.1 The following test-cases raise doubt whether error or partial should
+      // be returned. For example, we have 4-byte sequence with valid leading
+      // byte. If we hide the last byte we need to return partial. But, if the
+      // second or third byte, which are visible to the call to codecvt, are
+      // malformed then error should be returned.
+
+      // replace first trailing byte with ASCII byte, also incomplete at end
+      {5, 4, 3, 2, 'z', 4},
+      {8, 4, 6, 3, 'z', 7},
+      {9, 4, 6, 3, 'z', 7},
+
+      // replace first trailing byte with invalid byte, also incomplete at end
+      {5, 4, 3, 2, 0xFF, 4},
+      {8, 4, 6, 3, 0xFF, 7},
+      {9, 4, 6, 3, 0xFF, 7},
+
+      // replace second trailing byte with ASCII byte, also incomplete at end
+      {9, 4, 6, 3, 'z', 8},
+
+      // replace second trailing byte with invalid byte, also incomplete at end
+      {9, 4, 6, 3, 0xFF, 8},
+
+      // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
+      // CP U+D700
+      {6, 4, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
+      {6, 4, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
+      {6, 4, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
+      {6, 4, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
+
+      // 4. Overlong sequence. The CPs in the input are chosen such as modifying
+      // just the leading byte is enough to make them overlong, i.e. for the
+      // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
+      // zeroes.
+      {3, 4, 1, 1, 0b11000000, 1},  // make the 2-byte CP overlong
+      {3, 4, 1, 1, 0b11000001, 1},  // make the 2-byte CP overlong
+      {6, 4, 3, 2, 0b11100000, 3},  // make the 3-byte CP overlong
+      {10, 4, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
+
+      // 5. CP above range
+      // turn U+10AAAA into U+14AAAA by changing its leading byte
+      {10, 4, 6, 3, 0b11110101, 6},
+      // turn U+10AAAA into U+11AAAA by changing its 2nd byte
+      {10, 4, 6, 3, 0b10011010, 7},
+  };
+  for (test_offsets_error<unsigned char>* it = begin(offsets); it != end(offsets); ++it) {
+    test_offsets_error<unsigned char> t = *it;
+    InternT out[array_size(exp) - 1]    = {};
+    assert(t.in_size <= array_size(in));
+    assert(t.out_size <= array_size(out));
+    assert(t.expected_in_next <= t.in_size);
+    assert(t.expected_out_next <= t.out_size);
+    ExternT old_char  = in[t.replace_pos];
+    in[t.replace_pos] = t.replace_char;
+
+    mbstate_t state          = {};
+    const ExternT* in_next   = nullptr;
+    InternT* out_next        = nullptr;
+    codecvt_base::result res = codecvt_base::ok;
+
+    res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
+    assert(res == cvt.error);
+    assert(in_next == in + t.expected_in_next);
+    assert(out_next == out + t.expected_out_next);
+    assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
+    if (t.expected_out_next < array_size(out))
+      assert(out[t.expected_out_next] == 0);
+
+    in[t.replace_pos] = old_char;
+  }
+}
+
+template <class InternT, class ExternT>
+void utf8_to_utf32_in(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
+  utf8_to_utf32_in_ok(cvt);
+  utf8_to_utf32_in_partial(cvt);
+  utf8_to_utf32_in_error(cvt);
+}
+
+template <class InternT, class ExternT>
+void utf32_to_utf8_out_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
+  // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
+  const char32_t input[]         = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
+  const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
+  static_assert(array_size(input) == 5, "");
+  static_assert(array_size(expected) == 11, "");
+
+  InternT in[array_size(input)];
+  ExternT exp[array_size(expected)];
+  copy(begin(input), end(input), begin(in));
+  copy(begin(expected), end(expected), begin(exp));
+  assert(char_traits<InternT>::length(in) == 4);
+  assert(char_traits<ExternT>::length(exp) == 10);
+
+  test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {4, 10}};
+  for (test_off...
[truncated]

ldionne · 2023-10-10T18:14:30Z

This used to be https://reviews.llvm.org/D143349.

@mordante can you take another quick look, I assume the feedback that had been given on Phab was applied here and this should be pretty much good to go.

dimztimz · 2023-10-10T19:42:01Z

This used to be https://reviews.llvm.org/D143349.

@mordante can you take another quick look, I assume the feedback that had been given on Phab was applied here and this should be pretty much good to go.

It is not exactly the same it has even more tests that were not there previously, most notable it tests codecvt_utf16, too. That was not tested previously.

mordante

Sorry the patch slipped under my radar. LGTM but one more change is required to make it work.
Please apply the change and rebase the patch to test the CI before landing.

libcxx/test/std/localization/codecvt_unicode.pass.cpp

This patch fixes one case where the decoding member function in() was returning partial instead of error. Additionally, it adds large testsuite that tests all codecvt facets that were added in C++11 and in C++20. The testsuite covers this bug. Fixes llvm#60177.

Fix apple back-deployment

Fix includes

Add tests for codecvt::length()

Add -D_LIBCPP_ENABLE_CXX26_REMOVED_CODECVT

github-actions · 2023-11-24T20:33:28Z

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:

git-clang-format --diff 20f634f275b431ff256ba45cbcbb6dc5bd945fb3 0b6b3bea4a1b0d6640ace345fa444d0874e5806b -- libcxx/test/std/localization/codecvt_unicode.pass.cpp libcxx/src/locale.cpp

View the diff from clang-format here.

diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index 15cc2d7df6..ee17cbb9c3 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -1979,26 +1979,25 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
         }
         else if (c1 < 0xF0)
         {
-            if (frm_end-frm_nxt < 2)
-                return codecvt_base::partial;
-            uint8_t c2 = frm_nxt[1];
-            switch (c1)
-            {
-            case 0xE0:
-                if ((c2 & 0xE0) != 0xA0)
-                    return codecvt_base::error;
-                 break;
-            case 0xED:
-                if ((c2 & 0xE0) != 0x80)
-                    return codecvt_base::error;
-                 break;
-            default:
-                if ((c2 & 0xC0) != 0x80)
-                    return codecvt_base::error;
-                 break;
+          if (frm_end - frm_nxt < 2)
+            return codecvt_base::partial;
+          uint8_t c2 = frm_nxt[1];
+          switch (c1) {
+          case 0xE0:
+            if ((c2 & 0xE0) != 0xA0)
+              return codecvt_base::error;
+            break;
+          case 0xED:
+            if ((c2 & 0xE0) != 0x80)
+              return codecvt_base::error;
+            break;
+          default:
+            if ((c2 & 0xC0) != 0x80)
+              return codecvt_base::error;
+            break;
             }
-            if (frm_end-frm_nxt < 3)
-                return codecvt_base::partial;
+            if (frm_end - frm_nxt < 3)
+              return codecvt_base::partial;
             uint8_t c3 = frm_nxt[2];
             if ((c3 & 0xC0) != 0x80)
                 return codecvt_base::error;
@@ -2012,34 +2011,33 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
         }
         else if (c1 < 0xF5)
         {
-            if (frm_end-frm_nxt < 2)
-                return codecvt_base::partial;
-            uint8_t c2 = frm_nxt[1];
-            switch (c1)
-            {
-            case 0xF0:
-                if (!(0x90 <= c2 && c2 <= 0xBF))
-                    return codecvt_base::error;
-                 break;
-            case 0xF4:
-                if ((c2 & 0xF0) != 0x80)
-                    return codecvt_base::error;
-                 break;
-            default:
-                if ((c2 & 0xC0) != 0x80)
-                    return codecvt_base::error;
-                 break;
+          if (frm_end - frm_nxt < 2)
+            return codecvt_base::partial;
+          uint8_t c2 = frm_nxt[1];
+          switch (c1) {
+          case 0xF0:
+            if (!(0x90 <= c2 && c2 <= 0xBF))
+              return codecvt_base::error;
+            break;
+          case 0xF4:
+            if ((c2 & 0xF0) != 0x80)
+              return codecvt_base::error;
+            break;
+          default:
+            if ((c2 & 0xC0) != 0x80)
+              return codecvt_base::error;
+            break;
             }
-            if (frm_end-frm_nxt < 3)
-                 return codecvt_base::partial;
+            if (frm_end - frm_nxt < 3)
+              return codecvt_base::partial;
             uint8_t c3 = frm_nxt[2];
             if ((c3 & 0xC0) != 0x80)
-                 return codecvt_base::error;
-            if (frm_end-frm_nxt < 4)
-                 return codecvt_base::partial;
+              return codecvt_base::error;
+            if (frm_end - frm_nxt < 4)
+              return codecvt_base::partial;
             uint8_t c4 = frm_nxt[3];
             if ((c4 & 0xC0) != 0x80)
-                 return codecvt_base::error;
+              return codecvt_base::error;
             if (to_end-to_nxt < 2)
                 return codecvt_base::partial;
             if ((((c1 & 7UL) << 18) +
@@ -2108,26 +2106,25 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
         }
         else if (c1 < 0xF0)
         {
-            if (frm_end-frm_nxt < 2)
-                return codecvt_base::partial;
-            uint8_t c2 = frm_nxt[1];
-            switch (c1)
-            {
-            case 0xE0:
-                if ((c2 & 0xE0) != 0xA0)
-                    return codecvt_base::error;
-                 break;
-            case 0xED:
-                if ((c2 & 0xE0) != 0x80)
-                    return codecvt_base::error;
-                 break;
-            default:
-                if ((c2 & 0xC0) != 0x80)
-                    return codecvt_base::error;
-                 break;
+          if (frm_end - frm_nxt < 2)
+            return codecvt_base::partial;
+          uint8_t c2 = frm_nxt[1];
+          switch (c1) {
+          case 0xE0:
+            if ((c2 & 0xE0) != 0xA0)
+              return codecvt_base::error;
+            break;
+          case 0xED:
+            if ((c2 & 0xE0) != 0x80)
+              return codecvt_base::error;
+            break;
+          default:
+            if ((c2 & 0xC0) != 0x80)
+              return codecvt_base::error;
+            break;
             }
-            if (frm_end-frm_nxt < 3)
-                 return codecvt_base::partial;
+            if (frm_end - frm_nxt < 3)
+              return codecvt_base::partial;
             uint8_t c3 = frm_nxt[2];
             if ((c3 & 0xC0) != 0x80)
                 return codecvt_base::error;
@@ -2141,34 +2138,33 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
         }
         else if (c1 < 0xF5)
         {
-            if (frm_end-frm_nxt < 2)
-                return codecvt_base::partial;
-            uint8_t c2 = frm_nxt[1];
-            switch (c1)
-            {
-            case 0xF0:
-                if (!(0x90 <= c2 && c2 <= 0xBF))
-                    return codecvt_base::error;
-                 break;
-            case 0xF4:
-                if ((c2 & 0xF0) != 0x80)
-                    return codecvt_base::error;
-                 break;
-            default:
-                if ((c2 & 0xC0) != 0x80)
-                    return codecvt_base::error;
-                 break;
+          if (frm_end - frm_nxt < 2)
+            return codecvt_base::partial;
+          uint8_t c2 = frm_nxt[1];
+          switch (c1) {
+          case 0xF0:
+            if (!(0x90 <= c2 && c2 <= 0xBF))
+              return codecvt_base::error;
+            break;
+          case 0xF4:
+            if ((c2 & 0xF0) != 0x80)
+              return codecvt_base::error;
+            break;
+          default:
+            if ((c2 & 0xC0) != 0x80)
+              return codecvt_base::error;
+            break;
             }
-            if (frm_end-frm_nxt < 3)
-                 return codecvt_base::partial;
+            if (frm_end - frm_nxt < 3)
+              return codecvt_base::partial;
             uint8_t c3 = frm_nxt[2];
             if ((c3 & 0xC0) != 0x80)
-                 return codecvt_base::error;
-            if (frm_end-frm_nxt < 4)
-                 return codecvt_base::partial;
+              return codecvt_base::error;
+            if (frm_end - frm_nxt < 4)
+              return codecvt_base::partial;
             uint8_t c4 = frm_nxt[3];
             if ((c4 & 0xC0) != 0x80)
-                 return codecvt_base::error;
+              return codecvt_base::error;
             if (to_end-to_nxt < 2)
                 return codecvt_base::partial;
             if ((((c1 & 7UL) << 18) +
@@ -2394,26 +2390,25 @@ utf8_to_ucs4(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
         }
         else if (c1 < 0xF0)
         {
-            if (frm_end-frm_nxt < 2)
-                return codecvt_base::partial;
-            uint8_t c2 = frm_nxt[1];
-            switch (c1)
-            {
-            case 0xE0:
-                if ((c2 & 0xE0) != 0xA0)
-                    return codecvt_base::error;
-                 break;
-            case 0xED:
-                if ((c2 & 0xE0) != 0x80)
-                    return codecvt_base::error;
-                 break;
-            default:
-                if ((c2 & 0xC0) != 0x80)
-                    return codecvt_base::error;
-                 break;
+          if (frm_end - frm_nxt < 2)
+            return codecvt_base::partial;
+          uint8_t c2 = frm_nxt[1];
+          switch (c1) {
+          case 0xE0:
+            if ((c2 & 0xE0) != 0xA0)
+              return codecvt_base::error;
+            break;
+          case 0xED:
+            if ((c2 & 0xE0) != 0x80)
+              return codecvt_base::error;
+            break;
+          default:
+            if ((c2 & 0xC0) != 0x80)
+              return codecvt_base::error;
+            break;
             }
-            if (frm_end-frm_nxt < 3)
-                 return codecvt_base::partial;
+            if (frm_end - frm_nxt < 3)
+              return codecvt_base::partial;
             uint8_t c3 = frm_nxt[2];
             if ((c3 & 0xC0) != 0x80)
                 return codecvt_base::error;
@@ -2427,34 +2422,33 @@ utf8_to_ucs4(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
         }
         else if (c1 < 0xF5)
         {
-            if (frm_end-frm_nxt < 2)
-                return codecvt_base::partial;
-            uint8_t c2 = frm_nxt[1];
-            switch (c1)
-            {
-            case 0xF0:
-                if (!(0x90 <= c2 && c2 <= 0xBF))
-                    return codecvt_base::error;
-                 break;
-            case 0xF4:
-                if ((c2 & 0xF0) != 0x80)
-                    return codecvt_base::error;
-                 break;
-            default:
-                if ((c2 & 0xC0) != 0x80)
-                    return codecvt_base::error;
-                 break;
+          if (frm_end - frm_nxt < 2)
+            return codecvt_base::partial;
+          uint8_t c2 = frm_nxt[1];
+          switch (c1) {
+          case 0xF0:
+            if (!(0x90 <= c2 && c2 <= 0xBF))
+              return codecvt_base::error;
+            break;
+          case 0xF4:
+            if ((c2 & 0xF0) != 0x80)
+              return codecvt_base::error;
+            break;
+          default:
+            if ((c2 & 0xC0) != 0x80)
+              return codecvt_base::error;
+            break;
             }
-            if (frm_end-frm_nxt < 3)
-                 return codecvt_base::partial;
+            if (frm_end - frm_nxt < 3)
+              return codecvt_base::partial;
             uint8_t c3 = frm_nxt[2];
             if ((c3 & 0xC0) != 0x80)
-                 return codecvt_base::error;
-            if (frm_end-frm_nxt < 4)
-                 return codecvt_base::partial;
+              return codecvt_base::error;
+            if (frm_end - frm_nxt < 4)
+              return codecvt_base::partial;
             uint8_t c4 = frm_nxt[3];
             if ((c4 & 0xC0) != 0x80)
-                 return codecvt_base::error;
+              return codecvt_base::error;
             uint32_t t = static_cast<uint32_t>(((c1 & 0x07) << 18)
                                              | ((c2 & 0x3F) << 12)
                                              | ((c3 & 0x3F) << 6)
@@ -2660,26 +2654,25 @@ utf8_to_ucs2(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
         }
         else if (c1 < 0xF0)
         {
-            if (frm_end-frm_nxt < 2)
-                return codecvt_base::partial;
-            uint8_t c2 = frm_nxt[1];
-            switch (c1)
-            {
-            case 0xE0:
-                if ((c2 & 0xE0) != 0xA0)
-                    return codecvt_base::error;
-                 break;
-            case 0xED:
-                if ((c2 & 0xE0) != 0x80)
-                    return codecvt_base::error;
-                 break;
-            default:
-                if ((c2 & 0xC0) != 0x80)
-                    return codecvt_base::error;
-                 break;
+          if (frm_end - frm_nxt < 2)
+            return codecvt_base::partial;
+          uint8_t c2 = frm_nxt[1];
+          switch (c1) {
+          case 0xE0:
+            if ((c2 & 0xE0) != 0xA0)
+              return codecvt_base::error;
+            break;
+          case 0xED:
+            if ((c2 & 0xE0) != 0x80)
+              return codecvt_base::error;
+            break;
+          default:
+            if ((c2 & 0xC0) != 0x80)
+              return codecvt_base::error;
+            break;
             }
-            if (frm_end-frm_nxt < 3)
-                 return codecvt_base::partial;
+            if (frm_end - frm_nxt < 3)
+              return codecvt_base::partial;
             uint8_t c3 = frm_nxt[2];
             if ((c3 & 0xC0) != 0x80)
                 return codecvt_base::error;

dimztimz · 2023-11-27T10:13:00Z

CI is mostly successful. The few failures are not related to this patch.

mordante · 2023-11-27T17:31:43Z

CI is mostly successful. The few failures are not related to this patch.

Thanks! For the future in GitHub you should be able to merge the changes yourself.

dimztimz requested a review from a team as a code owner October 6, 2023 20:01

llvmbot added the libc++ libc++ C++ Standard Library. Not GNU libstdc++. Not libc++abi. label Oct 6, 2023

dimztimz force-pushed the codecvt branch from bbf09e2 to d14e1d6 Compare October 6, 2023 20:24

dimztimz changed the title ~~[libc++] Fix UTF-8 decoding in codecvts. Fix #60177.~~ [libc++] Fix UTF-8 decoding in codecvts Oct 6, 2023

dimztimz force-pushed the codecvt branch from d14e1d6 to 80143f5 Compare October 9, 2023 16:42

ldionne requested a review from mordante October 10, 2023 18:14

tahonermann self-requested a review October 10, 2023 18:25

ldionne assigned mordante and tahonermann Oct 20, 2023

philnik777 added the locale issues related to localization label Oct 27, 2023

mordante approved these changes Nov 24, 2023

View reviewed changes

libcxx/test/std/localization/codecvt_unicode.pass.cpp Outdated Show resolved Hide resolved

dimztimz added 5 commits November 24, 2023 21:29

fixup! [libc++] Fix UTF-8 decoding in codecvts

ce015bb

Fix apple back-deployment

fixup! [libc++] Fix UTF-8 decoding in codecvts

42edaa2

Fix includes

fixup! [libc++] Fix UTF-8 decoding in codecvts

e1c22f5

Add tests for codecvt::length()

fixup! [libc++] Fix UTF-8 decoding in codecvts

0b6b3be

Add -D_LIBCPP_ENABLE_CXX26_REMOVED_CODECVT

dimztimz force-pushed the codecvt branch from fbd9a8b to 0b6b3be Compare November 24, 2023 20:30

mordante merged commit 390840f into llvm:main Nov 27, 2023
35 of 39 checks passed

dimztimz deleted the codecvt branch November 28, 2023 15:07

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[libc++] Fix UTF-8 decoding in codecvts #68442

[libc++] Fix UTF-8 decoding in codecvts #68442

dimztimz commented Oct 6, 2023 •

edited

llvmbot commented Oct 6, 2023

ldionne commented Oct 10, 2023

dimztimz commented Oct 10, 2023

mordante left a comment

github-actions bot commented Nov 24, 2023

dimztimz commented Nov 27, 2023

mordante commented Nov 27, 2023

[libc++] Fix UTF-8 decoding in codecvts #68442

[libc++] Fix UTF-8 decoding in codecvts #68442

Conversation

dimztimz commented Oct 6, 2023 • edited

llvmbot commented Oct 6, 2023

ldionne commented Oct 10, 2023

dimztimz commented Oct 10, 2023

mordante left a comment

Choose a reason for hiding this comment

github-actions bot commented Nov 24, 2023

dimztimz commented Nov 27, 2023

mordante commented Nov 27, 2023

dimztimz commented Oct 6, 2023 •

edited