diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 381359cec6f1d..3cb1c483cea9e 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -1261,6 +1261,8 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.sys.socket.recvmsg # wchar.h entrypoints + libc.src.wchar.mblen + libc.src.wchar.mbrlen libc.src.wchar.mbrtowc libc.src.wchar.mbtowc libc.src.wchar.wcrtomb diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml index 123d3440aeec3..4adf596abe650 100644 --- a/libc/include/wchar.yaml +++ b/libc/include/wchar.yaml @@ -53,6 +53,21 @@ functions: - type: wchar_t *__restrict - type: const char *__restrict - type: size_t + - name: mblen + standards: + - stdc + return_type: int + arguments: + - type: const char * + - type: size_t + - name: mbrlen + standards: + - stdc + return_type: size_t + arguments: + - type: const char *__restrict + - type: size_t + - type: mbstate_t *__restrict - name: wmemset standards: - stdc diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt index 159778df6acca..2b95d94e4230a 100644 --- a/libc/src/wchar/CMakeLists.txt +++ b/libc/src/wchar/CMakeLists.txt @@ -169,6 +169,37 @@ add_entrypoint_object( libc.src.__support.wchar.mbstate ) +add_entrypoint_object( + mblen + SRCS + mblen.cpp + HDRS + mblen.h + DEPENDS + libc.hdr.types.size_t + libc.src.__support.common + libc.src.__support.macros.config + libc.src.__support.libc_errno + libc.src.__support.wchar.mbrtowc + libc.src.__support.wchar.mbstate +) + +add_entrypoint_object( + mbrlen + SRCS + mbrlen.cpp + HDRS + mbrlen.h + DEPENDS + libc.hdr.types.size_t + libc.hdr.types.mbstate_t + libc.src.__support.common + libc.src.__support.macros.config + libc.src.__support.wchar.mbrtowc + libc.src.__support.libc_errno + libc.src.__support.wchar.mbstate +) + add_entrypoint_object( wmemset SRCS diff --git a/libc/src/wchar/mblen.cpp b/libc/src/wchar/mblen.cpp new file mode 100644 index 0000000000000..2d15b3e0e5648 --- /dev/null +++ b/libc/src/wchar/mblen.cpp @@ -0,0 +1,35 @@ +//===-- Implementation of mblen -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/wchar/mblen.h" + +#include "hdr/types/size_t.h" +#include "src/__support/common.h" +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include "src/__support/wchar/mbrtowc.h" +#include "src/__support/wchar/mbstate.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, mblen, (const char *s, size_t n)) { + // returns 0 since UTF-8 encoding is not state-dependent + if (s == nullptr) + return 0; + internal::mbstate internal_mbstate; + auto ret = internal::mbrtowc(nullptr, s, n, &internal_mbstate); + if (!ret.has_value() || static_cast(ret.value()) == -2) { + // Encoding failure + if (!ret.has_value()) + libc_errno = EILSEQ; + return -1; + } + return static_cast(ret.value()); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/wchar/mblen.h b/libc/src/wchar/mblen.h new file mode 100644 index 0000000000000..a315a2f12f6a1 --- /dev/null +++ b/libc/src/wchar/mblen.h @@ -0,0 +1,21 @@ +//===-- Implementation header for mblen -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_WCHAR_MBLEN_H +#define LLVM_LIBC_SRC_WCHAR_MBLEN_H + +#include "hdr/types/size_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int mblen(const char *s, size_t n); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_WCHAR_MBLEN_H diff --git a/libc/src/wchar/mbrlen.cpp b/libc/src/wchar/mbrlen.cpp new file mode 100644 index 0000000000000..8de78e099566b --- /dev/null +++ b/libc/src/wchar/mbrlen.cpp @@ -0,0 +1,37 @@ +//===-- Implementation of mbrlen ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/wchar/mbrlen.h" + +#include "hdr/types/mbstate_t.h" +#include "hdr/types/size_t.h" +#include "src/__support/common.h" +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include "src/__support/wchar/mbrtowc.h" +#include "src/__support/wchar/mbstate.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(size_t, mbrlen, + (const char *__restrict s, size_t n, + mbstate_t *__restrict ps)) { + static internal::mbstate internal_mbstate; + auto ret = internal::mbrtowc(nullptr, s, n, + ps == nullptr + ? &internal_mbstate + : reinterpret_cast(ps)); + if (!ret.has_value()) { + // Encoding failure + libc_errno = ret.error(); + return -1; + } + return ret.value(); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/wchar/mbrlen.h b/libc/src/wchar/mbrlen.h new file mode 100644 index 0000000000000..08b59cfc8651c --- /dev/null +++ b/libc/src/wchar/mbrlen.h @@ -0,0 +1,22 @@ +//===-- Implementation header for mbrlen ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_WCHAR_MBRLEN_H +#define LLVM_LIBC_SRC_WCHAR_MBRLEN_H + +#include "hdr/types/mbstate_t.h" +#include "hdr/types/size_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_WCHAR_MBRLEN_H diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt index 176cf7c3487cd..baa52b74c3d97 100644 --- a/libc/test/src/wchar/CMakeLists.txt +++ b/libc/test/src/wchar/CMakeLists.txt @@ -64,6 +64,33 @@ add_libc_test( libc.test.UnitTest.ErrnoCheckingTest ) +add_libc_test( + mblen_test + SUITE + libc_wchar_unittests + SRCS + mblen_test.cpp + DEPENDS + libc.src.__support.libc_errno + libc.src.wchar.mblen + libc.test.UnitTest.ErrnoCheckingTest +) + +add_libc_test( + mbrlen_test + SUITE + libc_wchar_unittests + SRCS + mbrlen_test.cpp + DEPENDS + libc.src.__support.libc_errno + libc.src.__support.wchar.mbstate + libc.src.string.memset + libc.src.wchar.mbrlen + libc.hdr.types.mbstate_t + libc.test.UnitTest.ErrnoCheckingTest +) + add_libc_test( wctob_test SUITE diff --git a/libc/test/src/wchar/mblen_test.cpp b/libc/test/src/wchar/mblen_test.cpp new file mode 100644 index 0000000000000..efd4df7020741 --- /dev/null +++ b/libc/test/src/wchar/mblen_test.cpp @@ -0,0 +1,104 @@ +//===-- Unittests for mblen -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/libc_errno.h" +#include "src/wchar/mblen.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcMBLenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcMBLenTest, OneByte) { + const char *ch = "A"; + int n = LIBC_NAMESPACE::mblen(ch, 1); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(n, 1); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mblen(ch, 0); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(n, -1); +} + +TEST_F(LlvmLibcMBLenTest, TwoByte) { + const char ch[2] = {static_cast(0xC2), + static_cast(0x8E)}; // Ž car symbol + int n = LIBC_NAMESPACE::mblen(ch, 4); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(n, 2); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mblen(ch, 1); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_SUCCESS(); + // Should fail after trying to read next byte too + n = LIBC_NAMESPACE::mblen(ch + 1, 1); + ASSERT_EQ(n, -1); + // This one should be an invalid starting byte so should set errno + ASSERT_ERRNO_EQ(EILSEQ); +} + +TEST_F(LlvmLibcMBLenTest, ThreeByte) { + const char ch[3] = {static_cast(0xE2), static_cast(0x88), + static_cast(0x91)}; // ∑ sigma symbol + int n = LIBC_NAMESPACE::mblen(ch, 3); + ASSERT_EQ(n, 3); + ASSERT_ERRNO_SUCCESS(); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mblen(ch, 2); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBLenTest, FourByte) { + const char ch[4] = {static_cast(0xF0), static_cast(0x9F), + static_cast(0xA4), + static_cast(0xA1)}; // 🤡 clown emoji + int n = LIBC_NAMESPACE::mblen(ch, 4); + ASSERT_EQ(n, 4); + ASSERT_ERRNO_SUCCESS(); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mblen(ch, 2); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBLenTest, InvalidByte) { + const char ch[1] = {static_cast(0x80)}; + int n = LIBC_NAMESPACE::mblen(ch, 1); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_EQ(EILSEQ); +} + +TEST_F(LlvmLibcMBLenTest, InvalidMultiByte) { + const char ch[4] = {static_cast(0x80), static_cast(0x00), + static_cast(0x80), + static_cast(0x00)}; // invalid sequence of bytes + // Trying to push all 4 should error + int n = LIBC_NAMESPACE::mblen(ch, 4); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_EQ(EILSEQ); + + // Trying to push the second and third should correspond to null wc + n = LIBC_NAMESPACE::mblen(ch + 1, 2); + ASSERT_EQ(n, 0); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBLenTest, NullString) { + // reading on nullptr should return 0 + int n = LIBC_NAMESPACE::mblen(nullptr, 2); + ASSERT_EQ(n, 0); + ASSERT_ERRNO_SUCCESS(); + // reading a null terminator should return 0 + const char *ch = "\0"; + n = LIBC_NAMESPACE::mblen(ch, 1); + ASSERT_EQ(n, 0); +} diff --git a/libc/test/src/wchar/mbrlen_test.cpp b/libc/test/src/wchar/mbrlen_test.cpp new file mode 100644 index 0000000000000..e1452bf416054 --- /dev/null +++ b/libc/test/src/wchar/mbrlen_test.cpp @@ -0,0 +1,139 @@ +//===-- Unittests for mbrlen ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/types/wchar_t.h" +#include "src/__support/libc_errno.h" +#include "src/__support/wchar/mbstate.h" +#include "src/string/memset.h" +#include "src/wchar/mbrlen.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcMBRLenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcMBRLenTest, OneByte) { + const char *ch = "A"; + mbstate_t mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrlen(ch, 1, &mb); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(n, static_cast(1)); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbrlen(ch, 0, &mb); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(n, static_cast(-2)); +} + +TEST_F(LlvmLibcMBRLenTest, TwoByte) { + const char ch[2] = {static_cast(0xC2), + static_cast(0x8E)}; // Ž car symbol + mbstate_t mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrlen(ch, 4, nullptr); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(static_cast(n), 2); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbrlen(ch, 1, &mb); + ASSERT_EQ(static_cast(n), -2); + ASSERT_ERRNO_SUCCESS(); + // Should pass after trying to read next byte + n = LIBC_NAMESPACE::mbrlen(ch + 1, 1, &mb); + ASSERT_EQ(static_cast(n), 1); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBRLenTest, ThreeByte) { + const char ch[3] = {static_cast(0xE2), static_cast(0x88), + static_cast(0x91)}; // ∑ sigma symbol + mbstate_t mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrlen(ch, 3, &mb); + ASSERT_EQ(static_cast(n), 3); + ASSERT_ERRNO_SUCCESS(); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbrlen(ch, 2, &mb); + ASSERT_EQ(static_cast(n), -2); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBRLenTest, FourByte) { + const char ch[4] = {static_cast(0xF0), static_cast(0x9F), + static_cast(0xA4), + static_cast(0xA1)}; // 🤡 clown emoji + mbstate_t mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrlen(ch, 4, &mb); + ASSERT_EQ(static_cast(n), 4); + ASSERT_ERRNO_SUCCESS(); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbrlen(ch, 2, &mb); + ASSERT_EQ(static_cast(n), -2); + ASSERT_ERRNO_SUCCESS(); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbrlen(ch + 2, 1, &mb); + ASSERT_EQ(static_cast(n), -2); + ASSERT_ERRNO_SUCCESS(); + + // Should pass after reading final byte + n = LIBC_NAMESPACE::mbrlen(ch + 3, 5, &mb); + ASSERT_EQ(static_cast(n), 1); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBRLenTest, InvalidByte) { + const char ch[1] = {static_cast(0x80)}; + size_t n = LIBC_NAMESPACE::mbrlen(ch, 1, nullptr); + ASSERT_EQ(static_cast(n), -1); + ASSERT_ERRNO_EQ(EILSEQ); +} + +TEST_F(LlvmLibcMBRLenTest, InvalidMultiByte) { + const char ch[4] = {static_cast(0x80), static_cast(0x00), + static_cast(0x80), + static_cast(0x00)}; // invalid sequence of bytes + mbstate_t mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + // Trying to push all 4 should error + size_t n = LIBC_NAMESPACE::mbrlen(ch, 4, &mb); + ASSERT_EQ(static_cast(n), -1); + ASSERT_ERRNO_EQ(EILSEQ); + + // Trying to push the second and third should correspond to null wc + n = LIBC_NAMESPACE::mbrlen(ch + 1, 2, &mb); + ASSERT_EQ(static_cast(n), 0); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBRLenTest, NullString) { + // reading on nullptr should return 0 + size_t n = LIBC_NAMESPACE::mbrlen(nullptr, 2, nullptr); + ASSERT_EQ(static_cast(n), 0); + ASSERT_ERRNO_SUCCESS(); + // reading a null terminator should return 0 + const char *ch = "\0"; + n = LIBC_NAMESPACE::mbrlen(ch, 1, nullptr); + ASSERT_EQ(static_cast(n), 0); +} + +TEST_F(LlvmLibcMBRLenTest, InvalidMBState) { + const char ch[4] = {static_cast(0xC2), static_cast(0x8E), + static_cast(0xC7), static_cast(0x8C)}; + mbstate_t *mb; + LIBC_NAMESPACE::internal::mbstate inv; + inv.total_bytes = 6; + mb = reinterpret_cast(&inv); + // invalid mbstate should error + size_t n = LIBC_NAMESPACE::mbrlen(ch, 2, mb); + ASSERT_EQ(static_cast(n), -1); + ASSERT_ERRNO_EQ(EINVAL); +}