Create a CharSetConverter class with both iconv and icu support #74516

abhina-sree · 2023-12-05T20:13:46Z

This patch adds a wrapper class called CharSetConverter for ConverterEBCDIC. This class is then extended to support the ICU library or iconv library. The ICU library currently takes priority over the iconv library.

Relevant RFCs:
https://discourse.llvm.org/t/rfc-adding-a-charset-converter-to-the-llvm-support-library/69795
https://discourse.llvm.org/t/rfc-enabling-fexec-charset-support-to-llvm-and-clang-reposting/71512

PR to enable fexec-charset that depends on this:
abhina-sree#1

llvmbot · 2023-12-05T20:14:16Z

@llvm/pr-subscribers-llvm-support

Author: Abhina Sree (abhina-sree)

Changes

This patch adds a wrapper class called CharSetConverter for ConverterEBCDIC. This class is then extended to support the ICU library or iconv library. The ICU library currently takes priority over the iconv library.

Patch is 32.33 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/74516.diff

7 Files Affected:

(modified) llvm/cmake/config-ix.cmake (+16)
(modified) llvm/include/llvm/Config/config.h.cmake (+6)
(added) llvm/include/llvm/Support/CharSet.h (+160)
(modified) llvm/lib/Support/CMakeLists.txt (+17)
(added) llvm/lib/Support/CharSet.cpp (+370)
(modified) llvm/unittests/Support/CMakeLists.txt (+1)
(added) llvm/unittests/Support/CharSetTest.cpp (+279)

diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 7bb3e98333eff..b2505968e430d 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -257,6 +257,22 @@ else()
   set(LLVM_ENABLE_TERMINFO 0)
 endif()
 
+#Check for icu.
+find_package(ICU COMPONENTS uc i18n)
+if(ICU_FOUND)
+  set(HAVE_ICU 1)
+else()
+  set(HAVE_ICU 0)
+endif()
+
+# Check for iconv.
+find_package(Iconv)
+if(Iconv_FOUND)
+  set(HAVE_ICONV 1)
+else()
+  set(HAVE_ICONV 0)
+endif()
+
 # function checks
 check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM)
 find_package(Backtrace)
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index fc1f9bf342f8d..74003e1b22494 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -281,6 +281,12 @@
 /* Have host's ___chkstk_ms */
 #cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS}
 
+/* Define if icu library is available */
+#cmakedefine HAVE_ICU ${HAVE_ICU}
+
+/* Define if iconv library is available */
+#cmakedefine HAVE_ICONV ${HAVE_ICONV}
+
 /* Linker version detected at compile time. */
 #cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}"
 
diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h
new file mode 100644
index 0000000000000..856b3be65ff7e
--- /dev/null
+++ b/llvm/include/llvm/Support/CharSet.h
@@ -0,0 +1,160 @@
+//===-- CharSet.h - Utility class to convert between char sets ----*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides a utility class to convert between different character
+/// set encodings.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_CHARSET_H
+#define LLVM_SUPPORT_CHARSET_H
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/ErrorOr.h"
+
+#include <functional>
+#include <string>
+#include <system_error>
+
+namespace llvm {
+
+template <typename T> class SmallVectorImpl;
+
+namespace details {
+class CharSetConverterImplBase {
+public:
+  virtual ~CharSetConverterImplBase() = default;
+
+  /// Converts a string.
+  /// \param[in] Source source string
+  /// \param[in,out] Result container for converted string
+  /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
+  /// for multi-byte encodings iff true.
+  /// \return error code in case something went wrong
+  ///
+  /// The following error codes can occur, among others:
+  ///   - std::errc::argument_list_too_long: The result requires more than
+  ///     std::numeric_limits<size_t>::max() bytes.
+  ///   - std::errc::illegal_byte_sequence: The input contains an invalid
+  ///     multibyte sequence.
+  ///   - std::errc::invalid_argument: The input contains an incomplete
+  ///     multibyte sequence.
+  ///
+  /// In case of an error, the result string contains the successfully converted
+  /// part of the input string.
+  ///
+
+  virtual std::error_code convert(StringRef Source,
+                                  SmallVectorImpl<char> &Result,
+                                  bool ShouldAutoFlush) const = 0;
+
+  /// Restore the conversion to the original state.
+  /// \return error code in case something went wrong
+  ///
+  /// If the original character set or the destination character set
+  /// are multi-byte character sets, set the shift state to the initial
+  /// state. Otherwise this is a no-op.
+  virtual std::error_code flush() const = 0;
+
+  virtual std::error_code flush(SmallVectorImpl<char> &Result) const = 0;
+};
+} // namespace details
+
+// Names inspired by https://wg21.link/p1885.
+namespace text_encoding {
+enum class id {
+  /// UTF-8 character set encoding.
+  UTF8,
+
+  /// IBM EBCDIC 1047 character set encoding.
+  IBM1047
+};
+} // end namespace text_encoding
+
+/// Utility class to convert between different character set encodings.
+/// The class always supports converting between EBCDIC 1047 and Latin-1/UTF-8.
+class CharSetConverter {
+  // details::CharSetConverterImplBase *Converter;
+  std::unique_ptr<details::CharSetConverterImplBase> Converter;
+
+  CharSetConverter(std::unique_ptr<details::CharSetConverterImplBase> Converter)
+      : Converter(std::move(Converter)) {}
+
+public:
+  /// Creates a CharSetConverter instance.
+  /// \param[in] CSFrom name of the source character encoding
+  /// \param[in] CSTo name of the target character encoding
+  /// \return a CharSetConverter instance
+  static CharSetConverter create(text_encoding::id CSFrom,
+                                 text_encoding::id CSTo);
+
+  /// Creates a CharSetConverter instance.
+  /// Returns std::errc::invalid_argument in case the requested conversion is
+  /// not supported.
+  /// \param[in] CPFrom name of the source character encoding
+  /// \param[in] CPTo name of the target character encoding
+  /// \return a CharSetConverter instance or an error code
+  static ErrorOr<CharSetConverter> create(StringRef CPFrom, StringRef CPTo);
+
+  CharSetConverter(const CharSetConverter &) = delete;
+  CharSetConverter &operator=(const CharSetConverter &) = delete;
+
+  CharSetConverter(CharSetConverter &&Other) {
+    Converter = std::move(Other.Converter);
+  }
+
+  CharSetConverter &operator=(CharSetConverter &&Other) {
+    if (this != &Other)
+      Converter = std::move(Other.Converter);
+    return *this;
+  }
+
+  ~CharSetConverter() = default;
+
+  /// Converts a string.
+  /// \param[in] Source source string
+  /// \param[in,out] Result container for converted string
+  /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
+  /// for multi-byte encodings.
+  /// \return error code in case something went wrong
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush = true) const {
+    return Converter->convert(Source, Result, ShouldAutoFlush);
+  }
+
+  char convert(char SingleChar) const {
+    SmallString<1> Result;
+    Converter->convert(StringRef(&SingleChar, 1), Result, false);
+    return Result[0];
+  }
+
+  /// Converts a string.
+  /// \param[in] Source source string
+  /// \param[in,out] Result container for converted string
+  /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
+  /// for multi-byte encodings iff true.
+  /// \return error code in case something went wrong
+  std::error_code convert(const std::string &Source,
+                          SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush = true) const {
+    return convert(StringRef(Source), Result, ShouldAutoFlush);
+  }
+
+  std::error_code flush() const { return Converter->flush(); }
+
+  std::error_code flush(SmallVectorImpl<char> &Result) const {
+    return Converter->flush(Result);
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index b96d62c7a6224..b366b915df719 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -153,6 +153,7 @@ add_llvm_component_library(LLVMSupport
   CachePruning.cpp
   Caching.cpp
   circular_raw_ostream.cpp
+  CharSet.cpp
   Chrono.cpp
   COM.cpp
   CodeGenCoverage.cpp
@@ -291,6 +292,22 @@ add_llvm_component_library(LLVMSupport
   Demangle
   )
 
+# Link icu library if it is an external library.
+if(ICU_FOUND)
+  target_link_libraries(LLVMSupport
+  PRIVATE
+  ${ICU_LIBRARIES}
+  )
+else()
+  # Link iconv library if it is an external library.
+  if(Iconv_FOUND AND NOT Iconv_IS_BUILT_IN)
+    target_link_libraries(LLVMSupport
+    PRIVATE
+    ${Iconv_LIBRARIES}
+    )
+  endif()
+endif()
+
 set(llvm_system_libs ${system_libs})
 
 # This block is only needed for llvm-config. When we deprecate llvm-config and
diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp
new file mode 100644
index 0000000000000..dbc2cb7c1839d
--- /dev/null
+++ b/llvm/lib/Support/CharSet.cpp
@@ -0,0 +1,370 @@
+//===-- CharSet.cpp - Utility class to convert between char sets --*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides utility classes to convert between different character
+/// set encoding.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CharSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ConvertEBCDIC.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <limits>
+#include <system_error>
+
+#ifdef HAVE_ICU
+#include <unicode/ucnv.h>
+#elif defined(HAVE_ICONV)
+#include <iconv.h>
+#endif
+
+using namespace llvm;
+
+// Normalize the charset name with the charset alias matching algorithm proposed
+// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
+void normalizeCharSetName(StringRef CSName, SmallVectorImpl<char> &Normalized) {
+  bool PrevDigit = false;
+  for (auto Ch : CSName) {
+    if (isAlnum(Ch)) {
+      Ch = toLower(Ch);
+      if (Ch != '0' || PrevDigit) {
+        PrevDigit = isDigit(Ch);
+        Normalized.push_back(Ch);
+      }
+    }
+  }
+}
+
+// Maps the charset name to enum constant if possible.
+std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
+  SmallString<16> Normalized;
+  normalizeCharSetName(CSName, Normalized);
+#define CSNAME(CS, STR)                                                        \
+  if (Normalized.equals(STR))                                                  \
+  return CS
+  CSNAME(text_encoding::id::UTF8, "utf8");
+  CSNAME(text_encoding::id::IBM1047, "ibm1047");
+#undef CSNAME
+  return std::nullopt;
+}
+
+namespace {
+enum ConversionType {
+  UTFToIBM1047,
+  IBM1047ToUTF,
+};
+
+// Support conversion between EBCDIC 1047 and UTF8. This class uses
+// built-in translation tables that allow for translation between the
+// aforementioned character sets. The use of tables for conversion is only
+// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
+// character sets are not supported.
+class CharSetConverterTable : public details::CharSetConverterImplBase {
+  ConversionType ConvType;
+
+public:
+  CharSetConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
+
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush) const override;
+  std::error_code flush() const override;
+  std::error_code flush(SmallVectorImpl<char> &Result) const override;
+};
+
+std::error_code CharSetConverterTable::convert(StringRef Source,
+                                               SmallVectorImpl<char> &Result,
+                                               bool ShouldAutoFlush) const {
+  if (ConvType == IBM1047ToUTF) {
+    ConverterEBCDIC::convertToUTF8(Source, Result);
+    return std::error_code();
+  } else if (ConvType == UTFToIBM1047) {
+    return ConverterEBCDIC::convertToEBCDIC(Source, Result);
+  }
+  llvm_unreachable("Invalid ConvType!");
+  return std::error_code();
+}
+
+std::error_code CharSetConverterTable::flush() const {
+  return std::error_code();
+}
+
+std::error_code
+CharSetConverterTable::flush(SmallVectorImpl<char> &Result) const {
+  return std::error_code();
+}
+
+#ifdef HAVE_ICU
+class CharSetConverterICU : public details::CharSetConverterImplBase {
+  UConverter *FromConvDesc;
+  UConverter *ToConvDesc;
+
+public:
+  CharSetConverterICU(UConverter *Converter) {
+    UErrorCode EC = U_ZERO_ERROR;
+    FromConvDesc = nullptr;
+    ToConvDesc = ucnv_safeClone(Converter, nullptr, nullptr, &EC);
+    if (U_FAILURE(EC)) {
+      ToConvDesc = nullptr;
+    }
+  };
+
+  CharSetConverterICU(UConverter *FromConverter, UConverter *ToConverter) {
+    UErrorCode EC = U_ZERO_ERROR;
+    FromConvDesc = ucnv_safeClone(FromConverter, nullptr, nullptr, &EC);
+    if (U_FAILURE(EC))
+      FromConvDesc = nullptr;
+    ToConvDesc = ucnv_safeClone(ToConverter, nullptr, nullptr, &EC);
+    if (U_FAILURE(EC))
+      ToConvDesc = nullptr;
+  }
+
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush) const override;
+  std::error_code flush() const override;
+  std::error_code flush(SmallVectorImpl<char> &Result) const override;
+};
+
+std::error_code CharSetConverterICU::convert(StringRef Source,
+                                             SmallVectorImpl<char> &Result,
+                                             bool ShouldAutoFlush) const {
+  // Setup the output. We directly write into the SmallVector.
+  size_t OutputLength, Capacity = Result.capacity();
+  char *Output, *Out;
+
+  UErrorCode EC = U_ZERO_ERROR;
+
+  auto HandleError = [&Capacity, &Output, &OutputLength,
+                      &Result](UErrorCode UEC) {
+    if (UEC == U_BUFFER_OVERFLOW_ERROR &&
+        Capacity < std::numeric_limits<size_t>::max()) {
+      // No space left in output buffer. Double the size of the underlying
+      // memory in the SmallVectorImpl, adjust pointer and length and continue
+      // the conversion.
+      Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+                     ? 2 * Capacity
+                     : std::numeric_limits<size_t>::max();
+      Result.resize_for_overwrite(Capacity);
+      Output = static_cast<char *>(Result.data());
+      OutputLength = Capacity;
+      return std::error_code();
+    } else {
+      // Some other error occured.
+      return std::error_code(errno, std::generic_category());
+    }
+  };
+
+  do {
+    EC = U_ZERO_ERROR;
+    size_t InputLength = Source.size();
+    const char *Input =
+        InputLength ? const_cast<char *>(Source.data()) : nullptr;
+    const char *In = Input;
+    Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+    OutputLength = Capacity;
+    Out = Output;
+    Result.resize_for_overwrite(Capacity);
+    ucnv_convertEx(ToConvDesc, FromConvDesc, &Output, Out + OutputLength,
+                   &Input, In + InputLength, /*pivotStart=*/NULL,
+                   /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
+                   /*pivotLimit=*/NULL, /*reset=*/true, /*flush=*/true, &EC);
+    if (U_FAILURE(EC)) {
+      if (auto error = HandleError(EC))
+        return error;
+    } else if (U_SUCCESS(EC))
+      break;
+  } while (U_FAILURE(EC));
+
+  Result.resize(Output - Out);
+  return std::error_code();
+}
+
+std::error_code CharSetConverterICU::flush() const { return std::error_code(); }
+
+std::error_code
+CharSetConverterICU::flush(SmallVectorImpl<char> &Result) const {
+  return std::error_code();
+}
+
+#elif defined(HAVE_ICONV)
+class CharSetConverterIconv : public details::CharSetConverterImplBase {
+  iconv_t ConvDesc;
+
+public:
+  CharSetConverterIconv(iconv_t ConvDesc) : ConvDesc(ConvDesc) {}
+
+  std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
+                          bool ShouldAutoFlush) const override;
+  std::error_code flush() const override;
+  std::error_code flush(SmallVectorImpl<char> &Result) const override;
+};
+
+std::error_code CharSetConverterIconv::convert(StringRef Source,
+                                               SmallVectorImpl<char> &Result,
+                                               bool ShouldAutoFlush) const {
+  // Setup the input. Use nullptr to reset iconv state if input length is zero.
+  size_t InputLength = Source.size();
+  char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
+  // Setup the output. We directly write into the SmallVector.
+  size_t Capacity = Result.capacity();
+  Result.resize_for_overwrite(Capacity);
+  char *Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
+  size_t OutputLength = Capacity;
+
+  size_t Ret;
+
+  // Handle errors returned from iconv().
+  auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
+    if (Ret == static_cast<size_t>(-1)) {
+      // An error occured. Check if we can gracefully handle it.
+      if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
+        // No space left in output buffer. Double the size of the underlying
+        // memory in the SmallVectorImpl, adjust pointer and length and continue
+        // the conversion.
+        const size_t Used = Capacity - OutputLength;
+        Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
+                       ? 2 * Capacity
+                       : std::numeric_limits<size_t>::max();
+        Result.resize_for_overwrite(Capacity);
+        Output = static_cast<char *>(Result.data()) + Used;
+        OutputLength = Capacity - Used;
+        return std::error_code();
+      } else {
+        // Some other error occured.
+        return std::error_code(errno, std::generic_category());
+      }
+    } else {
+      // A positive return value indicates that some characters were converted
+      // in a nonreversible way, that is, replaced with a SUB symbol. Returning
+      // an error in this case makes sure that both conversion routines behave
+      // in the same way.
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    }
+  };
+
+  // Convert the string.
+  while ((Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength)))
+    if (auto EC = HandleError(Ret))
+      return EC;
+  if (ShouldAutoFlush) {
+    while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
+      if (auto EC = HandleError(Ret))
+        return EC;
+  }
+
+  // Re-adjust size to actual size.
+  Result.resize(Capacity - OutputLength);
+  return std::error_code();
+}
+
+std::error_code CharSetConverterIconv::flush() const {
+  size_t Ret = iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
+  if (Ret == static_cast<size_t>(-1)) {
+    return std::error_code(errno, std::generic_category());
+  }
+  return std::error_code();
+}
+
+std::error_code
+CharSetConverterIconv::flush(SmallVectorImpl<char> &Result) const {
+  char *Output = Result.data();
+  size_t OutputLength = Result.capacity();
+  size_t Capacity = Result.capacity();
+  Result.resize_for_overwrite(Capacity);
+
+  // Handle errors returned from iconv().
+  auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
+    if (Ret == static_cast<size_t>(-1)) {
+      // An error occured. Check if we can gracefully handle it.
+      if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
+        // No space left in output buffer. Increase the size of the underlying
+        // memory in the SmallVectorImpl by 2 bytes, adjust pointer and length
+        // and continue the conversion.
+        const size_t Used = Capacity - OutputLength;
+        Capacity = (Capacity < std::numeric_limits<size_t>::max() - 2)
+                       ? 2 + Capacity
+                       : std::numeric_limits<size_t>::max();
+        Result.resize_for_overwrite(Capacity);
+        Output = static_cast<char *>(Result.data()) + Used;
+        OutputLength = Capacity - Used;
+        return std::error_code();
+      } else {
+        // Some other error occured.
+        return std::error_code(errno, std::generic_category());
+      }
+    } else {
+      // A positive return value indicates that some characters were converted
+      // in a nonreversible way, that is, replaced with a SUB symbol. Returning
+      // an error in this case makes sure that both conversion routines behave
+      // in the same way.
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    }
+  };
+
+  size_t Ret;
+  while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
+    if (auto EC = HandleError(Ret))
+      return EC;
+
+  // Re-adjust size to actual size.
+  Result.resize(Capacity - OutputLength);
+  return std::error_code();
+}
...
[truncated]

github-actions · 2023-12-05T20:17:20Z

✅ With the latest revision this PR passed the C/C++ code formatter.

AaronBallman · 2023-12-08T16:53:07Z

Adding some more reviewers to cover cmake changes, packaging questions, and folks who expressed opinions on the RFC.

dwblaikie · 2023-12-08T18:32:02Z

Adding some more reviewers to cover cmake changes, packaging questions, and folks who expressed opinions on the RFC.

A link back to the RFC for those playing along at home: https://discourse.llvm.org/t/rfc-adding-a-charset-converter-to-the-llvm-support-library/69795

abhina-sree · 2023-12-08T18:39:25Z

Adding some more reviewers to cover cmake changes, packaging questions, and folks who expressed opinions on the RFC.

A link back to the RFC for those playing along at home: https://discourse.llvm.org/t/rfc-adding-a-charset-converter-to-the-llvm-support-library/69795

Thank you, that is the link to the first RFC. A continuation of that discussion can also be found here on my RFC for implementing the fexec-charset option https://discourse.llvm.org/t/rfc-enabling-fexec-charset-support-to-llvm-and-clang-reposting/71512

cor3ntin

Thanks working on this.
I do believe the patch implements the resolution of the RFC and generally looks good.

llvm/include/llvm/Support/CharSet.h

llvm/lib/Support/CharSet.cpp

cor3ntin · 2024-01-04T16:08:29Z

llvm/lib/Support/CharSet.cpp

+  if (U_FAILURE(EC)) {
+    return std::error_code(errno, std::generic_category());
+  }


I wonder whether that diagnostic is sufficient for higher level concerns.
But maybe it is, amd we can instead have an additional interface that list the available encodings.

I remember there was a comment on my old patch about checking whether we can actually create a converter when we use the fexec-charset option and emitting an error in the driver. Maybe that can be an option as well

cor3ntin · 2024-01-04T16:09:51Z

llvm/lib/Support/CharSet.cpp

+ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
+                                                   StringRef CSTo) {


We might be better off taking const std::string & here, given we need a null termination in the common case

I'm not sure if you saw my fexec-charset PR abhina-sree#1, but its mostly used in LiteralSupport.cpp where there is usually no null terminator when we are doing the translation.

llvm/cmake/config-ix.cmake

abhina-sree · 2024-02-22T20:19:17Z

ping :)

llvm/CMakeLists.txt

efriedma-quic · 2024-02-22T22:36:19Z

llvm/cmake/config-ix.cmake

@@ -257,6 +257,26 @@ else()
  set(LLVM_ENABLE_TERMINFO 0)
 endif()

+#Check for icu.
+if(LLVM_ENABLE_ICU)
+  find_package(ICU COMPONENTS uc i18n)


https://discourse.llvm.org/t/rfc-enabling-fexec-charset-support-to-llvm-and-clang-reposting/71512/30 distinguishes between statically and dynamically linking against ICU; which one does this do?

I think I will need more work for this, thanks for catching! In my local testing I only had the shared library but I think the current implementation will allow static linking.

I looked into implementing this by setting the following cmake variable CMAKE_FIND_LIBRARY_SUFFIXES so that it will only find libraries ending in .so for ICU. Please let me know if there is a better way to do this. Thanks!

I can't think of anything better.

This won't work on Windows.

abhina-sree · 2024-03-26T15:34:39Z

ping :)

llvm/cmake/config-ix.cmake

efriedma-quic · 2024-03-26T22:19:58Z

llvm/cmake/config-ix.cmake

@@ -257,6 +257,26 @@ else()
  set(LLVM_ENABLE_TERMINFO 0)
 endif()

+#Check for icu.
+if(LLVM_ENABLE_ICU)
+  find_package(ICU COMPONENTS uc i18n)


I can't think of anything better.

This won't work on Windows.

abhina-sree · 2024-04-08T13:34:39Z

ping :)

llvm/unittests/Support/CharSetTest.cpp

hubert-reinterpretcast

Partial review comments

llvm/include/llvm/Support/CharSet.h

llvm/cmake/config-ix.cmake

llvm/include/llvm/Config/config.h.cmake

llvm/include/llvm/Support/CharSet.h

llvm/lib/Support/CMakeLists.txt

llvm/lib/Support/CharSet.cpp

hubert-reinterpretcast

Further partial review comments. My suggested changes will require additional changes to code that use the modified interfaces.

llvm/cmake/config-ix.cmake

llvm/include/llvm/Support/CharSet.h

llvm/lib/Support/CharSet.cpp

hubert-reinterpretcast · 2024-05-04T06:46:18Z

llvm/include/llvm/Support/CharSet.h

+  /// In case of an error, the result string contains the successfully converted
+  /// part of the input string.


Just noting that this is probably insufficiently tested. The functionality may be useful for printing at least the first part of static_assert messages where conversion to the encoding used for diagnostic messages fails. As a later improvement, a StringRef to the unconverted portion of the input buffer would be helpful.

hubert-reinterpretcast

Remove redundant "housekeeping" variable Out.

llvm/lib/Support/CharSet.cpp

Co-authored-by: Eli Friedman <efriedma@quicinc.com>

Remove comment that looks like code (unique_ptr should be easy enough to understand). Co-authored-by: Hubert Tong <hubert-reinterpretcast@users.noreply.github.com>

abhina-sree · 2024-06-05T18:24:32Z

ping :)

abhina-sree self-assigned this Dec 5, 2023

llvmbot added cmake Build system in general and CMake in particular llvm:support labels Dec 5, 2023

abhina-sree mentioned this pull request Dec 5, 2023

Enable fexec-charset option abhina-sree/llvm-project#1

Open

cor3ntin requested review from cor3ntin, tahonermann and AaronBallman December 8, 2023 13:21

AaronBallman requested review from petrhosek, MaskRay, zmodem, hubert-reinterpretcast and efriedma-quic December 8, 2023 16:51

cor3ntin reviewed Jan 4, 2024

View reviewed changes

abhina-sree force-pushed the abhina/charset_converter branch 2 times, most recently from 2d735d9 to 6e0b77b Compare January 9, 2024 19:51

petrhosek reviewed Jan 10, 2024

View reviewed changes

llvm/cmake/config-ix.cmake Outdated Show resolved Hide resolved

abhina-sree force-pushed the abhina/charset_converter branch from cf6559f to 0ef0947 Compare January 31, 2024 19:06

efriedma-quic reviewed Feb 22, 2024

View reviewed changes

abhina-sree force-pushed the abhina/charset_converter branch from 0ef0947 to e2fc524 Compare February 23, 2024 18:35

abhina-sree force-pushed the abhina/charset_converter branch from e2fc524 to 9d0371a Compare March 26, 2024 15:25

efriedma-quic reviewed Mar 26, 2024

View reviewed changes

abhina-sree force-pushed the abhina/charset_converter branch from 9d0371a to 00bbc15 Compare April 4, 2024 19:19

abhina-sree force-pushed the abhina/charset_converter branch from 03f35d6 to 19f64df Compare April 29, 2024 13:52

hubert-reinterpretcast reviewed Apr 29, 2024

View reviewed changes

llvm/unittests/Support/CharSetTest.cpp Outdated Show resolved Hide resolved

hubert-reinterpretcast reviewed May 1, 2024

View reviewed changes

abhina-sree force-pushed the abhina/charset_converter branch 2 times, most recently from a2a7e4d to 4e8527f Compare May 1, 2024 13:27

hubert-reinterpretcast reviewed May 4, 2024

View reviewed changes

hubert-reinterpretcast reviewed May 5, 2024

View reviewed changes

llvm/lib/Support/CharSet.cpp Outdated Show resolved Hide resolved

llvm/lib/Support/CharSet.cpp Outdated Show resolved Hide resolved

llvm/lib/Support/CharSet.cpp Outdated Show resolved Hide resolved

abhina-sree force-pushed the abhina/charset_converter branch 3 times, most recently from efbd02d to 5152f91 Compare May 24, 2024 16:03

abhina-sree and others added 17 commits May 31, 2024 15:23

Create a CharSetConverter class with both iconv and icu support.

2470789

address review comments

5b24a38

add LLVM_ENABLE_ICU and LLVM_ENABLE_ICONV option

342cc42

remove single char conversion function

f715ffb

handle FORCE_ON, look for shared libraries only for ICU

8c54d36

only allow builtin iconv support

7dcca67

Update llvm/cmake/config-ix.cmake

48fd893

Co-authored-by: Eli Friedman <efriedma@quicinc.com>

address comments

8fdf464

remove function to get shift back characters, address comments

ba0d374

remove other flush function as well

2e2535f

update comments

6288aae

reset iconv if failed, cause overflow in testcase

e93d658

remove AutoFlush, remove stray comment

ddc16b1

formatting nits

9cee861

Remove comment that looks like code (unique_ptr should be easy enough to understand). Co-authored-by: Hubert Tong <hubert-reinterpretcast@users.noreply.github.com>

Refactor ICU code

0e6f005

refactor iconv

cf6bf69

resize output if error

1817bd3

abhina-sree force-pushed the abhina/charset_converter branch from 82cc1cf to 1817bd3 Compare May 31, 2024 19:23

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Create a CharSetConverter class with both iconv and icu support #74516

Create a CharSetConverter class with both iconv and icu support #74516

abhina-sree commented Dec 5, 2023 •

edited

llvmbot commented Dec 5, 2023

github-actions bot commented Dec 5, 2023 •

edited

AaronBallman commented Dec 8, 2023

dwblaikie commented Dec 8, 2023

abhina-sree commented Dec 8, 2023

cor3ntin left a comment

cor3ntin Jan 4, 2024

abhina-sree Feb 22, 2024

cor3ntin Jan 4, 2024

abhina-sree Jan 10, 2024

abhina-sree commented Feb 22, 2024

efriedma-quic Feb 22, 2024

abhina-sree Feb 27, 2024

abhina-sree Mar 26, 2024

efriedma-quic Mar 26, 2024

abhina-sree commented Mar 26, 2024

efriedma-quic Mar 26, 2024

abhina-sree commented Apr 8, 2024

hubert-reinterpretcast left a comment

hubert-reinterpretcast left a comment

hubert-reinterpretcast May 4, 2024

hubert-reinterpretcast left a comment

abhina-sree commented Jun 5, 2024

		ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
		StringRef CSTo) {

		/// In case of an error, the result string contains the successfully converted
		/// part of the input string.

Create a CharSetConverter class with both iconv and icu support #74516

Are you sure you want to change the base?

Create a CharSetConverter class with both iconv and icu support #74516

Conversation

abhina-sree commented Dec 5, 2023 • edited

llvmbot commented Dec 5, 2023

github-actions bot commented Dec 5, 2023 • edited

AaronBallman commented Dec 8, 2023

dwblaikie commented Dec 8, 2023

abhina-sree commented Dec 8, 2023

cor3ntin left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

abhina-sree commented Feb 22, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

abhina-sree commented Mar 26, 2024

Choose a reason for hiding this comment

abhina-sree commented Apr 8, 2024

hubert-reinterpretcast left a comment

Choose a reason for hiding this comment

hubert-reinterpretcast left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

hubert-reinterpretcast left a comment

Choose a reason for hiding this comment

abhina-sree commented Jun 5, 2024

abhina-sree commented Dec 5, 2023 •

edited

github-actions bot commented Dec 5, 2023 •

edited