-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[libc] Templatize strtofloatingpoint and implement wcstof. #167755
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This change follows the pattern of 315dfe5 by making strtofloat also accept wchar_t* strings (in addition to regular char*). It uses overloads from wctype_utils or specialized functions to ensure comparison with literal characters (or literal strings) pick char or wchar_t variants based on the argument type. The wcstof implementation is added, with unit test cases copied from strtof test suite.
|
@llvm/pr-subscribers-libc Author: Alexey Samsonov (vonosmas) ChangesThis change follows the pattern of 315dfe5 by making strtofloat also accept wchar_t* strings The wcstof implementation is added, with unit test cases copied from strtof test suite. Patch is 32.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167755.diff 11 Files Affected:
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 8a46a7a1baae3..d3bcad470b3e1 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -398,6 +398,7 @@ set(TARGET_LIBC_ENTRYPOINTS
libc.src.wchar.wmemchr
libc.src.wchar.wcpcpy
libc.src.wchar.wcpncpy
+ libc.src.wchar.wcstof
libc.src.wchar.wcstok
libc.src.wchar.wcstol
libc.src.wchar.wcstoll
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index fb5b19b523b31..faceb9bb4e12d 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -360,3 +360,10 @@ functions:
- type: const wchar_t *__restrict
- type: wchar_t **__restrict
- type: int
+ - name: wcstof
+ standards:
+ - stdc
+ return_type: float
+ arguments:
+ - type: const wchar_t *__restrict
+ - type: wchar_t **__restrict
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 96874702b1fdf..d33e7ae45c068 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -221,7 +221,9 @@ add_header_library(
HDRS
high_precision_decimal.h
DEPENDS
+ .ctype_utils
.str_to_integer
+ .wctype_utils
libc.hdr.stdint_proxy
)
@@ -236,6 +238,7 @@ add_header_library(
.str_to_integer
.str_to_num_result
.uint128
+ .wctype_utils
libc.hdr.errno_macros
libc.hdr.stdint_proxy
libc.src.__support.common
diff --git a/libc/src/__support/high_precision_decimal.h b/libc/src/__support/high_precision_decimal.h
index 08af78602d2ab..75f2a7607b425 100644
--- a/libc/src/__support/high_precision_decimal.h
+++ b/libc/src/__support/high_precision_decimal.h
@@ -20,6 +20,7 @@
#include "src/__support/ctype_utils.h"
#include "src/__support/macros/config.h"
#include "src/__support/str_to_integer.h"
+#include "src/__support/wctype_utils.h"
namespace LIBC_NAMESPACE_DECL {
namespace internal {
@@ -38,6 +39,24 @@ struct LShiftTableEntry {
// TODO: Figure out where to put this.
enum class RoundDirection { Up, Down, Nearest };
+// These constants are used in both this file and in the main str_to_float.h.
+// TODO: Figure out where to put this.
+template <typename CharType> struct constants;
+template <> struct constants<char> {
+ static constexpr char DECIMAL_POINT = '.';
+ static constexpr char DECIMAL_EXPONENT_MARKER = 'e';
+ static constexpr char HEX_EXPONENT_MARKER = 'p';
+ static constexpr char INF_STRING[] = "infinity";
+ static constexpr char NAN_STRING[] = "nan";
+};
+template <> struct constants<wchar_t> {
+ static constexpr wchar_t DECIMAL_POINT = L'.';
+ static constexpr wchar_t DECIMAL_EXPONENT_MARKER = L'e';
+ static constexpr wchar_t HEX_EXPONENT_MARKER = L'p';
+ static constexpr wchar_t INF_STRING[] = L"infinity";
+ static constexpr wchar_t NAN_STRING[] = L"nan";
+};
+
// This is based on the HPD data structure described as part of the Simple
// Decimal Conversion algorithm by Nigel Tao, described at this link:
// https://nigeltao.github.io/blog/2020/parse-number-f64-simple.html
@@ -314,9 +333,9 @@ class HighPrecisionDecimal {
public:
// num_string is assumed to be a string of numeric characters. It doesn't
// handle leading spaces.
- LIBC_INLINE
- HighPrecisionDecimal(
- const char *__restrict num_string,
+ template <typename CharType>
+ LIBC_INLINE HighPrecisionDecimal(
+ const CharType *__restrict num_string,
const size_t num_len = cpp::numeric_limits<size_t>::max()) {
bool saw_dot = false;
size_t num_cur = 0;
@@ -324,25 +343,26 @@ class HighPrecisionDecimal {
// them all.
uint32_t total_digits = 0;
while (num_cur < num_len &&
- (isdigit(num_string[num_cur]) || num_string[num_cur] == '.')) {
- if (num_string[num_cur] == '.') {
+ (isdigit(num_string[num_cur]) ||
+ num_string[num_cur] == constants<CharType>::DECIMAL_POINT)) {
+ if (num_string[num_cur] == constants<CharType>::DECIMAL_POINT) {
if (saw_dot) {
break;
}
this->decimal_point = static_cast<int32_t>(total_digits);
saw_dot = true;
} else {
- if (num_string[num_cur] == '0' && this->num_digits == 0) {
+ int digit = b36_char_to_int(num_string[num_cur]);
+ if (digit == 0 && this->num_digits == 0) {
--this->decimal_point;
++num_cur;
continue;
}
++total_digits;
if (this->num_digits < MAX_NUM_DIGITS) {
- this->digits[this->num_digits] = static_cast<uint8_t>(
- internal::b36_char_to_int(num_string[num_cur]));
+ this->digits[this->num_digits] = static_cast<uint8_t>(digit);
++this->num_digits;
- } else if (num_string[num_cur] != '0') {
+ } else if (digit != 0) {
this->truncated = true;
}
}
@@ -352,11 +372,10 @@ class HighPrecisionDecimal {
if (!saw_dot)
this->decimal_point = static_cast<int32_t>(total_digits);
- if (num_cur < num_len &&
- (num_string[num_cur] == 'e' || num_string[num_cur] == 'E')) {
+ if (num_cur < num_len && tolower(num_string[num_cur]) ==
+ constants<CharType>::DECIMAL_EXPONENT_MARKER) {
++num_cur;
- if (isdigit(num_string[num_cur]) || num_string[num_cur] == '+' ||
- num_string[num_cur] == '-') {
+ if (isdigit(num_string[num_cur]) || get_sign(num_string + num_cur) != 0) {
auto result =
strtointeger<int32_t>(num_string + num_cur, 10, num_len - num_cur);
if (result.has_error()) {
diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h
index 3d35d8a30afff..3cf0d47e130b3 100644
--- a/libc/src/__support/str_to_float.h
+++ b/libc/src/__support/str_to_float.h
@@ -33,6 +33,7 @@
#include "src/__support/str_to_integer.h"
#include "src/__support/str_to_num_result.h"
#include "src/__support/uint128.h"
+#include "src/__support/wctype_utils.h"
namespace LIBC_NAMESPACE_DECL {
namespace internal {
@@ -334,9 +335,9 @@ constexpr int32_t NUM_POWERS_OF_TWO =
// the Eisel-Lemire algorithm fails, it's slower but more accurate. It's based
// on the Simple Decimal Conversion algorithm by Nigel Tao, described at this
// link: https://nigeltao.github.io/blog/2020/parse-number-f64-simple.html
-template <class T>
+template <typename T, typename CharType>
LIBC_INLINE FloatConvertReturn<T> simple_decimal_conversion(
- const char *__restrict numStart,
+ const CharType *__restrict numStart,
const size_t num_len = cpp::numeric_limits<size_t>::max(),
RoundDirection round = RoundDirection::Nearest) {
using FPBits = typename fputil::FPBits<T>;
@@ -676,12 +677,11 @@ template <> LIBC_INLINE constexpr int32_t get_lower_bound<double>() {
// Takes a mantissa and base 10 exponent and converts it into its closest
// floating point type T equivalient. First we try the Eisel-Lemire algorithm,
// then if that fails then we fall back to a more accurate algorithm for
-// accuracy. The resulting mantissa and exponent are placed in outputMantissa
-// and outputExp2.
-template <class T>
+// accuracy.
+template <typename T, typename CharType>
LIBC_INLINE FloatConvertReturn<T> decimal_exp_to_float(
ExpandedFloat<T> init_num, bool truncated, RoundDirection round,
- const char *__restrict numStart,
+ const CharType *__restrict numStart,
const size_t num_len = cpp::numeric_limits<size_t>::max()) {
using FPBits = typename fputil::FPBits<T>;
using StorageType = typename FPBits::StorageType;
@@ -860,36 +860,43 @@ LIBC_INLINE FloatConvertReturn<T> binary_exp_to_float(ExpandedFloat<T> init_num,
return output;
}
-// checks if the next 4 characters of the string pointer are the start of a
+// Checks if the first characters of the string pointer are the start of a
// hexadecimal floating point number. Does not advance the string pointer.
-LIBC_INLINE bool is_float_hex_start(const char *__restrict src,
- const char decimalPoint) {
- if (!(src[0] == '0' && tolower(src[1]) == 'x')) {
+template <typename CharType>
+LIBC_INLINE static bool is_float_hex_start(const CharType *__restrict src,
+ CharType decimal_point) {
+ if (!is_char_or_wchar(src[0], '0', L'0') ||
+ !is_char_or_wchar(tolower(src[1]), 'x', L'x')) {
return false;
}
size_t first_digit = 2;
- if (src[2] == decimalPoint) {
+ if (src[2] == decimal_point) {
++first_digit;
}
return isalnum(src[first_digit]) && b36_char_to_int(src[first_digit]) < 16;
}
-// Takes the start of a string representing a decimal float, as well as the
-// local decimalPoint. It returns if it suceeded in parsing any digits, and if
-// the return value is true then the outputs are pointer to the end of the
-// number, and the mantissa and exponent for the closest float T representation.
-// If the return value is false, then it is assumed that there is no number
-// here.
-template <class T>
-LIBC_INLINE StrToNumResult<ExpandedFloat<T>>
-decimal_string_to_float(const char *__restrict src, const char DECIMAL_POINT,
- RoundDirection round) {
+// Verifies that first prefix_len characters of str, when lowercased, match the
+// specified prefix.
+template <typename CharType>
+LIBC_INLINE static bool tolower_starts_with(const CharType *str,
+ size_t prefix_len,
+ const CharType *prefix) {
+ for (size_t i = 0; i < prefix_len; ++i) {
+ if (tolower(str[i]) != prefix[i])
+ return false;
+ }
+ return true;
+}
+
+// Attempts parsing a decimal floating point number at the start of the string.
+template <typename T, typename CharType>
+LIBC_INLINE static StrToNumResult<ExpandedFloat<T>>
+decimal_string_to_float(const CharType *__restrict src, RoundDirection round) {
using FPBits = typename fputil::FPBits<T>;
using StorageType = typename FPBits::StorageType;
constexpr uint32_t BASE = 10;
- constexpr char EXPONENT_MARKER = 'e';
-
bool truncated = false;
bool seen_digit = false;
bool after_decimal = false;
@@ -926,7 +933,7 @@ decimal_string_to_float(const char *__restrict src, const char DECIMAL_POINT,
++index;
continue;
}
- if (src[index] == DECIMAL_POINT) {
+ if (src[index] == constants<CharType>::DECIMAL_POINT) {
if (after_decimal) {
break; // this means that src[index] points to a second decimal point,
// ending the number.
@@ -943,13 +950,10 @@ decimal_string_to_float(const char *__restrict src, const char DECIMAL_POINT,
return output;
// TODO: When adding max length argument, handle the case of a trailing
- // EXPONENT MARKER, see scanf for more details.
- if (tolower(src[index]) == EXPONENT_MARKER) {
- bool has_sign = false;
- if (src[index + 1] == '+' || src[index + 1] == '-') {
- has_sign = true;
- }
- if (isdigit(src[index + 1 + static_cast<size_t>(has_sign)])) {
+ // exponent marker, see scanf for more details.
+ if (tolower(src[index]) == constants<CharType>::DECIMAL_EXPONENT_MARKER) {
+ int sign = get_sign(src + index + 1);
+ if (isdigit(src[index + 1 + static_cast<size_t>(sign != 0)])) {
++index;
auto result = strtointeger<int32_t>(src + index, 10);
if (result.has_error())
@@ -985,22 +989,16 @@ decimal_string_to_float(const char *__restrict src, const char DECIMAL_POINT,
return output;
}
-// Takes the start of a string representing a hexadecimal float, as well as the
-// local decimal point. It returns if it suceeded in parsing any digits, and if
-// the return value is true then the outputs are pointer to the end of the
-// number, and the mantissa and exponent for the closest float T representation.
-// If the return value is false, then it is assumed that there is no number
-// here.
-template <class T>
-LIBC_INLINE StrToNumResult<ExpandedFloat<T>>
-hexadecimal_string_to_float(const char *__restrict src,
- const char DECIMAL_POINT, RoundDirection round) {
+// Attempts parsing a hexadecimal floating point number at the start of the
+// string.
+template <typename T, typename CharType>
+LIBC_INLINE static StrToNumResult<ExpandedFloat<T>>
+hexadecimal_string_to_float(const CharType *__restrict src,
+ RoundDirection round) {
using FPBits = typename fputil::FPBits<T>;
using StorageType = typename FPBits::StorageType;
constexpr uint32_t BASE = 16;
- constexpr char EXPONENT_MARKER = 'p';
-
bool truncated = false;
bool seen_digit = false;
bool after_decimal = false;
@@ -1038,7 +1036,7 @@ hexadecimal_string_to_float(const char *__restrict src,
++index;
continue;
}
- if (src[index] == DECIMAL_POINT) {
+ if (src[index] == constants<CharType>::DECIMAL_POINT) {
if (after_decimal) {
break; // this means that src[index] points to a second decimal point,
// ending the number.
@@ -1057,12 +1055,9 @@ hexadecimal_string_to_float(const char *__restrict src,
// Convert the exponent from having a base of 16 to having a base of 2.
exponent *= 4;
- if (tolower(src[index]) == EXPONENT_MARKER) {
- bool has_sign = false;
- if (src[index + 1] == '+' || src[index + 1] == '-') {
- has_sign = true;
- }
- if (isdigit(src[index + 1 + static_cast<size_t>(has_sign)])) {
+ if (tolower(src[index]) == constants<CharType>::HEX_EXPONENT_MARKER) {
+ int sign = get_sign(src + index + 1);
+ if (isdigit(src[index + 1 + static_cast<size_t>(sign != 0)])) {
++index;
auto result = strtointeger<int32_t>(src + index, 10);
if (result.has_error())
@@ -1098,21 +1093,21 @@ hexadecimal_string_to_float(const char *__restrict src,
return output;
}
-template <class T>
+template <typename T, typename CharType>
LIBC_INLINE typename fputil::FPBits<T>::StorageType
-nan_mantissa_from_ncharseq(const cpp::string_view ncharseq) {
+nan_mantissa_from_ncharseq(const CharType *str, size_t len) {
using FPBits = typename fputil::FPBits<T>;
using StorageType = typename FPBits::StorageType;
StorageType nan_mantissa = 0;
- if (ncharseq.data() != nullptr && isdigit(ncharseq[0])) {
+ if (len > 0 && isdigit(str[0])) {
StrToNumResult<StorageType> strtoint_result =
- strtointeger<StorageType>(ncharseq.data(), 0);
+ strtointeger<StorageType>(str, 0, len);
if (!strtoint_result.has_error())
nan_mantissa = strtoint_result.value;
- if (strtoint_result.parsed_len != static_cast<ptrdiff_t>(ncharseq.size()))
+ if (strtoint_result.parsed_len != static_cast<ptrdiff_t>(len))
nan_mantissa = 0;
}
@@ -1123,59 +1118,44 @@ nan_mantissa_from_ncharseq(const cpp::string_view ncharseq) {
// is used as the backend for all of the string to float functions.
// TODO: Add src_len member to match strtointeger.
// TODO: Next, move from char* and length to string_view
-template <class T>
-LIBC_INLINE StrToNumResult<T> strtofloatingpoint(const char *__restrict src) {
+template <typename T, typename CharType>
+LIBC_INLINE StrToNumResult<T>
+strtofloatingpoint(const CharType *__restrict src) {
using FPBits = typename fputil::FPBits<T>;
using StorageType = typename FPBits::StorageType;
FPBits result = FPBits();
bool seen_digit = false;
- char sign = '+';
-
int error = 0;
size_t index = first_non_whitespace(src);
+ int sign = get_sign(src + index);
+ bool is_positive = (sign >= 0);
+ index += (sign != 0);
- if (src[index] == '+' || src[index] == '-') {
- sign = src[index];
- ++index;
- }
-
- if (sign == '-') {
+ if (sign < 0) {
result.set_sign(Sign::NEG);
}
- static constexpr char DECIMAL_POINT = '.';
- static const char *inf_string = "infinity";
- static const char *nan_string = "nan";
-
- if (isdigit(src[index]) || src[index] == DECIMAL_POINT) { // regular number
+ if (isdigit(src[index]) ||
+ src[index] == constants<CharType>::DECIMAL_POINT) { // regular number
int base = 10;
- if (is_float_hex_start(src + index, DECIMAL_POINT)) {
+ if (is_float_hex_start(src + index, constants<CharType>::DECIMAL_POINT)) {
base = 16;
index += 2;
seen_digit = true;
}
RoundDirection round_direction = RoundDirection::Nearest;
-
switch (fputil::quick_get_round()) {
case FE_TONEAREST:
round_direction = RoundDirection::Nearest;
break;
case FE_UPWARD:
- if (sign == '+') {
- round_direction = RoundDirection::Up;
- } else {
- round_direction = RoundDirection::Down;
- }
+ round_direction = is_positive ? RoundDirection::Up : RoundDirection::Down;
break;
case FE_DOWNWARD:
- if (sign == '+') {
- round_direction = RoundDirection::Down;
- } else {
- round_direction = RoundDirection::Up;
- }
+ round_direction = is_positive ? RoundDirection::Down : RoundDirection::Up;
break;
case FE_TOWARDZERO:
round_direction = RoundDirection::Down;
@@ -1184,58 +1164,53 @@ LIBC_INLINE StrToNumResult<T> strtofloatingpoint(const char *__restrict src) {
StrToNumResult<ExpandedFloat<T>> parse_result({0, 0});
if (base == 16) {
- parse_result = hexadecimal_string_to_float<T>(src + index, DECIMAL_POINT,
- round_direction);
+ parse_result =
+ hexadecimal_string_to_float<T>(src + index, round_direction);
} else { // base is 10
- parse_result = decimal_string_to_float<T>(src + index, DECIMAL_POINT,
- round_direction);
+ parse_result = decimal_string_to_float<T>(src + index, round_direction);
}
seen_digit = parse_result.parsed_len != 0;
result.set_mantissa(parse_result.value.mantissa);
result.set_biased_exponent(parse_result.value.exponent);
index += parse_result.parsed_len;
error = parse_result.error;
- } else if (tolower(src[index]) == 'n') { // NaN
- if (tolower(src[index + 1]) == nan_string[1] &&
- tolower(src[index + 2]) == nan_string[2]) {
- seen_digit = true;
- index += 3;
- StorageType nan_mantissa = 0;
- // this handles the case of `NaN(n-character-sequence)`, where the
- // n-character-sequence is made of 0 or more letters, numbers, or
- // underscore characters in any order.
- if (src[index] == '(') {
- size_t left_paren = index;
+ } else if (tolower_starts_with(src + index, 3,
+ constants<CharType>::NAN_STRING)) {
+ // NAN
+ seen_digit = true;
+ index += 3;
+ StorageType nan_mantissa = 0;
+ // this handles the case of `NaN(n-character-sequence)`, where the
+ // n-character-sequence is made of 0 or more letters, numbers, or
+ // underscore characters in any order.
+ if (is_char_or_wchar(src[index], '(', L'(')) {
+ size_t left_paren = index;
+ ++index;
+ while (isalnum(src[index]) || is_char_or_wchar(src[index], '_', L'_'))
++index;
- while (isalnum(src[index]) || src[index] == '_')
- ++index;
- if (src[index] == ')') {
- ++index;
- nan_mantissa = nan_mantissa_from_ncharseq<T>(
- cpp::string_view(src + (left_paren + 1), index - left_paren - 2));
- } else {
- index = left_paren;
- }
- }
- result = FPBits(result.quiet_nan(result.sign(), nan_mantissa));
- }
- } else if (tolower(src[index]) == 'i') { // INF
- if (tolower(src[index + 1]) == inf_string[1] &&
- tolower(src[index + 2]) == inf_string[2]) {
- seen_digit = true;
- result = FPBits(result.inf(result.sign()));
- if (tolower(src[index + 3]) == inf_string[3] &&
- tolower(src[index + 4]) == inf_string[4] &&
- tolower(src[index + 5]) == inf_string[5] &&
- tolower(src[index + 6]) == inf_string[6] &&
- tolower(src[index + 7]) == inf_string[7]) {
- // if the string is "INFINITY" then consume 8 characters.
- index += 8;
+ if (is_char_or_wchar(src[index], ')', L')')) {
+ ++index;
+ nan_mantissa = nan_mantissa_from_ncharseq<T>(src + (left_paren + 1),
+ index - left_paren - 2...
[truncated]
|
michaelrj-google
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, please check that this doesn't break libc++. Just running check-cxx should be sufficient.
Thanks, I've verified that libc++ builds successfully (and fails if I introduce an error) |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/10/builds/17271 Here is the relevant piece of the build log for the reference |
This change follows the pattern of 315dfe5 by making strtofloat also accept wchar_t* strings
(in addition to regular char*). It uses overloads from wctype_utils or specialized functions to ensure comparison with literal characters (or literal strings) pick char or wchar_t variants based on the argument type.
The wcstof implementation is added, with unit test cases copied from strtof test suite.