[libc] Use function overloads to make string parsing code more generic. #167417

vonosmas · 2025-11-10T23:49:17Z

ctype_utils/wctype_utils were chaged in 120689e and e7f7973, respectively to operate on char/wchar_t. Now we can switch to the overloaded names (e.g. have noth isspace(char and isspace(wchar_t)) to simplify the templatized strtointeger implementation from 315dfe5 and make it easier to potentially add templatized strtofloat implementation.

…t strings more generic. ctype_utils/wctype_utils were chaged in 120689e and e7f7973, respectively to operate on char/wchar_t. Now we can switch to the overloaded names (e.g. have noth `isspace(char` and `isspace(wchar_t)`) to simplify the templatized strtointeger implementation from 315dfe5 and make it easier to potentially add templatized strtofloat implementation.

llvmbot · 2025-11-10T23:49:53Z

@llvm/pr-subscribers-libc

Author: Alexey Samsonov (vonosmas)

Changes

ctype_utils/wctype_utils were chaged in 120689e and e7f7973, respectively to operate on char/wchar_t. Now we can switch to the overloaded names (e.g. have noth isspace(char and isspace(wchar_t)) to simplify the templatized strtointeger implementation from 315dfe5 and make it easier to potentially add templatized strtofloat implementation.

Full diff: https://github.com/llvm/llvm-project/pull/167417.diff

4 Files Affected:

(modified) libc/src/__support/ctype_utils.h (+7)
(modified) libc/src/__support/str_to_integer.h (+15-43)
(modified) libc/src/__support/wctype_utils.h (+17-10)
(modified) libc/src/wctype/iswalpha.cpp (+1-1)

diff --git a/libc/src/__support/ctype_utils.h b/libc/src/__support/ctype_utils.h
index 61b7a0aeb5b67..b0f9ece5d1a25 100644
--- a/libc/src/__support/ctype_utils.h
+++ b/libc/src/__support/ctype_utils.h
@@ -578,6 +578,13 @@ LIBC_INLINE static constexpr bool isgraph(char ch) {
   return 0x20 < ch && ch < 0x7f;
 }
 
+// An overload which provides a way to compare input with specific character
+// values, when input can be of a regular or a wide character type.
+LIBC_INLINE static constexpr bool is_one_of(char ch, char c_value,
+                                            [[maybe_unused]] wchar_t) {
+  return (ch == c_value);
+}
+
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/__support/str_to_integer.h b/libc/src/__support/str_to_integer.h
index ba3f49fa2f47b..f9154ffc90c7e 100644
--- a/libc/src/__support/str_to_integer.h
+++ b/libc/src/__support/str_to_integer.h
@@ -31,22 +31,14 @@ namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
 // Returns the idx to the first character in src that is not a whitespace
-// character (as determined by isspace() / iswspace())
+// character (as determined by isspace())
 template <typename CharType>
 LIBC_INLINE size_t
 first_non_whitespace(const CharType *__restrict src,
                      size_t src_len = cpp::numeric_limits<size_t>::max()) {
   size_t src_cur = 0;
-  while (src_cur < src_len) {
-    if constexpr (cpp::is_same_v<CharType, char>) {
-      if (!internal::isspace(src[src_cur]))
-        break;
-    } else {
-      if (!internal::iswspace(src[src_cur]))
-        break;
-    }
-    ++src_cur;
-  }
+  for (; src_cur < src_len && internal::isspace(src[src_cur]); ++src_cur)
+    ;
   return src_cur;
 }
 
@@ -54,11 +46,11 @@ first_non_whitespace(const CharType *__restrict src,
 // plus sign, minus sign, or neither.
 template <typename CharType>
 LIBC_INLINE static int get_sign(const CharType *__restrict src) {
-  if constexpr (cpp::is_same_v<CharType, char>) {
-    return (src[0] == '+') ? 1 : (src[0] == '-' ? -1 : 0);
-  } else {
-    return (src[0] == L'+') ? 1 : (src[0] == L'-' ? -1 : 0);
-  }
+  if (is_one_of(src[0], '+', L'+'))
+    return 1;
+  if (is_one_of(src[0], '-', L'-'))
+    return -1;
+  return 0;
 }
 
 // checks if the next 3 characters of the string pointer are the start of a
@@ -68,13 +60,9 @@ LIBC_INLINE static bool is_hex_start(const CharType *__restrict src,
                                      size_t src_len) {
   if (src_len < 3)
     return false;
-  if constexpr (cpp::is_same_v<CharType, char>) {
-    return src[0] == '0' && tolower(src[1]) == 'x' && isalnum(src[2]) &&
-           b36_char_to_int(src[2]) < 16;
-  } else {
-    return src[0] == L'0' && towlower(src[1]) == L'x' && iswalnum(src[2]) &&
-           b36_wchar_to_int(src[2]) < 16;
-  }
+  return is_one_of(src[0], '0', L'0') &&
+         is_one_of(tolower(src[1]), 'x', L'x') && isalnum(src[2]) &&
+         b36_char_to_int(src[2]) < 16;
 }
 
 // Takes the address of the string pointer and parses the base from the start of
@@ -90,14 +78,8 @@ LIBC_INLINE static int infer_base(const CharType *__restrict src,
   // An octal number is defined as "the prefix 0 optionally followed by a
   // sequence of the digits 0 through 7 only" (C standard 6.4.4.1) and so any
   // number that starts with 0, including just 0, is an octal number.
-  if (src_len > 0) {
-    if constexpr (cpp::is_same_v<CharType, char>) {
-      if (src[0] == '0')
-        return 8;
-    } else {
-      if (src[0] == L'0')
-        return 8;
-    }
+  if (src_len > 0 && internal::is_one_of(src[0], '0', L'0')) {
+    return 8;
   }
   // A decimal number is defined as beginning "with a nonzero digit and
   // consist[ing] of a sequence of decimal digits." (C standard 6.4.4.1)
@@ -150,18 +132,8 @@ strtointeger(const CharType *__restrict src, int base,
   bool is_number = false;
   int error_val = 0;
   ResultType result = 0;
-  while (src_cur < src_len) {
-    int cur_digit;
-    if constexpr (cpp::is_same_v<CharType, char>) {
-      if (!isalnum(src[src_cur]))
-        break;
-      cur_digit = b36_char_to_int(src[src_cur]);
-    } else {
-      if (!iswalnum(src[src_cur]))
-        break;
-      cur_digit = b36_wchar_to_int(src[src_cur]);
-    }
-
+  while (src_cur < src_len && isalnum(src[src_cur])) {
+    int cur_digit = b36_char_to_int(src[src_cur]);
     if (cur_digit >= base)
       break;
 
diff --git a/libc/src/__support/wctype_utils.h b/libc/src/__support/wctype_utils.h
index 60b6afb928475..f04efd21db595 100644
--- a/libc/src/__support/wctype_utils.h
+++ b/libc/src/__support/wctype_utils.h
@@ -31,7 +31,7 @@ namespace internal {
 
 // Similarly, do not change these fumarks to show your new solution is faster,
 // as well as a way to support non-Anctions to use case ranges. e.g.
-//  bool iswlower(wchar_t ch) {
+//  bool islower(wchar_t ch) {
 //    switch(ch) {
 //    case L'a'...L'z':
 //      return true;
@@ -41,7 +41,7 @@ namespace internal {
 // EBCDIC. Technically we could use some smaller ranges, but that's even harder
 // to read.
 
-LIBC_INLINE static constexpr bool iswlower(wchar_t wch) {
+LIBC_INLINE static constexpr bool islower(wchar_t wch) {
   switch (wch) {
   case L'a':
   case L'b':
@@ -75,7 +75,7 @@ LIBC_INLINE static constexpr bool iswlower(wchar_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswupper(wchar_t wch) {
+LIBC_INLINE static constexpr bool isupper(wchar_t wch) {
   switch (wch) {
   case L'A':
   case L'B':
@@ -109,7 +109,7 @@ LIBC_INLINE static constexpr bool iswupper(wchar_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswdigit(wchar_t wch) {
+LIBC_INLINE static constexpr bool isdigit(wchar_t wch) {
   switch (wch) {
   case L'0':
   case L'1':
@@ -127,7 +127,7 @@ LIBC_INLINE static constexpr bool iswdigit(wchar_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr wchar_t towlower(wchar_t wch) {
+LIBC_INLINE static constexpr wchar_t tolower(wchar_t wch) {
   switch (wch) {
   case L'A':
     return L'a';
@@ -186,7 +186,7 @@ LIBC_INLINE static constexpr wchar_t towlower(wchar_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr wchar_t towupper(wchar_t wch) {
+LIBC_INLINE static constexpr wchar_t toupper(wchar_t wch) {
   switch (wch) {
   case L'a':
     return L'A';
@@ -245,7 +245,7 @@ LIBC_INLINE static constexpr wchar_t towupper(wchar_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswalpha(wchar_t wch) {
+LIBC_INLINE static constexpr bool isalpha(wchar_t wch) {
   switch (wch) {
   case L'a':
   case L'b':
@@ -305,7 +305,7 @@ LIBC_INLINE static constexpr bool iswalpha(wchar_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswalnum(wchar_t wch) {
+LIBC_INLINE static constexpr bool isalnum(wchar_t wch) {
   switch (wch) {
   case L'a':
   case L'b':
@@ -375,7 +375,7 @@ LIBC_INLINE static constexpr bool iswalnum(wchar_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr int b36_wchar_to_int(wchar_t wch) {
+LIBC_INLINE static constexpr int b36_char_to_int(wchar_t wch) {
   switch (wch) {
   case L'0':
     return 0;
@@ -563,7 +563,7 @@ LIBC_INLINE static constexpr wchar_t int_to_b36_wchar(int num) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswspace(wchar_t wch) {
+LIBC_INLINE static constexpr bool isspace(wchar_t wch) {
   switch (wch) {
   case L' ':
   case L'\t':
@@ -577,6 +577,13 @@ LIBC_INLINE static constexpr bool iswspace(wchar_t wch) {
   }
 }
 
+// An overload which provides a way to compare input with specific character
+// values, when input can be of a regular or a wide character type.
+LIBC_INLINE static constexpr bool is_one_of(wchar_t ch, [[maybe_unused]] char,
+                                            wchar_t wc_value) {
+  return (ch == wc_value);
+}
+
 // ------------------------------------------------------
 // Rationale: Since these classification functions are
 // called in other functions, we will avoid the overhead
diff --git a/libc/src/wctype/iswalpha.cpp b/libc/src/wctype/iswalpha.cpp
index e151363b88d0b..01ceac8f68b23 100644
--- a/libc/src/wctype/iswalpha.cpp
+++ b/libc/src/wctype/iswalpha.cpp
@@ -15,7 +15,7 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, iswalpha, (wint_t c)) {
-  return internal::iswalpha(static_cast<wchar_t>(c));
+  return internal::isalpha(static_cast<wchar_t>(c));
 }
 
 } // namespace LIBC_NAMESPACE_DECL

michaelrj-google

LGTM, with a nit on naming

libc/src/__support/ctype_utils.h

vonosmas requested a review from michaelrj-google November 10, 2025 23:49

llvmbot added the libc label Nov 10, 2025

michaelrj-google approved these changes Nov 11, 2025

View reviewed changes

libc/src/__support/ctype_utils.h Outdated Show resolved Hide resolved

update function name

58eb09d

vonosmas merged commit 124bfdf into llvm:main Nov 12, 2025
20 checks passed

vonosmas deleted the strtointeger-fix branch November 12, 2025 06:52

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[libc] Use function overloads to make string parsing code more generic. #167417

[libc] Use function overloads to make string parsing code more generic. #167417

Uh oh!

vonosmas commented Nov 10, 2025

Uh oh!

llvmbot commented Nov 10, 2025

Uh oh!

michaelrj-google left a comment

Uh oh!

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

[libc] Use function overloads to make string parsing code more generic. #167417

[libc] Use function overloads to make string parsing code more generic. #167417

Uh oh!

Conversation

vonosmas commented Nov 10, 2025

Uh oh!

llvmbot commented Nov 10, 2025

Uh oh!

michaelrj-google left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants