Skip to content

Commit

Permalink
Improve handling of static assert messages.
Browse files Browse the repository at this point in the history
Instead of dumping the string literal (which
quotes it and escape every non-ascii symbol),
we can use the content of the string when it is a
8 byte string.

Wide, UTF-8/UTF-16/32 strings are still completely
escaped, until we clarify how these entities should
behave (cf https://wg21.link/p2361).

`FormatDiagnostic` is modified to escape
non printable characters and invalid UTF-8.

This ensures that unicode characters, spaces and new
lines are properly rendered in static messages.
This make clang more consistent with other implementation
and fixes this tweet
https://twitter.com/jfbastien/status/1298307325443231744 :)

Of note, `PaddingChecker` did print out new lines that were
later removed by the diagnostic printing code.
To be consistent with its tests, the new lines are removed
from the diagnostic.

Unicode tables updated to both use the Unicode definitions
and the Unicode 14.0 data.

U+00AD SOFT HYPHEN is still considered a print character
to match existing practices in terminals, in addition of
being considered a formatting character as per Unicode.

Reviewed By: aaron.ballman, #clang-language-wg

Differential Revision: https://reviews.llvm.org/D108469
  • Loading branch information
cor3ntin committed Jun 29, 2022
1 parent 62e907e commit 64ab2b1
Show file tree
Hide file tree
Showing 10 changed files with 347 additions and 218 deletions.
2 changes: 2 additions & 0 deletions clang/docs/ReleaseNotes.rst
Expand Up @@ -273,6 +273,8 @@ Improvements to Clang's diagnostics
- When using class templates without arguments, clang now tells developers
that template arguments are missing in certain contexts.
This fixes `Issue 55962 <https://github.com/llvm/llvm-project/issues/55962>`_.
- Printable Unicode characters within `static_assert` messages are no longer
escaped.

Non-comprehensive list of changes in this release
-------------------------------------------------
Expand Down
58 changes: 49 additions & 9 deletions clang/lib/Basic/Diagnostic.cpp
Expand Up @@ -25,8 +25,9 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/CrashRecoveryContext.h"
#include "llvm/Support/Locale.h"
#include "llvm/Support/Unicode.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
Expand Down Expand Up @@ -803,6 +804,50 @@ FormatDiagnostic(SmallVectorImpl<char> &OutStr) const {
FormatDiagnostic(Diag.begin(), Diag.end(), OutStr);
}

/// pushEscapedString - Append Str to the diagnostic buffer,
/// escaping non-printable characters and ill-formed code unit sequences.
static void pushEscapedString(StringRef Str, SmallVectorImpl<char> &OutStr) {
OutStr.reserve(OutStr.size() + Str.size());
auto *Begin = reinterpret_cast<const unsigned char *>(Str.data());
llvm::raw_svector_ostream OutStream(OutStr);
const unsigned char *End = Begin + Str.size();
while (Begin != End) {
// ASCII case
if (isPrintable(*Begin) || isWhitespace(*Begin)) {
OutStream << *Begin;
++Begin;
continue;
}
if (llvm::isLegalUTF8Sequence(Begin, End)) {
llvm::UTF32 CodepointValue;
llvm::UTF32 *CpPtr = &CodepointValue;
const unsigned char *CodepointBegin = Begin;
const unsigned char *CodepointEnd =
Begin + llvm::getNumBytesForUTF8(*Begin);
llvm::ConversionResult Res = llvm::ConvertUTF8toUTF32(
&Begin, CodepointEnd, &CpPtr, CpPtr + 1, llvm::strictConversion);
(void)Res;
assert(
llvm::conversionOK == Res &&
"the sequence is legal UTF-8 but we couldn't convert it to UTF-32");
assert(Begin == CodepointEnd &&
"we must be further along in the string now");
if (llvm::sys::unicode::isPrintable(CodepointValue) ||
llvm::sys::unicode::isFormatting(CodepointValue)) {
OutStr.append(CodepointBegin, CodepointEnd);
continue;
}
// Unprintable code point.
OutStream << "<U+" << llvm::format_hex_no_prefix(CodepointValue, 4, true)
<< ">";
continue;
}
// Invalid code unit.
OutStream << "<" << llvm::format_hex_no_prefix(*Begin, 2, true) << ">";
++Begin;
}
}

void Diagnostic::
FormatDiagnostic(const char *DiagStr, const char *DiagEnd,
SmallVectorImpl<char> &OutStr) const {
Expand All @@ -813,11 +858,7 @@ FormatDiagnostic(const char *DiagStr, const char *DiagEnd,
StringRef(DiagStr, DiagEnd - DiagStr).equals("%0") &&
getArgKind(0) == DiagnosticsEngine::ak_std_string) {
const std::string &S = getArgStdStr(0);
for (char c : S) {
if (llvm::sys::locale::isPrint(c) || c == '\t') {
OutStr.push_back(c);
}
}
pushEscapedString(S, OutStr);
return;
}

Expand Down Expand Up @@ -924,7 +965,7 @@ FormatDiagnostic(const char *DiagStr, const char *DiagEnd,
case DiagnosticsEngine::ak_std_string: {
const std::string &S = getArgStdStr(ArgNo);
assert(ModifierLen == 0 && "No modifiers for strings yet");
OutStr.append(S.begin(), S.end());
pushEscapedString(S, OutStr);
break;
}
case DiagnosticsEngine::ak_c_string: {
Expand All @@ -934,8 +975,7 @@ FormatDiagnostic(const char *DiagStr, const char *DiagEnd,
// Don't crash if get passed a null pointer by accident.
if (!S)
S = "(null)";

OutStr.append(S, S + strlen(S));
pushEscapedString(S, OutStr);
break;
}
// ---- INTEGERS ----
Expand Down
9 changes: 7 additions & 2 deletions clang/lib/Sema/SemaDeclCXX.cpp
Expand Up @@ -16592,8 +16592,13 @@ Decl *Sema::BuildStaticAssertDeclaration(SourceLocation StaticAssertLoc,
if (!Failed && !Cond) {
SmallString<256> MsgBuffer;
llvm::raw_svector_ostream Msg(MsgBuffer);
if (AssertMessage)
AssertMessage->printPretty(Msg, nullptr, getPrintingPolicy());
if (AssertMessage) {
const auto *MsgStr = cast<StringLiteral>(AssertMessage);
if (MsgStr->isAscii())
Msg << '"' << MsgStr->getString() << '"';
else
MsgStr->printPretty(Msg, nullptr, getPrintingPolicy());
}

Expr *InnerCond = nullptr;
std::string InnerCondDescription;
Expand Down
6 changes: 3 additions & 3 deletions clang/lib/StaticAnalyzer/Checkers/PaddingChecker.cpp
Expand Up @@ -332,10 +332,10 @@ class PaddingChecker : public Checker<check::ASTDecl<TranslationUnitDecl>> {
}

Os << " (" << BaselinePad.getQuantity() << " padding bytes, where "
<< OptimalPad.getQuantity() << " is optimal). \n"
<< "Optimal fields order: \n";
<< OptimalPad.getQuantity() << " is optimal). "
<< "Optimal fields order: ";
for (const auto *FD : OptimalFieldsOrder)
Os << FD->getName() << ", \n";
Os << FD->getName() << ", ";
Os << "consider reordering the fields or adding explicit padding "
"members.";

Expand Down
Binary file modified clang/test/Lexer/null-character-in-literal.c
Binary file not shown.
4 changes: 2 additions & 2 deletions clang/test/Misc/diag-special-chars.c
Expand Up @@ -5,7 +5,7 @@
// marker character for diagnostic printing. Ensure diagnostics involving
// this character does not cause problems with the diagnostic printer.
#error Hi  Bye
//expected-error@-1 {{Hi Bye}}
// expected-error@-1 {{Hi <U+007F> Bye}}

// CHECK: error: Hi Bye
// CHECK: error: Hi <U+007F> Bye
// CHECK: #error Hi <U+007F> Bye
4 changes: 2 additions & 2 deletions clang/test/Misc/wrong-encoding.c
@@ -1,4 +1,4 @@
// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value %s 2>&1 | FileCheck -strict-whitespace %s
// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value %s 2>&1 | FileCheck %s
// REQUIRES: asserts

void foo(void) {
Expand All @@ -13,7 +13,7 @@ void foo(void) {
// CHECK: {{^ \^~~~}}

"xxé¿¿¿d";
// CHECK: {{^ "xx<U\+9FFF><BF>d";}}
// CHECK: {{^ "xxé¿¿<BF>d";}}
// CHECK: {{^ \^~~~}}

"xxé¿bcd";
Expand Down
16 changes: 15 additions & 1 deletion clang/test/SemaCXX/static-assert.cpp
Expand Up @@ -30,11 +30,25 @@ S<int> s2;

static_assert(false, L"\xFFFFFFFF"); // expected-error {{static_assert failed L"\xFFFFFFFF"}}
static_assert(false, u"\U000317FF"); // expected-error {{static_assert failed u"\U000317FF"}}
// FIXME: render this as u8"\u03A9"

static_assert(false, u8"Ω"); // expected-error {{static_assert failed u8"\316\251"}}
static_assert(false, L"\u1234"); // expected-error {{static_assert failed L"\x1234"}}
static_assert(false, L"\x1ff" "0\x123" "fx\xfffff" "goop"); // expected-error {{static_assert failed L"\x1FF""0\x123""fx\xFFFFFgoop"}}

static_assert(false, R"(a
\tb
c
)"); // expected-error@-3 {{static_assert failed "a\n\tb\nc\n"}}

static_assert(false, "\u0080\u0081\u0082\u0083\u0099\u009A\u009B\u009C\u009D\u009E\u009F");
// expected-error@-1 {{static_assert failed "<U+0080><U+0081><U+0082><U+0083><U+0099><U+009A><U+009B><U+009C><U+009D><U+009E><U+009F>"}}

//! Contains RTL/LTR marks
static_assert(false, "\u200Eabc\u200Fdef\u200Fgh"); // expected-error {{static_assert failed "‎abc‏def‏gh"}}

//! Contains ZWJ/regional indicators
static_assert(false, "🏳️‍🌈 🏴󠁧󠁢󠁥󠁮󠁧󠁿 🇪🇺"); // expected-error {{static_assert failed "🏳️‍🌈 🏴󠁧󠁢󠁥󠁮󠁧󠁿 🇪🇺"}}

template<typename T> struct AlwaysFails {
// Only give one error here.
static_assert(false, ""); // expected-error {{static_assert failed}}
Expand Down
14 changes: 4 additions & 10 deletions llvm/include/llvm/Support/Unicode.h
Expand Up @@ -34,19 +34,13 @@ enum ColumnWidthErrors {
/// terminal, so we define the semantic that should be suitable for generic case
/// of a terminal capable to output Unicode characters.
///
/// All characters from the Unicode code point range are considered printable
/// except for:
/// * C0 and C1 control character ranges;
/// * default ignorable code points as per 5.21 of
/// http://www.unicode.org/versions/Unicode6.2.0/UnicodeStandard-6.2.pdf
/// except for U+00AD SOFT HYPHEN, as it's actually displayed on most
/// terminals;
/// * format characters (category = Cf);
/// * surrogates (category = Cs);
/// * unassigned characters (category = Cn).
/// Printable codepoints are those in the categories L, M, N, P, S and Zs
/// \return true if the character is considered printable.
bool isPrintable(int UCS);

// Formatting codepoints are codepoints in the Cf category.
bool isFormatting(int UCS);

/// Gets the number of positions the UTF8-encoded \p Text is likely to occupy
/// when output on a terminal ("character width"). This depends on the
/// implementation of the terminal, and there's no standard definition of
Expand Down

0 comments on commit 64ab2b1

Please sign in to comment.