Skip to content

Commit

Permalink
[Clang] Add a warning on invalid UTF-8 in comments.
Browse files Browse the repository at this point in the history
Introduce an off-by default `-Winvalid-utf8` warning
that detects invalid UTF-8 code units sequences in comments.

Invalid UTF-8 in other places is already diagnosed,
as that cannot appear in identifiers and other grammar constructs.

The warning is off by default as its likely to be somewhat disruptive
otherwise.

This warning allows clang to conform to the yet-to be approved WG21
"P2295R5 Support for UTF-8 as a portable source file encoding"
paper.

Reviewed By: aaron.ballman, #clang-language-wg

Differential Revision: https://reviews.llvm.org/D128059
  • Loading branch information
cor3ntin committed Jul 6, 2022
1 parent 95e0882 commit e3dc568
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 18 deletions.
4 changes: 3 additions & 1 deletion clang/docs/ReleaseNotes.rst
Expand Up @@ -279,6 +279,8 @@ Improvements to Clang's diagnostics
unevaluated operands of a ``typeid`` expression, as they are now
modeled correctly in the CFG. This fixes
`Issue 21668 <https://github.com/llvm/llvm-project/issues/21668>`_.
- Added ``-Winvalid-utf8`` which diagnoses invalid UTF-8 code unit sequences in
comments.

Non-comprehensive list of changes in this release
-------------------------------------------------
Expand Down Expand Up @@ -576,7 +578,7 @@ AST Matchers

- Added ``forEachTemplateArgument`` matcher which creates a match every
time a ``templateArgument`` matches the matcher supplied to it.

- Added ``objcStringLiteral`` matcher which matches ObjectiveC String
literal expressions.

Expand Down
2 changes: 2 additions & 0 deletions clang/include/clang/Basic/DiagnosticLexKinds.td
Expand Up @@ -113,6 +113,8 @@ def warn_four_char_character_literal : Warning<
// Unicode and UCNs
def err_invalid_utf8 : Error<
"source file is not valid UTF-8">;
def warn_invalid_utf8_in_comment : Extension<
"invalid UTF-8 in comment">, InGroup<DiagGroup<"invalid-utf8">>;
def err_character_not_allowed : Error<
"unexpected character <U+%0>">;
def err_character_not_allowed_identifier : Error<
Expand Down
113 changes: 96 additions & 17 deletions clang/lib/Lex/Lexer.cpp
Expand Up @@ -2392,13 +2392,37 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
//
// This loop terminates with CurPtr pointing at the newline (or end of buffer)
// character that ends the line comment.

// C++23 [lex.phases] p1
// Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
// diagnostic only once per entire ill-formed subsequence to avoid
// emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
bool UnicodeDecodingAlreadyDiagnosed = false;

char C;
while (true) {
C = *CurPtr;
// Skip over characters in the fast loop.
while (C != 0 && // Potentially EOF.
C != '\n' && C != '\r') // Newline or DOS-style newline.
while (isASCII(C) && C != 0 && // Potentially EOF.
C != '\n' && C != '\r') { // Newline or DOS-style newline.
C = *++CurPtr;
UnicodeDecodingAlreadyDiagnosed = false;
}

if (!isASCII(C)) {
unsigned Length = llvm::getUTF8SequenceSize(
(const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
if (Length == 0) {
if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
UnicodeDecodingAlreadyDiagnosed = true;
++CurPtr;
} else {
UnicodeDecodingAlreadyDiagnosed = false;
CurPtr += Length;
}
continue;
}

const char *NextLine = CurPtr;
if (C != 0) {
Expand Down Expand Up @@ -2665,6 +2689,12 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
if (C == '/')
C = *CurPtr++;

// C++23 [lex.phases] p1
// Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
// diagnostic only once per entire ill-formed subsequence to avoid
// emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
bool UnicodeDecodingAlreadyDiagnosed = false;

while (true) {
// Skip over all non-interesting characters until we find end of buffer or a
// (probably ending) '/' character.
Expand All @@ -2673,14 +2703,24 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
// doesn't check for '\0'.
!(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
// While not aligned to a 16-byte boundary.
while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
if (!isASCII(C)) {
CurPtr--;
goto MultiByteUTF8;
}
C = *CurPtr++;

}
if (C == '/') goto FoundSlash;

#ifdef __SSE2__
__m128i Slashes = _mm_set1_epi8('/');
while (CurPtr+16 <= BufferEnd) {
while (CurPtr + 16 < BufferEnd) {
int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
if (LLVM_UNLIKELY(Mask != 0)) {
CurPtr += llvm::countTrailingZeros<unsigned>(Mask);
goto MultiByteUTF8;
}
// look for slashes
int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
Slashes));
if (cmp != 0) {
Expand All @@ -2693,31 +2733,70 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
CurPtr += 16;
}
#elif __ALTIVEC__
__vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80};
__vector unsigned char Slashes = {
'/', '/', '/', '/', '/', '/', '/', '/',
'/', '/', '/', '/', '/', '/', '/', '/'
};
while (CurPtr + 16 <= BufferEnd &&
!vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes))
while (CurPtr + 16 < BufferEnd) {
if (LLVM_UNLIKELY(
vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
goto MultiByteUTF8;
if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
C = *CurPtr++;
break;
}
CurPtr += 16;
}

#else
// Scan for '/' quickly. Many block comments are very large.
while (CurPtr[0] != '/' &&
CurPtr[1] != '/' &&
CurPtr[2] != '/' &&
CurPtr[3] != '/' &&
CurPtr+4 < BufferEnd) {
CurPtr += 4;
while (CurPtr + 16 <= BufferEnd) {
bool HasNonASCII = false;
for (unsigned I = 0; I < 16; ++I) {
HasNonASCII |= !isASCII(CurPtr[I]);
}

if (LLVM_UNLIKELY(HasNonASCII))
goto MultiByteUTF8;

bool HasSlash = false;
for (unsigned I = 0; I < 16; ++I) {
HasSlash |= CurPtr[I] == '/';
}
if (HasSlash)
break;
CurPtr += 16;
}
#endif

// It has to be one of the bytes scanned, increment to it and read one.
C = *CurPtr++;
}

// Loop to scan the remainder.
while (C != '/' && C != '\0')
C = *CurPtr++;
// Loop to scan the remainder, warning on invalid UTF-8
// if the corresponding warning is enabled, emitting a diagnostic only once
// per sequence that cannot be decoded.
while (C != '/' && C != '\0') {
if (isASCII(C)) {
UnicodeDecodingAlreadyDiagnosed = false;
C = *CurPtr++;
continue;
}
MultiByteUTF8:
unsigned Length = llvm::getUTF8SequenceSize(
(const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
if (Length == 0) {
if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
UnicodeDecodingAlreadyDiagnosed = true;
C = *CurPtr++;
continue;
}
UnicodeDecodingAlreadyDiagnosed = false;
C = *(CurPtr += Length - 1);
}

if (C == '/') {
FoundSlash:
Expand Down
27 changes: 27 additions & 0 deletions clang/test/Lexer/comment-invalid-utf8.c
@@ -0,0 +1,27 @@
// RUN: %clang_cc1 -fsyntax-only %s -Winvalid-utf8 -verify=expected
// RUN: %clang_cc1 -fsyntax-only %s -verify=nowarn
// nowarn-no-diagnostics

// This file is purposefully encoded as windows-1252
// be careful when modifying.

//€
// expected-warning@-1 {{invalid UTF-8 in comment}}

// € ‚ƒ„…†‡ˆ‰ Š ‹ Œ Ž
// expected-warning@-1 6{{invalid UTF-8 in comment}}

/*€*/
// expected-warning@-1 {{invalid UTF-8 in comment}}

/*€ ‚ƒ„…†‡ˆ‰ Š ‹ Œ Ž*/
// expected-warning@-1 6{{invalid UTF-8 in comment}}

/*
*/
// expected-warning@-2 {{invalid UTF-8 in comment}}

// abcd
// €abcd
// expected-warning@-1 {{invalid UTF-8 in comment}}
2 changes: 2 additions & 0 deletions llvm/include/llvm/Support/ConvertUTF.h
Expand Up @@ -181,6 +181,8 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);

Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);

unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd);

unsigned getNumBytesForUTF8(UTF8 firstByte);

/*************************************************************************/
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Support/ConvertUTF.cpp
Expand Up @@ -417,6 +417,16 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
return isLegalUTF8(source, length);
}

/*
* Exported function to return the size of the first utf-8 code unit sequence,
* Or 0 if the sequence is not valid;
*/
unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
int length = trailingBytesForUTF8[*source] + 1;
return (length > sourceEnd - source && isLegalUTF8(source, length)) ? length
: 0;
}

/* --------------------------------------------------------------------- */

static unsigned
Expand Down

0 comments on commit e3dc568

Please sign in to comment.