Skip to content

Commit

Permalink
Revert "[Clang] Add a warning on invalid UTF-8 in comments."
Browse files Browse the repository at this point in the history
It is probable thart this change crashes on the powerpc bots.

This reverts commit 355532a.
  • Loading branch information
cor3ntin committed Jul 9, 2022
1 parent 7ac7837 commit 50416e5
Show file tree
Hide file tree
Showing 8 changed files with 19 additions and 160 deletions.
6 changes: 2 additions & 4 deletions clang/docs/ReleaseNotes.rst
Expand Up @@ -279,11 +279,9 @@ Improvements to Clang's diagnostics
unevaluated operands of a ``typeid`` expression, as they are now
modeled correctly in the CFG. This fixes
`Issue 21668 <https://github.com/llvm/llvm-project/issues/21668>`_.
- ``-Wself-assign``, ``-Wself-assign-overloaded`` and ``-Wself-move`` will
- ``-Wself-assign``, ``-Wself-assign-overloaded`` and ``-Wself-move`` will
suggest a fix if the decl being assigned is a parameter that shadows a data
member of the contained class.
- Added ``-Winvalid-utf8`` which diagnoses invalid UTF-8 code unit sequences in
comments.

Non-comprehensive list of changes in this release
-------------------------------------------------
Expand Down Expand Up @@ -594,7 +592,7 @@ AST Matchers

- Added ``forEachTemplateArgument`` matcher which creates a match every
time a ``templateArgument`` matches the matcher supplied to it.

- Added ``objcStringLiteral`` matcher which matches ObjectiveC String
literal expressions.

Expand Down
2 changes: 0 additions & 2 deletions clang/include/clang/Basic/DiagnosticLexKinds.td
Expand Up @@ -113,8 +113,6 @@ def warn_four_char_character_literal : Warning<
// Unicode and UCNs
def err_invalid_utf8 : Error<
"source file is not valid UTF-8">;
def warn_invalid_utf8_in_comment : Extension<
"invalid UTF-8 in comment">, InGroup<DiagGroup<"invalid-utf8">>;
def err_character_not_allowed : Error<
"unexpected character <U+%0>">;
def err_character_not_allowed_identifier : Error<
Expand Down
110 changes: 16 additions & 94 deletions clang/lib/Lex/Lexer.cpp
Expand Up @@ -2392,37 +2392,13 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
//
// This loop terminates with CurPtr pointing at the newline (or end of buffer)
// character that ends the line comment.

// C++23 [lex.phases] p1
// Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
// diagnostic only once per entire ill-formed subsequence to avoid
// emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
bool UnicodeDecodingAlreadyDiagnosed = false;

char C;
while (true) {
C = *CurPtr;
// Skip over characters in the fast loop.
while (isASCII(C) && C != 0 && // Potentially EOF.
C != '\n' && C != '\r') { // Newline or DOS-style newline.
while (C != 0 && // Potentially EOF.
C != '\n' && C != '\r') // Newline or DOS-style newline.
C = *++CurPtr;
UnicodeDecodingAlreadyDiagnosed = false;
}

if (!isASCII(C)) {
unsigned Length = llvm::getUTF8SequenceSize(
(const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
if (Length == 0) {
if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
UnicodeDecodingAlreadyDiagnosed = true;
++CurPtr;
} else {
UnicodeDecodingAlreadyDiagnosed = false;
CurPtr += Length;
}
continue;
}

const char *NextLine = CurPtr;
if (C != 0) {
Expand Down Expand Up @@ -2689,12 +2665,6 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
if (C == '/')
C = *CurPtr++;

// C++23 [lex.phases] p1
// Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
// diagnostic only once per entire ill-formed subsequence to avoid
// emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
bool UnicodeDecodingAlreadyDiagnosed = false;

while (true) {
// Skip over all non-interesting characters until we find end of buffer or a
// (probably ending) '/' character.
Expand All @@ -2703,22 +2673,14 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
// doesn't check for '\0'.
!(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
// While not aligned to a 16-byte boundary.
while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
if (!isASCII(C))
goto MultiByteUTF8;
while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
C = *CurPtr++;
}

if (C == '/') goto FoundSlash;

#ifdef __SSE2__
__m128i Slashes = _mm_set1_epi8('/');
while (CurPtr + 16 < BufferEnd) {
int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
if (LLVM_UNLIKELY(Mask != 0)) {
CurPtr += llvm::countTrailingZeros<unsigned>(Mask);
goto MultiByteUTF8;
}
// look for slashes
while (CurPtr+16 <= BufferEnd) {
int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
Slashes));
if (cmp != 0) {
Expand All @@ -2731,71 +2693,31 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
CurPtr += 16;
}
#elif __ALTIVEC__
__vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80};
__vector unsigned char Slashes = {
'/', '/', '/', '/', '/', '/', '/', '/',
'/', '/', '/', '/', '/', '/', '/', '/'
};
while (CurPtr + 16 < BufferEnd) {
if (LLVM_UNLIKELY(
vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
goto MultiByteUTF8;
if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
C = *CurPtr++;
break;
}
while (CurPtr + 16 <= BufferEnd &&
!vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes))
CurPtr += 16;
}

#else
while (CurPtr + 16 < BufferEnd) {
bool HasNonASCII = false;
for (unsigned I = 0; I < 16; ++I)
HasNonASCII |= !isASCII(CurPtr[I]);

if (LLVM_UNLIKELY(HasNonASCII))
goto MultiByteUTF8;

bool HasSlash = false;
for (unsigned I = 0; I < 16; ++I)
HasSlash |= CurPtr[I] == '/';
if (HasSlash)
break;
CurPtr += 16;
// Scan for '/' quickly. Many block comments are very large.
while (CurPtr[0] != '/' &&
CurPtr[1] != '/' &&
CurPtr[2] != '/' &&
CurPtr[3] != '/' &&
CurPtr+4 < BufferEnd) {
CurPtr += 4;
}
#endif

// It has to be one of the bytes scanned, increment to it and read one.
C = *CurPtr++;
}

// Loop to scan the remainder, warning on invalid UTF-8
// if the corresponding warning is enabled, emitting a diagnostic only once
// per sequence that cannot be decoded.
while (C != '/' && C != '\0') {
if (isASCII(C)) {
UnicodeDecodingAlreadyDiagnosed = false;
C = *CurPtr++;
continue;
}
MultiByteUTF8:
// CurPtr is 1 code unit past C, so to decode
// the codepoint, we need to read from the previous position.
unsigned Length = llvm::getUTF8SequenceSize(
(const llvm::UTF8 *)CurPtr-1, (const llvm::UTF8 *)BufferEnd);
if (Length == 0) {
if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
Diag(CurPtr-1, diag::warn_invalid_utf8_in_comment);
UnicodeDecodingAlreadyDiagnosed = true;
}
else {
UnicodeDecodingAlreadyDiagnosed = false;
CurPtr += Length - 1;
}
// Loop to scan the remainder.
while (C != '/' && C != '\0')
C = *CurPtr++;
}

if (C == '/') {
FoundSlash:
Expand Down
27 changes: 0 additions & 27 deletions clang/test/Lexer/comment-invalid-utf8.c

This file was deleted.

20 changes: 0 additions & 20 deletions clang/test/Lexer/comment-utf8.c

This file was deleted.

2 changes: 1 addition & 1 deletion clang/test/SemaCXX/static-assert.cpp
@@ -1,4 +1,4 @@
// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11 -pedantic -triple=x86_64-linux-gnu -Wno-invalid-utf8
// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11 -pedantic -triple=x86_64-linux-gnu

int f(); // expected-note {{declared here}}

Expand Down
2 changes: 0 additions & 2 deletions llvm/include/llvm/Support/ConvertUTF.h
Expand Up @@ -181,8 +181,6 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);

Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);

unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd);

unsigned getNumBytesForUTF8(UTF8 firstByte);

/*************************************************************************/
Expand Down
10 changes: 0 additions & 10 deletions llvm/lib/Support/ConvertUTF.cpp
Expand Up @@ -417,16 +417,6 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
return isLegalUTF8(source, length);
}

/*
* Exported function to return the size of the first utf-8 code unit sequence,
* Or 0 if the sequence is not valid;
*/
unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
int length = trailingBytesForUTF8[*source] + 1;
return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length
: 0;
}

/* --------------------------------------------------------------------- */

static unsigned
Expand Down

0 comments on commit 50416e5

Please sign in to comment.