From 11d2a9b7d3df66782d373c1446f945cebfb2ff74 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 16 Sep 2025 10:51:07 -0700 Subject: [PATCH] [flang] Fix UTF-8 minimality checks UTF-8 encodings are required to be minimal, but the checks for minimality of 3-byte and 4-byte sequences were incorrect. Fix. --- flang/lib/Parser/characters.cpp | 15 +++++++++------ flang/test/Parser/utf8-01.f90 | 15 +++++++++++++++ 2 files changed, 24 insertions(+), 6 deletions(-) create mode 100644 flang/test/Parser/utf8-01.f90 diff --git a/flang/lib/Parser/characters.cpp b/flang/lib/Parser/characters.cpp index 1a00b16eefe9d..69b6d2ed5fafb 100644 --- a/flang/lib/Parser/characters.cpp +++ b/flang/lib/Parser/characters.cpp @@ -158,21 +158,24 @@ DecodedCharacter DecodeRawCharacter( const char *cp, std::size_t bytes) { auto p{reinterpret_cast(cp)}; char32_t ch{*p}; - if (ch <= 0x7f) { + // Valid UTF-8 encodings must be minimal. + if (ch <= 0x7f) { // 1 byte: 7 bits of payload return {ch, 1}; - } else if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 && - ((p[1] | p[2] | p[3]) & 0xc0) == 0x80) { + } else if ((ch & 0xf8) == 0xf0 && bytes >= 4 && + ((p[1] | p[2] | p[3]) & 0xc0) == 0x80 && (ch > 0xf0 || p[1] > 0x8f)) { + // 4 bytes: 3+6+6+6=21 bits of payload ch = ((ch & 7) << 6) | (p[1] & 0x3f); ch = (ch << 6) | (p[2] & 0x3f); ch = (ch << 6) | (p[3] & 0x3f); return {ch, 4}; - } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 && - ((p[1] | p[2]) & 0xc0) == 0x80) { + } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && + ((p[1] | p[2]) & 0xc0) == 0x80 && (ch > 0xe0 || p[1] > 0x9f)) { + // 3 bytes: 4+6+6=16 bits of payload ch = ((ch & 0xf) << 6) | (p[1] & 0x3f); ch = (ch << 6) | (p[2] & 0x3f); return {ch, 3}; } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 && - (p[1] & 0xc0) == 0x80) { + (p[1] & 0xc0) == 0x80) { // 2 bytes: 5+6=11 bits of payload ch = ((ch & 0x1f) << 6) | (p[1] & 0x3f); return {ch, 2}; } else { diff --git a/flang/test/Parser/utf8-01.f90 b/flang/test/Parser/utf8-01.f90 new file mode 100644 index 0000000000000..3a3745524d807 --- /dev/null +++ b/flang/test/Parser/utf8-01.f90 @@ -0,0 +1,15 @@ +!RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s + +character(kind=4), parameter :: c(2) = [character(kind=4) :: & +4_'🍌', 4_'水' ] +print *, '🍌' +print *, 4_'🍌' +print *, '水' +print *, 4_'水' +end + +!CHECK: CHARACTER(KIND=4_4), PARAMETER :: c(2_4) = [CHARACTER(KIND=4,LEN=1)::4_"\360\237\215\214",4_"\346\260\264"] +!CHECK: PRINT *, "\360\237\215\214" +!CHECK: PRINT *, 4_"\360\237\215\214" +!CHECK: PRINT *, "\346\260\264" +!CHECK: PRINT *, 4_"\346\260\264"