diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md index 16eb67f2e27c8..02ccd51dcb686 100644 --- a/flang/docs/Extensions.md +++ b/flang/docs/Extensions.md @@ -322,7 +322,8 @@ end ### Extensions supported when enabled by options * C-style backslash escape sequences in quoted CHARACTER literals - (but not Hollerith) [-fbackslash] + (but not Hollerith) [-fbackslash], including Unicode escapes + with `\U`. * Logical abbreviations `.T.`, `.F.`, `.N.`, `.A.`, `.O.`, and `.X.` [-flogical-abbreviations] * `.XOR.` as a synonym for `.NEQV.` [-fxor-operator] diff --git a/flang/include/flang/Parser/characters.h b/flang/include/flang/Parser/characters.h index b3b82a4f0b9f7..dae0d3e2a0cff 100644 --- a/flang/include/flang/Parser/characters.h +++ b/flang/include/flang/Parser/characters.h @@ -237,6 +237,23 @@ void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert, }}; if (ch <= 0x7f) { emitOneByte(ch); + } else if (useHexadecimalEscapeSequences) { + insert('\\'); + insert('u'); + if (ch > 0xffff) { + unsigned c1{(ch >> 28) & 0xf}, c2{(ch >> 24) & 0xf}, c3{(ch >> 20) & 0xf}, + c4{(ch >> 16) & 0xf}; + insert(c1 > 9 ? 'a' + c1 - 10 : '0' + c1); + insert(c2 > 9 ? 'a' + c2 - 10 : '0' + c2); + insert(c3 > 9 ? 'a' + c3 - 10 : '0' + c3); + insert(c4 > 9 ? 'a' + c4 - 10 : '0' + c4); + } + unsigned c1{(ch >> 12) & 0xf}, c2{(ch >> 8) & 0xf}, c3{(ch >> 4) & 0xf}, + c4{ch & 0xf}; + insert(c1 > 9 ? 'a' + c1 - 10 : '0' + c1); + insert(c2 > 9 ? 'a' + c2 - 10 : '0' + c2); + insert(c3 > 9 ? 'a' + c3 - 10 : '0' + c3); + insert(c4 > 9 ? 'a' + c4 - 10 : '0' + c4); } else { EncodedCharacter encoded{EncodeCharacter(encoding, ch)}; for (int j{0}; j < encoded.bytes; ++j) { diff --git a/flang/lib/Evaluate/character.h b/flang/lib/Evaluate/character.h index ca24dd8e9413d..2d6747741161b 100644 --- a/flang/lib/Evaluate/character.h +++ b/flang/lib/Evaluate/character.h @@ -13,9 +13,7 @@ #include // Provides implementations of intrinsic functions operating on character -// scalars. No assumption is made regarding character encodings other than they -// must be compatible with ASCII (else, NEW_LINE, ACHAR and IACHAR need to be -// adapted). +// scalars. namespace Fortran::evaluate { @@ -34,13 +32,8 @@ template class CharacterUtils { // contain ASCII static std::int64_t ICHAR(const Character &c) { CHECK(c.length() == 1); - if constexpr (std::is_same_v) { - // char may be signed, so cast it first to unsigned to avoid having - // ichar(char(128_4)) returning -128 - return static_cast(c[0]); - } else { - return c[0]; - } + // Convert first to an unsigned integer type to avoid sign extension + return static_cast>(c[0]); } static Character NEW_LINE() { return Character{{NewLine()}}; } diff --git a/flang/lib/Parser/characters.cpp b/flang/lib/Parser/characters.cpp index dce20a4e5fe47..f6ac777ea874c 100644 --- a/flang/lib/Parser/characters.cpp +++ b/flang/lib/Parser/characters.cpp @@ -235,7 +235,30 @@ template DecodedCharacter DecodeCharacter( const char *cp, std::size_t bytes, bool backslashEscapes) { if (backslashEscapes && bytes >= 2 && *cp == '\\') { - return DecodeEscapedCharacters(cp, bytes); + if (ENCODING == Encoding::UTF_8 && bytes >= 6 && + ToLowerCaseLetter(cp[1]) == 'u' && IsHexadecimalDigit(cp[2]) && + IsHexadecimalDigit(cp[3]) && IsHexadecimalDigit(cp[4]) && + IsHexadecimalDigit(cp[5])) { + char32_t ch{ + static_cast(4096 * HexadecimalDigitValue(cp[2]) + + 256 * HexadecimalDigitValue(cp[3]) + + 16 * HexadecimalDigitValue(cp[4]) + HexadecimalDigitValue(cp[5])), + }; + if (bytes >= 10 && IsHexadecimalDigit(cp[6]) && + IsHexadecimalDigit(cp[7]) && IsHexadecimalDigit(cp[8]) && + IsHexadecimalDigit(cp[9])) { + return {(ch << 16) | + (4096 * HexadecimalDigitValue(cp[6]) + + 256 * HexadecimalDigitValue(cp[7]) + + 16 * HexadecimalDigitValue(cp[8]) + + HexadecimalDigitValue(cp[9])), + 10}; + } else { + return {ch, 6}; + } + } else { + return DecodeEscapedCharacters(cp, bytes); + } } else { return DecodeRawCharacter(cp, bytes); } diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp index 6d4fa588cbf60..71e7f4edbd0e1 100644 --- a/flang/runtime/edit-input.cpp +++ b/flang/runtime/edit-input.cpp @@ -976,7 +976,12 @@ bool EditCharacterInput(IoStatementState &io, const DataEdit &edit, CHAR *x, if (skipping) { --skipChars; } else if (auto ucs{DecodeUTF8(input)}) { - *x++ = *ucs; + if ((sizeof *x == 1 && *ucs > 0xff) || + (sizeof *x == 2 && *ucs > 0xffff)) { + *x++ = '?'; + } else { + *x++ = *ucs; + } --lengthChars; } else if (chunkBytes == 0) { // error recovery: skip bad encoding @@ -990,7 +995,12 @@ bool EditCharacterInput(IoStatementState &io, const DataEdit &edit, CHAR *x, } else { char32_t buffer{0}; std::memcpy(&buffer, input, chunkBytes); - *x++ = buffer; + if ((sizeof *x == 1 && buffer > 0xff) || + (sizeof *x == 2 && buffer > 0xffff)) { + *x++ = '?'; + } else { + *x++ = buffer; + } --lengthChars; } } else if constexpr (sizeof *x > 1) { diff --git a/flang/test/Semantics/modfile60.f90 b/flang/test/Semantics/modfile60.f90 new file mode 100644 index 0000000000000..fdb0f8930fe06 --- /dev/null +++ b/flang/test/Semantics/modfile60.f90 @@ -0,0 +1,19 @@ +! RUN: %python %S/test_modfile.py %s %flang_fc1 -fbackslash +! Test Unicode escape sequences +module m + integer, parameter :: wide = 4 + character(kind=wide, len=20), parameter :: ch = wide_"\u1234 \u56789abc" + integer, parameter :: check(2) = [ iachar(ch(1:1)), iachar(ch(3:3)) ] + logical, parameter :: valid = all(check == [int(z'1234'), int(z'56789abc')]) +end + +!Expect: m.mod +!module m +!integer(4),parameter::wide=4_4 +!character(20_4,4),parameter::ch=4_"\341\210\264 \375\226\236\211\252\274 " +!integer(4),parameter::check(1_8:2_8)=[INTEGER(4)::4660_4,1450744508_4] +!intrinsic::iachar +!logical(4),parameter::valid=.true._4 +!intrinsic::all +!intrinsic::int +!end