diff --git a/flang/include/flang/Runtime/iostat.h b/flang/include/flang/Runtime/iostat.h index 0c0b3f4b3f7f3..d0e8ea7d65747 100644 --- a/flang/include/flang/Runtime/iostat.h +++ b/flang/include/flang/Runtime/iostat.h @@ -66,6 +66,7 @@ enum Iostat { IostatShortRead, IostatMissingTerminator, IostatBadUnformattedRecord, + IostatUTF8Decoding, }; const char *IostatErrorString(int); diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt index 6a80b65ba0342..62f251f7dbbb4 100644 --- a/flang/runtime/CMakeLists.txt +++ b/flang/runtime/CMakeLists.txt @@ -82,6 +82,7 @@ add_flang_library(FortranRuntime type-info.cpp unit.cpp unit-map.cpp + utf.cpp LINK_LIBS FortranDecimal diff --git a/flang/runtime/descriptor-io.h b/flang/runtime/descriptor-io.h index 7e098d8cfca99..1ca659a39a53a 100644 --- a/flang/runtime/descriptor-io.h +++ b/flang/runtime/descriptor-io.h @@ -168,17 +168,17 @@ inline bool FormattedCharacterIO( for (std::size_t j{0}; j < numElements; ++j) { A *x{&ExtractElement(io, descriptor, subscripts)}; if (listOutput) { - if (!ListDirectedDefaultCharacterOutput(io, *listOutput, x, length)) { + if (!ListDirectedCharacterOutput(io, *listOutput, x, length)) { return false; } } else if (auto edit{io.GetNextDataEdit()}) { if constexpr (DIR == Direction::Output) { - if (!EditDefaultCharacterOutput(io, *edit, x, length)) { + if (!EditCharacterOutput(io, *edit, x, length)) { return false; } } else { if (edit->descriptor != DataEdit::ListDirectedNullValue) { - if (EditDefaultCharacterInput(io, *edit, x, length)) { + if (EditCharacterInput(io, *edit, x, length)) { anyInput = true; } else { return anyInput && edit->IsNamelist(); @@ -456,7 +456,10 @@ static bool DescriptorIO(IoStatementState &io, const Descriptor &descriptor) { switch (kind) { case 1: return FormattedCharacterIO(io, descriptor); - // TODO cases 2, 4 + case 2: + return FormattedCharacterIO(io, descriptor); + case 4: + return FormattedCharacterIO(io, descriptor); default: handler.Crash( "DescriptorIO: Unimplemented CHARACTER kind (%d) in descriptor", diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp index ee35bd4c76cde..aabe5df30f6d9 100644 --- a/flang/runtime/edit-input.cpp +++ b/flang/runtime/edit-input.cpp @@ -8,6 +8,7 @@ #include "edit-input.h" #include "namelist.h" +#include "utf.h" #include "flang/Common/real.h" #include "flang/Common/uint128.h" #include @@ -61,7 +62,6 @@ static bool ScanNumericPrefix(IoStatementState &io, const DataEdit &edit, if (next) { negative = *next == '-'; if (negative || *next == '+') { - io.GotChar(); io.SkipSpaces(remaining); next = io.NextInField(remaining, edit); } @@ -88,8 +88,7 @@ bool EditIntegerInput( case 'Z': return EditBOZInput(io, edit, n, 16, kind << 3); case 'A': // legacy extension - return EditDefaultCharacterInput( - io, edit, reinterpret_cast(n), kind); + return EditCharacterInput(io, edit, reinterpret_cast(n), kind); default: io.GetIoErrorHandler().SignalError(IostatErrorInFormat, "Data edit descriptor '%c' may not be used with an INTEGER data item", @@ -260,9 +259,10 @@ static int ScanRealInput(char *buffer, int bufferSize, IoStatementState &io, next = io.NextInField(remaining, edit); } if (!next) { // NextInField fails on separators like ')' - next = io.GetCurrentChar(); + std::size_t byteCount{0}; + next = io.GetCurrentChar(byteCount); if (next && *next == ')') { - io.HandleRelativePosition(1); + io.HandleRelativePosition(byteCount); } } } else if (remaining) { @@ -427,8 +427,7 @@ bool EditRealInput(IoStatementState &io, const DataEdit &edit, void *n) { return EditBOZInput( io, edit, n, 16, common::BitsForBinaryPrecision(binaryPrecision)); case 'A': // legacy extension - return EditDefaultCharacterInput( - io, edit, reinterpret_cast(n), KIND); + return EditCharacterInput(io, edit, reinterpret_cast(n), KIND); default: io.GetIoErrorHandler().SignalError(IostatErrorInFormat, "Data edit descriptor '%c' may not be used for REAL input", @@ -487,11 +486,13 @@ bool EditLogicalInput(IoStatementState &io, const DataEdit &edit, bool &x) { } // See 13.10.3.1 paragraphs 7-9 in Fortran 2018 +template static bool EditDelimitedCharacterInput( - IoStatementState &io, char *x, std::size_t length, char32_t delimiter) { + IoStatementState &io, CHAR *x, std::size_t length, char32_t delimiter) { bool result{true}; while (true) { - auto ch{io.GetCurrentChar()}; + std::size_t byteCount{0}; + auto ch{io.GetCurrentChar(byteCount)}; if (!ch) { if (io.AdvanceRecord()) { continue; @@ -500,12 +501,12 @@ static bool EditDelimitedCharacterInput( break; } } - io.HandleRelativePosition(1); + io.HandleRelativePosition(byteCount); if (*ch == delimiter) { - auto next{io.GetCurrentChar()}; + auto next{io.GetCurrentChar(byteCount)}; if (next && *next == delimiter) { // Repeated delimiter: use as character value - io.HandleRelativePosition(1); + io.HandleRelativePosition(byteCount); } else { break; // closing delimiter } @@ -519,19 +520,23 @@ static bool EditDelimitedCharacterInput( return result; } -static bool EditListDirectedDefaultCharacterInput( - IoStatementState &io, char *x, std::size_t length, const DataEdit &edit) { - auto ch{io.GetCurrentChar()}; +template +static bool EditListDirectedCharacterInput( + IoStatementState &io, CHAR *x, std::size_t length, const DataEdit &edit) { + std::size_t byteCount{0}; + auto ch{io.GetCurrentChar(byteCount)}; if (ch && (*ch == '\'' || *ch == '"')) { - io.HandleRelativePosition(1); + io.HandleRelativePosition(byteCount); return EditDelimitedCharacterInput(io, x, length, *ch); } if (IsNamelistName(io) || io.GetConnectionState().IsAtEOF()) { return false; } // Undelimited list-directed character input: stop at a value separator - // or the end of the current record. - std::optional remaining{length}; + // or the end of the current record. Subtlety: the "remaining" count + // here is a dummy that's used to avoid the interpretation of separators + // in NextInField. + std::optional remaining{maxUTF8Bytes}; while (std::optional next{io.NextInField(remaining, edit)}) { switch (*next) { case ' ': @@ -544,17 +549,19 @@ static bool EditListDirectedDefaultCharacterInput( default: *x++ = *next; --length; + remaining = maxUTF8Bytes; } } std::fill_n(x, length, ' '); return true; } -bool EditDefaultCharacterInput( - IoStatementState &io, const DataEdit &edit, char *x, std::size_t length) { +template +bool EditCharacterInput( + IoStatementState &io, const DataEdit &edit, CHAR *x, std::size_t length) { switch (edit.descriptor) { case DataEdit::ListDirected: - return EditListDirectedDefaultCharacterInput(io, x, length, edit); + return EditListDirectedCharacterInput(io, x, length, edit); case 'A': case 'G': break; @@ -564,7 +571,8 @@ bool EditDefaultCharacterInput( edit.descriptor); return false; } - if (io.GetConnectionState().IsAtEOF()) { + const ConnectionState &connection{io.GetConnectionState()}; + if (connection.IsAtEOF()) { return false; } std::size_t remaining{length}; @@ -577,26 +585,9 @@ bool EditDefaultCharacterInput( const char *input{nullptr}; std::size_t ready{0}; bool hitEnd{false}; - if (remaining > length) { - // Discard leading bytes. - // These bytes don't count towards INQUIRE(IOLENGTH=). - std::size_t skip{remaining - length}; - do { - if (ready == 0) { - ready = io.GetNextInputBytes(input); - if (ready == 0) { - hitEnd = true; - break; - } - } - std::size_t chunk{std::min(skip, ready)}; - io.HandleRelativePosition(chunk); - ready -= chunk; - input += chunk; - skip -= chunk; - } while (skip > 0); - remaining = length; - } + // Skip leading bytes. + // These bytes don't count towards INQUIRE(IOLENGTH=). + std::size_t skip{remaining > length ? remaining - length : 0}; // Transfer payload bytes; these do count. while (remaining > 0) { if (ready == 0) { @@ -606,18 +597,41 @@ bool EditDefaultCharacterInput( break; } } - std::size_t chunk{std::min(remaining, ready)}; - std::memcpy(x, input, chunk); - x += chunk; + std::size_t chunk; + bool skipping{skip > 0}; + if (connection.isUTF8) { + chunk = MeasureUTF8Bytes(*input); + if (skipping) { + --skip; + } else if (auto ucs{DecodeUTF8(input)}) { + *x++ = *ucs; + --length; + } else if (chunk == 0) { + // error recovery: skip bad encoding + chunk = 1; + } + --remaining; + } else { + if (skipping) { + chunk = std::min(skip, ready); + skip -= chunk; + } else { + chunk = std::min(remaining, ready); + std::memcpy(x, input, chunk); + x += chunk; + length -= chunk; + } + remaining -= chunk; + } input += chunk; - io.GotChar(chunk); + if (!skipping) { + io.GotChar(chunk); + } io.HandleRelativePosition(chunk); ready -= chunk; - remaining -= chunk; - length -= chunk; } // Pad the remainder of the input variable, if any. - std::memset(x, ' ', length); + std::fill_n(x, length, ' '); if (hitEnd) { io.CheckForEndOfRecord(); // signal any needed error } @@ -631,4 +645,12 @@ template bool EditRealInput<8>(IoStatementState &, const DataEdit &, void *); template bool EditRealInput<10>(IoStatementState &, const DataEdit &, void *); // TODO: double/double template bool EditRealInput<16>(IoStatementState &, const DataEdit &, void *); + +template bool EditCharacterInput( + IoStatementState &, const DataEdit &, char *, std::size_t); +template bool EditCharacterInput( + IoStatementState &, const DataEdit &, char16_t *, std::size_t); +template bool EditCharacterInput( + IoStatementState &, const DataEdit &, char32_t *, std::size_t); + } // namespace Fortran::runtime::io diff --git a/flang/runtime/edit-input.h b/flang/runtime/edit-input.h index a8b0e76cfefd4..61844a1199a74 100644 --- a/flang/runtime/edit-input.h +++ b/flang/runtime/edit-input.h @@ -21,8 +21,10 @@ template bool EditRealInput(IoStatementState &, const DataEdit &, void *); bool EditLogicalInput(IoStatementState &, const DataEdit &, bool &); -bool EditDefaultCharacterInput( - IoStatementState &, const DataEdit &, char *, std::size_t); + +template +bool EditCharacterInput( + IoStatementState &, const DataEdit &, CHAR *, std::size_t); extern template bool EditRealInput<2>( IoStatementState &, const DataEdit &, void *); @@ -37,5 +39,13 @@ extern template bool EditRealInput<10>( // TODO: double/double extern template bool EditRealInput<16>( IoStatementState &, const DataEdit &, void *); + +extern template bool EditCharacterInput( + IoStatementState &, const DataEdit &, char *, std::size_t); +extern template bool EditCharacterInput( + IoStatementState &, const DataEdit &, char16_t *, std::size_t); +extern template bool EditCharacterInput( + IoStatementState &, const DataEdit &, char32_t *, std::size_t); + } // namespace Fortran::runtime::io #endif // FORTRAN_RUNTIME_EDIT_INPUT_H_ diff --git a/flang/runtime/edit-output.cpp b/flang/runtime/edit-output.cpp index aa5ef489d22e7..e3bb5abb2bb98 100644 --- a/flang/runtime/edit-output.cpp +++ b/flang/runtime/edit-output.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "edit-output.h" +#include "utf.h" #include "flang/Common/uint128.h" #include @@ -53,7 +54,7 @@ bool EditIntegerOutput(IoStatementState &io, const DataEdit &edit, } break; case 'A': // legacy extension - return EditDefaultCharacterOutput( + return EditCharacterOutput( io, edit, reinterpret_cast(&n), sizeof n); default: io.GetIoErrorHandler().Crash( @@ -434,7 +435,7 @@ template bool RealOutputEditing::Edit(const DataEdit &edit) { case 'G': return Edit(EditForGOutput(edit)); case 'A': // legacy extension - return EditDefaultCharacterOutput( + return EditCharacterOutput( io_, edit, reinterpret_cast(&x_), sizeof x_); default: if (edit.IsListDirected()) { @@ -467,8 +468,9 @@ bool EditLogicalOutput(IoStatementState &io, const DataEdit &edit, bool truth) { } } -bool ListDirectedDefaultCharacterOutput(IoStatementState &io, - ListDirectedStatementState &list, const char *x, +template +bool ListDirectedCharacterOutput(IoStatementState &io, + ListDirectedStatementState &list, const CHAR *x, std::size_t length) { bool ok{true}; MutableModes &modes{io.mutableModes()}; @@ -477,11 +479,11 @@ bool ListDirectedDefaultCharacterOutput(IoStatementState &io, ok = ok && list.EmitLeadingSpaceOrAdvance(io); // Value is delimited with ' or " marks, and interior // instances of that character are doubled. - auto EmitOne{[&](char ch) { + auto EmitOne{[&](CHAR ch) { if (connection.NeedAdvance(1)) { ok = ok && io.AdvanceRecord(); } - ok = ok && io.Emit(&ch, 1); + ok = ok && io.EmitEncoded(&ch, 1); }}; EmitOne(modes.delim); for (std::size_t j{0}; j < length; ++j) { @@ -494,7 +496,7 @@ bool ListDirectedDefaultCharacterOutput(IoStatementState &io, // the same thing when tested with this case. // This runtime splits the doubled delimiters across // two records for lack of a better alternative. - if (x[j] == modes.delim) { + if (x[j] == static_cast(modes.delim)) { EmitOne(x[j]); } EmitOne(x[j]); @@ -504,12 +506,15 @@ bool ListDirectedDefaultCharacterOutput(IoStatementState &io, // Undelimited list-directed output ok = ok && list.EmitLeadingSpaceOrAdvance(io, length > 0 ? 1 : 0, true); std::size_t put{0}; + std::size_t oneIfUTF8{connection.isUTF8 ? 1 : length}; while (ok && put < length) { - auto chunk{std::min(length - put, connection.RemainingSpaceInRecord())}; - ok = ok && io.Emit(x + put, chunk); - put += chunk; - if (put < length) { - ok = ok && io.AdvanceRecord() && io.Emit(" ", 1); + if (std::size_t chunk{std::min( + std::min(length - put, oneIfUTF8), + connection.RemainingSpaceInRecord())}) { + ok = io.EmitEncoded(x + put, chunk); + put += chunk; + } else { + ok = io.AdvanceRecord() && io.Emit(" ", 1); } } list.set_lastWasUndelimitedCharacter(true); @@ -517,8 +522,9 @@ bool ListDirectedDefaultCharacterOutput(IoStatementState &io, return ok; } -bool EditDefaultCharacterOutput(IoStatementState &io, const DataEdit &edit, - const char *x, std::size_t length) { +template +bool EditCharacterOutput(IoStatementState &io, const DataEdit &edit, + const CHAR *x, std::size_t length) { switch (edit.descriptor) { case 'A': case 'G': @@ -532,7 +538,7 @@ bool EditDefaultCharacterOutput(IoStatementState &io, const DataEdit &edit, int len{static_cast(length)}; int width{edit.width.value_or(len)}; return io.EmitRepeated(' ', std::max(0, width - len)) && - io.Emit(x, std::min(width, len)); + io.EmitEncoded(x, std::min(width, len)); } template bool EditIntegerOutput<1>( @@ -553,4 +559,22 @@ template class RealOutputEditing<8>; template class RealOutputEditing<10>; // TODO: double/double template class RealOutputEditing<16>; + +template bool ListDirectedCharacterOutput(IoStatementState &, + ListDirectedStatementState &, const char *, + std::size_t chars); +template bool ListDirectedCharacterOutput(IoStatementState &, + ListDirectedStatementState &, const char16_t *, + std::size_t chars); +template bool ListDirectedCharacterOutput(IoStatementState &, + ListDirectedStatementState &, const char32_t *, + std::size_t chars); + +template bool EditCharacterOutput( + IoStatementState &, const DataEdit &, const char *, std::size_t chars); +template bool EditCharacterOutput( + IoStatementState &, const DataEdit &, const char16_t *, std::size_t chars); +template bool EditCharacterOutput( + IoStatementState &, const DataEdit &, const char32_t *, std::size_t chars); + } // namespace Fortran::runtime::io diff --git a/flang/runtime/edit-output.h b/flang/runtime/edit-output.h index bcb6fb0b6bfa7..bd1377e3a18c4 100644 --- a/flang/runtime/edit-output.h +++ b/flang/runtime/edit-output.h @@ -94,10 +94,30 @@ template class RealOutputEditing : public RealOutputEditingBase { bool ListDirectedLogicalOutput( IoStatementState &, ListDirectedStatementState &, bool); bool EditLogicalOutput(IoStatementState &, const DataEdit &, bool); -bool ListDirectedDefaultCharacterOutput(IoStatementState &, - ListDirectedStatementState &, const char *, std::size_t); -bool EditDefaultCharacterOutput( - IoStatementState &, const DataEdit &, const char *, std::size_t); + +template +bool ListDirectedCharacterOutput(IoStatementState &, + ListDirectedStatementState &, const CHAR *, + std::size_t chars); +extern template bool ListDirectedCharacterOutput(IoStatementState &, + ListDirectedStatementState &, const char *, + std::size_t chars); +extern template bool ListDirectedCharacterOutput(IoStatementState &, + ListDirectedStatementState &, const char16_t *, + std::size_t chars); +extern template bool ListDirectedCharacterOutput(IoStatementState &, + ListDirectedStatementState &, const char32_t *, + std::size_t chars); + +template +bool EditCharacterOutput( + IoStatementState &, const DataEdit &, const CHAR *, std::size_t chars); +extern template bool EditCharacterOutput( + IoStatementState &, const DataEdit &, const char *, std::size_t chars); +extern template bool EditCharacterOutput( + IoStatementState &, const DataEdit &, const char16_t *, std::size_t chars); +extern template bool EditCharacterOutput( + IoStatementState &, const DataEdit &, const char32_t *, std::size_t chars); extern template bool EditIntegerOutput<1>( IoStatementState &, const DataEdit &, std::int8_t); diff --git a/flang/runtime/environment.cpp b/flang/runtime/environment.cpp index 53af239facea2..7ecbdce6bf961 100644 --- a/flang/runtime/environment.cpp +++ b/flang/runtime/environment.cpp @@ -78,6 +78,17 @@ void ExecutionEnvironment::Configure( } } + if (auto *x{std::getenv("DEFAULT_UTF8")}) { + char *end; + auto n{std::strtol(x, &end, 10)}; + if (n >= 0 && n <= 1 && *end == '\0') { + defaultUTF8 = n != 0; + } else { + std::fprintf( + stderr, "Fortran runtime: DEFAULT_UTF8=%s is invalid; ignored\n", x); + } + } + // TODO: Set RP/ROUND='PROCESSOR_DEFINED' from environment } diff --git a/flang/runtime/environment.h b/flang/runtime/environment.h index 7db6cf3f5723b..b6223a88446ce 100644 --- a/flang/runtime/environment.h +++ b/flang/runtime/environment.h @@ -30,19 +30,23 @@ enum class Convert { Unknown, Native, LittleEndian, BigEndian, Swap }; std::optional GetConvertFromString(const char *, std::size_t); struct ExecutionEnvironment { + constexpr ExecutionEnvironment(){}; void Configure(int argc, const char *argv[], const char *envp[]); const char *GetEnv( const char *name, std::size_t name_length, const Terminator &terminator); - int argc; - const char **argv; - const char **envp; + int argc{0}; + const char **argv{nullptr}; + const char **envp{nullptr}; - int listDirectedOutputLineLengthLimit; // FORT_FMT_RECL - enum decimal::FortranRounding defaultOutputRoundingMode; - Convert conversion; // FORT_CONVERT - bool noStopMessage; // NO_STOP_MESSAGE=1 inhibits "Fortran STOP" + int listDirectedOutputLineLengthLimit{79}; // FORT_FMT_RECL + enum decimal::FortranRounding defaultOutputRoundingMode{ + decimal::FortranRounding::RoundNearest}; // RP(==PN) + Convert conversion{Convert::Unknown}; // FORT_CONVERT + bool noStopMessage{false}; // NO_STOP_MESSAGE=1 inhibits "Fortran STOP" + bool defaultUTF8{false}; // DEFAULT_UTF8 }; + extern ExecutionEnvironment executionEnvironment; } // namespace Fortran::runtime diff --git a/flang/runtime/internal-unit.cpp b/flang/runtime/internal-unit.cpp index 0c833ba548ec7..39a8e4b2c9c4e 100644 --- a/flang/runtime/internal-unit.cpp +++ b/flang/runtime/internal-unit.cpp @@ -102,21 +102,6 @@ std::size_t InternalDescriptorUnit::GetNextInputBytes( } } -template -std::optional InternalDescriptorUnit::GetCurrentChar( - IoErrorHandler &handler) { - const char *p{nullptr}; - std::size_t bytes{GetNextInputBytes(p, handler)}; - if (bytes == 0) { - return std::nullopt; - } else { - if (isUTF8) { - // TODO: UTF-8 decoding - } - return *p; - } -} - template bool InternalDescriptorUnit::AdvanceRecord(IoErrorHandler &handler) { if (currentRecordNumber >= endfileRecordNumber.value_or(0)) { diff --git a/flang/runtime/internal-unit.h b/flang/runtime/internal-unit.h index ad52cc761de53..e59866013188c 100644 --- a/flang/runtime/internal-unit.h +++ b/flang/runtime/internal-unit.h @@ -32,7 +32,6 @@ template class InternalDescriptorUnit : public ConnectionState { bool Emit(const char *, std::size_t, IoErrorHandler &); std::size_t GetNextInputBytes(const char *&, IoErrorHandler &); - std::optional GetCurrentChar(IoErrorHandler &); bool AdvanceRecord(IoErrorHandler &); void BackspaceRecord(IoErrorHandler &); diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp index 1a8b06068802d..ec824d9b3cdff 100644 --- a/flang/runtime/io-stmt.cpp +++ b/flang/runtime/io-stmt.cpp @@ -11,11 +11,13 @@ #include "format.h" #include "tools.h" #include "unit.h" +#include "utf.h" #include "flang/Runtime/memory.h" #include #include #include #include +#include namespace Fortran::runtime::io { @@ -357,7 +359,6 @@ bool ExternalIoStatementState::Emit( Crash( "ExternalIoStatementState::Emit(char16_t) called for input statement"); } - // TODO: UTF-8 encoding return unit().Emit(reinterpret_cast(data), chars * sizeof *data, sizeof *data, *this); } @@ -369,7 +370,6 @@ bool ExternalIoStatementState::Emit( Crash( "ExternalIoStatementState::Emit(char32_t) called for input statement"); } - // TODO: UTF-8 encoding return unit().Emit(reinterpret_cast(data), chars * sizeof *data, sizeof *data, *this); } @@ -472,6 +472,30 @@ bool IoStatementState::Emit(const char32_t *data, std::size_t chars) { return std::visit([=](auto &x) { return x.get().Emit(data, chars); }, u_); } +template +bool IoStatementState::EmitEncoded(const CHAR *data0, std::size_t chars) { + // Don't allow sign extension + using UnsignedChar = std::make_unsigned_t; + const UnsignedChar *data{reinterpret_cast(data0)}; + if (GetConnectionState().isUTF8) { + char buffer[256]; + std::size_t at{0}; + while (chars-- > 0) { + auto len{EncodeUTF8(buffer + at, *data++)}; + at += len; + if (at + maxUTF8Bytes > sizeof buffer) { + if (!Emit(buffer, at)) { + return false; + } + at = 0; + } + } + return at == 0 || Emit(buffer, at); + } else { + return Emit(data0, chars); + } +} + bool IoStatementState::Receive( char *data, std::size_t n, std::size_t elementBytes) { return std::visit( @@ -533,6 +557,30 @@ ExternalFileUnit *IoStatementState::GetExternalFileUnit() const { return std::visit([](auto &x) { return x.get().GetExternalFileUnit(); }, u_); } +std::optional IoStatementState::GetCurrentChar( + std::size_t &byteCount) { + const char *p{nullptr}; + std::size_t bytes{GetNextInputBytes(p)}; + if (bytes == 0) { + byteCount = 0; + return std::nullopt; + } else { + if (GetConnectionState().isUTF8) { + std::size_t length{MeasureUTF8Bytes(*p)}; + if (length <= bytes) { + if (auto result{DecodeUTF8(p)}) { + byteCount = length; + return result; + } + } + GetIoErrorHandler().SignalError(IostatUTF8Decoding); + // Error recovery: return the next byte + } + byteCount = 1; + return *p; + } +} + bool IoStatementState::EmitRepeated(char ch, std::size_t n) { return std::visit( [=](auto &x) { @@ -561,8 +609,9 @@ bool IoStatementState::EmitField( std::optional IoStatementState::NextInField( std::optional &remaining, const DataEdit &edit) { + std::size_t byteCount{0}; if (!remaining) { // Stream, list-directed, or NAMELIST - if (auto next{GetCurrentChar()}) { + if (auto next{GetCurrentChar(byteCount)}) { if (edit.IsListDirected()) { // list-directed or NAMELIST: check for separators switch (*next) { @@ -587,15 +636,18 @@ std::optional IoStatementState::NextInField( break; } } - HandleRelativePosition(1); - GotChar(); + HandleRelativePosition(byteCount); + GotChar(byteCount); return next; } } else if (*remaining > 0) { - if (auto next{GetCurrentChar()}) { - --*remaining; - HandleRelativePosition(1); - GotChar(); + if (auto next{GetCurrentChar(byteCount)}) { + if (byteCount > static_cast(*remaining)) { + return std::nullopt; + } + *remaining -= byteCount; + HandleRelativePosition(byteCount); + GotChar(byteCount); return next; } if (CheckForEndOfRecord()) { // do padding @@ -708,12 +760,13 @@ ListDirectedStatementState::GetNextDataEdit( if (edit.modes.editingFlags & decimalComma) { comma = ';'; } + std::size_t byteCount{0}; if (remaining_ > 0 && !realPart_) { // "r*c" repetition in progress RUNTIME_CHECK(io.GetIoErrorHandler(), repeatPosition_.has_value()); repeatPosition_.reset(); // restores the saved position if (!imaginaryPart_) { edit.repeat = std::min(remaining_, maxRepeat); - auto ch{io.GetCurrentChar()}; + auto ch{io.GetCurrentChar(byteCount)}; if (!ch || *ch == ' ' || *ch == '\t' || *ch == comma) { // "r*" repeated null edit.descriptor = DataEdit::ListDirectedNullValue; @@ -733,14 +786,14 @@ ListDirectedStatementState::GetNextDataEdit( imaginaryPart_ = true; edit.descriptor = DataEdit::ListDirectedImaginaryPart; } - auto ch{io.GetNextNonBlank()}; + auto ch{io.GetNextNonBlank(byteCount)}; if (ch && *ch == comma && eatComma_) { // Consume comma & whitespace after previous item. // This includes the comma between real and imaginary components // in list-directed/NAMELIST complex input. // (When DECIMAL='COMMA', the comma is actually a semicolon.) - io.HandleRelativePosition(1); - ch = io.GetNextNonBlank(); + io.HandleRelativePosition(byteCount); + ch = io.GetNextNonBlank(byteCount); } eatComma_ = true; if (!ch) { @@ -768,12 +821,12 @@ ListDirectedStatementState::GetNextDataEdit( break; } r = 10 * r + (*ch - '0'); - io.HandleRelativePosition(1); - ch = io.GetCurrentChar(); + io.HandleRelativePosition(byteCount); + ch = io.GetCurrentChar(byteCount); } while (ch && *ch >= '0' && *ch <= '9'); if (r > 0 && ch && *ch == '*') { // subtle: r must be nonzero - io.HandleRelativePosition(1); - ch = io.GetCurrentChar(); + io.HandleRelativePosition(byteCount); + ch = io.GetCurrentChar(byteCount); if (ch && *ch == '/') { // r*/ hitSlash_ = true; edit.descriptor = DataEdit::ListDirectedNullValue; @@ -793,7 +846,7 @@ ListDirectedStatementState::GetNextDataEdit( } if (!imaginaryPart_ && ch && *ch == '(') { realPart_ = true; - io.HandleRelativePosition(1); + io.HandleRelativePosition(byteCount); edit.descriptor = DataEdit::ListDirectedRealPart; } return edit; @@ -1445,4 +1498,10 @@ int ErroneousIoStatementState::EndIoStatement() { return IoStatementBase::EndIoStatement(); } +template bool IoStatementState::EmitEncoded(const char *, std::size_t); +template bool IoStatementState::EmitEncoded( + const char16_t *, std::size_t); +template bool IoStatementState::EmitEncoded( + const char32_t *, std::size_t); + } // namespace Fortran::runtime::io diff --git a/flang/runtime/io-stmt.h b/flang/runtime/io-stmt.h index 2c43151296b8a..0ed14e5ad6a4d 100644 --- a/flang/runtime/io-stmt.h +++ b/flang/runtime/io-stmt.h @@ -90,6 +90,7 @@ class IoStatementState { bool Emit(const char *, std::size_t); bool Emit(const char16_t *, std::size_t chars); bool Emit(const char32_t *, std::size_t chars); + template bool EmitEncoded(const CHAR *, std::size_t); bool Receive(char *, std::size_t, std::size_t elementBytes = 0); std::size_t GetNextInputBytes(const char *&); bool AdvanceRecord(int = 1); @@ -123,16 +124,7 @@ class IoStatementState { } // Vacant after the end of the current record - std::optional GetCurrentChar() { - const char *p{nullptr}; - std::size_t bytes{GetNextInputBytes(p)}; - if (bytes == 0) { - return std::nullopt; - } else { - // TODO: UTF-8 decoding; may have to get more bytes in a loop - return *p; - } - } + std::optional GetCurrentChar(std::size_t &byteCount); bool EmitRepeated(char, std::size_t); bool EmitField(const char *, std::size_t length, std::size_t width); @@ -144,7 +136,8 @@ class IoStatementState { const DataEdit &edit, std::optional &remaining) { remaining.reset(); if (edit.descriptor == DataEdit::ListDirected) { - GetNextNonBlank(); + std::size_t byteCount{0}; + GetNextNonBlank(byteCount); } else { if (edit.width.value_or(0) > 0) { remaining = *edit.width; @@ -156,15 +149,19 @@ class IoStatementState { std::optional SkipSpaces(std::optional &remaining) { while (!remaining || *remaining > 0) { - if (auto ch{GetCurrentChar()}) { + std::size_t byteCount{0}; + if (auto ch{GetCurrentChar(byteCount)}) { if (*ch != ' ' && *ch != '\t') { return ch; } - HandleRelativePosition(1); if (remaining) { - GotChar(); - --*remaining; + if (static_cast(*remaining) < byteCount) { + break; + } + GotChar(byteCount); + *remaining -= byteCount; } + HandleRelativePosition(byteCount); } else { break; } @@ -182,16 +179,16 @@ class IoStatementState { bool CheckForEndOfRecord(); // Skips spaces, advances records, and ignores NAMELIST comments - std::optional GetNextNonBlank() { - auto ch{GetCurrentChar()}; + std::optional GetNextNonBlank(std::size_t &byteCount) { + auto ch{GetCurrentChar(byteCount)}; bool inNamelist{mutableModes().inNamelist}; while (!ch || *ch == ' ' || *ch == '\t' || (inNamelist && *ch == '!')) { if (ch && (*ch == ' ' || *ch == '\t')) { - HandleRelativePosition(1); + HandleRelativePosition(byteCount); } else if (!AdvanceRecord()) { return std::nullopt; } - ch = GetCurrentChar(); + ch = GetCurrentChar(byteCount); } return ch; } @@ -721,5 +718,12 @@ class ErroneousIoStatementState : public IoStatementBase { ConnectionState connection_; }; +extern template bool IoStatementState::EmitEncoded( + const char *, std::size_t); +extern template bool IoStatementState::EmitEncoded( + const char16_t *, std::size_t); +extern template bool IoStatementState::EmitEncoded( + const char32_t *, std::size_t); + } // namespace Fortran::runtime::io #endif // FORTRAN_RUNTIME_IO_STMT_H_ diff --git a/flang/runtime/iostat.cpp b/flang/runtime/iostat.cpp index f6305eaca6559..73cf2b4e58002 100644 --- a/flang/runtime/iostat.cpp +++ b/flang/runtime/iostat.cpp @@ -75,6 +75,8 @@ const char *IostatErrorString(int iostat) { return "Sequential record missing its terminator"; case IostatBadUnformattedRecord: return "Erroneous unformatted sequential file record structure"; + case IostatUTF8Decoding: + return "UTF-8 decoding error"; default: return nullptr; } diff --git a/flang/runtime/namelist.cpp b/flang/runtime/namelist.cpp index 762b885b56b3b..3e2c7a012bada 100644 --- a/flang/runtime/namelist.cpp +++ b/flang/runtime/namelist.cpp @@ -86,13 +86,14 @@ static constexpr char NormalizeIdChar(char32_t ch) { static bool GetLowerCaseName( IoStatementState &io, char buffer[], std::size_t maxLength) { - if (auto ch{io.GetNextNonBlank()}) { + std::size_t byteLength{0}; + if (auto ch{io.GetNextNonBlank(byteLength)}) { if (IsLegalIdStart(*ch)) { std::size_t j{0}; do { buffer[j] = NormalizeIdChar(*ch); - io.HandleRelativePosition(1); - ch = io.GetCurrentChar(); + io.HandleRelativePosition(byteLength); + ch = io.GetCurrentChar(byteLength); } while (++j < maxLength && ch && IsLegalIdChar(*ch)); buffer[j++] = '\0'; if (j <= maxLength) { @@ -107,19 +108,20 @@ static bool GetLowerCaseName( static std::optional GetSubscriptValue(IoStatementState &io) { std::optional value; - std::optional ch{io.GetCurrentChar()}; + std::size_t byteCount{0}; + std::optional ch{io.GetCurrentChar(byteCount)}; bool negate{ch && *ch == '-'}; if ((ch && *ch == '+') || negate) { - io.HandleRelativePosition(1); - ch = io.GetCurrentChar(); + io.HandleRelativePosition(byteCount); + ch = io.GetCurrentChar(byteCount); } bool overflow{false}; while (ch && *ch >= '0' && *ch <= '9') { SubscriptValue was{value.value_or(0)}; overflow |= was >= std::numeric_limits::max() / 10; value = 10 * was + *ch - '0'; - io.HandleRelativePosition(1); - ch = io.GetCurrentChar(); + io.HandleRelativePosition(byteCount); + ch = io.GetCurrentChar(byteCount); } if (overflow) { io.GetIoErrorHandler().SignalError( @@ -130,7 +132,7 @@ static std::optional GetSubscriptValue(IoStatementState &io) { if (value) { return -*value; } else { - io.HandleRelativePosition(-1); // give back '-' with no digits + io.HandleRelativePosition(-byteCount); // give back '-' with no digits } } return value; @@ -146,7 +148,8 @@ static bool HandleSubscripts(IoStatementState &io, Descriptor &desc, int j{0}; std::size_t contiguousStride{source.ElementBytes()}; bool ok{true}; - std::optional ch{io.GetNextNonBlank()}; + std::size_t byteCount{0}; + std::optional ch{io.GetNextNonBlank(byteCount)}; char32_t comma{GetComma(io)}; for (; ch && *ch != ')'; ++j) { SubscriptValue dimLower{0}, dimUpper{0}, dimStride{0}; @@ -176,11 +179,11 @@ static bool HandleSubscripts(IoStatementState &io, Descriptor &desc, } else { dimLower = *low; } - ch = io.GetNextNonBlank(); + ch = io.GetNextNonBlank(byteCount); } if (ch && *ch == ':') { - io.HandleRelativePosition(1); - ch = io.GetNextNonBlank(); + io.HandleRelativePosition(byteCount); + ch = io.GetNextNonBlank(byteCount); if (auto high{GetSubscriptValue(io)}) { if (*high > dimUpper) { if (ok) { @@ -194,14 +197,14 @@ static bool HandleSubscripts(IoStatementState &io, Descriptor &desc, } else { dimUpper = *high; } - ch = io.GetNextNonBlank(); + ch = io.GetNextNonBlank(byteCount); } if (ch && *ch == ':') { - io.HandleRelativePosition(1); - ch = io.GetNextNonBlank(); + io.HandleRelativePosition(byteCount); + ch = io.GetNextNonBlank(byteCount); if (auto str{GetSubscriptValue(io)}) { dimStride = *str; - ch = io.GetNextNonBlank(); + ch = io.GetNextNonBlank(byteCount); } } } else { // scalar @@ -209,8 +212,8 @@ static bool HandleSubscripts(IoStatementState &io, Descriptor &desc, dimStride = 0; } if (ch && *ch == comma) { - io.HandleRelativePosition(1); - ch = io.GetNextNonBlank(); + io.HandleRelativePosition(byteCount); + ch = io.GetNextNonBlank(byteCount); } if (ok) { lower[j] = dimLower; @@ -220,7 +223,7 @@ static bool HandleSubscripts(IoStatementState &io, Descriptor &desc, } if (ok) { if (ch && *ch == ')') { - io.HandleRelativePosition(1); + io.HandleRelativePosition(byteCount); if (desc.EstablishPointerSection(source, lower, upper, stride)) { return true; } else { @@ -250,29 +253,30 @@ static bool HandleSubstring( // ambiguous within the parentheses. io.HandleRelativePosition(1); // skip '(' std::optional lower, upper; - std::optional ch{io.GetNextNonBlank()}; + std::size_t byteCount{0}; + std::optional ch{io.GetNextNonBlank(byteCount)}; if (ch) { if (*ch == ':') { lower = 1; } else { lower = GetSubscriptValue(io); - ch = io.GetNextNonBlank(); + ch = io.GetNextNonBlank(byteCount); } } if (ch && ch == ':') { - io.HandleRelativePosition(1); - ch = io.GetNextNonBlank(); + io.HandleRelativePosition(byteCount); + ch = io.GetNextNonBlank(byteCount); if (ch) { if (*ch == ')') { upper = chars; } else { upper = GetSubscriptValue(io); - ch = io.GetNextNonBlank(); + ch = io.GetNextNonBlank(byteCount); } } } if (ch && *ch == ')') { - io.HandleRelativePosition(1); + io.HandleRelativePosition(byteCount); if (lower && upper) { if (*lower > *upper) { // An empty substring, whatever the values are @@ -335,16 +339,17 @@ static bool HandleComponent(IoStatementState &io, Descriptor &desc, // Advance to the terminal '/' of a namelist group. static void SkipNamelistGroup(IoStatementState &io) { - while (auto ch{io.GetNextNonBlank()}) { - io.HandleRelativePosition(1); + std::size_t byteCount{0}; + while (auto ch{io.GetNextNonBlank(byteCount)}) { + io.HandleRelativePosition(byteCount); if (*ch == '/') { break; } else if (*ch == '\'' || *ch == '"') { // Skip quoted character literal char32_t quote{*ch}; while (true) { - if ((ch = io.GetCurrentChar())) { - io.HandleRelativePosition(1); + if ((ch = io.GetCurrentChar(byteCount))) { + io.HandleRelativePosition(byteCount); if (*ch == quote) { break; } @@ -369,14 +374,15 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) { char name[nameBufferSize]; RUNTIME_CHECK(handler, group.groupName != nullptr); char32_t comma{GetComma(io)}; + std::size_t byteCount{0}; while (true) { - next = io.GetNextNonBlank(); + next = io.GetNextNonBlank(byteCount); while (next && *next != '&') { // Extension: comment lines without ! before namelist groups if (!io.AdvanceRecord()) { next.reset(); } else { - next = io.GetNextNonBlank(); + next = io.GetNextNonBlank(byteCount); } } if (!next || *next != '&') { @@ -384,7 +390,7 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) { "NAMELIST input group does not begin with '&' (at '%lc')", *next); return false; } - io.HandleRelativePosition(1); + io.HandleRelativePosition(byteCount); if (!GetLowerCaseName(io, name, sizeof name)) { handler.SignalError("NAMELIST input group has no name"); return false; @@ -396,7 +402,7 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) { } // Read the group's items while (true) { - next = io.GetNextNonBlank(); + next = io.GetNextNonBlank(byteCount); if (!next || *next == '/') { break; } @@ -423,7 +429,7 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) { const Descriptor *useDescriptor{&itemDescriptor}; StaticDescriptor staticDesc[2]; int whichStaticDesc{0}; - next = io.GetCurrentChar(); + next = io.GetCurrentChar(byteCount); bool hadSubscripts{false}; bool hadSubstring{false}; if (next && (*next == '(' || *next == '%')) { @@ -456,25 +462,25 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) { hadSubstring = false; } useDescriptor = &mutableDescriptor; - next = io.GetCurrentChar(); + next = io.GetCurrentChar(byteCount); } while (next && (*next == '(' || *next == '%')); } // Skip the '=' - next = io.GetNextNonBlank(); + next = io.GetNextNonBlank(byteCount); if (!next || *next != '=') { handler.SignalError("No '=' found after item '%s' in NAMELIST group '%s'", name, group.groupName); return false; } - io.HandleRelativePosition(1); + io.HandleRelativePosition(byteCount); // Read the values into the descriptor. An array can be short. listInput->ResetForNextNamelistItem(); if (!descr::DescriptorIO(io, *useDescriptor)) { return false; } - next = io.GetNextNonBlank(); + next = io.GetNextNonBlank(byteCount); if (next && *next == comma) { - io.HandleRelativePosition(1); + io.HandleRelativePosition(byteCount); } } if (!next || *next != '/') { @@ -490,13 +496,14 @@ bool IsNamelistName(IoStatementState &io) { if (io.get_if>()) { if (io.mutableModes().inNamelist) { SavedPosition savedPosition{io}; - if (auto ch{io.GetNextNonBlank()}) { + std::size_t byteCount{0}; + if (auto ch{io.GetNextNonBlank(byteCount)}) { if (IsLegalIdStart(*ch)) { do { - io.HandleRelativePosition(1); - ch = io.GetCurrentChar(); + io.HandleRelativePosition(byteCount); + ch = io.GetCurrentChar(byteCount); } while (ch && IsLegalIdChar(*ch)); - ch = io.GetNextNonBlank(); + ch = io.GetNextNonBlank(byteCount); // TODO: how to deal with NaN(...) ambiguity? return ch && (*ch == '=' || *ch == '(' || *ch == '%'); } diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp index 23e5b6292621b..2ba4faf23dc3f 100644 --- a/flang/runtime/unit.cpp +++ b/flang/runtime/unit.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "unit.h" -#include "environment.h" #include "io-error.h" #include "lock.h" #include "unit-map.h" @@ -233,7 +232,6 @@ UnitMap &ExternalFileUnit::GetUnitMap() { error.isUnformatted = false; errorOutput = &error; - // TODO: Set UTF-8 mode from the environment unitMap = newUnitMap; return *unitMap; } @@ -374,18 +372,6 @@ std::size_t ExternalFileUnit::GetNextInputBytes( return p ? length : 0; } -std::optional ExternalFileUnit::GetCurrentChar( - IoErrorHandler &handler) { - const char *p{nullptr}; - std::size_t bytes{GetNextInputBytes(p, handler)}; - if (bytes == 0) { - return std::nullopt; - } else { - // TODO: UTF-8 decoding; may have to get more bytes in a loop - return *p; - } -} - const char *ExternalFileUnit::FrameNextInput( IoErrorHandler &handler, std::size_t bytes) { RUNTIME_CHECK(handler, isUnformatted.has_value() && !*isUnformatted); diff --git a/flang/runtime/unit.h b/flang/runtime/unit.h index 7be5e2f387f8d..6e1a5ffbac7d8 100644 --- a/flang/runtime/unit.h +++ b/flang/runtime/unit.h @@ -13,6 +13,7 @@ #include "buffer.h" #include "connection.h" +#include "environment.h" #include "file.h" #include "format.h" #include "io-error.h" @@ -34,7 +35,9 @@ class ExternalFileUnit : public ConnectionState, public OpenFile, public FileFrame { public: - explicit ExternalFileUnit(int unitNumber) : unitNumber_{unitNumber} {} + explicit ExternalFileUnit(int unitNumber) : unitNumber_{unitNumber} { + isUTF8 = executionEnvironment.defaultUTF8; + } ~ExternalFileUnit() {} int unitNumber() const { return unitNumber_; } @@ -80,7 +83,6 @@ class ExternalFileUnit : public ConnectionState, const char *, std::size_t, std::size_t elementBytes, IoErrorHandler &); bool Receive(char *, std::size_t, std::size_t elementBytes, IoErrorHandler &); std::size_t GetNextInputBytes(const char *&, IoErrorHandler &); - std::optional GetCurrentChar(IoErrorHandler &); void SetLeftTabLimit(); bool BeginReadingRecord(IoErrorHandler &); void FinishReadingRecord(IoErrorHandler &); diff --git a/flang/runtime/utf.cpp b/flang/runtime/utf.cpp new file mode 100644 index 0000000000000..8f59ddbb19663 --- /dev/null +++ b/flang/runtime/utf.cpp @@ -0,0 +1,111 @@ +//===-- runtime/utf.cpp ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "utf.h" + +namespace Fortran::runtime { + +// clang-format off +const std::uint8_t UTF8FirstByteTable[256]{ + /* 00 - 7F: 7 bit payload in single byte */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 80 - BF: invalid first byte, valid later byte */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* C0 - DF: 11 bit payload */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + /* E0 - EF: 16 bit payload */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + /* F0 - F7: 21 bit payload */ 4, 4, 4, 4, 4, 4, 4, 4, + /* F8 - FB: 26 bit payload */ 5, 5, 5, 5, + /* FC - FD: 31 bit payload */ 6, 6, + /* FE: 32 bit payload */ 7, + /* FF: invalid */ 0 +}; +// clang-format on + +// Non-minimal encodings are accepted. +std::optional DecodeUTF8(const char *p0) { + const std::uint8_t *p{reinterpret_cast(p0)}; + std::size_t bytes{MeasureUTF8Bytes(*p0)}; + if (bytes == 1) { + return char32_t{*p}; + } else if (bytes > 1) { + std::uint64_t result{char32_t{*p} & (0x7f >> bytes)}; + for (std::size_t j{1}; j < bytes; ++j) { + std::uint8_t next{p[j]}; + if (next < 0x80 || next > 0xbf) { + return std::nullopt; + } + result = (result << 6) | (next & 0x3f); + } + if (result <= 0xffffffff) { + return static_cast(result); + } + } + return std::nullopt; +} + +std::size_t EncodeUTF8(char *p0, char32_t ucs) { + std::uint8_t *p{reinterpret_cast(p0)}; + if (ucs <= 0x7f) { + p[0] = ucs; + return 1; + } else if (ucs <= 0x7ff) { + p[0] = 0xc0 | (ucs >> 6); + p[1] = 0x80 | (ucs & 0x3f); + return 2; + } else if (ucs <= 0xffff) { + p[0] = 0xe0 | (ucs >> 12); + p[1] = 0x80 | ((ucs >> 6) & 0x3f); + p[2] = 0x80 | (ucs & 0x3f); + return 3; + } else if (ucs <= 0x1fffff) { + p[0] = 0xf0 | (ucs >> 18); + p[1] = 0x80 | ((ucs >> 12) & 0x3f); + p[2] = 0x80 | ((ucs >> 6) & 0x3f); + p[3] = 0x80 | (ucs & 0x3f); + return 4; + } else if (ucs <= 0x3ffffff) { + p[0] = 0xf8 | (ucs >> 24); + p[1] = 0x80 | ((ucs >> 18) & 0x3f); + p[2] = 0x80 | ((ucs >> 12) & 0x3f); + p[3] = 0x80 | ((ucs >> 6) & 0x3f); + p[4] = 0x80 | (ucs & 0x3f); + return 5; + } else if (ucs <= 0x7ffffff) { + p[0] = 0xf8 | (ucs >> 30); + p[1] = 0x80 | ((ucs >> 24) & 0x3f); + p[2] = 0x80 | ((ucs >> 18) & 0x3f); + p[3] = 0x80 | ((ucs >> 12) & 0x3f); + p[4] = 0x80 | ((ucs >> 6) & 0x3f); + p[5] = 0x80 | (ucs & 0x3f); + return 6; + } else { + p[0] = 0xfe; + p[1] = 0x80 | ((ucs >> 30) & 0x3f); + p[2] = 0x80 | ((ucs >> 24) & 0x3f); + p[3] = 0x80 | ((ucs >> 18) & 0x3f); + p[4] = 0x80 | ((ucs >> 12) & 0x3f); + p[5] = 0x80 | ((ucs >> 6) & 0x3f); + p[6] = 0x80 | (ucs & 0x3f); + return 7; + } +} + +} // namespace Fortran::runtime diff --git a/flang/runtime/utf.h b/flang/runtime/utf.h new file mode 100644 index 0000000000000..6d9943bb6b8a2 --- /dev/null +++ b/flang/runtime/utf.h @@ -0,0 +1,68 @@ +//===-- runtime/utf.h -----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UTF-8 is the variant-width standard encoding of Unicode (ISO 10646) +// code points. +// +// 7-bit values in [00 .. 7F] represent themselves as single bytes, so true +// 7-bit ASCII is also valid UTF-8. +// +// Larger values are encoded with a start byte in [C0 .. FE] that carries +// the length of the encoding and some of the upper bits of the value, followed +// by one or more bytes in the range [80 .. BF]. +// +// Specifically, the first byte holds two or more uppermost set bits, +// a zero bit, and some payload; the second and later bytes each start with +// their uppermost bit set, the next bit clear, and six bits of payload. +// Payload parcels are in big-endian order. All bytes must be present in a +// valid sequence; i.e., low-order sezo bits must be explicit. UTF-8 is +// self-synchronizing on input as any byte value cannot be both a valid +// first byte or trailing byte. +// +// 0xxxxxxx - 7 bit ASCII +// 110xxxxx 10xxxxxx - 11-bit value +// 1110xxxx 10xxxxxx 10xxxxxx - 16-bit value +// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 21-bit value +// 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 26-bit value +// 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 31-bit value +// 11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 36-bit value +// +// Canonical UTF-8 sequences should be minimal, and our output is so, but +// we do not reject non-minimal sequences on input. Unicode only defines +// code points up to 0x10FFFF, so 21-bit (4-byte) UTF-8 is the actual +// standard maximum. However, we support extended forms up to 32 bits so that +// CHARACTER(KIND=4) can be abused to hold arbitrary 32-bit data. + +#ifndef FORTRAN_RUNTIME_UTF_H_ +#define FORTRAN_RUNTIME_UTF_H_ + +#include +#include +#include + +namespace Fortran::runtime { + +// Derive the length of a UTF-8 character encoding from its first byte. +// A zero result signifies an invalid encoding. +extern const std::uint8_t UTF8FirstByteTable[256]; +static inline std::size_t MeasureUTF8Bytes(char first) { + return UTF8FirstByteTable[static_cast(first)]; +} + +static constexpr std::size_t maxUTF8Bytes{7}; + +// Ensure that all bytes are present in sequence in the input buffer +// before calling; use MeasureUTF8Bytes(first byte) to count them. +std::optional DecodeUTF8(const char *); + +// Ensure that at least maxUTF8Bytes remain in the output +// buffer before calling. +std::size_t EncodeUTF8(char *, char32_t); + +} // namespace Fortran::runtime +#endif // FORTRAN_RUNTIME_UTF_H_ diff --git a/flang/unittests/Runtime/ExternalIOTest.cpp b/flang/unittests/Runtime/ExternalIOTest.cpp index fe88144bcff99..d88a0e11d87d0 100644 --- a/flang/unittests/Runtime/ExternalIOTest.cpp +++ b/flang/unittests/Runtime/ExternalIOTest.cpp @@ -553,6 +553,10 @@ TEST(ExternalIOTests, TestNonAvancingInput) { << "Input-item value after non advancing read " << j; j++; } + // CLOSE(UNIT=unit) + io = IONAME(BeginClose)(unit, __FILE__, __LINE__); + ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) + << "EndIoStatement() for Close"; } TEST(ExternalIOTests, TestWriteAfterNonAvancingInput) { @@ -645,9 +649,12 @@ TEST(ExternalIOTests, TestWriteAfterNonAvancingInput) { << "InputAscii() "; ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) << "EndIoStatement() for Read "; - ASSERT_EQ(resultRecord, expectedRecord) << "Record after non advancing read followed by write"; + // CLOSE(UNIT=unit) + io = IONAME(BeginClose)(unit, __FILE__, __LINE__); + ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) + << "EndIoStatement() for Close"; } TEST(ExternalIOTests, TestWriteAfterEndfile) { @@ -707,4 +714,184 @@ TEST(ExternalIOTests, TestWriteAfterEndfile) { ASSERT_FALSE(IONAME(InputInteger)(io, eof)) << "InputInteger(eof)"; ASSERT_EQ(eof, -1) << "READ(eof)"; ASSERT_EQ(IONAME(EndIoStatement)(io), IostatEnd) << "EndIoStatement for READ"; + // CLOSE(UNIT=unit) + io = IONAME(BeginClose)(unit, __FILE__, __LINE__); + ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) + << "EndIoStatement() for Close"; +} + +TEST(ExternalIOTests, TestUTF8Encoding) { + // OPEN(FILE="utf8test",NEWUNIT=unit,ACCESS='SEQUENTIAL',ACTION='READWRITE',& + // FORM='FORMATTED',STATUS='REPLACE',ENCODING='UTF-8') + auto *io{IONAME(BeginOpenNewUnit)(__FILE__, __LINE__)}; + ASSERT_TRUE(IONAME(SetAccess)(io, "SEQUENTIAL", 10)) + << "SetAccess(SEQUENTIAL)"; + ASSERT_TRUE(IONAME(SetAction)(io, "READWRITE", 9)) << "SetAction(READWRITE)"; + ASSERT_TRUE(IONAME(SetFile)(io, "utf8test", 8)) << "SetFile(utf8test)"; + ASSERT_TRUE(IONAME(SetForm)(io, "FORMATTED", 9)) << "SetForm(FORMATTED)"; + ASSERT_TRUE(IONAME(SetStatus)(io, "REPLACE", 7)) << "SetStatus(REPLACE)"; + ASSERT_TRUE(IONAME(SetEncoding)(io, "UTF-8", 5)) << "SetEncoding(UTF-8)"; + int unit{-1}; + ASSERT_TRUE(IONAME(GetNewUnit)(io, unit)) << "GetNewUnit()"; + ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) + << "EndIoStatement() for first OPEN"; + char buffer[12]; + std::memcpy(buffer, + "abc\x80\xff" + "de\0\0\0\0\0", + 12); + // WRITE(unit, *) buffer + io = IONAME(BeginExternalListOutput)(unit, __FILE__, __LINE__); + StaticDescriptor<0> staticDescriptor; + Descriptor &desc{staticDescriptor.descriptor()}; + desc.Establish(TypeCode{CFI_type_char}, 7, buffer, 0); + desc.Check(); + ASSERT_TRUE(IONAME(OutputDescriptor)(io, desc)); + ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) + << "EndIoStatement() for WRITE"; + // REWIND(unit) + io = IONAME(BeginRewind)(unit, __FILE__, __LINE__); + ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) + << "EndIoStatement for REWIND"; + // READ(unit, *) buffer + desc.Establish(TypeCode(CFI_type_char), sizeof buffer, buffer, 0); + desc.Check(); + io = IONAME(BeginExternalListInput)(unit, __FILE__, __LINE__); + ASSERT_TRUE(IONAME(InputDescriptor)(io, desc)); + ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) + << "EndIoStatement() for first READ"; + ASSERT_EQ(std::memcmp(buffer, + "abc\x80\xff" + "de ", + 12), + 0); + // CLOSE(UNIT=unit,STATUS='KEEP') + io = IONAME(BeginClose)(unit, __FILE__, __LINE__); + ASSERT_TRUE(IONAME(SetStatus)(io, "KEEP", 4)) << "SetStatus(KEEP)"; + ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) + << "EndIoStatement() for first CLOSE"; + // OPEN(FILE="utf8test",NEWUNIT=unit,ACCESS='SEQUENTIAL',ACTION='READWRITE',& + // FORM='FORMATTED',STATUS='OLD') + io = IONAME(BeginOpenNewUnit)(__FILE__, __LINE__); + ASSERT_TRUE(IONAME(SetAccess)(io, "SEQUENTIAL", 10)) + << "SetAccess(SEQUENTIAL)"; + ASSERT_TRUE(IONAME(SetAction)(io, "READWRITE", 9)) << "SetAction(READWRITE)"; + ASSERT_TRUE(IONAME(SetFile)(io, "utf8test", 8)) << "SetFile(utf8test)"; + ASSERT_TRUE(IONAME(SetForm)(io, "FORMATTED", 9)) << "SetForm(FORMATTED)"; + ASSERT_TRUE(IONAME(SetStatus)(io, "OLD", 3)) << "SetStatus(OLD)"; + ASSERT_TRUE(IONAME(GetNewUnit)(io, unit)) << "GetNewUnit()"; + ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) + << "EndIoStatement() for second OPEN"; + // READ(unit, *) buffer + io = IONAME(BeginExternalListInput)(unit, __FILE__, __LINE__); + ASSERT_TRUE(IONAME(InputDescriptor)(io, desc)); + ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) + << "EndIoStatement() for second READ"; + ASSERT_EQ(std::memcmp(buffer, + "abc\xc2\x80\xc3\xbf" + "de ", + 12), + 0); + // CLOSE(UNIT=unit,STATUS='DELETE') + io = IONAME(BeginClose)(unit, __FILE__, __LINE__); + ASSERT_TRUE(IONAME(SetStatus)(io, "DELETE", 6)) << "SetStatus(DELETE)"; + ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) + << "EndIoStatement() for second CLOSE"; +} + +TEST(ExternalIOTests, TestUCS) { + // OPEN(FILE="ucstest',NEWUNIT=unit,ACCESS='SEQUENTIAL',ACTION='READWRITE',& + // FORM='FORMATTED',STATUS='REPLACE',ENCODING='UTF-8') + auto *io{IONAME(BeginOpenNewUnit)(__FILE__, __LINE__)}; + ASSERT_TRUE(IONAME(SetAccess)(io, "SEQUENTIAL", 10)) + << "SetAccess(SEQUENTIAL)"; + ASSERT_TRUE(IONAME(SetAction)(io, "READWRITE", 9)) << "SetAction(READWRITE)"; + ASSERT_TRUE(IONAME(SetFile)(io, "ucstest", 7)) << "SetAction(ucstest)"; + ASSERT_TRUE(IONAME(SetForm)(io, "FORMATTED", 9)) << "SetForm(FORMATTED)"; + ASSERT_TRUE(IONAME(SetStatus)(io, "REPLACE", 7)) << "SetStatus(REPLACE)"; + ASSERT_TRUE(IONAME(SetEncoding)(io, "UTF-8", 5)) << "SetEncoding(UTF-8)"; + int unit{-1}; + ASSERT_TRUE(IONAME(GetNewUnit)(io, unit)) << "GetNewUnit()"; + ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) + << "EndIoStatement() for first OPEN"; + char32_t wbuffer[8]{U"abc\u0080\uffff" + "de"}; + // WRITE(unit, *) wbuffec + io = IONAME(BeginExternalListOutput)(unit, __FILE__, __LINE__); + StaticDescriptor<0> staticDescriptor; + Descriptor &desc{staticDescriptor.descriptor()}; + desc.Establish(TypeCode{CFI_type_char32_t}, sizeof wbuffer - sizeof(char32_t), + wbuffer, 0); + desc.Check(); + ASSERT_TRUE(IONAME(OutputDescriptor)(io, desc)); + ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) + << "EndIoStatement() for WRITE"; + // REWIND(unit) + io = IONAME(BeginRewind)(unit, __FILE__, __LINE__); + ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) + << "EndIoStatement for REWIND"; + // READ(unit, *) buffer + io = IONAME(BeginExternalListInput)(unit, __FILE__, __LINE__); + desc.Establish(TypeCode{CFI_type_char32_t}, sizeof wbuffer, wbuffer, 0); + desc.Check(); + ASSERT_TRUE(IONAME(InputDescriptor)(io, desc)); + ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) + << "EndIoStatement() for first READ"; + char dump[80]; + dump[0] = '\0'; + for (int j{0}; j < 8; ++j) { + std::size_t dumpLen{std::strlen(dump)}; + std::snprintf( + dump + dumpLen, sizeof dump - dumpLen, " %x", (unsigned)wbuffer[j]); + } + EXPECT_EQ(wbuffer[0], U'a') << dump; + EXPECT_EQ(wbuffer[1], U'b') << dump; + EXPECT_EQ(wbuffer[2], U'c') << dump; + EXPECT_EQ(wbuffer[3], U'\u0080') << dump; + EXPECT_EQ(wbuffer[4], U'\uffff') << dump; + EXPECT_EQ(wbuffer[5], U'd') << dump; + EXPECT_EQ(wbuffer[6], U'e') << dump; + EXPECT_EQ(wbuffer[7], U' ') << dump; + // CLOSE(UNIT=unit,STATUS='KEEP') + io = IONAME(BeginClose)(unit, __FILE__, __LINE__); + ASSERT_TRUE(IONAME(SetStatus)(io, "KEEP", 4)) << "SetStatus(KEEP)"; + ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) + << "EndIoStatement() for first CLOSE"; + // OPEN(FILE="ucstest",NEWUNIT=unit,ACCESS='SEQUENTIAL',ACTION='READWRITE',& + // FORM='FORMATTED',STATUS='OLD') + io = IONAME(BeginOpenNewUnit)(__FILE__, __LINE__); + ASSERT_TRUE(IONAME(SetAccess)(io, "SEQUENTIAL", 10)) + << "SetAccess(SEQUENTIAL)"; + ASSERT_TRUE(IONAME(SetAction)(io, "READWRITE", 9)) << "SetAction(READWRITE)"; + ASSERT_TRUE(IONAME(SetFile)(io, "ucstest", 7)) << "SetFile(ucstest)"; + ASSERT_TRUE(IONAME(SetForm)(io, "FORMATTED", 9)) << "SetForm(FORMATTED)"; + ASSERT_TRUE(IONAME(SetStatus)(io, "OLD", 3)) << "SetStatus(OLD)"; + ASSERT_TRUE(IONAME(GetNewUnit)(io, unit)) << "GetNewUnit()"; + ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) + << "EndIoStatement() for second OPEN"; + char buffer[12]; + // READ(unit, *) buffer + io = IONAME(BeginExternalListInput)(unit, __FILE__, __LINE__); + desc.Establish(TypeCode{CFI_type_char}, sizeof buffer, buffer, 0); + desc.Check(); + ASSERT_TRUE(IONAME(InputDescriptor)(io, desc)); + ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) + << "EndIoStatement() for second READ"; + dump[0] = '\0'; + for (int j{0}; j < 12; ++j) { + std::size_t dumpLen{std::strlen(dump)}; + std::snprintf(dump + dumpLen, sizeof dump - dumpLen, " %x", + (unsigned)(unsigned char)buffer[j]); + } + EXPECT_EQ(std::memcmp(buffer, + "abc\xc2\x80\xef\xbf\xbf" + "de ", + 12), + 0) + << dump; + // CLOSE(UNIT=unit,STATUS='DELETE') + io = IONAME(BeginClose)(unit, __FILE__, __LINE__); + ASSERT_TRUE(IONAME(SetStatus)(io, "DELETE", 6)) << "SetStatus(DELETE)"; + ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) + << "EndIoStatement() for second CLOSE"; }