From 2883820e5bd4a0ff62fad46c8e6c4721d4151f83 Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Sun, 10 Mar 2019 21:40:27 -0700 Subject: [PATCH] Refactor Encoding to split fast-path and fallback logic (dotnet/coreclr#23098) This refactoring is limited to ASCIIEncoding at the moment, but it can easily be applied to UTF-8 / UTF-16 / UTF-32. High-level changes: - Fallback logic has been split from the fast-path, improving performance of GetBytes and similar routines. - All of the plumbing of when to invoke the fallback logic and how to manage leftover data has been moved into the base class. - Almost all of the logic except for the fast-path is now written in terms of verifiable code (Span and ReadOnlySpan). - Minor bug fixes in EncoderNLS.Convert (see https://github.com/dotnet/coreclr/issues/23020). Commit migrated from https://github.com/dotnet/coreclr/commit/43a5159d39bd52195c5095da4006183f791c696b --- src/coreclr/tests/CoreFX/CoreFX.issues.json | 22 + .../System.Private.CoreLib.Shared.projitems | 2 + .../src/System/String.cs | 4 +- .../src/System/Text/ASCIIEncoding.cs | 1128 +++++++-------- .../src/System/Text/ASCIIUtility.cs | 76 + .../src/System/Text/DecoderFallback.cs | 104 ++ .../src/System/Text/DecoderNLS.cs | 198 ++- .../src/System/Text/EncoderFallback.cs | 185 ++- .../src/System/Text/EncoderNLS.cs | 167 ++- .../src/System/Text/Encoding.Internal.cs | 1277 +++++++++++++++++ .../src/System/Text/Encoding.cs | 50 +- .../src/System/Text/EncodingNLS.cs | 2 + .../src/System/ThrowHelper.cs | 27 + 13 files changed, 2584 insertions(+), 658 deletions(-) create mode 100644 src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs create mode 100644 src/libraries/System.Private.CoreLib/src/System/Text/Encoding.Internal.cs diff --git a/src/coreclr/tests/CoreFX/CoreFX.issues.json b/src/coreclr/tests/CoreFX/CoreFX.issues.json index 4489b51ecc18a..3db01ef4e1b35 100644 --- a/src/coreclr/tests/CoreFX/CoreFX.issues.json +++ b/src/coreclr/tests/CoreFX/CoreFX.issues.json @@ -861,6 +861,28 @@ ] } }, + { + "name": "System.Text.Encoding.Tests", + "enabled": true, + "exclusions": { + "namespaces": null, + "classes": null, + "methods": [ + { + "name": "System.Text.Tests.EncoderConvert2.EncoderASCIIConvertMixedASCIIUnicodeCharArrayPartial", + "reason": "https://github.com/dotnet/coreclr/issues/23020" + }, + { + "name": "System.Text.Tests.EncoderConvert2.EncoderUTF8ConvertMixedASCIIUnicodeCharArrayPartial", + "reason": "https://github.com/dotnet/coreclr/issues/23020" + }, + { + "name": "System.Text.Tests.EncoderConvert2.EncoderUTF8ConvertUnicodeCharArrayPartial", + "reason": "https://github.com/dotnet/coreclr/issues/23020" + } + ] + } + }, { "name": "System.Text.RegularExpressions.Tests", "enabled": true, diff --git a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems index 71459faf790e8..ce0f1025874bd 100644 --- a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems +++ b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems @@ -761,6 +761,7 @@ + @@ -776,6 +777,7 @@ + diff --git a/src/libraries/System.Private.CoreLib/src/System/String.cs b/src/libraries/System.Private.CoreLib/src/System/String.cs index 22f830a0e4c9a..49afbc8c8c3d2 100644 --- a/src/libraries/System.Private.CoreLib/src/System/String.cs +++ b/src/libraries/System.Private.CoreLib/src/System/String.cs @@ -480,7 +480,7 @@ public static bool IsNullOrWhiteSpace(string value) Debug.Assert(byteLength >= 0); // Get our string length - int stringLength = encoding.GetCharCount(bytes, byteLength, null); + int stringLength = encoding.GetCharCount(bytes, byteLength); Debug.Assert(stringLength >= 0, "stringLength >= 0"); // They gave us an empty string if they needed one @@ -491,7 +491,7 @@ public static bool IsNullOrWhiteSpace(string value) string s = FastAllocateString(stringLength); fixed (char* pTempChars = &s._firstChar) { - int doubleCheck = encoding.GetChars(bytes, byteLength, pTempChars, stringLength, null); + int doubleCheck = encoding.GetChars(bytes, byteLength, pTempChars, stringLength); Debug.Assert(stringLength == doubleCheck, "Expected encoding.GetChars to return same length as encoding.GetCharCount"); } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIEncoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIEncoding.cs index 217d93467767e..8cf1f57ccb10d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIEncoding.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIEncoding.cs @@ -2,8 +2,9 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System; +using System.Buffers; using System.Diagnostics; +using System.Runtime.CompilerServices; using System.Runtime.InteropServices; namespace System.Text @@ -18,10 +19,30 @@ namespace System.Text // Note: IsAlwaysNormalized remains false because 1/2 the code points are unassigned, so they'd // use fallbacks, and we cannot guarantee that fallbacks are normalized. - public class ASCIIEncoding : Encoding + public partial class ASCIIEncoding : Encoding { - // Allow for devirtualization (see https://github.com/dotnet/coreclr/pull/9230) - internal sealed class ASCIIEncodingSealed : ASCIIEncoding { } + // This specialized sealed type has two benefits: + // 1) it allows for devirtualization (see https://github.com/dotnet/coreclr/pull/9230), and + // 2) it allows us to provide highly optimized implementations of certain routines because + // we can make assumptions about the fallback mechanisms in use (in particular, always + // replace with "?"). + // + // (We don't take advantage of #2 yet, but we can do so in the future because the implementation + // of cloning below allows us to make assumptions about the behaviors of the sealed type.) + internal sealed class ASCIIEncodingSealed : ASCIIEncoding + { + public override object Clone() + { + // The base implementation of Encoding.Clone calls object.MemberwiseClone and marks the new object mutable. + // We don't want to do this because it violates the invariants we have set for the sealed type. + // Instead, we'll create a new instance of the base ASCIIEncoding type and mark it mutable. + + return new ASCIIEncoding() + { + IsReadOnly = false + }; + } + } // Used by Encoding.ASCII for lazy initialization // The initialization code will not be run until a static member of the class is referenced @@ -58,22 +79,26 @@ internal sealed override void SetDefaultFallbacks() public override unsafe int GetByteCount(char[] chars, int index, int count) { // Validate input parameters - if (chars == null) - throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array); - if (index < 0 || count < 0) - throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum); + if (chars is null) + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars, ExceptionResource.ArgumentNull_Array); + } - if (chars.Length - index < count) - throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer); + if ((index | count) < 0) + { + ThrowHelper.ThrowArgumentOutOfRangeException((index < 0) ? ExceptionArgument.index : ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - // If no input, return 0, avoid fixed empty array problem - if (count == 0) - return 0; + if (chars.Length - index < count) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.chars, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer); + } - // Just call the pointer version fixed (char* pChars = chars) - return GetByteCount(pChars + index, count, null); + { + return GetByteCountCommon(pChars + index, count); + } } // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) @@ -83,12 +108,17 @@ public override unsafe int GetByteCount(char[] chars, int index, int count) public override unsafe int GetByteCount(string chars) { - // Validate input - if (chars==null) - throw new ArgumentNullException(nameof(chars)); + // Validate input parameters + + if (chars is null) + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars); + } fixed (char* pChars = chars) - return GetByteCount(pChars, chars.Length, null); + { + return GetByteCountCommon(pChars, chars.Length); + } } // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) @@ -99,22 +129,81 @@ public override unsafe int GetByteCount(string chars) public override unsafe int GetByteCount(char* chars, int count) { // Validate Parameters + if (chars == null) - throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array); + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars); + } if (count < 0) - throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum); + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - // Call it with empty encoder - return GetByteCount(chars, count, null); + return GetByteCountCommon(chars, count); } public override unsafe int GetByteCount(ReadOnlySpan chars) { - fixed (char* charsPtr = &MemoryMarshal.GetNonNullPinnableReference(chars)) + // It's ok for us to pass null pointers down to the workhorse below. + + fixed (char* charsPtr = &MemoryMarshal.GetReference(chars)) + { + return GetByteCountCommon(charsPtr, chars.Length); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe int GetByteCountCommon(char* pChars, int charCount) + { + // Common helper method for all non-EncoderNLS entry points to GetByteCount. + // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32. + + Debug.Assert(charCount >= 0, "Caller should't specify negative length buffer."); + Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified."); + + // First call into the fast path. + + int totalByteCount = GetByteCountFast(pChars, charCount, EncoderFallback, out int charsConsumed); + + if (charsConsumed != charCount) + { + // If there's still data remaining in the source buffer, go down the fallback path. + // We need to check for integer overflow since the fallback could change the required + // output count in unexpected ways. + + totalByteCount += GetByteCountWithFallback(pChars, charCount, charsConsumed); + if (totalByteCount < 0) + { + ThrowConversionOverflow(); + } + } + + return totalByteCount; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetByteCountCommon + private protected sealed override unsafe int GetByteCountFast(char* pChars, int charsLength, EncoderFallback fallback, out int charsConsumed) + { + // First: Can we short-circuit the entire calculation? + // If an EncoderReplacementFallback is in use, all non-ASCII chars + // (including surrogate halves) are replaced with the default string. + // If the default string consists of a single ASCII value, then we + // know there's a 1:1 char->byte transcoding in all cases. + + int byteCount = charsLength; + + if (!(fallback is EncoderReplacementFallback replacementFallback + && replacementFallback.MaxCharCount == 1 + && replacementFallback.DefaultString[0] <= 0x7F)) { - return GetByteCount(charsPtr, chars.Length, encoder: null); + // Unrecognized fallback mechanism - count chars manually. + + byteCount = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pChars, (uint)charsLength); } + + charsConsumed = byteCount; + return byteCount; } // Parent method is safe. @@ -125,22 +214,37 @@ public override unsafe int GetByteCount(ReadOnlySpan chars) public override unsafe int GetBytes(string chars, int charIndex, int charCount, byte[] bytes, int byteIndex) { - if (chars == null || bytes == null) - throw new ArgumentNullException((chars == null ? nameof(chars) : nameof(bytes)), SR.ArgumentNull_Array); + // Validate Parameters - if (charIndex < 0 || charCount < 0) - throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)), SR.ArgumentOutOfRange_NeedNonNegNum); + if (chars is null || bytes is null) + { + ThrowHelper.ThrowArgumentNullException( + argument: (chars is null) ? ExceptionArgument.chars : ExceptionArgument.bytes, + resource: ExceptionResource.ArgumentNull_Array); + } - if (chars.Length - charIndex < charCount) - throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCount); + if ((charIndex | charCount) < 0) + { + ThrowHelper.ThrowArgumentOutOfRangeException( + argument: (charIndex < 0) ? ExceptionArgument.charIndex : ExceptionArgument.charCount, + resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - if (byteIndex < 0 || byteIndex > bytes.Length) - throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index); + if (chars.Length - charIndex < charCount) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.chars, ExceptionResource.ArgumentOutOfRange_IndexCount); + } - int byteCount = bytes.Length - byteIndex; + if ((uint)byteIndex > bytes.Length) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.byteIndex, ExceptionResource.ArgumentOutOfRange_Index); + } - fixed (char* pChars = chars) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span)bytes)) - return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + fixed (char* pChars = chars) + fixed (byte* pBytes = bytes) + { + return GetBytesCommon(pChars + charIndex, charCount, pBytes + byteIndex, bytes.Length - byteIndex); + } } // Encodes a range of characters in a character array into a range of bytes @@ -161,28 +265,36 @@ public override unsafe int GetByteCount(ReadOnlySpan chars) byte[] bytes, int byteIndex) { // Validate parameters - if (chars == null || bytes == null) - throw new ArgumentNullException((chars == null ? nameof(chars) : nameof(bytes)), SR.ArgumentNull_Array); - - if (charIndex < 0 || charCount < 0) - throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)), SR.ArgumentOutOfRange_NeedNonNegNum); - if (chars.Length - charIndex < charCount) - throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer); + if (chars is null || bytes is null) + { + ThrowHelper.ThrowArgumentNullException( + argument: (chars is null) ? ExceptionArgument.chars : ExceptionArgument.bytes, + resource: ExceptionResource.ArgumentNull_Array); + } - if (byteIndex < 0 || byteIndex > bytes.Length) - throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index); + if ((charIndex | charCount) < 0) + { + ThrowHelper.ThrowArgumentOutOfRangeException( + argument: (charIndex < 0) ? ExceptionArgument.charIndex : ExceptionArgument.charCount, + resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - // If nothing to encode return 0 - if (charCount == 0) - return 0; + if (chars.Length - charIndex < charCount) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.chars, ExceptionResource.ArgumentOutOfRange_IndexCount); + } - // Just call pointer version - int byteCount = bytes.Length - byteIndex; + if ((uint)byteIndex > bytes.Length) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.byteIndex, ExceptionResource.ArgumentOutOfRange_Index); + } - fixed (char* pChars = chars) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span)bytes)) - // Remember that byteCount is # to decode, not size of array. - return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + fixed (char* pChars = chars) + fixed (byte* pBytes = bytes) + { + return GetBytesCommon(pChars + charIndex, charCount, pBytes + byteIndex, bytes.Length - byteIndex); + } } // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) @@ -193,21 +305,123 @@ public override unsafe int GetByteCount(ReadOnlySpan chars) public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount) { // Validate Parameters - if (bytes == null || chars == null) - throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array); - if (charCount < 0 || byteCount < 0) - throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum); + if (chars == null || bytes == null) + { + ThrowHelper.ThrowArgumentNullException( + argument: (chars is null) ? ExceptionArgument.chars : ExceptionArgument.bytes, + resource: ExceptionResource.ArgumentNull_Array); + } - return GetBytes(chars, charCount, bytes, byteCount, null); + if ((charCount | byteCount) < 0) + { + ThrowHelper.ThrowArgumentOutOfRangeException( + argument: (charCount < 0) ? ExceptionArgument.charCount : ExceptionArgument.byteCount, + resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } + + return GetBytesCommon(chars, charCount, bytes, byteCount); } public override unsafe int GetBytes(ReadOnlySpan chars, Span bytes) { - fixed (char* charsPtr = &MemoryMarshal.GetNonNullPinnableReference(chars)) - fixed (byte* bytesPtr = &MemoryMarshal.GetNonNullPinnableReference(bytes)) + // It's ok for us to operate on null / empty spans. + + fixed (char* charsPtr = &MemoryMarshal.GetReference(chars)) + fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes)) { - return GetBytes(charsPtr, chars.Length, bytesPtr, bytes.Length, encoder: null); + return GetBytesCommon(charsPtr, chars.Length, bytesPtr, bytes.Length); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe int GetBytesCommon(char* pChars, int charCount, byte* pBytes, int byteCount) + { + // Common helper method for all non-EncoderNLS entry points to GetBytes. + // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32. + + Debug.Assert(charCount >= 0, "Caller should't specify negative length buffer."); + Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified."); + Debug.Assert(byteCount >= 0, "Caller should't specify negative length buffer."); + Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified."); + + // First call into the fast path. + + int bytesWritten = GetBytesFast(pChars, charCount, pBytes, byteCount, out int charsConsumed); + + if (charsConsumed == charCount) + { + // All elements converted - return immediately. + + return bytesWritten; + } + else + { + // Simple narrowing conversion couldn't operate on entire buffer - invoke fallback. + + return GetBytesWithFallback(pChars, charCount, pBytes, byteCount, charsConsumed, bytesWritten); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetBytesCommon + private protected sealed override unsafe int GetBytesFast(char* pChars, int charsLength, byte* pBytes, int bytesLength, out int charsConsumed) + { + int bytesWritten = (int)ASCIIUtility.NarrowUtf16ToAscii(pChars, pBytes, (uint)Math.Min(charsLength, bytesLength)); + + charsConsumed = bytesWritten; + return bytesWritten; + } + + private protected sealed override unsafe int GetBytesWithFallback(ReadOnlySpan chars, int originalCharsLength, Span bytes, int originalBytesLength, EncoderNLS encoder) + { + // We special-case EncoderReplacementFallback if it's telling us to write a single ASCII char, + // since we believe this to be relatively common and we can handle it more efficiently than + // the base implementation. + + if (((encoder is null) ? this.EncoderFallback : encoder.Fallback) is EncoderReplacementFallback replacementFallback + && replacementFallback.MaxCharCount == 1 + && replacementFallback.DefaultString[0] <= 0x7F) + { + byte replacementByte = (byte)replacementFallback.DefaultString[0]; + + int numElementsToConvert = Math.Min(chars.Length, bytes.Length); + int idx = 0; + + fixed (char* pChars = &MemoryMarshal.GetReference(chars)) + fixed (byte* pBytes = &MemoryMarshal.GetReference(bytes)) + { + // In a loop, replace the non-convertible data, then bulk-convert as much as we can. + + while (idx < numElementsToConvert) + { + pBytes[idx++] = replacementByte; + + if (idx < numElementsToConvert) + { + idx += (int)ASCIIUtility.NarrowUtf16ToAscii(&pChars[idx], &pBytes[idx], (uint)(numElementsToConvert - idx)); + } + + Debug.Assert(idx <= numElementsToConvert, "Somehow went beyond bounds of source or destination buffer?"); + } + } + + // Slice off how much we consumed / wrote. + + chars = chars.Slice(numElementsToConvert); + bytes = bytes.Slice(numElementsToConvert); + } + + // If we couldn't go through our fast fallback mechanism, or if we still have leftover + // data because we couldn't consume everything in the loop above, we need to go down the + // slow fallback path. + + if (chars.IsEmpty) + { + return originalBytesLength - bytes.Length; // total number of bytes written + } + else + { + return base.GetBytesWithFallback(chars, originalCharsLength, bytes, originalBytesLength, encoder); } } @@ -222,22 +436,26 @@ public override unsafe int GetBytes(ReadOnlySpan chars, Span bytes) public override unsafe int GetCharCount(byte[] bytes, int index, int count) { // Validate Parameters - if (bytes == null) - throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array); - if (index < 0 || count < 0) - throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum); + if (bytes is null) + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array); + } - if (bytes.Length - index < count) - throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer); + if ((index | count) < 0) + { + ThrowHelper.ThrowArgumentOutOfRangeException((index < 0) ? ExceptionArgument.index : ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - // If no input just return 0, fixed doesn't like 0 length arrays - if (count == 0) - return 0; + if (bytes.Length - index < count) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer); + } - // Just call pointer version fixed (byte* pBytes = bytes) - return GetCharCount(pBytes + index, count, null); + { + return GetCharCountCommon(pBytes + index, count); + } } // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) @@ -248,673 +466,367 @@ public override unsafe int GetCharCount(byte[] bytes, int index, int count) public override unsafe int GetCharCount(byte* bytes, int count) { // Validate Parameters + if (bytes == null) - throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array); + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array); + } if (count < 0) - throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum); + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - return GetCharCount(bytes, count, null); + return GetCharCountCommon(bytes, count); } public override unsafe int GetCharCount(ReadOnlySpan bytes) { - fixed (byte* bytesPtr = &MemoryMarshal.GetNonNullPinnableReference(bytes)) + // It's ok for us to pass null pointers down to the workhorse routine. + + fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes)) { - return GetCharCount(bytesPtr, bytes.Length, decoder: null); + return GetCharCountCommon(bytesPtr, bytes.Length); } } - // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) - // So if you fix this, fix the others. Currently those include: - // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding - // parent method is safe - - public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount, - char[] chars, int charIndex) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe int GetCharCountCommon(byte* pBytes, int byteCount) { - // Validate Parameters - if (bytes == null || chars == null) - throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array); + // Common helper method for all non-DecoderNLS entry points to GetCharCount. + // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32. - if (byteIndex < 0 || byteCount < 0) - throw new ArgumentOutOfRangeException((byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum); + Debug.Assert(byteCount >= 0, "Caller should't specify negative length buffer."); + Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified."); - if ( bytes.Length - byteIndex < byteCount) - throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer); + // First call into the fast path. - if (charIndex < 0 || charIndex > chars.Length) - throw new ArgumentOutOfRangeException(nameof(charIndex), SR.ArgumentOutOfRange_Index); + int totalCharCount = GetCharCountFast(pBytes, byteCount, DecoderFallback, out int bytesConsumed); - // If no input, return 0 & avoid fixed problem - if (byteCount == 0) - return 0; + if (bytesConsumed != byteCount) + { + // If there's still data remaining in the source buffer, go down the fallback path. + // We need to check for integer overflow since the fallback could change the required + // output count in unexpected ways. - // Just call pointer version - int charCount = chars.Length - charIndex; + totalCharCount += GetCharCountWithFallback(pBytes, byteCount, bytesConsumed); + if (totalCharCount < 0) + { + ThrowConversionOverflow(); + } + } - fixed (byte* pBytes = bytes) fixed (char* pChars = &MemoryMarshal.GetReference((Span)chars)) - // Remember that charCount is # to decode, not size of array - return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null); + return totalCharCount; } - // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) - // So if you fix this, fix the others. Currently those include: - // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding - - [CLSCompliant(false)] - public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount) + [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharCountCommon + private protected sealed override unsafe int GetCharCountFast(byte* pBytes, int bytesLength, DecoderFallback fallback, out int bytesConsumed) { - // Validate Parameters - if (bytes == null || chars == null) - throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array); + // First: Can we short-circuit the entire calculation? + // If a DecoderReplacementFallback is in use, all non-ASCII bytes are replaced with + // the default string. If the default string consists of a single BMP value, then we + // know there's a 1:1 byte->char transcoding in all cases. - if (charCount < 0 || byteCount < 0) - throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum); + int charCount = bytesLength; - return GetChars(bytes, byteCount, chars, charCount, null); - } - - public override unsafe int GetChars(ReadOnlySpan bytes, Span chars) - { - fixed (byte* bytesPtr = &MemoryMarshal.GetNonNullPinnableReference(bytes)) - fixed (char* charsPtr = &MemoryMarshal.GetNonNullPinnableReference(chars)) + if (!(fallback is DecoderReplacementFallback replacementFallback) || replacementFallback.MaxCharCount != 1) { - return GetChars(bytesPtr, bytes.Length, charsPtr, chars.Length, decoder: null); + // Unrecognized fallback mechanism - count bytes manually. + + charCount = (int)ASCIIUtility.GetIndexOfFirstNonAsciiByte(pBytes, (uint)bytesLength); } + + bytesConsumed = charCount; + return charCount; } - // Returns a string containing the decoded representation of a range of - // bytes in a byte array. - // // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) // So if you fix this, fix the others. Currently those include: // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding // parent method is safe - public override unsafe string GetString(byte[] bytes, int byteIndex, int byteCount) + public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount, + char[] chars, int charIndex) { // Validate Parameters - if (bytes == null) - throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array); - if (byteIndex < 0 || byteCount < 0) - throw new ArgumentOutOfRangeException((byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum); + if (bytes is null || chars is null) + { + ThrowHelper.ThrowArgumentNullException( + argument: (bytes is null) ? ExceptionArgument.bytes : ExceptionArgument.chars, + resource: ExceptionResource.ArgumentNull_Array); + } + if ((byteIndex | byteCount) < 0) + { + ThrowHelper.ThrowArgumentOutOfRangeException( + argument: (byteIndex < 0) ? ExceptionArgument.byteIndex : ExceptionArgument.byteCount, + resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } if (bytes.Length - byteIndex < byteCount) - throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer); + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer); + } - // Avoid problems with empty input buffer - if (byteCount == 0) return string.Empty; + if ((uint)charIndex > (uint)chars.Length) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.charIndex, ExceptionResource.ArgumentOutOfRange_Index); + } fixed (byte* pBytes = bytes) - return string.CreateStringFromEncoding( - pBytes + byteIndex, byteCount, this); + fixed (char* pChars = chars) + { + return GetCharsCommon(pBytes + byteIndex, byteCount, pChars + charIndex, chars.Length - charIndex); + } } - // - // End of standard methods copied from EncodingNLS.cs - // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding - // GetByteCount - // Note: We start by assuming that the output will be the same as count. Having - // an encoder or fallback may change that assumption - internal sealed override unsafe int GetByteCount(char* chars, int charCount, EncoderNLS encoder) + [CLSCompliant(false)] + public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount) { - // Just need to ASSERT, this is called by something else internal that checked parameters already - Debug.Assert(charCount >= 0, "[ASCIIEncoding.GetByteCount]count is negative"); - Debug.Assert(chars != null, "[ASCIIEncoding.GetByteCount]chars is null"); - - // Assert because we shouldn't be able to have a null encoder. - Debug.Assert(encoderFallback != null, "[ASCIIEncoding.GetByteCount]Attempting to use null fallback encoder"); - - char charLeftOver = (char)0; - EncoderReplacementFallback fallback = null; - - // Start by assuming default count, then +/- for fallback characters - char* charEnd = chars + charCount; - - // For fallback we may need a fallback buffer, we know we aren't default fallback. - EncoderFallbackBuffer fallbackBuffer = null; - char* charsForFallback; - - if (encoder != null) - { - charLeftOver = encoder._charLeftOver; - Debug.Assert(charLeftOver == 0 || char.IsHighSurrogate(charLeftOver), - "[ASCIIEncoding.GetByteCount]leftover character should be high surrogate"); - - fallback = encoder.Fallback as EncoderReplacementFallback; - - // We mustn't have left over fallback data when counting - if (encoder.InternalHasFallbackBuffer) - { - // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary - fallbackBuffer = encoder.FallbackBuffer; - if (fallbackBuffer.Remaining > 0 && encoder._throwOnOverflow) - throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType())); - - // Set our internal fallback interesting things. - fallbackBuffer.InternalInitialize(chars, charEnd, encoder, false); - } + // Validate Parameters - // Verify that we have no fallbackbuffer, for ASCII its always empty, so just assert - Debug.Assert(!encoder._throwOnOverflow || !encoder.InternalHasFallbackBuffer || - encoder.FallbackBuffer.Remaining == 0, - "[ASCIICodePageEncoding.GetByteCount]Expected empty fallback buffer"); - } - else + if (bytes is null || chars is null) { - fallback = this.EncoderFallback as EncoderReplacementFallback; + ThrowHelper.ThrowArgumentNullException( + argument: (bytes is null) ? ExceptionArgument.bytes : ExceptionArgument.chars, + resource: ExceptionResource.ArgumentNull_Array); } - // If we have an encoder AND we aren't using default fallback, - // then we may have a complicated count. - if (fallback != null && fallback.MaxCharCount == 1) + if ((byteCount | charCount) < 0) { - // Replacement fallback encodes surrogate pairs as two ?? (or two whatever), so return size is always - // same as input size. - // Note that no existing SBCS code pages map code points to supplimentary characters, so this is easy. - - // We could however have 1 extra byte if the last call had an encoder and a funky fallback and - // if we don't use the funky fallback this time. - - // Do we have an extra char left over from last time? - if (charLeftOver > 0) - charCount++; - - return (charCount); + ThrowHelper.ThrowArgumentOutOfRangeException( + argument: (byteCount < 0) ? ExceptionArgument.byteCount : ExceptionArgument.charCount, + resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); } - // Count is more complicated if you have a funky fallback - // For fallback we may need a fallback buffer, we know we're not default fallback - int byteCount = 0; - - // We may have a left over character from last time, try and process it. - if (charLeftOver > 0) - { - Debug.Assert(char.IsHighSurrogate(charLeftOver), "[ASCIIEncoding.GetByteCount]leftover character should be high surrogate"); - Debug.Assert(encoder != null, "[ASCIIEncoding.GetByteCount]Expected encoder"); - - // Since left over char was a surrogate, it'll have to be fallen back. - // Get Fallback - fallbackBuffer = encoder.FallbackBuffer; - fallbackBuffer.InternalInitialize(chars, charEnd, encoder, false); - - // This will fallback a pair if *chars is a low surrogate - charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered - fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); - chars = charsForFallback; - } + return GetCharsCommon(bytes, byteCount, chars, charCount); + } - // Now we may have fallback char[] already from the encoder + public override unsafe int GetChars(ReadOnlySpan bytes, Span chars) + { + // It's ok for us to pass null pointers down to the workhorse below. - // Go ahead and do it, including the fallback. - char ch; - while ((ch = (fallbackBuffer == null) ? '\0' : fallbackBuffer.InternalGetNextChar()) != 0 || - chars < charEnd) + fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes)) + fixed (char* charsPtr = &MemoryMarshal.GetReference(chars)) { - // First unwind any fallback - if (ch == 0) - { - // No fallback, just get next char - ch = *chars; - chars++; - } - - // Check for fallback, this'll catch surrogate pairs too. - // no chars >= 0x80 are allowed. - if (ch > 0x7f) - { - if (fallbackBuffer == null) - { - // Initialize the buffer - if (encoder == null) - fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); - else - fallbackBuffer = encoder.FallbackBuffer; - fallbackBuffer.InternalInitialize(charEnd - charCount, charEnd, encoder, false); - } - - // Get Fallback - charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered - fallbackBuffer.InternalFallback(ch, ref charsForFallback); - chars = charsForFallback; - continue; - } - - // We'll use this one - byteCount++; + return GetCharsCommon(bytesPtr, bytes.Length, charsPtr, chars.Length); } - - Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, - "[ASCIIEncoding.GetByteCount]Expected Empty fallback buffer"); - - return byteCount; } - internal sealed override unsafe int GetBytes( - char* chars, int charCount, byte* bytes, int byteCount, EncoderNLS encoder) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe int GetCharsCommon(byte* pBytes, int byteCount, char* pChars, int charCount) { - // Just need to ASSERT, this is called by something else internal that checked parameters already - Debug.Assert(bytes != null, "[ASCIIEncoding.GetBytes]bytes is null"); - Debug.Assert(byteCount >= 0, "[ASCIIEncoding.GetBytes]byteCount is negative"); - Debug.Assert(chars != null, "[ASCIIEncoding.GetBytes]chars is null"); - Debug.Assert(charCount >= 0, "[ASCIIEncoding.GetBytes]charCount is negative"); + // Common helper method for all non-DecoderNLS entry points to GetChars. + // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32. - // Assert because we shouldn't be able to have a null encoder. - Debug.Assert(encoderFallback != null, "[ASCIIEncoding.GetBytes]Attempting to use null encoder fallback"); + Debug.Assert(byteCount >= 0, "Caller should't specify negative length buffer."); + Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified."); + Debug.Assert(charCount >= 0, "Caller should't specify negative length buffer."); + Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified."); - // Get any left over characters - char charLeftOver = (char)0; - EncoderReplacementFallback fallback = null; + // First call into the fast path. - // For fallback we may need a fallback buffer, we know we aren't default fallback. - EncoderFallbackBuffer fallbackBuffer = null; - char* charsForFallback; + int charsWritten = GetCharsFast(pBytes, byteCount, pChars, charCount, out int bytesConsumed); - // prepare our end - char* charEnd = chars + charCount; - byte* byteStart = bytes; - char* charStart = chars; - - if (encoder != null) + if (bytesConsumed == byteCount) { - charLeftOver = encoder._charLeftOver; - fallback = encoder.Fallback as EncoderReplacementFallback; - - // We mustn't have left over fallback data when counting - if (encoder.InternalHasFallbackBuffer) - { - // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary - fallbackBuffer = encoder.FallbackBuffer; - if (fallbackBuffer.Remaining > 0 && encoder._throwOnOverflow) - throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType())); - - // Set our internal fallback interesting things. - fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true); - } + // All elements converted - return immediately. - Debug.Assert(charLeftOver == 0 || char.IsHighSurrogate(charLeftOver), - "[ASCIIEncoding.GetBytes]leftover character should be high surrogate"); - - // Verify that we have no fallbackbuffer, for ASCII its always empty, so just assert - Debug.Assert(!encoder._throwOnOverflow || !encoder.InternalHasFallbackBuffer || - encoder.FallbackBuffer.Remaining == 0, - "[ASCIICodePageEncoding.GetBytes]Expected empty fallback buffer"); + return charsWritten; } else { - fallback = this.EncoderFallback as EncoderReplacementFallback; + // Simple narrowing conversion couldn't operate on entire buffer - invoke fallback. + + return GetCharsWithFallback(pBytes, byteCount, pChars, charCount, bytesConsumed, charsWritten); } + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharsCommon + private protected sealed override unsafe int GetCharsFast(byte* pBytes, int bytesLength, char* pChars, int charsLength, out int bytesConsumed) + { + int charsWritten = (int)ASCIIUtility.WidenAsciiToUtf16(pBytes, pChars, (uint)Math.Min(bytesLength, charsLength)); + + bytesConsumed = charsWritten; + return charsWritten; + } - // See if we do the fast default or slightly slower fallback - if (fallback != null && fallback.MaxCharCount == 1) + private protected sealed override unsafe int GetCharsWithFallback(ReadOnlySpan bytes, int originalBytesLength, Span chars, int originalCharsLength, DecoderNLS decoder) + { + // We special-case DecoderReplacementFallback if it's telling us to write a single BMP char, + // since we believe this to be relatively common and we can handle it more efficiently than + // the base implementation. + + if (((decoder is null) ? this.DecoderFallback: decoder.Fallback) is DecoderReplacementFallback replacementFallback + && replacementFallback.MaxCharCount == 1) { - // Fast version - char cReplacement = fallback.DefaultString[0]; + char replacementChar = replacementFallback.DefaultString[0]; + + int numElementsToConvert = Math.Min( bytes.Length, chars.Length); + int idx = 0; - // Check for replacements in range, otherwise fall back to slow version. - if (cReplacement <= (char)0x7f) + fixed (byte* pBytes = &MemoryMarshal.GetReference(bytes)) + fixed (char* pChars = &MemoryMarshal.GetReference(chars)) { - // We should have exactly as many output bytes as input bytes, unless there's a left - // over character, in which case we may need one more. - // If we had a left over character will have to add a ? (This happens if they had a funky - // fallback last time, but not this time.) (We can't spit any out though - // because with fallback encoder each surrogate is treated as a seperate code point) - if (charLeftOver > 0) - { - // Have to have room - // Throw even if doing no throw version because this is just 1 char, - // so buffer will never be big enough - if (byteCount == 0) - ThrowBytesOverflow(encoder, true); - - // This'll make sure we still have more room and also make sure our return value is correct. - *(bytes++) = (byte)cReplacement; - byteCount--; // We used one of the ones we were counting. - } + // In a loop, replace the non-convertible data, then bulk-convert as much as we can. - // This keeps us from overrunning our output buffer - if (byteCount < charCount) + while (idx < numElementsToConvert) { - // Throw or make buffer smaller? - ThrowBytesOverflow(encoder, byteCount < 1); + pChars[idx++] = replacementChar; - // Just use what we can - charEnd = chars + byteCount; - } - - // We just do a quick copy - while (chars < charEnd) - { - char ch2 = *(chars++); - if (ch2 >= 0x0080) *(bytes++) = (byte)cReplacement; - else *(bytes++) = unchecked((byte)(ch2)); - } + if (idx < numElementsToConvert) + { + idx += (int)ASCIIUtility.WidenAsciiToUtf16(&pBytes[idx], &pChars[idx], (uint)(numElementsToConvert - idx)); + } - // Clear encoder - if (encoder != null) - { - encoder._charLeftOver = (char)0; - encoder._charsUsed = (int)(chars - charStart); + Debug.Assert(idx <= numElementsToConvert, "Somehow went beyond bounds of source or destination buffer?"); } - - return (int)(bytes - byteStart); } - } - - // Slower version, have to do real fallback. - // prepare our end - byte* byteEnd = bytes + byteCount; + // Slice off how much we consumed / wrote. - // We may have a left over character from last time, try and process it. - if (charLeftOver > 0) - { - // Initialize the buffer - Debug.Assert(encoder != null, - "[ASCIIEncoding.GetBytes]Expected non null encoder if we have surrogate left over"); - fallbackBuffer = encoder.FallbackBuffer; - fallbackBuffer.InternalInitialize(chars, charEnd, encoder, true); - - // Since left over char was a surrogate, it'll have to be fallen back. - // Get Fallback - // This will fallback a pair if *chars is a low surrogate - charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered - fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); - chars = charsForFallback; + bytes = bytes.Slice(numElementsToConvert); + chars = chars.Slice(numElementsToConvert); } - // Now we may have fallback char[] already from the encoder + // If we couldn't go through our fast fallback mechanism, or if we still have leftover + // data because we couldn't consume everything in the loop above, we need to go down the + // slow fallback path. - // Go ahead and do it, including the fallback. - char ch; - while ((ch = (fallbackBuffer == null) ? '\0' : fallbackBuffer.InternalGetNextChar()) != 0 || - chars < charEnd) + if (bytes.IsEmpty) { - // First unwind any fallback - if (ch == 0) - { - // No fallback, just get next char - ch = *chars; - chars++; - } - - // Check for fallback, this'll catch surrogate pairs too. - // All characters >= 0x80 must fall back. - if (ch > 0x7f) - { - // Initialize the buffer - if (fallbackBuffer == null) - { - if (encoder == null) - fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); - else - fallbackBuffer = encoder.FallbackBuffer; - fallbackBuffer.InternalInitialize(charEnd - charCount, charEnd, encoder, true); - } - - // Get Fallback - charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered - fallbackBuffer.InternalFallback(ch, ref charsForFallback); - chars = charsForFallback; - - // Go ahead & continue (& do the fallback) - continue; - } - - // We'll use this one - // Bounds check - if (bytes >= byteEnd) - { - // didn't use this char, we'll throw or use buffer - if (fallbackBuffer == null || fallbackBuffer.bFallingBack == false) - { - Debug.Assert(chars > charStart || bytes == byteStart, - "[ASCIIEncoding.GetBytes]Expected chars to have advanced already."); - chars--; // don't use last char - } - else - fallbackBuffer.MovePrevious(); - - // Are we throwing or using buffer? - ThrowBytesOverflow(encoder, bytes == byteStart); // throw? - break; // don't throw, stop - } - - // Go ahead and add it - *bytes = unchecked((byte)ch); - bytes++; + return originalCharsLength - chars.Length; // total number of chars written } - - // Need to do encoder stuff - if (encoder != null) + else { - // Fallback stuck it in encoder if necessary, but we have to clear MustFlush cases - if (fallbackBuffer != null && !fallbackBuffer.bUsedEncoder) - // Clear it in case of MustFlush - encoder._charLeftOver = (char)0; - - // Set our chars used count - encoder._charsUsed = (int)(chars - charStart); + return base.GetCharsWithFallback(bytes, originalBytesLength, chars, originalCharsLength, decoder); } - - Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 || - (encoder != null && !encoder._throwOnOverflow), - "[ASCIIEncoding.GetBytes]Expected Empty fallback buffer at end"); - - return (int)(bytes - byteStart); } - // This is internal and called by something else, - internal sealed override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS decoder) - { - // Just assert, we're called internally so these should be safe, checked already - Debug.Assert(bytes != null, "[ASCIIEncoding.GetCharCount]bytes is null"); - Debug.Assert(count >= 0, "[ASCIIEncoding.GetCharCount]byteCount is negative"); + // Returns a string containing the decoded representation of a range of + // bytes in a byte array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe - // ASCII doesn't do best fit, so don't have to check for it, find out which decoder fallback we're using - DecoderReplacementFallback fallback = null; + public override unsafe string GetString(byte[] bytes, int byteIndex, int byteCount) + { + // Validate Parameters - if (decoder == null) - fallback = this.DecoderFallback as DecoderReplacementFallback; - else + if (bytes is null) { - fallback = decoder.Fallback as DecoderReplacementFallback; - Debug.Assert(!decoder._throwOnOverflow || !decoder.InternalHasFallbackBuffer || - decoder.FallbackBuffer.Remaining == 0, - "[ASCIICodePageEncoding.GetCharCount]Expected empty fallback buffer"); + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array); } - if (fallback != null && fallback.MaxCharCount == 1) + if ((byteIndex | byteCount) < 0) { - // Just return length, SBCS stay the same length because they don't map to surrogate - // pairs and we don't have a decoder fallback. - - return count; + ThrowHelper.ThrowArgumentOutOfRangeException( + argument: (byteIndex < 0) ? ExceptionArgument.byteIndex : ExceptionArgument.byteCount, + resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); } - // Only need decoder fallback buffer if not using default replacement fallback, no best fit for ASCII - DecoderFallbackBuffer fallbackBuffer = null; - - // Have to do it the hard way. - // Assume charCount will be == count - int charCount = count; - byte[] byteBuffer = new byte[1]; - - // Do it our fast way - byte* byteEnd = bytes + count; - - // Quick loop - while (bytes < byteEnd) + if (bytes.Length - byteIndex < byteCount) { - // Faster if don't use *bytes++; - byte b = *bytes; - bytes++; - - // If unknown we have to do fallback count - if (b >= 0x80) - { - if (fallbackBuffer == null) - { - if (decoder == null) - fallbackBuffer = this.DecoderFallback.CreateFallbackBuffer(); - else - fallbackBuffer = decoder.FallbackBuffer; - fallbackBuffer.InternalInitialize(byteEnd - count, null); - } - - // Use fallback buffer - byteBuffer[0] = b; - charCount--; // Have to unreserve the one we already allocated for b - charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes); - } + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer); } - // Fallback buffer must be empty - Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, - "[ASCIIEncoding.GetCharCount]Expected Empty fallback buffer"); + // Avoid problems with empty input buffer + if (byteCount == 0) + return string.Empty; - // Converted sequence is same length as input - return charCount; + fixed (byte* pBytes = bytes) + { + return string.CreateStringFromEncoding(pBytes + byteIndex, byteCount, this); + } } - internal sealed override unsafe int GetChars( - byte* bytes, int byteCount, char* chars, int charCount, DecoderNLS decoder) - { - // Just need to ASSERT, this is called by something else internal that checked parameters already - Debug.Assert(bytes != null, "[ASCIIEncoding.GetChars]bytes is null"); - Debug.Assert(byteCount >= 0, "[ASCIIEncoding.GetChars]byteCount is negative"); - Debug.Assert(chars != null, "[ASCIIEncoding.GetChars]chars is null"); - Debug.Assert(charCount >= 0, "[ASCIIEncoding.GetChars]charCount is negative"); - - // Do it fast way if using ? replacement fallback - byte* byteEnd = bytes + byteCount; - byte* byteStart = bytes; - char* charStart = chars; + // + // End of standard methods copied from EncodingNLS.cs + // - // Note: ASCII doesn't do best fit, but we have to fallback if they use something > 0x7f - // Only need decoder fallback buffer if not using ? fallback. - // ASCII doesn't do best fit, so don't have to check for it, find out which decoder fallback we're using - DecoderReplacementFallback fallback = null; - char* charsForFallback; + // + // Beginning of methods used by shared fallback logic. + // - if (decoder == null) - fallback = this.DecoderFallback as DecoderReplacementFallback; + internal sealed override bool TryGetByteCount(Rune value, out int byteCount) + { + if (value.IsAscii) + { + byteCount = 1; + return true; + } else { - fallback = decoder.Fallback as DecoderReplacementFallback; - Debug.Assert(!decoder._throwOnOverflow || !decoder.InternalHasFallbackBuffer || - decoder.FallbackBuffer.Remaining == 0, - "[ASCIICodePageEncoding.GetChars]Expected empty fallback buffer"); + byteCount = default; + return false; } + } - if (fallback != null && fallback.MaxCharCount == 1) + internal sealed override OperationStatus EncodeRune(Rune value, Span bytes, out int bytesWritten) + { + if (value.IsAscii) { - // Try it the fast way - char replacementChar = fallback.DefaultString[0]; - - // Need byteCount chars, otherwise too small buffer - if (charCount < byteCount) + if (!bytes.IsEmpty) { - // Need at least 1 output byte, throw if must throw - ThrowCharsOverflow(decoder, charCount < 1); - - // Not throwing, use what we can - byteEnd = bytes + charCount; + bytes[0] = (byte)value.Value; + bytesWritten = 1; + return OperationStatus.Done; } - - // Quick loop, just do '?' replacement because we don't have fallbacks for decodings. - while (bytes < byteEnd) + else { - byte b = *(bytes++); - if (b >= 0x80) - // This is an invalid byte in the ASCII encoding. - *(chars++) = replacementChar; - else - *(chars++) = unchecked((char)b); + bytesWritten = 0; + return OperationStatus.DestinationTooSmall; } - - // bytes & chars used are the same - if (decoder != null) - decoder._bytesUsed = (int)(bytes - byteStart); - return (int)(chars - charStart); } - - // Slower way's going to need a fallback buffer - DecoderFallbackBuffer fallbackBuffer = null; - byte[] byteBuffer = new byte[1]; - char* charEnd = chars + charCount; - - // Not quite so fast loop - while (bytes < byteEnd) + else { - // Faster if don't use *bytes++; - byte b = *(bytes); - bytes++; + bytesWritten = 0; + return OperationStatus.InvalidData; + } + } - if (b >= 0x80) + internal sealed override OperationStatus DecodeFirstRune(ReadOnlySpan bytes, out Rune value, out int bytesConsumed) + { + if (!bytes.IsEmpty) + { + byte b = bytes[0]; + if (b <= 0x7F) { - // This is an invalid byte in the ASCII encoding. - if (fallbackBuffer == null) - { - if (decoder == null) - fallbackBuffer = this.DecoderFallback.CreateFallbackBuffer(); - else - fallbackBuffer = decoder.FallbackBuffer; - fallbackBuffer.InternalInitialize(byteEnd - byteCount, charEnd); - } - - // Use fallback buffer - byteBuffer[0] = b; - - // Note that chars won't get updated unless this succeeds - charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered - bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback); - chars = charsForFallback; + // ASCII byte - if (!fallbackResult) - { - // May or may not throw, but we didn't get this byte - Debug.Assert(bytes > byteStart || chars == charStart, - "[ASCIIEncoding.GetChars]Expected bytes to have advanced already (fallback case)"); - bytes--; // unused byte - fallbackBuffer.InternalReset(); // Didn't fall this back - ThrowCharsOverflow(decoder, chars == charStart); // throw? - break; // don't throw, but stop loop - } + value = new Rune(b); + bytesConsumed = 1; + return OperationStatus.Done; } else { - // Make sure we have buffer space - if (chars >= charEnd) - { - Debug.Assert(bytes > byteStart || chars == charStart, - "[ASCIIEncoding.GetChars]Expected bytes to have advanced already (normal case)"); - bytes--; // unused byte - ThrowCharsOverflow(decoder, chars == charStart); // throw? - break; // don't throw, but stop loop - } + // Non-ASCII byte - *(chars) = unchecked((char)b); - chars++; + value = Rune.ReplacementChar; + bytesConsumed = 1; + return OperationStatus.InvalidData; } } + else + { + // No data to decode - // Might have had decoder fallback stuff. - if (decoder != null) - decoder._bytesUsed = (int)(bytes - byteStart); - - // Expect Empty fallback buffer for GetChars - Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, - "[ASCIIEncoding.GetChars]Expected Empty fallback buffer"); - - return (int)(chars - charStart); + value = Rune.ReplacementChar; + bytesConsumed = 0; + return OperationStatus.NeedMoreData; + } } + // + // End of methods used by shared fallback logic. + // public override int GetMaxByteCount(int charCount) { diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs new file mode 100644 index 0000000000000..5bc80c35f5b1d --- /dev/null +++ b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs @@ -0,0 +1,76 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Runtime.CompilerServices; + +namespace System.Text +{ + /* + * Contains naive unoptimized (non-SIMD) implementations of ASCII transcoding + * operations. Vectorized methods can be substituted here as a drop-in replacement. + */ + + internal unsafe static class ASCIIUtility + { + [MethodImpl(MethodImplOptions.NoInlining)] // the actual implementation won't be inlined, so this shouldn't be either, lest it throw off benchmarks + public static uint GetIndexOfFirstNonAsciiByte(byte* pBytes, uint byteCount) + { + uint idx = 0; + for (; idx < byteCount; idx++) + { + if ((sbyte)pBytes[idx] < 0) + { + break; + } + } + return idx; + } + + [MethodImpl(MethodImplOptions.NoInlining)] // the actual implementation won't be inlined, so this shouldn't be either, lest it throw off benchmarks + public static uint GetIndexOfFirstNonAsciiChar(char* pChars, uint charCount) + { + uint idx = 0; + for (; idx < charCount; idx++) + { + if (pChars[idx] > 0x7Fu) + { + break; + } + } + return idx; + } + + [MethodImpl(MethodImplOptions.NoInlining)] // the actual implementation won't be inlined, so this shouldn't be either, lest it throw off benchmarks + public static uint NarrowUtf16ToAscii(char* pChars, byte* pBytes, uint elementCount) + { + uint idx = 0; + for (; idx < elementCount; idx++) + { + uint ch = pChars[idx]; + if (ch > 0x7Fu) + { + break; + } + pBytes[idx] = (byte)ch; + } + return idx; + } + + [MethodImpl(MethodImplOptions.NoInlining)] // the actual implementation won't be inlined, so this shouldn't be either, lest it throw off benchmarks + public static uint WidenAsciiToUtf16(byte* pBytes, char* pChars, uint elementCount) + { + uint idx = 0; + for (; idx < elementCount; idx++) + { + byte b = pBytes[idx]; + if (b > 0x7F) + { + break; + } + pChars[idx] = (char)b; + } + return idx; + } + } +} diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/DecoderFallback.cs b/src/libraries/System.Private.CoreLib/src/System/Text/DecoderFallback.cs index fff8ad1d7bb3d..2eb03d8089d40 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/DecoderFallback.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/DecoderFallback.cs @@ -67,6 +67,10 @@ public virtual void Reset() internal unsafe byte* byteStart; internal unsafe char* charEnd; + internal Encoding _encoding; + internal DecoderNLS _decoder; + private int _originalByteCount; + // Internal Reset internal unsafe void InternalReset() { @@ -82,6 +86,22 @@ internal unsafe void InternalInitialize(byte* byteStart, char* charEnd) this.charEnd = charEnd; } + internal static DecoderFallbackBuffer CreateAndInitialize(Encoding encoding, DecoderNLS decoder, int originalByteCount) + { + // The original byte count is only used for keeping track of what 'index' value needs + // to be passed to the abstract Fallback method. The index value is calculated by subtracting + // 'bytes.Length' (where bytes is expected to be the entire remaining input buffer) + // from the 'originalByteCount' value specified here. + + DecoderFallbackBuffer fallbackBuffer = (decoder is null) ? encoding.DecoderFallback.CreateFallbackBuffer() : decoder.FallbackBuffer; + + fallbackBuffer._encoding = encoding; + fallbackBuffer._decoder = decoder; + fallbackBuffer._originalByteCount = originalByteCount; + + return fallbackBuffer; + } + // Fallback the current byte by sticking it into the remaining char buffer. // This can only be called by our encodings (other have to use the public fallback methods), so // we can use our DecoderNLS here too (except we don't). @@ -191,6 +211,90 @@ internal unsafe virtual int InternalFallback(byte[] bytes, byte* pBytes) return 0; } + internal int InternalFallbackGetCharCount(ReadOnlySpan remainingBytes, int fallbackLength) + { + return (Fallback(remainingBytes.Slice(0, fallbackLength).ToArray(), index: _originalByteCount - remainingBytes.Length)) + ? DrainRemainingDataForGetCharCount() + : 0; + } + + internal bool TryInternalFallbackGetChars(ReadOnlySpan remainingBytes, int fallbackLength, Span chars, out int charsWritten) + { + if (Fallback(remainingBytes.Slice(0, fallbackLength).ToArray(), index: _originalByteCount - remainingBytes.Length)) + { + return TryDrainRemainingDataForGetChars(chars, out charsWritten); + } + else + { + // Return true because we weren't asked to write anything, so this is a "success" in the sense that + // the output buffer was large enough to hold the desired 0 chars of output. + + charsWritten = 0; + return true; + } + } + + private Rune GetNextRune() + { + // Call GetNextChar() and try treating it as a non-surrogate character. + // If that fails, call GetNextChar() again and attempt to treat the two chars + // as a surrogate pair. If that still fails, throw an exception since the fallback + // mechanism is giving us a bad replacement character. + + Rune rune; + char ch = GetNextChar(); + if (!Rune.TryCreate(ch, out rune) && !Rune.TryCreate(ch, GetNextChar(), out rune)) + { + throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex); + } + + return rune; + } + + internal int DrainRemainingDataForGetCharCount() + { + int totalCharCount = 0; + + Rune thisRune; + while ((thisRune = GetNextRune()).Value != 0) + { + // We need to check for overflow while tallying the fallback char count. + + totalCharCount += thisRune.Utf16SequenceLength; + if (totalCharCount < 0) + { + InternalReset(); + Encoding.ThrowConversionOverflow(); + } + } + + return totalCharCount; + } + + internal bool TryDrainRemainingDataForGetChars(Span chars, out int charsWritten) + { + int originalCharCount = chars.Length; + + Rune thisRune; + while ((thisRune = GetNextRune()).Value != 0) + { + if (thisRune.TryEncode(chars, out int charsWrittenJustNow)) + { + chars = chars.Slice(charsWrittenJustNow); + continue; + } + else + { + InternalReset(); + charsWritten = default; + return false; + } + } + + charsWritten = originalCharCount - chars.Length; + return true; + } + // private helper methods internal void ThrowLastBytesRecursive(byte[] bytesUnknown) { diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/DecoderNLS.cs b/src/libraries/System.Private.CoreLib/src/System/Text/DecoderNLS.cs index 8af4dc3a55c0e..597d362bf7811 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/DecoderNLS.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/DecoderNLS.cs @@ -2,9 +2,8 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System.Runtime.Serialization; -using System.Text; -using System; +using System.Buffers; +using System.Diagnostics; using System.Runtime.InteropServices; namespace System.Text @@ -27,6 +26,8 @@ internal class DecoderNLS : Decoder private bool _mustFlush; internal bool _throwOnOverflow; internal int _bytesUsed; + private int _leftoverBytes; // leftover data from a previous invocation of GetChars (up to 4 bytes) + private int _leftoverByteCount; // number of bytes of actual data in _leftoverBytes internal DecoderNLS(Encoding encoding) { @@ -44,6 +45,7 @@ internal DecoderNLS() public override void Reset() { + ClearLeftoverData(); _fallbackBuffer?.Reset(); } @@ -238,5 +240,195 @@ internal void ClearMustFlush() { _mustFlush = false; } + + internal ReadOnlySpan GetLeftoverData() + { + return MemoryMarshal.AsBytes(new ReadOnlySpan(ref _leftoverBytes, 1)).Slice(0, _leftoverByteCount); + } + + internal void SetLeftoverData(ReadOnlySpan bytes) + { + bytes.CopyTo(MemoryMarshal.AsBytes(new Span(ref _leftoverBytes, 1))); + _leftoverByteCount = bytes.Length; + } + + internal bool HasLeftoverData => _leftoverByteCount != 0; + + internal void ClearLeftoverData() + { + _leftoverByteCount = 0; + } + + internal int DrainLeftoverDataForGetCharCount(ReadOnlySpan bytes, out int bytesConsumed) + { + // Quick check: we _should not_ have leftover fallback data from a previous invocation, + // as we'd end up consuming any such data and would corrupt whatever Convert call happens + // to be in progress. Unlike EncoderNLS, this is simply a Debug.Assert. No exception is thrown. + + Debug.Assert(_fallbackBuffer is null || _fallbackBuffer.Remaining == 0, "Should have no data remaining in the fallback buffer."); + + // Copy the existing leftover data plus as many bytes as possible of the new incoming data + // into a temporary concated buffer, then get its char count by decoding it. + + Span combinedBuffer = stackalloc byte[4]; + combinedBuffer = combinedBuffer.Slice(0, ConcatInto(GetLeftoverData(), bytes, combinedBuffer)); + int charCount = 0; + + switch (_encoding.DecodeFirstRune(combinedBuffer, out Rune value, out int combinedBufferBytesConsumed)) + { + case OperationStatus.Done: + charCount = value.Utf16SequenceLength; + goto Finish; // successfully transcoded bytes -> chars + + case OperationStatus.NeedMoreData: + if (MustFlush) + { + goto case OperationStatus.InvalidData; // treat as equivalent to bad data + } + else + { + goto Finish; // consumed some bytes, output 0 chars + } + + case OperationStatus.InvalidData: + break; + + default: + Debug.Fail("Unexpected OperationStatus return value."); + break; + } + + // Couldn't decode the buffer. Fallback the buffer instead. + + if (FallbackBuffer.Fallback(combinedBuffer.Slice(0, combinedBufferBytesConsumed).ToArray(), index: 0)) + { + charCount = _fallbackBuffer.DrainRemainingDataForGetCharCount(); + Debug.Assert(charCount >= 0, "Fallback buffer shouldn't have returned a negative char count."); + } + + Finish: + + bytesConsumed = combinedBufferBytesConsumed - _leftoverByteCount; // amount of 'bytes' buffer consumed just now + return charCount; + } + + internal int DrainLeftoverDataForGetChars(ReadOnlySpan bytes, Span chars, out int bytesConsumed) + { + // Quick check: we _should not_ have leftover fallback data from a previous invocation, + // as we'd end up consuming any such data and would corrupt whatever Convert call happens + // to be in progress. Unlike EncoderNLS, this is simply a Debug.Assert. No exception is thrown. + + Debug.Assert(_fallbackBuffer is null || _fallbackBuffer.Remaining == 0, "Should have no data remaining in the fallback buffer."); + + // Copy the existing leftover data plus as many bytes as possible of the new incoming data + // into a temporary concated buffer, then transcode it from bytes to chars. + + Span combinedBuffer = stackalloc byte[4]; + combinedBuffer = combinedBuffer.Slice(0, ConcatInto(GetLeftoverData(), bytes, combinedBuffer)); + int charsWritten = 0; + + bool persistNewCombinedBuffer = false; + + switch (_encoding.DecodeFirstRune(combinedBuffer, out Rune value, out int combinedBufferBytesConsumed)) + { + case OperationStatus.Done: + if (value.TryEncode(chars, out charsWritten)) + { + goto Finish; // successfully transcoded bytes -> chars + } + else + { + goto DestinationTooSmall; + } + + case OperationStatus.NeedMoreData: + if (MustFlush) + { + goto case OperationStatus.InvalidData; // treat as equivalent to bad data + } + else + { + persistNewCombinedBuffer = true; + goto Finish; // successfully consumed some bytes, output no chars + } + + case OperationStatus.InvalidData: + break; + + default: + Debug.Fail("Unexpected OperationStatus return value."); + break; + } + + // Couldn't decode the buffer. Fallback the buffer instead. + + if (FallbackBuffer.Fallback(combinedBuffer.Slice(0, combinedBufferBytesConsumed).ToArray(), index: 0) + && !_fallbackBuffer.TryDrainRemainingDataForGetChars(chars, out charsWritten)) + { + goto DestinationTooSmall; + } + + Finish: + + if (persistNewCombinedBuffer) + { + Debug.Assert(combinedBufferBytesConsumed == combinedBuffer.Length, "We should be asked to persist the entire combined buffer."); + SetLeftoverData(combinedBuffer); // the buffer still only contains partial data; a future call to Convert will need it + } + else + { + ClearLeftoverData(); // the buffer contains no partial data; we'll go down the normal paths + } + + bytesConsumed = combinedBufferBytesConsumed - _leftoverByteCount; // amount of 'bytes' buffer consumed just now + return charsWritten; + + DestinationTooSmall: + + // If we got to this point, we're trying to write chars to the output buffer, but we're unable to do + // so. Unlike EncoderNLS, this type does not allow partial writes to the output buffer. Since we know + // draining leftover data is the first operation performed by any DecoderNLS API, there was no + // opportunity for any code before us to make forward progress, so we must fail immediately. + + _encoding.ThrowCharsOverflow(this, nothingDecoded: true); + throw null; // will never reach this point + } + + /// + /// Given a byte buffer , concatenates as much of followed + /// by into it as will fit, then returns the total number of bytes copied. + /// + private static int ConcatInto(ReadOnlySpan srcLeft, ReadOnlySpan srcRight, Span dest) + { + int total = 0; + + for (int i = 0; i < srcLeft.Length; i++) + { + if ((uint)total >= (uint)dest.Length) + { + goto Finish; + } + else + { + dest[total++] = srcLeft[i]; + } + } + + for (int i = 0; i < srcRight.Length; i++) + { + if ((uint)total >= (uint)dest.Length) + { + goto Finish; + } + else + { + dest[total++] = srcRight[i]; + } + } + + Finish: + + return total; + } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/EncoderFallback.cs b/src/libraries/System.Private.CoreLib/src/System/Text/EncoderFallback.cs index f98b15e078667..ff895d6788797 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/EncoderFallback.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/EncoderFallback.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.Buffers; using System.Diagnostics; using System.Threading; @@ -86,12 +87,14 @@ public virtual void Reset() // These help us with our performance and messages internally internal unsafe char* charStart; internal unsafe char* charEnd; - internal EncoderNLS encoder; + internal EncoderNLS encoder; // TODO: MAKE ME PRIVATE internal bool setEncoder; internal bool bUsedEncoder; internal bool bFallingBack = false; internal int iRecursionCount = 0; private const int iMaxRecursion = 250; + private Encoding encoding; + private int originalCharCount; // Internal Reset // For example, what if someone fails a conversion and wants to reset one of our fallback buffers? @@ -116,6 +119,22 @@ internal unsafe void InternalInitialize(char* charStart, char* charEnd, EncoderN this.iRecursionCount = 0; } + internal static EncoderFallbackBuffer CreateAndInitialize(Encoding encoding, EncoderNLS encoder, int originalCharCount) + { + // The original char count is only used for keeping track of what 'index' value needs + // to be passed to the abstract Fallback method. The index value is calculated by subtracting + // 'chars.Length' (where chars is expected to be the entire remaining input buffer) + // from the 'originalCharCount' value specified here. + + EncoderFallbackBuffer fallbackBuffer = (encoder is null) ? encoding.EncoderFallback.CreateFallbackBuffer() : encoder.FallbackBuffer; + + fallbackBuffer.encoding = encoding; + fallbackBuffer.encoder = encoder; + fallbackBuffer.originalCharCount = originalCharCount; + + return fallbackBuffer; + } + internal char InternalGetNextChar() { char ch = GetNextChar(); @@ -124,6 +143,170 @@ internal char InternalGetNextChar() return ch; } + private bool InternalFallback(ReadOnlySpan chars, out int charsConsumed) + { + Debug.Assert(!chars.IsEmpty, "Caller shouldn't invoke this if there's no data to fall back."); + + // First, try falling back a single BMP character or a standalone low surrogate. + // If the first char is a high surrogate, we'll try to combine it with the next + // char in the input sequence. + + char firstChar = chars[0]; + char secondChar = default; + + if (!chars.IsEmpty) + { + firstChar = chars[0]; + + if (1 < (uint)chars.Length) + { + secondChar = chars[1]; + } + } + + // Ask the subclassed type to initiate fallback logic. + + int index = originalCharCount - chars.Length; + + if (!char.IsSurrogatePair(firstChar, secondChar)) + { + // This code path is also used when 'firstChar' is a standalone surrogate or + // if it's a high surrogate at the end of the input buffer. + + charsConsumed = 1; + return Fallback(firstChar, index); + } + else + { + charsConsumed = 2; + return Fallback(firstChar, secondChar, index); + } + } + + internal int InternalFallbackGetByteCount(ReadOnlySpan chars, out int charsConsumed) + { + int bytesWritten = 0; + + if (InternalFallback(chars, out charsConsumed)) + { + // There's data in the fallback buffer - pull it out now. + + bytesWritten = DrainRemainingDataForGetByteCount(); + } + + return bytesWritten; + } + + internal bool TryInternalFallbackGetBytes(ReadOnlySpan chars, Span bytes, out int charsConsumed, out int bytesWritten) + { + if (InternalFallback(chars, out charsConsumed)) + { + // There's data in the fallback buffer - pull it out now. + + return TryDrainRemainingDataForGetBytes(bytes, out bytesWritten); + } + else + { + // There's no data in the fallback buffer. + + bytesWritten = 0; + return true; // true = didn't run out of space in destination buffer + } + } + + internal bool TryDrainRemainingDataForGetBytes(Span bytes, out int bytesWritten) + { + int originalBytesLength = bytes.Length; + + Rune thisRune; + while ((thisRune = GetNextRune()).Value != 0) + { + switch (encoding.EncodeRune(thisRune, bytes, out int bytesWrittenJustNow)) + { + case OperationStatus.Done: + + bytes = bytes.Slice(bytesWrittenJustNow); + continue; + + case OperationStatus.DestinationTooSmall: + + // Since we're not consuming the Rune we just read, back up as many chars as necessary + // to undo the read we just performed, then report to our caller that we ran out of space. + + for (int i = 0; i < thisRune.Utf16SequenceLength; i++) + { + MovePrevious(); + } + + bytesWritten = originalBytesLength - bytes.Length; + return false; // ran out of destination buffer + + case OperationStatus.InvalidData: + + // We can't fallback the fallback. We can't make forward progress, so report to our caller + // that something went terribly wrong. The error message contains the fallback char that + // couldn't be converted. (Ideally we'd provide the first char that originally triggered + // the fallback, but it's complicated to keep this state around, and a fallback producing + // invalid data should be a very rare occurrence.) + + ThrowLastCharRecursive(thisRune.Value); + break; // will never be hit; call above throws + + default: + + Debug.Fail("Unexpected return value."); + break; + } + } + + bytesWritten = originalBytesLength - bytes.Length; + return true; // finished successfully + } + + internal int DrainRemainingDataForGetByteCount() + { + int totalByteCount = 0; + + Rune thisRune; + while ((thisRune = GetNextRune()).Value != 0) + { + if (!encoding.TryGetByteCount(thisRune, out int byteCountThisIteration)) + { + // We can't fallback the fallback. We can't make forward progress, so report to our caller + // that something went terribly wrong. The error message contains the fallback char that + // couldn't be converted. (Ideally we'd provide the first char that originally triggered + // the fallback, but it's complicated to keep this state around, and a fallback producing + // invalid data should be a very rare occurrence.) + + ThrowLastCharRecursive(thisRune.Value); + } + + Debug.Assert(byteCountThisIteration >= 0, "Encoding shouldn't have returned a negative byte count."); + + // We need to check for overflow while tallying the fallback byte count. + + totalByteCount += byteCountThisIteration; + if (totalByteCount < 0) + { + InternalReset(); + Encoding.ThrowConversionOverflow(); + } + } + + return totalByteCount; + } + + private Rune GetNextRune() + { + char firstChar = GetNextChar(); + if (Rune.TryCreate(firstChar, out Rune value) || Rune.TryCreate(firstChar, GetNextChar(), out value)) + { + return value; + } + + throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex); + } + // Fallback the current character using the remaining buffer and encoder if necessary // This can only be called by our encodings (other have to use the public fallback methods), so // we can use our EncoderNLS here too. diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/EncoderNLS.cs b/src/libraries/System.Private.CoreLib/src/System/Text/EncoderNLS.cs index e83666f7a3f13..2901fc37b9bfe 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/EncoderNLS.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/EncoderNLS.cs @@ -2,8 +2,8 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System.Text; -using System; +using System.Buffers; +using System.Diagnostics; using System.Runtime.InteropServices; namespace System.Text @@ -197,9 +197,13 @@ public unsafe override int GetBytes(char* chars, int charCount, byte* bytes, int bytesUsed = _encoding.GetBytes(chars, charCount, bytes, byteCount, this); charsUsed = _charsUsed; - // Its completed if they've used what they wanted AND if they didn't want flush or if we are flushed - completed = (charsUsed == charCount) && (!flush || !this.HasState) && - (_fallbackBuffer == null || _fallbackBuffer.Remaining == 0); + // Per MSDN, "The completed output parameter indicates whether all the data in the input + // buffer was converted and stored in the output buffer." That means we've successfully + // consumed all the input _and_ there's no pending state or fallback data remaining to be output. + + completed = (charsUsed == charCount) + && !this.HasState + && (_fallbackBuffer is null || _fallbackBuffer.Remaining == 0); // Our data thingys are now full, we can return } @@ -220,6 +224,10 @@ public bool MustFlush } } + /// + /// States whether a call to must first drain data on this instance. + /// + internal bool HasLeftoverData => _charLeftOver != default || (_fallbackBuffer != null && _fallbackBuffer.Remaining > 0); // Anything left in our encoder? internal virtual bool HasState @@ -235,5 +243,154 @@ internal void ClearMustFlush() { _mustFlush = false; } + + internal int DrainLeftoverDataForGetByteCount(ReadOnlySpan chars, out int charsConsumed) + { + // Quick check: we _should not_ have leftover fallback data from a previous invocation, + // as we'd end up consuming any such data and would corrupt whatever Convert call happens + // to be in progress. + + if (_fallbackBuffer != null && _fallbackBuffer.Remaining > 0) + { + throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, Encoding.EncodingName, _fallbackBuffer.GetType())); + } + + // If we have a leftover high surrogate from a previous operation, consume it now. + // We won't clear the _charLeftOver field since GetByteCount is supposed to be + // a non-mutating operation, and we need the field to retain its value for the + // next call to Convert. + + charsConsumed = 0; // could be incorrect, will fix up later in the method + + if (_charLeftOver == default) + { + return 0; // no leftover high surrogate char - short-circuit and finish + } + else + { + char secondChar = default; + + if (chars.IsEmpty) + { + // If the input buffer is empty and we're not being asked to flush, no-op and return + // success to our caller. If we're being asked to flush, the leftover high surrogate from + // the previous operation will go through the fallback mechanism by itself. + + if (!MustFlush) + { + return 0; // no-op = success + } + } + else + { + secondChar = chars[0]; + } + + // If we have to fallback the chars we're reading immediately below, populate the + // fallback buffer with the invalid data. We'll just fall through to the "consume + // fallback buffer" logic at the end of the method. + + bool didFallback; + + if (Rune.TryCreate(_charLeftOver, secondChar, out Rune rune)) + { + charsConsumed = 1; // consumed the leftover high surrogate + the first char in the input buffer + + if (_encoding.TryGetByteCount(rune, out int byteCount)) + { + Debug.Assert(byteCount >= 0, "Encoding shouldn't have returned a negative byte count."); + return byteCount; + } + else + { + didFallback = FallbackBuffer.Fallback(_charLeftOver, secondChar, index: 0); + } + } + else + { + didFallback = FallbackBuffer.Fallback(_charLeftOver, index: 0); + } + + // Now tally the number of bytes that would've been emitted as part of fallback. + + return _fallbackBuffer.DrainRemainingDataForGetByteCount(); + } + } + + internal bool TryDrainLeftoverDataForGetBytes(ReadOnlySpan chars, Span bytes, out int charsConsumed, out int bytesWritten) + { + // We may have a leftover high surrogate data from a previous invocation, or we may have leftover + // data in the fallback buffer, or we may have neither, but we will never have both. Check for these + // conditions and handle them now. + + charsConsumed = 0; // could be incorrect, will fix up later in the method + bytesWritten = 0; // could be incorrect, will fix up later in the method + + if (_charLeftOver != default) + { + char secondChar = default; + + if (chars.IsEmpty) + { + // If the input buffer is empty and we're not being asked to flush, no-op and return + // success to our caller. If we're being asked to flush, the leftover high surrogate from + // the previous operation will go through the fallback mechanism by itself. + + if (!MustFlush) + { + charsConsumed = 0; + bytesWritten = 0; + return true; // no-op = success + } + } + else + { + secondChar = chars[0]; + } + + // If we have to fallback the chars we're reading immediately below, populate the + // fallback buffer with the invalid data. We'll just fall through to the "consume + // fallback buffer" logic at the end of the method. + + if (Rune.TryCreate(_charLeftOver, secondChar, out Rune rune)) + { + charsConsumed = 1; // at the very least, we consumed 1 char from the input + switch (_encoding.EncodeRune(rune, bytes, out bytesWritten)) + { + case OperationStatus.Done: + _charLeftOver = default; // we just consumed this char + return true; // that's all - we've handled the leftover data + + case OperationStatus.DestinationTooSmall: + _charLeftOver = default; // we just consumed this char + _encoding.ThrowBytesOverflow(this, nothingEncoded: true); // will throw + break; + + case OperationStatus.InvalidData: + FallbackBuffer.Fallback(_charLeftOver, secondChar, index: 0); + break; + + default: + Debug.Fail("Unknown return value."); + break; + } + } + else + { + FallbackBuffer.Fallback(_charLeftOver, index: 0); + } + } + + // Now check the fallback buffer for any remaining data. + + if (_fallbackBuffer != null && _fallbackBuffer.Remaining > 0) + { + return _fallbackBuffer.TryDrainRemainingDataForGetBytes(bytes, out bytesWritten); + } + + // And we're done! + + return true; // success + } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Encoding.Internal.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Encoding.Internal.cs new file mode 100644 index 0000000000000..09044afefe60e --- /dev/null +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Encoding.Internal.cs @@ -0,0 +1,1277 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Buffers; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using Internal.Runtime.CompilerServices; + +namespace System.Text +{ + public partial class Encoding + { + /* + * This file contains infrastructure code that supports a simplified way of writing + * internally-implemented Encoding types. In this system, the individual Encoding types + * are no longer responsible for handling anything related to the EncoderNLS / DecoderNLS + * infrastructure, nor are they responsible for implementing anything related to fallback + * buffers logic. + * + * Instead, subclassed types are responsible only for transcoding of individual scalar values + * to and from the encoding's byte representation (see the two methods immediately below). + * They can optionally implement fast-path logic to perform bulk transcoding up until the + * first segment of data that cannot be transcoded. They can special-case certain fallback + * mechanisms if desired. + * + * Most of the fast-path code is written using raw pointers as the exchange types, just as + * in the standard Encoding infrastructure. Since the fallback logic is more complex, most + * of it is written using type-safe constructs like Span, with some amount of glue to + * allow it to work correctly with pointer-based fast-path code. + * + * A typical call graph for GetBytes is represented below, using ASCIIEncoding as an example. + * + * ASCIIEncoding.GetBytes(...) [non-EncoderNLS path, public virtual override] + * `- + * - ASCIIEncoding.GetBytesCommon [private helper method per derived type, inlined] + * `- ASCIIEncoding.GetBytesFast [overridden fast-path implementation, inlined] + * - + * - + * `- Encoding.GetBytesWithFallback [non-virtual stub method to call main GetBytesWithFallback worker] + * `- Encoding.GetBytesWithFallback [virtual method whose base implementation contains slow fallback logic] + * `- + * - + * - + * `- + * - + * - + * - + * - + * - + * + * The call graph for GetBytes(..., EncoderNLS) is similar: + * + * Encoding.GetBytes(..., EncoderNLS) [base implementation] + * `- + * - + * - + * `- Encoding.GetBytesWithFallback [non-virtual stub method] + * `- + * - + * - + * - + * `- Encoding.GetBytesWithFallback [virtual method as described above] + * + * There are different considerations in each call graph for things like error handling, + * since the error conditions will be different depending on whether or not an EncoderNLS + * instance is available and what values its properties have. + */ + + /* + * THESE TWO METHODS MUST BE OVERRIDDEN BY A SUBCLASSED TYPE + */ + + internal virtual OperationStatus DecodeFirstRune(ReadOnlySpan bytes, out Rune value, out int bytesConsumed) + { + Debug.Fail("This should be overridden by a subclassed type."); + throw NotImplemented.ByDesign; + } + + internal virtual OperationStatus EncodeRune(Rune value, Span bytes, out int bytesWritten) + { + Debug.Fail("This should be overridden by a subclassed type."); + throw NotImplemented.ByDesign; + } + + /* + * ALL OTHER LOGIC CAN BE IMPLEMENTED IN TERMS OF THE TWO METHODS ABOVE. + * FOR IMPROVED PERFORMANCE, SUBCLASSED TYPES MAY WANT TO OVERRIDE ONE OR MORE VIRTUAL METHODS BELOW. + */ + + /* + * GETBYTECOUNT FAMILY OF FUNCTIONS + */ + + /// + /// Given a , determines its byte count under the current . + /// Returns if the cannot be represented in the + /// current . + /// + internal virtual bool TryGetByteCount(Rune value, out int byteCount) + { + // Any production-quality type would override this method and provide a real + // implementation, so we won't provide a base implementation. However, a + // non-shipping slow reference implementation is provided below for convenience. + +#if false + Span bytes = stackalloc byte[4]; // max 4 bytes per input scalar + + OperationStatus opStatus = EncodeRune(value, bytes, out byteCount); + Debug.Assert(opStatus == OperationStatus.Done || opStatus == OperationStatus.InvalidData, "Unexpected return value."); + + return (opStatus == OperationStatus.Done); +#else + Debug.Fail("This should be overridden by a subclassed type."); + throw NotImplemented.ByDesign; +#endif + } + + /// + /// Entry point from . + /// + internal virtual unsafe int GetByteCount(char* pChars, int charCount, EncoderNLS encoder) + { + Debug.Assert(encoder != null, "This code path should only be called from EncoderNLS."); + Debug.Assert(charCount >= 0, "Caller should've checked this condition."); + Debug.Assert(pChars != null || charCount == 0, "Cannot provide a null pointer and a non-zero count."); + + // We're going to try to stay on the fast-path as much as we can. That means that we have + // no leftover data to drain and the entire source buffer can be consumed in a single + // fast-path invocation. If either of these doesn't hold, we'll go down the slow path of + // creating spans, draining the EncoderNLS instance, and falling back. + + int totalByteCount = 0; + int charsConsumed = 0; + + if (!encoder.HasLeftoverData) + { + totalByteCount = GetByteCountFast(pChars, charCount, encoder.Fallback, out charsConsumed); + if (charsConsumed == charCount) + { + return totalByteCount; + } + } + + // We had leftover data, or we couldn't consume the entire input buffer. + // Let's go down the draining + fallback mechanisms. + + totalByteCount += GetByteCountWithFallback(pChars, charCount, charsConsumed, encoder); + if (totalByteCount < 0) + { + ThrowConversionOverflow(); + } + + return totalByteCount; + } + + /// + /// Counts the number of s that would result from transcoding the source + /// data, exiting when the source buffer is consumed or when the first unreadable data is encountered. + /// The implementation may inspect to short-circuit any counting + /// operation, but it should not attempt to call . + /// + /// + /// Via , the number of elements from which + /// were consumed; and returns the transcoded byte count up to this point. + /// + /// + /// If the byte count would be greater than . + /// (Implementation should call .) + /// + /// + /// The implementation should not attempt to perform any sort of fallback behavior. + /// If custom fallback behavior is necessary, override . + /// + private protected virtual unsafe int GetByteCountFast(char* pChars, int charsLength, EncoderFallback fallback, out int charsConsumed) + { + // Any production-quality type would override this method and provide a real + // implementation, so we won't provide a base implementation. However, a + // non-shipping slow reference implementation is provided below for convenience. + +#if false + ReadOnlySpan chars = new ReadOnlySpan(pChars, charsLength); + int totalByteCount = 0; + + while (!chars.IsEmpty) + { + if (Rune.DecodeUtf16(chars, out Rune scalarValue, out int charsConsumedThisIteration) != OperationStatus.Done + || !TryGetByteCount(scalarValue, out int byteCountThisIteration)) + { + // Invalid UTF-16 data, or not convertible to target encoding + + break; + } + + chars = chars.Slice(charsConsumedThisIteration); + + totalByteCount += byteCountThisIteration; + if (totalByteCount < 0) + { + ThrowConversionOverflow(); + } + } + + charsConsumed = charsLength - chars.Length; // number of chars consumed across all loop iterations above + return totalByteCount; +#else + Debug.Fail("This should be overridden by a subclassed type."); + throw NotImplemented.ByDesign; +#endif + } + + /// + /// Counts the number of bytes that would result from transcoding the provided chars, + /// with no associated . The first two arguments are based on the + /// original input before invoking this method; and + /// signals where in the provided buffer the fallback loop should begin operating. + /// + /// + /// The byte count resulting from transcoding the input data. + /// + /// + /// If the resulting byte count is greater than . + /// (Implementation should call .) + /// + [MethodImpl(MethodImplOptions.NoInlining)] // don't stack spill spans into our caller + private protected unsafe int GetByteCountWithFallback(char* pCharsOriginal, int originalCharCount, int charsConsumedSoFar) + { + // This is a stub method that's marked "no-inlining" so that it we don't stack-spill spans + // into our immediate caller. Doing so increases the method prolog in what's supposed to + // be a very fast path. + + Debug.Assert(0 <= charsConsumedSoFar && charsConsumedSoFar < originalCharCount, "Invalid arguments provided to method."); + + return GetByteCountWithFallback( + chars: new ReadOnlySpan(pCharsOriginal, originalCharCount).Slice(charsConsumedSoFar), + originalCharsLength: originalCharCount, + encoder: null); + } + + /// + /// Gets the number of s that would result from transcoding the provided + /// input data, with an associated . The first two arguments are + /// based on the original input before invoking this method; and + /// signals where in the provided source buffer the fallback loop should begin operating. + /// The behavior of this method is to consume (non-destructively) any leftover data in the + /// instance, then to invoke the virtual method + /// after data has been drained, then to call . + /// + /// + /// The total number of bytes that would result from transcoding the remaining portion of the source buffer. + /// + /// + /// If the return value would exceed . + /// (The implementation should call .) + /// + private unsafe int GetByteCountWithFallback(char* pOriginalChars, int originalCharCount, int charsConsumedSoFar, EncoderNLS encoder) + { + Debug.Assert(encoder != null, "This code path should only be called from EncoderNLS."); + Debug.Assert(0 <= charsConsumedSoFar && charsConsumedSoFar < originalCharCount, "Caller should've checked this condition."); + + // First, try draining any data that already exists on the encoder instance. If we can't complete + // that operation, there's no point to continuing down to the main workhorse methods. + + ReadOnlySpan chars = new ReadOnlySpan(pOriginalChars, originalCharCount).Slice(charsConsumedSoFar); + + int totalByteCount = encoder.DrainLeftoverDataForGetByteCount(chars, out int charsConsumedJustNow); + chars = chars.Slice(charsConsumedJustNow); + + // Now try invoking the "fast path" (no fallback) implementation. + // We can use Unsafe.AsPointer here since these spans are created from pinned data (raw pointers). + + totalByteCount += GetByteCountFast( + pChars: (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(chars)), + charsLength: chars.Length, + fallback: encoder.Fallback, + charsConsumed: out charsConsumedJustNow); + + if (totalByteCount < 0) + { + ThrowConversionOverflow(); + } + + chars = chars.Slice(charsConsumedJustNow); + + // If there's still data remaining in the source buffer, go down the fallback path. + // Otherwise we're finished. + + if (!chars.IsEmpty) + { + totalByteCount += GetByteCountWithFallback(chars, originalCharCount, encoder); + if (totalByteCount < 0) + { + ThrowConversionOverflow(); + } + } + + return totalByteCount; + } + + /// + /// Counts the number of bytes that would result from transcoding the provided chars, + /// using the provided if necessary. + /// + /// + /// The byte count resulting from transcoding the input data. + /// + /// + /// If the resulting byte count is greater than . + /// (Implementation should call .) + /// + private protected virtual unsafe int GetByteCountWithFallback(ReadOnlySpan chars, int originalCharsLength, EncoderNLS encoder) + { + Debug.Assert(!chars.IsEmpty, "Caller shouldn't invoke this method with an empty input buffer."); + Debug.Assert(originalCharsLength >= 0, "Caller provided invalid parameter."); + + // Since we're using Unsafe.AsPointer in our central loop, we want to ensure everything is pinned. + + fixed (char* _pChars_Unused = &MemoryMarshal.GetReference(chars)) + { + EncoderFallbackBuffer fallbackBuffer = EncoderFallbackBuffer.CreateAndInitialize(this, encoder, originalCharsLength); + int totalByteCount = 0; + + do + { + // There's still data in the source buffer; why wasn't the previous fast-path able to consume it fully? + // There are two scenarios: (a) the source buffer contained invalid / incomplete UTF-16 data; + // or (b) the encoding can't translate this scalar value. + + if (Rune.DecodeUtf16(chars, out Rune firstScalarValue, out int charsConsumedThisIteration) == OperationStatus.NeedMoreData + && encoder != null + && !encoder.MustFlush) + { + // We saw a standalone high surrogate at the end of the buffer, and the + // active EncoderNLS instance isn't asking us to flush. Since a call to + // GetBytes would've consumed this char by storing it in EncoderNLS._charLeftOver, + // we'll "consume" it by ignoring it. The next call to GetBytes will + // pick it up correctly. + + goto Finish; + } + + // We saw invalid UTF-16 data, or we saw a high surrogate that we need to flush (and + // thus treat as invalid), or we saw valid UTF-16 data that this encoder doesn't support. + // In any case we'll run it through the fallback mechanism. + + int byteCountThisIteration = fallbackBuffer.InternalFallbackGetByteCount(chars, out charsConsumedThisIteration); + + Debug.Assert(byteCountThisIteration >= 0, "Fallback shouldn't have returned a negative value."); + Debug.Assert(charsConsumedThisIteration >= 0, "Fallback shouldn't have returned a negative value."); + + totalByteCount += byteCountThisIteration; + if (totalByteCount < 0) + { + ThrowConversionOverflow(); + } + + chars = chars.Slice(charsConsumedThisIteration); + + if (!chars.IsEmpty) + { + // Still data remaining - run it through the fast-path to find the next data to fallback. + // While building up the tally we need to continually check for integer overflow + // since fallbacks can change the total byte count in unexpected ways. + + byteCountThisIteration = GetByteCountFast( + pChars: (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(chars)), + charsLength: chars.Length, + fallback: null, // already tried this earlier and we still fell down the common path, so skip from now on + charsConsumed: out charsConsumedThisIteration); + + Debug.Assert(byteCountThisIteration >= 0, "Workhorse shouldn't have returned a negative value."); + Debug.Assert(charsConsumedThisIteration >= 0, "Workhorse shouldn't have returned a negative value."); + + totalByteCount += byteCountThisIteration; + if (totalByteCount < 0) + { + ThrowConversionOverflow(); + } + + chars = chars.Slice(charsConsumedThisIteration); + } + } while (!chars.IsEmpty); + + Finish: + + Debug.Assert(fallbackBuffer.Remaining == 0, "There should be no data in the fallback buffer after GetByteCount."); + + return totalByteCount; + } + } + + /* + * GETBYTES FAMILY OF FUNCTIONS + */ + + /// + /// Entry point from and . + /// + internal virtual unsafe int GetBytes(char* pChars, int charCount, byte* pBytes, int byteCount, EncoderNLS encoder) + { + Debug.Assert(encoder != null, "This code path should only be called from EncoderNLS."); + Debug.Assert(charCount >= 0, "Caller should've checked this condition."); + Debug.Assert(pChars != null || charCount == 0, "Cannot provide a null pointer and a non-zero count."); + Debug.Assert(byteCount >= 0, "Caller should've checked this condition."); + Debug.Assert(pBytes != null || byteCount == 0, "Cannot provide a null pointer and a non-zero count."); + + // We're going to try to stay on the fast-path as much as we can. That means that we have + // no leftover data to drain and the entire source buffer can be transcoded in a single + // fast-path invocation. If either of these doesn't hold, we'll go down the slow path of + // creating spans, draining the EncoderNLS instance, and falling back. + + int bytesWritten = 0; + int charsConsumed = 0; + + if (!encoder.HasLeftoverData) + { + bytesWritten = GetBytesFast(pChars, charCount, pBytes, byteCount, out charsConsumed); + if (charsConsumed == charCount) + { + encoder._charsUsed = charCount; + return bytesWritten; + } + } + + // We had leftover data, or we couldn't consume the entire input buffer. + // Let's go down the draining + fallback mechanisms. + + return GetBytesWithFallback(pChars, charCount, pBytes, byteCount, charsConsumed, bytesWritten, encoder); + } + + /// + /// Transcodes s to s, exiting when the source or destination + /// buffer is consumed or when the first unreadable data is encountered. + /// + /// + /// Via , the number of elements from which + /// were consumed; and returns the number of elements written to . + /// + /// + /// The implementation should not attempt to perform any sort of fallback behavior. + /// If custom fallback behavior is necessary, override . + /// + private protected virtual unsafe int GetBytesFast(char* pChars, int charsLength, byte* pBytes, int bytesLength, out int charsConsumed) + { + // Any production-quality type would override this method and provide a real + // implementation, so we won't provide a base implementation. However, a + // non-shipping slow reference implementation is provided below for convenience. + +#if false + ReadOnlySpan chars = new ReadOnlySpan(pChars, charsLength); + Span bytes = new Span(pBytes, bytesLength); + + while (!chars.IsEmpty) + { + if (Rune.DecodeUtf16(chars, out Rune scalarValue, out int charsConsumedJustNow) != OperationStatus.Done + || EncodeRune(scalarValue, bytes, out int bytesWrittenJustNow) != OperationStatus.Done) + { + // Invalid UTF-16 data, or not convertible to target encoding, or destination buffer too small to contain encoded value + + break; + } + + chars = chars.Slice(charsConsumedJustNow); + bytes = bytes.Slice(bytesWrittenJustNow); + } + + charsConsumed = charsLength - chars.Length; // number of chars consumed across all loop iterations above + return bytesLength - bytes.Length; // number of bytes written across all loop iterations above +#else + Debug.Fail("This should be overridden by a subclassed type."); + throw NotImplemented.ByDesign; +#endif + } + + /// + /// Transcodes chars to bytes, with no associated . The first four arguments are + /// based on the original input before invoking this method; and + /// and signal where in the provided buffers the fallback loop + /// should begin operating. The behavior of this method is to call the + /// virtual method as overridden by the specific type, and failing that go down the shared fallback path. + /// + /// + /// The total number of bytes written to , including . + /// + /// + /// If the destination buffer is not large enough to hold the entirety of the transcoded data. + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private protected unsafe int GetBytesWithFallback(char* pOriginalChars, int originalCharCount, byte* pOriginalBytes, int originalByteCount, int charsConsumedSoFar, int bytesWrittenSoFar) + { + // This is a stub method that's marked "no-inlining" so that it we don't stack-spill spans + // into our immediate caller. Doing so increases the method prolog in what's supposed to + // be a very fast path. + + Debug.Assert(0 <= charsConsumedSoFar && charsConsumedSoFar < originalCharCount, "Invalid arguments provided to method."); + Debug.Assert(0 <= bytesWrittenSoFar && bytesWrittenSoFar <= originalByteCount, "Invalid arguments provided to method."); + + return GetBytesWithFallback( + chars: new ReadOnlySpan(pOriginalChars, originalCharCount).Slice(charsConsumedSoFar), + originalCharsLength: originalCharCount, + bytes: new Span(pOriginalBytes, originalByteCount).Slice(bytesWrittenSoFar), + originalBytesLength: originalByteCount, + encoder: null); + } + + /// + /// Transcodes chars to bytes, with an associated . The first four arguments are + /// based on the original input before invoking this method; and + /// and signal where in the provided buffers the fallback loop + /// should begin operating. The behavior of this method is to drain any leftover data in the + /// instance, then to invoke the virtual method + /// after data has been drained, then to call . + /// + /// + /// The total number of bytes written to , including . + /// + /// + /// If the destination buffer is too small to make any forward progress at all, or if the destination buffer is + /// too small to contain the entirety of the transcoded data and the instance disallows + /// partial transcoding. + /// + private unsafe int GetBytesWithFallback(char* pOriginalChars, int originalCharCount, byte* pOriginalBytes, int originalByteCount, int charsConsumedSoFar, int bytesWrittenSoFar, EncoderNLS encoder) + { + Debug.Assert(encoder != null, "This code path should only be called from EncoderNLS."); + Debug.Assert(0 <= charsConsumedSoFar && charsConsumedSoFar < originalCharCount, "Caller should've checked this condition."); + Debug.Assert(0 <= bytesWrittenSoFar && bytesWrittenSoFar <= originalByteCount, "Caller should've checked this condition."); + + // First, try draining any data that already exists on the encoder instance. If we can't complete + // that operation, there's no point to continuing down to the main workhorse methods. + + ReadOnlySpan chars = new ReadOnlySpan(pOriginalChars, originalCharCount).Slice(charsConsumedSoFar); + Span bytes = new Span(pOriginalBytes, originalByteCount).Slice(bytesWrittenSoFar); + + bool drainFinishedSuccessfully = encoder.TryDrainLeftoverDataForGetBytes(chars, bytes, out int charsConsumedJustNow, out int bytesWrittenJustNow); + + chars = chars.Slice(charsConsumedJustNow); // whether or not the drain finished, we may have made some progress + bytes = bytes.Slice(bytesWrittenJustNow); + + if (!drainFinishedSuccessfully) + { + ThrowBytesOverflow(encoder, nothingEncoded: bytes.Length == originalByteCount); // might not throw if we wrote at least one byte + } + else + { + // Now try invoking the "fast path" (no fallback) implementation. + // We can use Unsafe.AsPointer here since these spans are created from pinned data (raw pointers). + + bytesWrittenJustNow = GetBytesFast( + pChars: (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(chars)), + charsLength: chars.Length, + pBytes: (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(bytes)), + bytesLength: bytes.Length, + charsConsumed: out charsConsumedJustNow); + + chars = chars.Slice(charsConsumedJustNow); + bytes = bytes.Slice(bytesWrittenJustNow); + + // If there's still data remaining in the source buffer, go down the fallback path. + // Otherwise we're finished. + + if (!chars.IsEmpty) + { + // We'll optimistically tell the encoder that we're using everything; the + // GetBytesWithFallback method will overwrite this field if necessary. + + encoder._charsUsed = originalCharCount; + return GetBytesWithFallback(chars, originalCharCount, bytes, originalByteCount, encoder); + } + } + + encoder._charsUsed = originalCharCount - chars.Length; // total number of characters consumed up until now + return originalByteCount - bytes.Length; // total number of bytes written up until now + } + + /// + /// Transcodes chars to bytes, using or if needed. + /// + /// + /// The total number of bytes written to (based on ). + /// + /// + /// The derived class should override this method if it might be able to provide a more optimized fallback + /// implementation, deferring to the base implementation if needed. This method calls + /// if necessary. + /// + private protected virtual unsafe int GetBytesWithFallback(ReadOnlySpan chars, int originalCharsLength, Span bytes, int originalBytesLength, EncoderNLS encoder) + { + Debug.Assert(!chars.IsEmpty, "Caller shouldn't invoke this method with an empty input buffer."); + Debug.Assert(originalCharsLength >= 0, "Caller provided invalid parameter."); + Debug.Assert(originalBytesLength >= 0, "Caller provided invalid parameter."); + + // Since we're using Unsafe.AsPointer in our central loop, we want to ensure everything is pinned. + + fixed (char* _pChars_Unused = &MemoryMarshal.GetReference(chars)) + fixed (byte* _pBytes_Unused = &MemoryMarshal.GetReference(bytes)) + { + EncoderFallbackBuffer fallbackBuffer = EncoderFallbackBuffer.CreateAndInitialize(this, encoder, originalCharsLength); + + do + { + // There's still data in the source buffer; why wasn't the previous fast-path able to consume it fully? + // There are two scenarios: (a) the source buffer contained invalid / incomplete UTF-16 data; + // or (b) the encoding can't translate this scalar value. + + switch (Rune.DecodeUtf16(chars, out Rune firstScalarValue, out int charsConsumedThisIteration)) + { + case OperationStatus.NeedMoreData: + Debug.Assert(charsConsumedThisIteration == chars.Length, "If returning NeedMoreData, should out the entire buffer length as chars consumed."); + if (encoder is null || encoder.MustFlush) + { + goto case OperationStatus.InvalidData; // see comment in GetByteCountWithFallback + } + else + { + encoder._charLeftOver = chars[0]; // squirrel away remaining high surrogate char and finish + chars = ReadOnlySpan.Empty; + goto Finish; + } + + case OperationStatus.InvalidData: + break; + + default: + if (EncodeRune(firstScalarValue, bytes, out _) == OperationStatus.DestinationTooSmall) + { + goto Finish; // source buffer contained valid UTF-16 but encoder ran out of space in destination buffer + } + break; // source buffer contained valid UTF-16 but encoder doesn't support this scalar value + } + + // Now we know the reason for failure was that the original input was invalid + // for the encoding in use. Run it through the fallback mechanism. + + bool fallbackFinished = fallbackBuffer.TryInternalFallbackGetBytes(chars, bytes, out charsConsumedThisIteration, out int bytesWrittenThisIteration); + + // Regardless of whether the fallback finished, it did consume some number of + // chars, and it may have written some number of bytes. + + chars = chars.Slice(charsConsumedThisIteration); + bytes = bytes.Slice(bytesWrittenThisIteration); + + if (!fallbackFinished) + { + goto Finish; // fallback has pending state - it'll get written out on the next GetBytes call + } + + if (!chars.IsEmpty) + { + // Still data remaining - run it through the fast-path to find the next data to fallback. + + bytesWrittenThisIteration = GetBytesFast( + pChars: (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(chars)), + charsLength: chars.Length, + pBytes: (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(bytes)), + bytesLength: bytes.Length, + charsConsumed: out charsConsumedThisIteration); + + Debug.Assert(bytesWrittenThisIteration >= 0, "Workhorse shouldn't have returned a negative value."); + Debug.Assert(charsConsumedThisIteration >= 0, "Workhorse shouldn't have returned a negative value."); + + chars = chars.Slice(charsConsumedThisIteration); + bytes = bytes.Slice(bytesWrittenThisIteration); + } + } while (!chars.IsEmpty); + + Finish: + + // We reach this point when we deplete the source or destination buffer. There are a few + // cases to consider now. If the source buffer has been fully consumed and there's no + // leftover data in the EncoderNLS or the fallback buffer, we've completed transcoding. + // If the source buffer isn't empty or there's leftover data in the fallback buffer, + // it means we ran out of space in the destintion buffer. This is an unrecoverable error + // if no EncoderNLS is in use (because only EncoderNLS can handle partial success), and + // even if an EncoderNLS is in use this is only recoverable if the EncoderNLS instance + // allows partial completion. Let's check all of these conditions now. + + if (!chars.IsEmpty || fallbackBuffer.Remaining > 0) + { + // The line below will also throw if the encoder couldn't make any progress at all + // because the output buffer wasn't large enough to contain the result of even + // a single scalar conversion or fallback. + + ThrowBytesOverflow(encoder, nothingEncoded: bytes.Length == originalBytesLength); + } + + // If an EncoderNLS instance is active, update its "total consumed character count" value. + + if (encoder != null) + { + Debug.Assert(originalCharsLength >= chars.Length, "About to report a negative number of chars used?"); + encoder._charsUsed = originalCharsLength - chars.Length; // number of chars consumed + } + + Debug.Assert(fallbackBuffer.Remaining == 0 || encoder != null, "Shouldn't have any leftover data in fallback buffer unless an EncoderNLS is in use."); + + return originalBytesLength - bytes.Length; + } + } + + /* + * GETCHARCOUNT FAMILY OF FUNCTIONS + */ + + /// + /// Entry point from . + /// + internal virtual unsafe int GetCharCount(byte* pBytes, int byteCount, DecoderNLS decoder) + { + Debug.Assert(decoder != null, "This code path should only be called from DecoderNLS."); + Debug.Assert(byteCount >= 0, "Caller should've checked this condition."); + Debug.Assert(pBytes != null || byteCount == 0, "Cannot provide a null pointer and a non-zero count."); + + // We're going to try to stay on the fast-path as much as we can. That means that we have + // no leftover data to drain and the entire source buffer can be consumed in a single + // fast-path invocation. If either of these doesn't hold, we'll go down the slow path of + // creating spans, draining the DecoderNLS instance, and falling back. + + Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, "Fallback buffer can't hold data between GetChars invocations."); + + int totalCharCount = 0; + int bytesConsumed = 0; + + if (!decoder.HasLeftoverData) + { + totalCharCount = GetCharCountFast(pBytes, byteCount, decoder.Fallback, out bytesConsumed); + if (bytesConsumed == byteCount) + { + return totalCharCount; + } + } + + // We had leftover data, or we couldn't consume the entire input buffer. + // Let's go down the draining + fallback mechanisms. + + totalCharCount += GetCharCountWithFallback(pBytes, byteCount, bytesConsumed, decoder); + if (totalCharCount < 0) + { + ThrowConversionOverflow(); + } + + return totalCharCount; + } + + /// + /// Counts the number of s that would result from transcoding the source + /// data, exiting when the source buffer is consumed or when the first unreadable data is encountered. + /// The implementation may inspect to short-circuit any counting + /// operation, but it should not attempt to call . + /// + /// + /// Via , the number of elements from which + /// were consumed; and returns the transcoded char count up to this point. + /// + /// + /// If the char count would be greater than . + /// (Implementation should call .) + /// + /// + /// The implementation should not attempt to perform any sort of fallback behavior. + /// If custom fallback behavior is necessary, override . + /// + private protected virtual unsafe int GetCharCountFast(byte* pBytes, int bytesLength, DecoderFallback fallback, out int bytesConsumed) + { + // Any production-quality type would override this method and provide a real + // implementation, so we won't provide a base implementation. However, a + // non-shipping slow reference implementation is provided below for convenience. + +#if false + ReadOnlySpan bytes = new ReadOnlySpan(pBytes, bytesLength); + int totalCharCount = 0; + + while (!bytes.IsEmpty) + { + // We don't care about statuses other than Done. The fallback mechanism will handle those. + + if (DecodeFirstRune(bytes, out Rune value, out int bytesConsumedJustNow) != OperationStatus.Done) + { + break; + } + + totalCharCount += value.Utf16SequenceLength; + if (totalCharCount < 0) + { + ThrowConversionOverflow(); + } + + bytes = bytes.Slice(bytesConsumedJustNow); + } + + bytesConsumed = bytesLength - bytes.Length; // number of bytes consumed across all loop iterations above + return totalCharCount; +#else + Debug.Fail("This should be overridden by a subclassed type."); + throw NotImplemented.ByDesign; +#endif + } + + /// + /// Counts the number of chars that would result from transcoding the provided bytes, + /// with no associated . The first two arguments are based on the + /// original input before invoking this method; and + /// signals where in the provided buffer the fallback loop should begin operating. + /// + /// + /// The char count resulting from transcoding the input data. + /// + /// + /// If the resulting char count is greater than . + /// (Implementation should call .) + /// + [MethodImpl(MethodImplOptions.NoInlining)] // don't stack spill spans into our caller + private protected unsafe int GetCharCountWithFallback(byte* pBytesOriginal, int originalByteCount, int bytesConsumedSoFar) + { + // This is a stub method that's marked "no-inlining" so that it we don't stack-spill spans + // into our immediate caller. Doing so increases the method prolog in what's supposed to + // be a very fast path. + + Debug.Assert(0 <= bytesConsumedSoFar && bytesConsumedSoFar < originalByteCount, "Invalid arguments provided to method."); + + return GetCharCountWithFallback( + bytes: new ReadOnlySpan(pBytesOriginal, originalByteCount).Slice(bytesConsumedSoFar), + originalBytesLength: originalByteCount, + decoder: null); + } + + /// + /// Gets the number of s that would result from transcoding the provided + /// input data, with an associated . The first two arguments are + /// based on the original input before invoking this method; and + /// signals where in the provided source buffer the fallback loop should begin operating. + /// The behavior of this method is to consume (non-destructively) any leftover data in the + /// instance, then to invoke the virtual method + /// after data has been drained, then to call . + /// + /// + /// The total number of chars that would result from transcoding the remaining portion of the source buffer. + /// + /// + /// If the return value would exceed . + /// (The implementation should call .) + /// + private unsafe int GetCharCountWithFallback(byte* pOriginalBytes, int originalByteCount, int bytesConsumedSoFar, DecoderNLS decoder) + { + Debug.Assert(decoder != null, "This code path should only be called from DecoderNLS."); + Debug.Assert(0 <= bytesConsumedSoFar && bytesConsumedSoFar < originalByteCount, "Caller should've checked this condition."); + + // First, try draining any data that already exists on the decoder instance. If we can't complete + // that operation, there's no point to continuing down to the main workhorse methods. + + ReadOnlySpan bytes = new ReadOnlySpan(pOriginalBytes, originalByteCount).Slice(bytesConsumedSoFar); + + int totalCharCount = decoder.DrainLeftoverDataForGetCharCount(bytes, out int bytesConsumedJustNow); + bytes = bytes.Slice(bytesConsumedJustNow); + + // Now try invoking the "fast path" (no fallback) implementation. + // We can use Unsafe.AsPointer here since these spans are created from pinned data (raw pointers). + + totalCharCount += GetCharCountFast( + pBytes: (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(bytes)), + bytesLength: bytes.Length, + fallback: decoder.Fallback, + bytesConsumed: out bytesConsumedJustNow); + + if (totalCharCount < 0) + { + ThrowConversionOverflow(); + } + + bytes = bytes.Slice(bytesConsumedJustNow); + + // If there's still data remaining in the source buffer, go down the fallback path. + // Otherwise we're finished. + + if (!bytes.IsEmpty) + { + totalCharCount += GetCharCountWithFallback(bytes, originalByteCount, decoder); + if (totalCharCount < 0) + { + ThrowConversionOverflow(); + } + } + + return totalCharCount; + } + + /// + /// Counts the number of chars that would result from transcoding the provided bytes, + /// using the provided if necessary. + /// + /// + /// The char count resulting from transcoding the input data. + /// + /// + /// If the resulting char count is greater than . + /// (Implementation should call .) + /// + private unsafe int GetCharCountWithFallback(ReadOnlySpan bytes, int originalBytesLength, DecoderNLS decoder) + { + Debug.Assert(!bytes.IsEmpty, "Caller shouldn't invoke this method with an empty input buffer."); + Debug.Assert(originalBytesLength >= 0, "Caller provided invalid parameter."); + + // Since we're using Unsafe.AsPointer in our central loop, we want to ensure everything is pinned. + + fixed (byte* _pBytes_Unused = &MemoryMarshal.GetReference(bytes)) + { + DecoderFallbackBuffer fallbackBuffer = DecoderFallbackBuffer.CreateAndInitialize(this, decoder, originalBytesLength); + int totalCharCount = 0; + + do + { + // There's still data in the source buffer; why wasn't the previous fast-path able to consume it fully? + // There are two scenarios: (a) the source buffer contained invalid data, or it contained incomplete data. + + if (DecodeFirstRune(bytes, out Rune firstScalarValue, out int bytesConsumedThisIteration) == OperationStatus.NeedMoreData + && decoder != null + && !decoder.MustFlush) + { + // We saw incomplete data at the end of the buffer, and the active DecoderNLS isntance + // isn't asking us to flush. Since a call to GetChars would've consumed this data by + // storing it in the DecoderNLS instance, we'll "consume" it by ignoring it. + // The next call to GetChars will pick it up correctly. + + goto Finish; + } + + // We saw invalid binary data, or we saw incomplete data that we need to flush (and thus + // treat as invalid). In any case we'll run through the fallback mechanism. + + int charCountThisIteration = fallbackBuffer.InternalFallbackGetCharCount(bytes, bytesConsumedThisIteration); + + Debug.Assert(charCountThisIteration >= 0, "Fallback shouldn't have returned a negative value."); + + totalCharCount += charCountThisIteration; + if (totalCharCount < 0) + { + ThrowConversionOverflow(); + } + + bytes = bytes.Slice(bytesConsumedThisIteration); + + if (!bytes.IsEmpty) + { + // Still data remaining - run it through the fast-path to find the next data to fallback. + // While building up the tally we need to continually check for integer overflow + // since fallbacks can change the total byte count in unexpected ways. + + charCountThisIteration = GetCharCountFast( + pBytes: (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(bytes)), + bytesLength: bytes.Length, + fallback: null, // wasn't able to be short-circuited by our caller; don't bother trying again + bytesConsumed: out bytesConsumedThisIteration); + + Debug.Assert(charCountThisIteration >= 0, "Workhorse shouldn't have returned a negative value."); + Debug.Assert(bytesConsumedThisIteration >= 0, "Workhorse shouldn't have returned a negative value."); + + totalCharCount += charCountThisIteration; + if (totalCharCount < 0) + { + ThrowConversionOverflow(); + } + + bytes = bytes.Slice(bytesConsumedThisIteration); + } + } while (!bytes.IsEmpty); + + Finish: + + Debug.Assert(fallbackBuffer.Remaining == 0, "There should be no data in the fallback buffer after GetCharCount."); + + return totalCharCount; + } + } + + /* + * GETCHARS FAMILY OF FUNCTIONS + */ + + /// + /// Entry point from and . + /// + internal virtual unsafe int GetChars(byte* pBytes, int byteCount, char* pChars, int charCount, DecoderNLS decoder) + { + Debug.Assert(decoder != null, "This code path should only be called from DecoderNLS."); + Debug.Assert(byteCount >= 0, "Caller should've checked this condition."); + Debug.Assert(pBytes != null || byteCount == 0, "Cannot provide a null pointer and a non-zero count."); + Debug.Assert(charCount >= 0, "Caller should've checked this condition."); + Debug.Assert(pChars != null || charCount == 0, "Cannot provide a null pointer and a non-zero count."); + + // We're going to try to stay on the fast-path as much as we can. That means that we have + // no leftover data to drain and the entire source buffer can be transcoded in a single + // fast-path invocation. If either of these doesn't hold, we'll go down the slow path of + // creating spans, draining the DecoderNLS instance, and falling back. + + int charsWritten = 0; + int bytesConsumed = 0; + + if (!decoder.HasLeftoverData) + { + charsWritten = GetCharsFast(pBytes, byteCount, pChars, charCount, out bytesConsumed); + if (bytesConsumed == byteCount) + { + decoder._bytesUsed = byteCount; + return charsWritten; + } + } + + // We had leftover data, or we couldn't consume the entire input buffer. + // Let's go down the draining + fallback mechanisms. + + return GetCharsWithFallback(pBytes, byteCount, pChars, charCount, bytesConsumed, charsWritten, decoder); + } + + /// + /// Transcodes s to s, exiting when the source or destination + /// buffer is consumed or when the first unreadable data is encountered. + /// + /// + /// Via , the number of elements from which + /// were consumed; and returns the number of elements written to . + /// + /// + /// The implementation should not attempt to perform any sort of fallback behavior. + /// If custom fallback behavior is necessary, override . + /// + private protected virtual unsafe int GetCharsFast(byte* pBytes, int bytesLength, char* pChars, int charsLength, out int bytesConsumed) + { + // Any production-quality type would override this method and provide a real + // implementation, so we won't provide a base implementation. However, a + // non-shipping slow reference implementation is provided below for convenience. + +#if false + ReadOnlySpan bytes = new ReadOnlySpan(pBytes, bytesLength); + Span chars = new Span(pChars, charsLength); + + while (!bytes.IsEmpty) + { + if ((DecodeFirstRune(bytes, out Rune firstScalarValue, out int bytesConsumedJustNow) != OperationStatus.Done) + || !firstScalarValue.TryEncode(chars, out int charsWrittenJustNow)) + { + // Invalid or incomplete binary data, or destination buffer too small to contain decoded value + + break; + } + + bytes = bytes.Slice(bytesConsumedJustNow); + chars = chars.Slice(charsWrittenJustNow); + } + + bytesConsumed = bytesLength - bytes.Length; // number of bytes consumed across all loop iterations above + return charsLength - chars.Length; // number of chars written across all loop iterations above +#else + Debug.Fail("This should be overridden by a subclassed type."); + throw NotImplemented.ByDesign; +#endif + } + + /// + /// Transcodes bytes to chars, with no associated . The first four arguments are + /// based on the original input before invoking this method; and + /// and signal where in the provided buffers the fallback loop + /// should begin operating. The behavior of this method is to call the + /// virtual method as overridden by the specific type, and failing that go down the shared fallback path. + /// + /// + /// The total number of chars written to , including . + /// + /// + /// If the destination buffer is not large enough to hold the entirety of the transcoded data. + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private protected unsafe int GetCharsWithFallback(byte* pOriginalBytes, int originalByteCount, char* pOriginalChars, int originalCharCount, int bytesConsumedSoFar, int charsWrittenSoFar) + { + // This is a stub method that's marked "no-inlining" so that it we don't stack-spill spans + // into our immediate caller. Doing so increases the method prolog in what's supposed to + // be a very fast path. + + Debug.Assert(0 <= bytesConsumedSoFar && bytesConsumedSoFar < originalByteCount, "Invalid arguments provided to method."); + Debug.Assert(0 <= charsWrittenSoFar && charsWrittenSoFar <= originalCharCount, "Invalid arguments provided to method."); + + return GetCharsWithFallback( + bytes: new ReadOnlySpan(pOriginalBytes, originalByteCount).Slice(bytesConsumedSoFar), + originalBytesLength: originalByteCount, + chars: new Span(pOriginalChars, originalCharCount).Slice(charsWrittenSoFar), + originalCharsLength: originalCharCount, + decoder: null); + } + + /// + /// Transcodes bytes to chars, with an associated . The first four arguments are + /// based on the original input before invoking this method; and + /// and signal where in the provided buffers the fallback loop + /// should begin operating. The behavior of this method is to drain any leftover data in the + /// instance, then to invoke the virtual method + /// after data has been drained, then to call . + /// + /// + /// The total number of chars written to , including . + /// + /// + /// If the destination buffer is too small to make any forward progress at all, or if the destination buffer is + /// too small to contain the entirety of the transcoded data and the instance disallows + /// partial transcoding. + /// + private protected unsafe int GetCharsWithFallback(byte* pOriginalBytes, int originalByteCount, char* pOriginalChars, int originalCharCount, int bytesConsumedSoFar, int charsWrittenSoFar, DecoderNLS decoder) + { + Debug.Assert(decoder != null, "This code path should only be called from DecoderNLS."); + Debug.Assert(0 <= bytesConsumedSoFar && bytesConsumedSoFar < originalByteCount, "Caller should've checked this condition."); + Debug.Assert(0 <= charsWrittenSoFar && charsWrittenSoFar <= originalCharCount, "Caller should've checked this condition."); + + // First, try draining any data that already exists on the encoder instance. If we can't complete + // that operation, there's no point to continuing down to the main workhorse methods. + // + // Like GetBytes, there may be leftover data in the DecoderNLS instance. But unlike GetBytes, + // the bytes -> chars conversion doesn't allow leftover data in the fallback buffer. This means + // that the drain operation below will either succeed fully or fail; there's no partial success + // condition as with the chars -> bytes conversion. The drain method will throw if there's not + // enough space in the destination buffer. + + ReadOnlySpan bytes = new ReadOnlySpan(pOriginalBytes, originalByteCount).Slice(bytesConsumedSoFar); + Span chars = new Span(pOriginalChars, originalCharCount).Slice(charsWrittenSoFar); + + int charsWrittenJustNow = decoder.DrainLeftoverDataForGetChars(bytes, chars, out int bytesConsumedJustNow); + + bytes = bytes.Slice(bytesConsumedJustNow); + chars = chars.Slice(charsWrittenJustNow); + + Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, "Should be no remaining fallback data at this point."); + + // Now try invoking the "fast path" (no fallback buffer) implementation. + // We can use Unsafe.AsPointer here since these spans are created from pinned data (raw pointers). + + charsWrittenJustNow = GetCharsFast( + pBytes: (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(bytes)), + bytesLength: bytes.Length, + pChars: (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(chars)), + charsLength: chars.Length, + bytesConsumed: out bytesConsumedJustNow); + + bytes = bytes.Slice(bytesConsumedJustNow); + chars = chars.Slice(charsWrittenJustNow); + + // We'll optimistically tell the decoder that we're using everything; the + // GetCharsWithFallback method will overwrite this field if necessary. + + decoder._bytesUsed = originalByteCount; + + if (bytes.IsEmpty) + { + return originalCharCount - chars.Length; // total number of chars written + } + else + { + return GetCharsWithFallback(bytes, originalByteCount, chars, originalCharCount, decoder); + } + } + + /// + /// Transcodes bytes to chars, using or if needed. + /// + /// + /// The total number of chars written to (based on ). + /// + /// + /// The derived class should override this method if it might be able to provide a more optimized fallback + /// implementation, deferring to the base implementation if needed. This method calls + /// if necessary. + /// + private protected virtual unsafe int GetCharsWithFallback(ReadOnlySpan bytes, int originalBytesLength, Span chars, int originalCharsLength, DecoderNLS decoder) + { + Debug.Assert(!bytes.IsEmpty, "Caller shouldn't invoke this method with an empty input buffer."); + Debug.Assert(originalBytesLength >= 0, "Caller provided invalid parameter."); + Debug.Assert(originalCharsLength >= 0, "Caller provided invalid parameter."); + + // Since we're using Unsafe.AsPointer in our central loop, we want to ensure everything is pinned. + + fixed (byte* _pBytes_Unused = &MemoryMarshal.GetReference(bytes)) + fixed (char* _pChars_Unused = &MemoryMarshal.GetReference(chars)) + { + DecoderFallbackBuffer fallbackBuffer = DecoderFallbackBuffer.CreateAndInitialize(this, decoder, originalBytesLength); + + do + { + // There's still data in the source buffer; why wasn't the previous fast-path able to consume it fully? + // There are two scenarios: (a) the source buffer contained invalid data, or it contained incomplete data. + + int charsWrittenThisIteration; + + switch (DecodeFirstRune(bytes, out _, out int bytesConsumedThisIteration)) + { + case OperationStatus.NeedMoreData: + Debug.Assert(bytesConsumedThisIteration == bytes.Length, "If returning NeedMoreData, should out the entire buffer length as bytes consumed."); + if (decoder is null || decoder.MustFlush) + { + goto case OperationStatus.InvalidData; // see comment in GetCharCountWithFallback + } + else + { + decoder.SetLeftoverData(bytes); // squirrel away remaining data and finish + bytes = ReadOnlySpan.Empty; + goto Finish; + } + + case OperationStatus.InvalidData: + if (fallbackBuffer.TryInternalFallbackGetChars(bytes, bytesConsumedThisIteration, chars, out charsWrittenThisIteration)) + { + // We successfully consumed some bytes, sent it through the fallback, and wrote some chars. + + Debug.Assert(charsWrittenThisIteration >= 0, "Fallback shouldn't have returned a negative value."); + break; + } + else + { + // We generated fallback data, but the destination buffer wasn't large enough to hold it. + // Don't mark any of the bytes we ran through the fallback as consumed, and terminate + // the loop now and let our caller handle this condition. + + goto Finish; + } + + default: + goto Finish; // no error on input, so destination must have been too small + } + + bytes = bytes.Slice(bytesConsumedThisIteration); + chars = chars.Slice(charsWrittenThisIteration); + + if (!bytes.IsEmpty) + { + // Still data remaining - run it through the fast-path to find the next data to fallback. + // We need to figure out why we weren't able to make progress. + + charsWrittenThisIteration = GetCharsFast( + pBytes: (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(bytes)), + bytesLength: bytes.Length, + pChars: (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(chars)), + charsLength: chars.Length, + bytesConsumed: out bytesConsumedThisIteration); + + Debug.Assert(charsWrittenThisIteration >= 0, "Workhorse shouldn't have returned a negative value."); + Debug.Assert(bytesConsumedThisIteration >= 0, "Workhorse shouldn't have returned a negative value."); + + bytes = bytes.Slice(bytesConsumedThisIteration); + chars = chars.Slice(charsWrittenThisIteration); + } + } while (!bytes.IsEmpty); + + Finish: + + // We reach this point when we deplete the source or destination buffer. See main comment + // at the end of GetBytesWithFallback for how the below logic works; the primary difference + // here is that GetChars disallows leftover data in the fallback buffer between calls. + + Debug.Assert(fallbackBuffer.Remaining == 0); + + if (!bytes.IsEmpty) + { + // The line below will also throw if the decoder couldn't make any progress at all + // because the output buffer wasn't large enough to contain the result of even + // a single scalar conversion or fallback. + + ThrowCharsOverflow(decoder, nothingDecoded: chars.Length == originalCharsLength); + } + + // If a DecoderNLS instance is active, update its "total consumed byte count" value. + + if (decoder != null) + { + Debug.Assert(originalBytesLength >= bytes.Length, "About to report a negative number of bytes used?"); + decoder._bytesUsed = originalBytesLength - bytes.Length; // number of bytes consumed + } + + return originalCharsLength - chars.Length; // total number of chars written + } + } + } +} diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Encoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Encoding.cs index 175e5442fd4e3..8947b7fca0eb3 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Encoding.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Encoding.cs @@ -3,11 +3,8 @@ // See the LICENSE file in the project root for more information. using System.Diagnostics; -using System.Globalization; -using System.Threading; using System.Runtime.InteropServices; using System.Runtime.Serialization; -using System.Diagnostics.CodeAnalysis; namespace System.Text { @@ -74,7 +71,7 @@ namespace System.Text // generally executes faster. // - public abstract class Encoding : ICloneable + public abstract partial class Encoding : ICloneable { // For netcore we use UTF8 as default encoding since ANSI isn't available private static readonly UTF8Encoding.UTF8EncodingSealed s_defaultEncoding = new UTF8Encoding.UTF8EncodingSealed(encoderShouldEmitUTF8Identifier: false); @@ -559,13 +556,16 @@ public virtual object Clone() return newEncoding; } - public bool IsReadOnly { get { return (_isReadOnly); } + private protected set + { + _isReadOnly = value; + } } // Returns an encoding for the ASCII character set. The returned encoding @@ -666,16 +666,6 @@ public virtual unsafe int GetByteCount(ReadOnlySpan chars) } } - // For NLS Encodings, workhorse takes an encoder (may be null) - // Always validate parameters before calling internal version, which will only assert. - internal virtual unsafe int GetByteCount(char* chars, int count, EncoderNLS encoder) - { - Debug.Assert(chars != null); - Debug.Assert(count >= 0); - - return GetByteCount(chars, count); - } - // Returns a byte array containing the encoded representation of the given // character array. // @@ -772,14 +762,6 @@ public byte[] GetBytes(string s, int index, int count) return GetBytes(s.ToCharArray(), charIndex, charCount, bytes, byteIndex); } - // This is our internal workhorse - // Always validate parameters before calling internal version, which will only assert. - internal virtual unsafe int GetBytes(char* chars, int charCount, - byte* bytes, int byteCount, EncoderNLS encoder) - { - return GetBytes(chars, charCount, bytes, byteCount); - } - // We expect this to be the workhorse for NLS Encodings, but for existing // ones we need a working (if slow) default implementation) // @@ -898,13 +880,6 @@ public virtual unsafe int GetCharCount(ReadOnlySpan bytes) } } - // This is our internal workhorse - // Always validate parameters before calling internal version, which will only assert. - internal virtual unsafe int GetCharCount(byte* bytes, int count, DecoderNLS decoder) - { - return GetCharCount(bytes, count); - } - // Returns a character array containing the decoded representation of a // given byte array. // @@ -1011,15 +986,6 @@ public virtual unsafe int GetChars(ReadOnlySpan bytes, Span chars) } } - // This is our internal workhorse - // Always validate parameters before calling internal version, which will only assert. - internal virtual unsafe int GetChars(byte* bytes, int byteCount, - char* chars, int charCount, DecoderNLS decoder) - { - return GetChars(bytes, byteCount, chars, charCount); - } - - [CLSCompliant(false)] public unsafe string GetString(byte* bytes, int byteCount) { @@ -1238,6 +1204,12 @@ internal void ThrowBytesOverflow(EncoderNLS encoder, bool nothingEncoded) encoder.ClearMustFlush(); } + [StackTraceHidden] + internal static void ThrowConversionOverflow() + { + throw new ArgumentException(SR.Argument_ConversionOverflow); + } + internal void ThrowCharsOverflow() { // Special message to include fallback type in case fallback's GetMaxCharCount is broken diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/EncodingNLS.cs b/src/libraries/System.Private.CoreLib/src/System/Text/EncodingNLS.cs index e6fa0627d3e0a..51d0e66044bd1 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/EncodingNLS.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/EncodingNLS.cs @@ -4,6 +4,7 @@ using System; using System.Collections; +using System.Diagnostics; using System.Globalization; using System.Runtime.InteropServices; using System.Threading; @@ -27,6 +28,7 @@ internal abstract class EncodingNLS : Encoding { protected EncodingNLS(int codePage) : base(codePage) { + Debug.Assert(GetType() == typeof(Latin1Encoding), "Should be no instantiations of this type except via Latin1Encoding."); } // Returns the number of bytes required to encode a range of characters in diff --git a/src/libraries/System.Private.CoreLib/src/System/ThrowHelper.cs b/src/libraries/System.Private.CoreLib/src/System/ThrowHelper.cs index c3c91d8f0bbab..06b3ce41a6416 100644 --- a/src/libraries/System.Private.CoreLib/src/System/ThrowHelper.cs +++ b/src/libraries/System.Private.CoreLib/src/System/ThrowHelper.cs @@ -452,8 +452,20 @@ private static string GetArgumentName(ExceptionArgument argument) return "startIndex"; case ExceptionArgument.task: return "task"; + case ExceptionArgument.bytes: + return "bytes"; + case ExceptionArgument.byteIndex: + return "byteIndex"; + case ExceptionArgument.byteCount: + return "byteCount"; case ExceptionArgument.ch: return "ch"; + case ExceptionArgument.chars: + return "chars"; + case ExceptionArgument.charIndex: + return "charIndex"; + case ExceptionArgument.charCount: + return "charCount"; case ExceptionArgument.s: return "s"; case ExceptionArgument.input: @@ -612,6 +624,10 @@ private static string GetResourceString(ExceptionResource resource) { case ExceptionResource.ArgumentOutOfRange_Index: return SR.ArgumentOutOfRange_Index; + case ExceptionResource.ArgumentOutOfRange_IndexCount: + return SR.ArgumentOutOfRange_IndexCount; + case ExceptionResource.ArgumentOutOfRange_IndexCountBuffer: + return SR.ArgumentOutOfRange_IndexCountBuffer; case ExceptionResource.ArgumentOutOfRange_Count: return SR.ArgumentOutOfRange_Count; case ExceptionResource.Arg_ArrayPlusOffTooSmall: @@ -694,6 +710,8 @@ private static string GetResourceString(ExceptionResource resource) return SR.Task_WaitMulti_NullTask; case ExceptionResource.ArgumentException_OtherNotArrayOfCorrectLength: return SR.ArgumentException_OtherNotArrayOfCorrectLength; + case ExceptionResource.ArgumentNull_Array: + return SR.ArgumentNull_Array; case ExceptionResource.ArgumentNull_SafeHandle: return SR.ArgumentNull_SafeHandle; case ExceptionResource.ArgumentOutOfRange_EndIndexStartIndex: @@ -752,7 +770,13 @@ internal enum ExceptionArgument value, startIndex, task, + bytes, + byteIndex, + byteCount, ch, + chars, + charIndex, + charCount, s, input, ownedMemory, @@ -828,6 +852,8 @@ internal enum ExceptionArgument internal enum ExceptionResource { ArgumentOutOfRange_Index, + ArgumentOutOfRange_IndexCount, + ArgumentOutOfRange_IndexCountBuffer, ArgumentOutOfRange_Count, Arg_ArrayPlusOffTooSmall, NotSupported_ReadOnlyCollection, @@ -869,6 +895,7 @@ internal enum ExceptionResource Task_ThrowIfDisposed, Task_WaitMulti_NullTask, ArgumentException_OtherNotArrayOfCorrectLength, + ArgumentNull_Array, ArgumentNull_SafeHandle, ArgumentOutOfRange_EndIndexStartIndex, ArgumentOutOfRange_Enum,