Refactor Encoding to split fast-path and fallback logic (dotnet/corec…

…lr#23098) This refactoring is limited to ASCIIEncoding at the moment, but it can easily be applied to UTF-8 / UTF-16 / UTF-32. High-level changes: - Fallback logic has been split from the fast-path, improving performance of GetBytes and similar routines. - All of the plumbing of when to invoke the fallback logic and how to manage leftover data has been moved into the base class. - Almost all of the logic except for the fast-path is now written in terms of verifiable code (Span and ReadOnlySpan). - Minor bug fixes in EncoderNLS.Convert (see https://github.com/dotnet/coreclr/issues/23020). Commit migrated from dotnet/coreclr@43a5159
picenka21 · Mar 11, 2019 · 2883820 · 2883820
1 parent 8d27a06
commit 2883820
Show file tree

Hide file tree

Showing 13 changed files with 2,584 additions and 658 deletions.
diff --git a/src/coreclr/tests/CoreFX/CoreFX.issues.json b/src/coreclr/tests/CoreFX/CoreFX.issues.json
@@ -861,6 +861,28 @@
             ]
         }
     },
+    {
+        "name": "System.Text.Encoding.Tests",
+        "enabled": true,
+        "exclusions": {
+            "namespaces": null,
+            "classes": null,
+            "methods": [
+                {
+                    "name": "System.Text.Tests.EncoderConvert2.EncoderASCIIConvertMixedASCIIUnicodeCharArrayPartial",
+                    "reason": "https://github.com/dotnet/coreclr/issues/23020"
+                },
+                {
+                    "name": "System.Text.Tests.EncoderConvert2.EncoderUTF8ConvertMixedASCIIUnicodeCharArrayPartial",
+                    "reason": "https://github.com/dotnet/coreclr/issues/23020"
+                },
+                {
+                    "name": "System.Text.Tests.EncoderConvert2.EncoderUTF8ConvertUnicodeCharArrayPartial",
+                    "reason": "https://github.com/dotnet/coreclr/issues/23020"
+                }
+            ]
+        }
+    },
     {
         "name": "System.Text.RegularExpressions.Tests",
         "enabled": true,

diff --git a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems
@@ -761,6 +761,7 @@
     <Compile Include="$(MSBuildThisFileDirectory)System\StringSplitOptions.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\SystemException.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIEncoding.cs" />
+    <Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIUtility.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\StringBuilderCache.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\CodePageDataItem.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\Decoder.cs" />
@@ -776,6 +777,7 @@
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\EncoderFallback.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\EncoderReplacementFallback.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\Encoding.cs" />
+    <Compile Include="$(MSBuildThisFileDirectory)System\Text\Encoding.Internal.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\EncodingData.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\EncodingInfo.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\EncodingNLS.cs" />

diff --git a/src/libraries/System.Private.CoreLib/src/System/String.cs b/src/libraries/System.Private.CoreLib/src/System/String.cs
@@ -480,7 +480,7 @@ public static bool IsNullOrWhiteSpace(string value)
             Debug.Assert(byteLength >= 0);
 
             // Get our string length
-            int stringLength = encoding.GetCharCount(bytes, byteLength, null);
+            int stringLength = encoding.GetCharCount(bytes, byteLength);
             Debug.Assert(stringLength >= 0, "stringLength >= 0");
 
             // They gave us an empty string if they needed one
@@ -491,7 +491,7 @@ public static bool IsNullOrWhiteSpace(string value)
             string s = FastAllocateString(stringLength);
             fixed (char* pTempChars = &s._firstChar)
             {
-                int doubleCheck = encoding.GetChars(bytes, byteLength, pTempChars, stringLength, null);
+                int doubleCheck = encoding.GetChars(bytes, byteLength, pTempChars, stringLength);
                 Debug.Assert(stringLength == doubleCheck,
                     "Expected encoding.GetChars to return same length as encoding.GetCharCount");
             }

diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIEncoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIEncoding.cs
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs
@@ -0,0 +1,76 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Runtime.CompilerServices;
+
+namespace System.Text
+{
+    /*
+     * Contains naive unoptimized (non-SIMD) implementations of ASCII transcoding
+     * operations. Vectorized methods can be substituted here as a drop-in replacement.
+     */
+
+    internal unsafe static class ASCIIUtility
+    {
+        [MethodImpl(MethodImplOptions.NoInlining)] // the actual implementation won't be inlined, so this shouldn't be either, lest it throw off benchmarks
+        public static uint GetIndexOfFirstNonAsciiByte(byte* pBytes, uint byteCount)
+        {
+            uint idx = 0;
+            for (; idx < byteCount; idx++)
+            {
+                if ((sbyte)pBytes[idx] < 0)
+                {
+                    break;
+                }
+            }
+            return idx;
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)] // the actual implementation won't be inlined, so this shouldn't be either, lest it throw off benchmarks
+        public static uint GetIndexOfFirstNonAsciiChar(char* pChars, uint charCount)
+        {
+            uint idx = 0;
+            for (; idx < charCount; idx++)
+            {
+                if (pChars[idx] > 0x7Fu)
+                {
+                    break;
+                }
+            }
+            return idx;
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)] // the actual implementation won't be inlined, so this shouldn't be either, lest it throw off benchmarks
+        public static uint NarrowUtf16ToAscii(char* pChars, byte* pBytes, uint elementCount)
+        {
+            uint idx = 0;
+            for (; idx < elementCount; idx++)
+            {
+                uint ch = pChars[idx];
+                if (ch > 0x7Fu)
+                {
+                    break;
+                }
+                pBytes[idx] = (byte)ch;
+            }
+            return idx;
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)] // the actual implementation won't be inlined, so this shouldn't be either, lest it throw off benchmarks
+        public static uint WidenAsciiToUtf16(byte* pBytes, char* pChars, uint elementCount)
+        {
+            uint idx = 0;
+            for (; idx < elementCount; idx++)
+            {
+                byte b = pBytes[idx];
+                if (b > 0x7F)
+                {
+                    break;
+                }
+                pChars[idx] = (char)b;
+            }
+            return idx;
+        }
+    }
+}
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/DecoderFallback.cs b/src/libraries/System.Private.CoreLib/src/System/Text/DecoderFallback.cs
@@ -67,6 +67,10 @@ public virtual void Reset()
         internal unsafe byte* byteStart;
         internal unsafe char* charEnd;
 
+        internal Encoding _encoding;
+        internal DecoderNLS _decoder;
+        private int _originalByteCount;
+
         // Internal Reset
         internal unsafe void InternalReset()
         {
@@ -82,6 +86,22 @@ internal unsafe void InternalInitialize(byte* byteStart, char* charEnd)
             this.charEnd = charEnd;
         }
 
+        internal static DecoderFallbackBuffer CreateAndInitialize(Encoding encoding, DecoderNLS decoder, int originalByteCount)
+        {
+            // The original byte count is only used for keeping track of what 'index' value needs
+            // to be passed to the abstract Fallback method. The index value is calculated by subtracting
+            // 'bytes.Length' (where bytes is expected to be the entire remaining input buffer)
+            // from the 'originalByteCount' value specified here.
+
+            DecoderFallbackBuffer fallbackBuffer = (decoder is null) ? encoding.DecoderFallback.CreateFallbackBuffer() : decoder.FallbackBuffer;
+
+            fallbackBuffer._encoding = encoding;
+            fallbackBuffer._decoder = decoder;
+            fallbackBuffer._originalByteCount = originalByteCount;
+
+            return fallbackBuffer;
+        }
+
         // Fallback the current byte by sticking it into the remaining char buffer.
         // This can only be called by our encodings (other have to use the public fallback methods), so
         // we can use our DecoderNLS here too (except we don't).
@@ -191,6 +211,90 @@ internal unsafe virtual int InternalFallback(byte[] bytes, byte* pBytes)
             return 0;
         }
 
+        internal int InternalFallbackGetCharCount(ReadOnlySpan<byte> remainingBytes, int fallbackLength)
+        {
+            return (Fallback(remainingBytes.Slice(0, fallbackLength).ToArray(), index: _originalByteCount - remainingBytes.Length))
+                ? DrainRemainingDataForGetCharCount()
+                : 0;
+        }
+
+        internal bool TryInternalFallbackGetChars(ReadOnlySpan<byte> remainingBytes, int fallbackLength, Span<char> chars, out int charsWritten)
+        {
+            if (Fallback(remainingBytes.Slice(0, fallbackLength).ToArray(), index: _originalByteCount - remainingBytes.Length))
+            {
+                return TryDrainRemainingDataForGetChars(chars, out charsWritten);
+            }
+            else
+            {
+                // Return true because we weren't asked to write anything, so this is a "success" in the sense that
+                // the output buffer was large enough to hold the desired 0 chars of output.
+
+                charsWritten = 0;
+                return true;
+            }
+        }
+
+        private Rune GetNextRune()
+        {
+            // Call GetNextChar() and try treating it as a non-surrogate character.
+            // If that fails, call GetNextChar() again and attempt to treat the two chars
+            // as a surrogate pair. If that still fails, throw an exception since the fallback
+            // mechanism is giving us a bad replacement character.
+
+            Rune rune;
+            char ch = GetNextChar();
+            if (!Rune.TryCreate(ch, out rune) && !Rune.TryCreate(ch, GetNextChar(), out rune))
+            {
+                throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex);
+            }
+
+            return rune;
+        }
+
+        internal int DrainRemainingDataForGetCharCount()
+        {
+            int totalCharCount = 0;
+
+            Rune thisRune;
+            while ((thisRune = GetNextRune()).Value != 0)
+            {
+                // We need to check for overflow while tallying the fallback char count.
+
+                totalCharCount += thisRune.Utf16SequenceLength;
+                if (totalCharCount < 0)
+                {
+                    InternalReset();
+                    Encoding.ThrowConversionOverflow();
+                }
+            }
+
+            return totalCharCount;
+        }
+
+        internal bool TryDrainRemainingDataForGetChars(Span<char> chars, out int charsWritten)
+        {
+            int originalCharCount = chars.Length;
+
+            Rune thisRune;
+            while ((thisRune = GetNextRune()).Value != 0)
+            {
+                if (thisRune.TryEncode(chars, out int charsWrittenJustNow))
+                {
+                    chars = chars.Slice(charsWrittenJustNow);
+                    continue;
+                }
+                else
+                {
+                    InternalReset();
+                    charsWritten = default;
+                    return false;
+                }
+            }
+
+            charsWritten = originalCharCount - chars.Length;
+            return true;
+        }
+
         // private helper methods
         internal void ThrowLastBytesRecursive(byte[] bytesUnknown)
         {