From 8b8aeb3f5672bd00518f27d69709eba8f2a54a50 Mon Sep 17 00:00:00 2001
From: RealityProgrammer <DestinyHero789@gmail.com>
Date: Tue, 14 Mar 2023 19:17:19 +0700
Subject: [PATCH] Fix source validator's code reports, remove 95% of AdvSimd
 implementation to prevent future consequences

---
 X10D/src/Collections/BoolListExtensions.cs |   2 +-
 X10D/src/Collections/ByteExtensions.cs     |   2 +-
 X10D/src/Collections/Int32Extensions.cs    |  20 +--
 X10D/src/Core/IntrinsicExtensions.cs       |   3 +-
 X10D/src/Core/IntrinsicUtility.cs          | 176 +++++++++++----------
 X10D/src/Core/SpanExtensions.cs            |  44 ++----
 6 files changed, 125 insertions(+), 122 deletions(-)
diff --git a/X10D/src/Collections/BoolListExtensions.cs b/X10D/src/Collections/BoolListExtensions.cs
index 65f31b5ee..ee7005020 100644
--- a/X10D/src/Collections/BoolListExtensions.cs
+++ b/X10D/src/Collections/BoolListExtensions.cs
@@ -26,7 +26,7 @@ public static byte PackByte(this IReadOnlyList<bool> source)
             throw new ArgumentNullException(nameof(source));
         }
 #endif
-        
+
         if (source.Count > 8)
         {
             throw new ArgumentException("Source cannot contain more than than 8 elements.", nameof(source));
diff --git a/X10D/src/Collections/ByteExtensions.cs b/X10D/src/Collections/ByteExtensions.cs
index 35403f8ca..6fde587b0 100644
--- a/X10D/src/Collections/ByteExtensions.cs
+++ b/X10D/src/Collections/ByteExtensions.cs
@@ -47,7 +47,7 @@ public static void Unpack(this byte value, Span<bool> destination)
             return;
         }
 #endif
-        
+
         FallbackImplementation(value, destination);
 
 #if NETCOREAPP3_0_OR_GREATER
diff --git a/X10D/src/Collections/Int32Extensions.cs b/X10D/src/Collections/Int32Extensions.cs
index f226adf9d..f6e8fd771 100644
--- a/X10D/src/Collections/Int32Extensions.cs
+++ b/X10D/src/Collections/Int32Extensions.cs
@@ -42,7 +42,7 @@ public static void Unpack(this int value, Span<bool> destination)
 
 #if NETCOREAPP3_0_OR_GREATER
         // TODO: AdvSimd support.
-        
+
         // https://stackoverflow.com/questions/24225786/fastest-way-to-unpack-32-bits-to-a-32-byte-simd-vector
         if (Avx2.IsSupported)
         {
@@ -64,15 +64,15 @@ unsafe static void Avx2Implementation(int value, Span<bool> destination)
             fixed (bool* pDestination = destination)
             {
                 var mask1 = Vector256.Create(
-                    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-                    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 
-                    0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 
+                    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                    0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
                     0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
                 ).AsByte();
                 var mask2 = Vector256.Create(
-                    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 
-                    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 
-                    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 
+                    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
                     0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
                 );
 
@@ -81,7 +81,7 @@ unsafe static void Avx2Implementation(int value, Span<bool> destination)
                 var and = Avx2.AndNot(shuffle, mask2);
                 var cmp = Avx2.CompareEqual(and, Vector256<byte>.Zero);
                 var correctness = Avx2.And(cmp, Vector256.Create((byte)0x01));
-                
+
                 Avx.Store((byte*)pDestination, correctness);
             }
         }
@@ -103,9 +103,9 @@ unsafe static void Ssse3Implementation(int value, Span<bool> destination)
                 var and = Sse2.AndNot(shuffle, mask2);
                 var cmp = Sse2.CompareEqual(and, Vector128<byte>.Zero);
                 var correctness = Sse2.And(cmp, one);
-                
+
                 Sse2.Store((byte*)pDestination, correctness);
-                
+
                 shuffle = Ssse3.Shuffle(vec, mask1Hi);
                 and = Sse2.AndNot(shuffle, mask2);
                 cmp = Sse2.CompareEqual(and, Vector128<byte>.Zero);
diff --git a/X10D/src/Core/IntrinsicExtensions.cs b/X10D/src/Core/IntrinsicExtensions.cs
index 9e78dd1d3..11283606a 100644
--- a/X10D/src/Core/IntrinsicExtensions.cs
+++ b/X10D/src/Core/IntrinsicExtensions.cs
@@ -5,7 +5,8 @@
 namespace X10D.Core;
 
 /// <summary>
-///     Extension methods for SIMD vectors, namely <see cref="Vector64{T}"/>, <see cref="Vector128{T}"/> and <see cref="Vector256{T}"/>.
+///     Extension methods for SIMD vectors, namely <see cref="Vector64{T}"/>, <see cref="Vector128{T}"/> and
+///     <see cref="Vector256{T}"/>.
 /// </summary>
 public static class IntrinsicExtensions
 {
diff --git a/X10D/src/Core/IntrinsicUtility.cs b/X10D/src/Core/IntrinsicUtility.cs
index 7493001cf..07dd852ad 100644
--- a/X10D/src/Core/IntrinsicUtility.cs
+++ b/X10D/src/Core/IntrinsicUtility.cs
@@ -18,7 +18,9 @@ public static class IntrinsicUtility
     // FOR API CONSISTENCY.
 
     /// <summary>
-    ///     <br>Correcting <see cref="Vector64{T}"/> of <see langword="byte"/> into 0 and 1 depend on their boolean truthiness.</br>
+    ///     <br>
+    ///     Correcting <see cref="Vector64{T}"/> of <see langword="byte"/> into 0 and 1 depend on their boolean truthiness.
+    ///     </br>
     ///     <br>Operation (raw):</br>
     ///     <code>
     ///     for (int i = 0; i &lt; 8; i++) {
@@ -33,19 +35,15 @@ public static class IntrinsicUtility
     ///     </code>
     /// </summary>
     /// <param name="vector">Vector of byte to correct.</param>
-    /// <returns>A <see cref="Vector64{T}"/> of <see langword="byte"/> which remapped back to 0 and 1 based on boolean truthiness.</returns>
+    /// <returns>
+    /// A <see cref="Vector64{T}"/> of <see langword="byte"/> which remapped back to 0 and 1 based on boolean truthiness.
+    /// </returns>
     [Pure]
     [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
     public static Vector64<byte> CorrectBoolean(Vector64<byte> vector)
     {
-        if (AdvSimd.IsSupported)
-        {
-            // Haven't tested since March 6th 2023 (Reason: Unavailable hardware).
-            var cmp = AdvSimd.CompareEqual(vector, Vector64<byte>.Zero);
-            var result = AdvSimd.BitwiseSelect(cmp, vector, Vector64<byte>.Zero);
-
-            return result;
-        }
+        // TODO: AdvSimd implementation.
+        // TODO: WasmSimd implementation. (?)
 
         var output = GetUninitializedVector64<byte>();
 
@@ -64,7 +62,9 @@ public static Vector64<byte> CorrectBoolean(Vector64<byte> vector)
     }
 
     /// <summary>
-    ///     <br>Correcting <see cref="Vector128{T}"/> of <see langword="byte"/> into 0 and 1 depend on their boolean truthiness.</br>
+    ///     <br>
+    ///     Correcting <see cref="Vector128{T}"/> of <see langword="byte"/> into 0 and 1 depend on their boolean truthiness.
+    ///     </br>
     ///     <br>Operation (raw):</br>
     ///     <code>
     ///     for (int i = 0; i &lt; 16; i++) {
@@ -79,7 +79,9 @@ public static Vector64<byte> CorrectBoolean(Vector64<byte> vector)
     ///     </code>
     /// </summary>
     /// <param name="vector">Vector of byte to correct.</param>
-    /// <returns>A <see cref="Vector128{T}"/> of <see langword="byte"/> which remapped back to 0 and 1 based on boolean truthiness.</returns>
+    /// <returns>
+    /// A <see cref="Vector128{T}"/> of <see langword="byte"/> which remapped back to 0 and 1 based on boolean truthiness.
+    /// </returns>
     [Pure]
     [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
     public static Vector128<byte> CorrectBoolean(Vector128<byte> vector)
@@ -91,33 +93,25 @@ public static Vector128<byte> CorrectBoolean(Vector128<byte> vector)
 
             return result;
         }
-        if (AdvSimd.IsSupported)
-        {
-            // Haven't tested since March 6th 2023 (Reason: Unavailable hardware).
-            var cmp = AdvSimd.CompareEqual(vector, Vector128<byte>.Zero);
-            var result = AdvSimd.BitwiseSelect(cmp, vector, Vector128<byte>.Zero);
 
-            return result;
-        }
+        // TODO: AdvSimd implementation.
+        // TODO: WasmSimd implementation.
 
         var output = GetUninitializedVector128<byte>();
 
         for (int i = 0; i < Vector128<byte>.Count; i++)
         {
-            ref var writeElement = ref Unsafe.Add(ref Unsafe.As<Vector128<byte>, byte>(ref output), i);
-#if NET7_0_OR_GREATER
-            writeElement = vector[i] == 0 ? (byte)0 : (byte)1;
-#else
-            var element = Unsafe.Add(ref Unsafe.As<Vector128<byte>, byte>(ref vector), i);
-            writeElement = element == 0 ? (byte)0 : (byte)1;
-#endif
+            Unsafe.Add(ref Unsafe.As<Vector128<byte>, byte>(ref output), i) =
+                Unsafe.Add(ref Unsafe.As<Vector128<byte>, byte>(ref vector), i) == 0 ? (byte)0 : (byte)1;
         }
 
         return output;
     }
 
     /// <summary>
-    ///     <br>Correcting <see cref="Vector256{T}"/> of <see langword="byte"/> into 0 and 1 depend on their boolean truthiness.</br>
+    ///     <br>
+    ///     Correcting <see cref="Vector256{T}"/> of <see langword="byte"/> into 0 and 1 depend on their boolean truthiness.
+    ///     </br>
     ///     <br>Operation (raw):</br>
     ///     <code>
     ///     for (int i = 0; i &lt; 16; i++) {
@@ -132,7 +126,9 @@ public static Vector128<byte> CorrectBoolean(Vector128<byte> vector)
     ///     </code>
     /// </summary>
     /// <param name="vector">Vector of byte to correct.</param>
-    /// <returns>A <see cref="Vector256{T}"/> of <see langword="byte"/> which remapped back to 0 and 1 based on boolean truthiness.</returns>
+    /// <returns>
+    /// A <see cref="Vector256{T}"/> of <see langword="byte"/> which remapped back to 0 and 1 based on boolean truthiness.
+    /// </returns>
     [Pure]
     [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
     public static Vector256<byte> CorrectBoolean(Vector256<byte> vector)
@@ -149,20 +145,17 @@ public static Vector256<byte> CorrectBoolean(Vector256<byte> vector)
 
         for (int i = 0; i < Vector256<byte>.Count; i++)
         {
-            ref var writeElement = ref Unsafe.Add(ref Unsafe.As<Vector256<byte>, byte>(ref output), i);
-#if NET7_0_OR_GREATER
-            writeElement = vector[i] == 0 ? (byte)0 : (byte)1;
-#else
-            var element = Unsafe.Add(ref Unsafe.As<Vector256<byte>, byte>(ref vector), i);
-            writeElement = element == 0 ? (byte)0 : (byte)1;
-#endif
+            Unsafe.Add(ref Unsafe.As<Vector256<byte>, byte>(ref output), i) =
+                Unsafe.Add(ref Unsafe.As<Vector256<byte>, byte>(ref vector), i) == 0 ? (byte)0 : (byte)1;
         }
 
         return output;
     }
 
     /// <summary>
-    ///     <br>Multiply packed 64-bit unsigned integer elements in a and b and truncate the results to 64-bit integer.</br>
+    ///     <br>
+    ///     Multiply packed 64-bit unsigned integer elements in a and b and truncate the results to 64-bit integer.
+    ///     </br>
     ///     <br>Operation:</br>
     ///     <code>
     ///     dest[0] = lhs[0] * rhs[0];
@@ -171,7 +164,9 @@ public static Vector256<byte> CorrectBoolean(Vector256<byte> vector)
     /// </summary>
     /// <param name="lhs">Left vector.</param>
     /// <param name="rhs">Right vector.</param>
-    /// <returns>A <see cref="Vector128{T}"/> of <see langword="ulong"/> whose elements is 64-bit truncated product of lhs and rhs.</returns>
+    /// <returns>
+    /// A <see cref="Vector128{T}"/> of <see langword="ulong"/> whose elements is 64-bit truncated product of lhs and rhs.
+    /// </returns>
     [Pure]
     [CLSCompliant(false)]
     [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
@@ -191,32 +186,26 @@ public static Vector128<ulong> Multiply(Vector128<ulong> lhs, Vector128<ulong> r
 
             return Sse2.Add(high, ac);
         }
-        if (AdvSimd.IsSupported)
-        {
-            // https://stackoverflow.com/questions/60236627/facing-problem-in-implementing-multiplication-of-64-bit-variables-using-arm-neon
 
-            // Hasn't been tested since March 7th 2023 (Reason: Unavailable hardware)
-            var a = AdvSimd.ExtractNarrowingLower(lhs);
-            var b = AdvSimd.ExtractNarrowingLower(rhs);
-
-            var mul = AdvSimd.Multiply(rhs.AsUInt32(), AdvSimd.ReverseElement32(lhs).AsUInt32());
-
-            return AdvSimd.MultiplyWideningLowerAndAdd(AdvSimd.ShiftLeftLogical(mul.AsUInt64(), 32), a, b);
-        }
+        // TODO: AdvSimd implementation.
+        // TODO: WasmSimd implementation.
 
         var output = GetUninitializedVector128<ulong>();
 
         Unsafe.As<Vector128<ulong>, ulong>(ref output) =
             Unsafe.As<Vector128<ulong>, ulong>(ref lhs) * Unsafe.As<Vector128<ulong>, ulong>(ref rhs);
 
-        Unsafe.Add(ref Unsafe.As<Vector128<ulong>, ulong>(ref output), 1) = 
-            Unsafe.Add(ref Unsafe.As<Vector128<ulong>, ulong>(ref lhs), 1) * Unsafe.Add(ref Unsafe.As<Vector128<ulong>, ulong>(ref rhs), 1);
+        Unsafe.Add(ref Unsafe.As<Vector128<ulong>, ulong>(ref output), 1) =
+            Unsafe.Add(ref Unsafe.As<Vector128<ulong>, ulong>(ref lhs), 1) *
+            Unsafe.Add(ref Unsafe.As<Vector128<ulong>, ulong>(ref rhs), 1);
 
         return output;
     }
 
     /// <summary>
-    ///     <br>Multiply packed 64-bit unsigned integer elements in a and b and truncate the results to 64-bit integer.</br>
+    ///     <br>
+    ///     Multiply packed 64-bit unsigned integer elements in a and b and truncate the results to 64-bit integer.
+    ///     </br>
     ///     <br>Operation:</br>
     ///     <code>
     ///     dest[0] = lhs[0] * rhs[0];
@@ -227,7 +216,9 @@ public static Vector128<ulong> Multiply(Vector128<ulong> lhs, Vector128<ulong> r
     /// </summary>
     /// <param name="lhs">Left vector.</param>
     /// <param name="rhs">Right vector.</param>
-    /// <returns>A <see cref="Vector256{T}"/> of <see langword="ulong"/> whose elements is 64-bit truncated product of lhs and rhs.</returns>
+    /// <returns>
+    /// A <see cref="Vector256{T}"/> of <see langword="ulong"/> whose elements is 64-bit truncated product of lhs and rhs.
+    /// </returns>
     [Pure]
     [CLSCompliant(false)]
     [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
@@ -253,14 +244,17 @@ public static Vector256<ulong> Multiply(Vector256<ulong> lhs, Vector256<ulong> r
         for (int i = 0; i < Vector256<ulong>.Count; i++)
         {
             Unsafe.Add(ref Unsafe.As<Vector256<ulong>, ulong>(ref output), i) =
-                Unsafe.Add(ref Unsafe.As<Vector256<ulong>, ulong>(ref lhs), i) * Unsafe.Add(ref Unsafe.As<Vector256<ulong>, ulong>(ref rhs), i);
+                Unsafe.Add(ref Unsafe.As<Vector256<ulong>, ulong>(ref lhs), i) *
+                Unsafe.Add(ref Unsafe.As<Vector256<ulong>, ulong>(ref rhs), i);
         }
 
         return output;
     }
 
     /// <summary>
-    ///     <br>Multiply packed 64-bit signed integer elements in a and b and truncate the results to 64-bit integer.</br>
+    ///     <br>
+    ///     Multiply packed 64-bit signed integer elements in a and b and truncate the results to 64-bit integer.
+    ///     </br>
     ///     <br>Operation:</br>
     ///     <code>
     ///     dest[0] = lhs[0] * rhs[0];
@@ -269,7 +263,9 @@ public static Vector256<ulong> Multiply(Vector256<ulong> lhs, Vector256<ulong> r
     /// </summary>
     /// <param name="lhs">Left vector.</param>
     /// <param name="rhs">Right vector.</param>
-    /// <returns>A <see cref="Vector128{T}"/> of <see langword="long"/> whose elements is 64-bit truncated product of lhs and rhs.</returns>
+    /// <returns>
+    /// A <see cref="Vector128{T}"/> of <see langword="long"/> whose elements is 64-bit truncated product of lhs and rhs.
+    /// </returns>
     [Pure]
     [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
     public static Vector128<long> Multiply(Vector128<long> lhs, Vector128<long> rhs)
@@ -278,7 +274,9 @@ public static Vector128<long> Multiply(Vector128<long> lhs, Vector128<long> rhs)
     }
 
     /// <summary>
-    ///     <br>Multiply packed 64-bit signed integer elements in a and b and truncate the results to 64-bit integer.</br>
+    ///     <br>
+    ///     Multiply packed 64-bit signed integer elements in a and b and truncate the results to 64-bit integer.
+    ///     </br>
     ///     <br>Operation:</br>
     ///     <code>
     ///     dest[0] = lhs[0] * rhs[0];
@@ -289,7 +287,9 @@ public static Vector128<long> Multiply(Vector128<long> lhs, Vector128<long> rhs)
     /// </summary>
     /// <param name="lhs">Left vector.</param>
     /// <param name="rhs">Right vector.</param>
-    /// <returns>A <see cref="Vector256{T}"/> of <see langword="ulong"/> whose elements is 64-bit truncated product of lhs and rhs.</returns>
+    /// <returns>
+    /// A <see cref="Vector256{T}"/> of <see langword="ulong"/> whose elements is 64-bit truncated product of lhs and rhs.
+    /// </returns>
     [Pure]
     [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
     public static Vector256<long> Multiply(Vector256<long> lhs, Vector256<long> rhs)
@@ -298,7 +298,10 @@ public static Vector256<long> Multiply(Vector256<long> lhs, Vector256<long> rhs)
     }
 
     /// <summary>
-    ///     <br>Horizontally apply OR operation on adjacent pairs of single-precision (32-bit) floating-point elements in lhs and rhs.</br>
+    ///     <br>
+    ///     Horizontally apply OR operation on adjacent pairs of single-precision (32-bit) floating-point elements in lhs and
+    ///     rhs.
+    ///     </br>
     ///     <br>Operation:</br>
     ///     <code>
     ///     dest[0] = lhs[0] | lhs[1];
@@ -309,7 +312,10 @@ public static Vector256<long> Multiply(Vector256<long> lhs, Vector256<long> rhs)
     /// </summary>
     /// <param name="lhs">Left vector.</param>
     /// <param name="rhs">Right vector.</param>
-    /// <returns>A <see cref="Vector128{T}"/> of <see langword="float"/> with all elements is result of OR operation on adjacent pairs of elements in lhs and rhs.</returns>
+    /// <returns>
+    /// A <see cref="Vector128{T}"/> of <see langword="float"/> with all elements is result of OR operation on adjacent pairs of
+    /// elements in lhs and rhs.
+    /// </returns>
     [Pure]
     [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
     public static Vector128<float> HorizontalOr(Vector128<float> lhs, Vector128<float> rhs)
@@ -321,34 +327,35 @@ public static Vector128<float> HorizontalOr(Vector128<float> lhs, Vector128<floa
 
             return Sse.Or(s1, s2);
         }
-        if (AdvSimd.Arm64.IsSupported)
-        {
-            // Hasn't been tested since March 7th 2023 (Reason: Unavailable hardware).
-            var s1 = AdvSimd.Arm64.UnzipEven(lhs, rhs);
-            var s2 = AdvSimd.Arm64.UnzipOdd(lhs, rhs);
 
-            return AdvSimd.Or(s1, s2);
-        }
+        // TODO: AdvSimd implementation.
+        // TODO: WasmSimd implementation. (?)
 
         Vector128<float> output = GetUninitializedVector128<float>();
 
-        Unsafe.As<Vector128<float>, uint>(ref output) = 
-            Unsafe.As<Vector128<float>, uint>(ref lhs) | Unsafe.Add(ref Unsafe.As<Vector128<float>, uint>(ref lhs), 1);
+        Unsafe.As<Vector128<float>, uint>(ref output) =
+            Unsafe.As<Vector128<float>, uint>(ref lhs) |
+            Unsafe.Add(ref Unsafe.As<Vector128<float>, uint>(ref lhs), 1);
 
         Unsafe.Add(ref Unsafe.As<Vector128<float>, uint>(ref output), 1) =
-            Unsafe.Add(ref Unsafe.As<Vector128<float>, uint>(ref lhs), 2) | Unsafe.Add(ref Unsafe.As<Vector128<float>, uint>(ref lhs), 3);
+            Unsafe.Add(ref Unsafe.As<Vector128<float>, uint>(ref lhs), 2) |
+            Unsafe.Add(ref Unsafe.As<Vector128<float>, uint>(ref lhs), 3);
 
         Unsafe.Add(ref Unsafe.As<Vector128<float>, uint>(ref output), 2) =
-            Unsafe.As<Vector128<float>, uint>(ref rhs) | Unsafe.Add(ref Unsafe.As<Vector128<float>, uint>(ref rhs), 1);
+            Unsafe.As<Vector128<float>, uint>(ref rhs) |
+            Unsafe.Add(ref Unsafe.As<Vector128<float>, uint>(ref rhs), 1);
 
         Unsafe.Add(ref Unsafe.As<Vector128<float>, uint>(ref output), 3) =
-            Unsafe.Add(ref Unsafe.As<Vector128<float>, uint>(ref rhs), 2) | Unsafe.Add(ref Unsafe.As<Vector128<float>, uint>(ref rhs), 3);
+            Unsafe.Add(ref Unsafe.As<Vector128<float>, uint>(ref rhs), 2) |
+            Unsafe.Add(ref Unsafe.As<Vector128<float>, uint>(ref rhs), 3);
 
         return output;
     }
 
     /// <summary>
-    ///     <br>Horizontally apply OR operation on adjacent pairs of 32-bit integer elements in lhs and rhs.</br>
+    ///     <br>
+    ///     Horizontally apply OR operation on adjacent pairs of 32-bit integer elements in lhs and rhs.
+    ///     </br>
     ///     <br>Operation:</br>
     ///     <code>
     ///     dest[0] = lhs[0] | lhs[1];
@@ -359,9 +366,10 @@ public static Vector128<float> HorizontalOr(Vector128<float> lhs, Vector128<floa
     /// </summary>
     /// <param name="lhs">Left vector.</param>
     /// <param name="rhs">Right vector.</param>
-    /// <returns>A <see cref="Vector128{T}"/> of <see langword="int"/> with all elements is result of OR operation on adjacent pairs of elements in lhs and rhs.</returns>
-    /// <remarks>API avaliable on SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, ARM64 NEON (untested) hardwares.</remarks>
-    /// <exception cref="PlatformNotSupportedException">Hardware doesn't support ARM64 NEON or SSE instruction set.</exception>
+    /// <returns>
+    /// A <see cref="Vector128{T}"/> of <see langword="int"/> with all elements is result of OR operation on adjacent pairs of
+    /// elements in lhs and rhs.
+    /// </returns>
     [Pure]
     [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
     public static Vector128<int> HorizontalOr(Vector128<int> lhs, Vector128<int> rhs)
@@ -370,7 +378,9 @@ public static Vector128<int> HorizontalOr(Vector128<int> lhs, Vector128<int> rhs
     }
 
     /// <summary>
-    ///     <br>Horizontally apply OR operation on adjacent pairs of 32-bit unsigned integer elements in lhs and rhs.</br>
+    ///     <br>
+    ///     Horizontally apply OR operation on adjacent pairs of 32-bit unsigned integer elements in lhs and rhs.
+    ///     </br>
     ///     <br>Operation:</br>
     ///     <code>
     ///     dest[0] = lhs[0] | lhs[1];
@@ -381,9 +391,10 @@ public static Vector128<int> HorizontalOr(Vector128<int> lhs, Vector128<int> rhs
     /// </summary>
     /// <param name="lhs">Left vector.</param>
     /// <param name="rhs">Right vector.</param>
-    /// <returns>A <see cref="Vector128{T}"/> of <see langword="uint"/> with all elements is result of OR operation on adjacent pairs of elements in lhs and rhs.</returns>
-    /// <remarks>API avaliable on SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, ARM64 NEON (untested) hardwares.</remarks>
-    /// <exception cref="PlatformNotSupportedException">Hardware doesn't support ARM64 NEON or SSE2 instruction set.</exception>
+    /// <returns>
+    /// A <see cref="Vector128{T}"/> of <see langword="uint"/> with all elements is result of OR operation on adjacent pairs of
+    /// elements in lhs and rhs.
+    /// </returns>
     [Pure]
     [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
     [CLSCompliant(false)]
@@ -402,9 +413,10 @@ public static Vector128<uint> HorizontalOr(Vector128<uint> lhs, Vector128<uint>
     ///     </code>
     /// </summary>
     /// <param name="vector">Input vector.</param>
-    /// <returns>A <see cref="Vector128{T}"/> of <see langword="ulong"/> with elements the same as input vector except their positions/indices are reversed.</returns>
-    /// <remarks>API available on SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 hardwares.</remarks>
-    /// <exception cref="PlatformNotSupportedException">Hardware doesn't support SSE2 instruction set.</exception>
+    /// <returns>
+    /// A <see cref="Vector128{T}"/> of <see langword="ulong"/> with elements the same as input vector except their positions
+    /// (or indices) are reversed.
+    /// </returns>
     [Pure]
     [CLSCompliant(false)]
     [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
diff --git a/X10D/src/Core/SpanExtensions.cs b/X10D/src/Core/SpanExtensions.cs
index 68bd05af7..e4ecf0799 100644
--- a/X10D/src/Core/SpanExtensions.cs
+++ b/X10D/src/Core/SpanExtensions.cs
@@ -72,13 +72,14 @@ private static Vector256<ulong> IntegerPackingMagicV256
     public static bool Contains<T>(this ReadOnlySpan<T> span, T value) where T : struct, Enum
     {
 #if NET6_0_OR_GREATER
-        // Use MemoryMarshal.CreateSpan instead of using creating new Span instance from pointer will trim down a lot of instructions
-        // on Release mode.
-        // https://sharplab.io/#v2:EYLgxg9gTgpgtADwGwBYA0AXEBDAzgWwB8ABABgAJiBGAOgCUBXAOwwEt8YaBJFmKCAA4BlPgDdWYGLgDcAWABQZSrUYt2nAMIR8A1gBs+IqOMkyFxAExVzFIQAtsUAQBlsweszYc588wGZyGCYGfHIAFSkMAFFg0JByVhZyAG8FcnTyAEE0cgAhHI0cgBE0BQBfBX9KC3INFLSMgG0AKVYMAHEgvgkACgwATwEYCAAzHojcaNiASmmAXQb0xoBZGAw7CAATLh09HtX1rZ2BPQB5ATYIJlwaTIBzO9hcXFZRGB49RMS78kJyA4221250u11uDyeLzeIPYrAAXthQfNFpQAtQkORmLhsCMYORgBAIHp/mtAVQADxhAB8PSEAmwTEpVPIuHpTByYXIomwegYMGm5AA7nY+HjOfEYiF6vIMrLyLARgkkkEQrhyABeeUwRUAVWuOM4mVwlJyiQwNIVJPw0H6y0cuAcehonQwdG1oqYkh6rIZsx8coyxAA7FabXaoA6eTQNLBETA6QyepaVfhcDkfUwaM4gnd1tNo1cMNhErgenrsbjbsawqaWBbtVyeXy/SiKjKMiiWm1OkxumA+oNhmMJlMQrMFu2lgCjrt9qSZycYVcbvdHlIoe8mJ8mN9fiTDkDFxdWMvwWvnq8YDD8PDESemMjJ6jlBisQb8YTidPNhYmbS2UyLJshyja8vyQoirA4TkBKsTSgG6TBuQvaCuQCaMmaNLlgaVYAAoQGafBJg2qzWlAtr2o6zprG6uKwJ6MDemyszpmyWY5nmBYsMW1xlvqlZGiaSrmsRircmBLZPm2ZRAA===
+        // Use MemoryMarshal.CreateSpan instead of using creating new Span instance from pointer will trim down a lot of
+        // instructions on Release mode.
 
-        // Also use reference instead of MemoryMarshal.Cast to remove boundary check (or something, it just result in something like that).
+        // Also use reference instead of MemoryMarshal.Cast to remove boundary check (or something, it just result in something
+        // like that).
 
-        // TODO: Figure out some kind of way to directly pass the Span directly into Contains call, which make method smaller and more prone to inlining...
+        // TODO: Figure out some kind of way to directly pass the Span directly into Contains call, which make method smaller and
+        // more prone to inlining...
         unsafe
         {
 #pragma warning disable CS8500 // This takes the address of, gets the size of, or declares a pointer to a managed type
@@ -176,6 +177,10 @@ public static unsafe byte PackByte(this ReadOnlySpan<bool> source)
 
                         return unchecked((byte)(IntegerPackingMagic * correct.AsUInt64().GetElement(0) >> 56));
                     }
+
+                    // Probably should remove this piece of code because it is untested, but I see no reason why it should fail
+                    // unless vld1_u8 reverse positions of 8 bytes for some reason.
+
                     if (AdvSimd.IsSupported)
                     {
                         // Hasn't been tested since March 6th 2023 (Reason: Unavailable hardware).
@@ -240,12 +245,12 @@ public static unsafe short PackInt16(this ReadOnlySpan<bool> source)
                     goto default;
                 }
 
-                fixed (bool* pSource = source)
-                {
-                    // TODO: .NET 8.0 Wasm support.
-                    // TODO: Implement a replacement for UInt64 vector multiplication (there are no instruction for this built-in).
+                // TODO: AdvSimd implementation.
+                // TODO: WasmSimd implementation.
 
-                    if (Sse2.IsSupported)
+                if (Sse2.IsSupported)
+                {
+                    fixed (bool* pSource = source)
                     {
                         var load = Sse2.LoadVector128((byte*)pSource);
                         var correct = IntrinsicUtility.CorrectBoolean(load).AsUInt64();
@@ -254,21 +259,9 @@ public static unsafe short PackInt16(this ReadOnlySpan<bool> source)
 
                         return (short)(shift.GetElement(0) | (shift.GetElement(1) << 8));
                     }
-                    if (AdvSimd.IsSupported)
-                    {
-                        // Hasn't been tested since March 6th 2023 (Reason: Unavailable hardware).
-                        var load = AdvSimd.LoadVector128((byte*)pSource);
-                        var correct = IntrinsicUtility.CorrectBoolean(load).AsUInt64();
-                        var multiply = IntrinsicUtility.Multiply(IntegerPackingMagicV128, correct);
-                        var shift = AdvSimd.ShiftRightLogical(multiply, 56);
-
-                        return (short)(shift.GetElement(0) | (shift.GetElement(1) << 8));
-                    }
-                    else
-                    {
-                        goto default;
-                    }
                 }
+
+                goto default;
 #endif
 
             default:
@@ -324,9 +317,6 @@ public static unsafe int PackInt32(this ReadOnlySpan<bool> source)
 
                 fixed (bool* pSource = source)
                 {
-                    // TODO: .NET 8.0 Wasm support.
-                    // TODO: Implement a replacement for UInt64 vector multiplication (there are no instruction for this built-in).
-
                     if (Avx2.IsSupported)
                     {
                         var load = Avx.LoadVector256((byte*)pSource);