From b6d0eec3de0ac75ce81784251b4648a1cef7f628 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Wed, 8 Dec 2021 16:44:21 -0800
Subject: [PATCH 01/18] Wrap bitshifts in ops.rs

For all other operators, we use wrapping logic where applicable.
This is another case it applies. Per rust-lang/rust#91237, we may
wish to specify this as the natural behavior of `simd_{shl,shr}`.
---
 crates/core_simd/src/ops.rs | 168 +++++++++++++++++++++++-------------
 1 file changed, 109 insertions(+), 59 deletions(-)
diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index 3582c57870b9e..2ebcef3d82976 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -32,14 +32,115 @@ where
     }
 }
 
-/// Checks if the right-hand side argument of a left- or right-shift would cause overflow.
-fn invalid_shift_rhs<T>(rhs: T) -> bool
-where
-    T: Default + PartialOrd + core::convert::TryFrom<usize>,
-    <T as core::convert::TryFrom<usize>>::Error: core::fmt::Debug,
-{
-    let bits_in_type = T::try_from(8 * core::mem::size_of::<T>()).unwrap();
-    rhs < T::default() || rhs >= bits_in_type
+/// SAFETY: This macro should not be used for anything except Shl or Shr, and passed the appropriate shift intrinsic.
+/// It handles performing a bitand in addition to calling the shift operator, so that the result
+/// is well-defined: LLVM can return a poison value if you shl, lshr, or ashr if rhs >= <Int>::BITS
+/// At worst, this will maybe add another instruction and cycle,
+/// at best, it may open up more optimization opportunities,
+/// or simply be elided entirely, especially for SIMD ISAs which default to this.
+///
+// FIXME: Consider implementing this in cg_llvm instead?
+// cg_clif defaults to this, and scalar MIR shifts also default to wrapping
+macro_rules! wrap_bitshift_inner {
+    (impl<const LANES: usize> $op:ident for Simd<$int:ty, LANES> {
+        fn $call:ident(self, rhs: Self) -> Self::Output {
+            unsafe { $simd_call:ident }
+        }
+    }) => {
+        impl<const LANES: usize> $op for Simd<$int, LANES>
+        where
+            $int: SimdElement,
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            type Output = Self;
+
+            #[inline]
+            #[must_use = "operator returns a new vector without mutating the inputs"]
+            fn $call(self, rhs: Self) -> Self::Output {
+                unsafe {
+                    $crate::intrinsics::$simd_call(self, rhs.bitand(Simd::splat(<$int>::BITS as $int - 1)))
+                }
+            }
+        }
+    };
+}
+
+macro_rules! wrap_bitshifts {
+    ($(impl<const LANES: usize> ShiftOps for Simd<$int:ty, LANES> {
+        fn shl(self, rhs: Self) -> Self::Output;
+        fn shr(self, rhs: Self) -> Self::Output;
+     })*) => {
+        $(
+            wrap_bitshift_inner! {
+                impl<const LANES: usize> Shl for Simd<$int, LANES> {
+                    fn shl(self, rhs: Self) -> Self::Output {
+                        unsafe { simd_shl }
+                    }
+                }
+            }
+            wrap_bitshift_inner! {
+                impl<const LANES: usize> Shr for Simd<$int, LANES> {
+                    fn shr(self, rhs: Self) -> Self::Output {
+                        // This automatically monomorphizes to lshr or ashr, depending,
+                        // so it's fine to use it for both UInts and SInts.
+                        unsafe { simd_shr }
+                    }
+                }
+            }
+        )*
+    };
+}
+
+wrap_bitshifts! {
+    impl<const LANES: usize> ShiftOps for Simd<i8, LANES> {
+        fn shl(self, rhs: Self) -> Self::Output;
+        fn shr(self, rhs: Self) -> Self::Output;
+    }
+
+    impl<const LANES: usize> ShiftOps for Simd<i16, LANES> {
+        fn shl(self, rhs: Self) -> Self::Output;
+        fn shr(self, rhs: Self) -> Self::Output;
+    }
+
+    impl<const LANES: usize> ShiftOps for Simd<i32, LANES> {
+        fn shl(self, rhs: Self) -> Self::Output;
+        fn shr(self, rhs: Self) -> Self::Output;
+    }
+
+    impl<const LANES: usize> ShiftOps for Simd<i64, LANES> {
+        fn shl(self, rhs: Self) -> Self::Output;
+        fn shr(self, rhs: Self) -> Self::Output;
+    }
+
+    impl<const LANES: usize> ShiftOps for Simd<isize, LANES> {
+        fn shl(self, rhs: Self) -> Self::Output;
+        fn shr(self, rhs: Self) -> Self::Output;
+    }
+
+    impl<const LANES: usize> ShiftOps for Simd<u8, LANES> {
+        fn shl(self, rhs: Self) -> Self::Output;
+        fn shr(self, rhs: Self) -> Self::Output;
+    }
+
+    impl<const LANES: usize> ShiftOps for Simd<u16, LANES> {
+        fn shl(self, rhs: Self) -> Self::Output;
+        fn shr(self, rhs: Self) -> Self::Output;
+    }
+
+    impl<const LANES: usize> ShiftOps for Simd<u32, LANES> {
+        fn shl(self, rhs: Self) -> Self::Output;
+        fn shr(self, rhs: Self) -> Self::Output;
+    }
+
+    impl<const LANES: usize> ShiftOps for Simd<u64, LANES> {
+        fn shl(self, rhs: Self) -> Self::Output;
+        fn shr(self, rhs: Self) -> Self::Output;
+    }
+
+    impl<const LANES: usize> ShiftOps for Simd<usize, LANES> {
+        fn shl(self, rhs: Self) -> Self::Output;
+        fn shr(self, rhs: Self) -> Self::Output;
+    }
 }
 
 /// Automatically implements operators over references in addition to the provided operator.
@@ -85,12 +186,6 @@ macro_rules! impl_op {
     { impl Rem for $scalar:ty } => {
         impl_op! { @binary $scalar, Rem::rem, simd_rem }
     };
-    { impl Shl for $scalar:ty } => {
-        impl_op! { @binary $scalar, Shl::shl, simd_shl }
-    };
-    { impl Shr for $scalar:ty } => {
-        impl_op! { @binary $scalar, Shr::shr, simd_shr }
-    };
     { impl BitAnd for $scalar:ty } => {
         impl_op! { @binary $scalar, BitAnd::bitand, simd_and }
     };
@@ -202,51 +297,6 @@ macro_rules! impl_unsigned_int_ops {
                     }
                 }
             }
-
-            // shifts panic on overflow
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Shl<Self> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Self;
-
-                    #[inline]
-                    fn shl(self, rhs: Self) -> Self::Output {
-                        // TODO there is probably a better way of doing this
-                        if rhs.as_array()
-                            .iter()
-                            .copied()
-                            .any(invalid_shift_rhs)
-                        {
-                            panic!("attempt to shift left with overflow");
-                        }
-                        unsafe { intrinsics::simd_shl(self, rhs) }
-                    }
-                }
-            }
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Shr<Self> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Self;
-
-                    #[inline]
-                    fn shr(self, rhs: Self) -> Self::Output {
-                        // TODO there is probably a better way of doing this
-                        if rhs.as_array()
-                            .iter()
-                            .copied()
-                            .any(invalid_shift_rhs)
-                        {
-                            panic!("attempt to shift with overflow");
-                        }
-                        unsafe { intrinsics::simd_shr(self, rhs) }
-                    }
-                }
-            }
         )*
     };
 }

From 8aef340b8b0658e34b54fdea59e5ffc5ec581106 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Wed, 8 Dec 2021 17:23:54 -0800
Subject: [PATCH 02/18] Refactor bitops with `#[must_use]`

---
 crates/core_simd/src/ops.rs | 131 +++++++++++++++++++++++++++---------
 1 file changed, 98 insertions(+), 33 deletions(-)

diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index 2ebcef3d82976..5e775d6ca138b 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -32,6 +32,29 @@ where
     }
 }
 
+macro_rules! unsafe_base_op {
+    ($(impl<const LANES: usize> $op:ident for Simd<$scalar:ty, LANES> {
+        fn $call:ident(self, rhs: Self) -> Self::Output {
+            unsafe{ $simd_call:ident }
+        }
+    })*) => {
+        $(impl<const LANES: usize> $op for Simd<$scalar, LANES>
+            where
+                $scalar: SimdElement,
+                LaneCount<LANES>: SupportedLaneCount,
+            {
+                type Output = Self;
+
+                #[inline]
+                #[must_use = "operator returns a new vector without mutating the inputs"]
+                fn $call(self, rhs: Self) -> Self::Output {
+                    unsafe { $crate::intrinsics::$simd_call(self, rhs) }
+                }
+            }
+        )*
+    }
+}
+
 /// SAFETY: This macro should not be used for anything except Shl or Shr, and passed the appropriate shift intrinsic.
 /// It handles performing a bitand in addition to calling the shift operator, so that the result
 /// is well-defined: LLVM can return a poison value if you shl, lshr, or ashr if rhs >= <Int>::BITS
@@ -41,13 +64,13 @@ where
 ///
 // FIXME: Consider implementing this in cg_llvm instead?
 // cg_clif defaults to this, and scalar MIR shifts also default to wrapping
-macro_rules! wrap_bitshift_inner {
-    (impl<const LANES: usize> $op:ident for Simd<$int:ty, LANES> {
+macro_rules! wrap_bitshift {
+    ($(impl<const LANES: usize> $op:ident for Simd<$int:ty, LANES> {
         fn $call:ident(self, rhs: Self) -> Self::Output {
             unsafe { $simd_call:ident }
         }
-    }) => {
-        impl<const LANES: usize> $op for Simd<$int, LANES>
+    })*) => {
+        $(impl<const LANES: usize> $op for Simd<$int, LANES>
         where
             $int: SimdElement,
             LaneCount<LANES>: SupportedLaneCount,
@@ -61,24 +84,45 @@ macro_rules! wrap_bitshift_inner {
                     $crate::intrinsics::$simd_call(self, rhs.bitand(Simd::splat(<$int>::BITS as $int - 1)))
                 }
             }
-        }
+        })*
     };
 }
 
-macro_rules! wrap_bitshifts {
-    ($(impl<const LANES: usize> ShiftOps for Simd<$int:ty, LANES> {
+macro_rules! bitops {
+    ($(impl<const LANES: usize> BitOps for Simd<$int:ty, LANES> {
+        fn bitand(self, rhs: Self) -> Self::Output;
+        fn bitor(self, rhs: Self) -> Self::Output;
+        fn bitxor(self, rhs: Self) -> Self::Output;
         fn shl(self, rhs: Self) -> Self::Output;
         fn shr(self, rhs: Self) -> Self::Output;
      })*) => {
         $(
-            wrap_bitshift_inner! {
+            unsafe_base_op!{
+                impl<const LANES: usize> BitAnd for Simd<$int, LANES> {
+                    fn bitand(self, rhs: Self) -> Self::Output {
+                        unsafe { simd_and }
+                    }
+                }
+
+                impl<const LANES: usize> BitOr for Simd<$int, LANES> {
+                    fn bitor(self, rhs: Self) -> Self::Output {
+                        unsafe { simd_or }
+                    }
+                }
+
+                impl<const LANES: usize> BitXor for Simd<$int, LANES> {
+                    fn bitxor(self, rhs: Self) -> Self::Output {
+                        unsafe { simd_xor }
+                    }
+                }
+            }
+            wrap_bitshift! {
                 impl<const LANES: usize> Shl for Simd<$int, LANES> {
                     fn shl(self, rhs: Self) -> Self::Output {
                         unsafe { simd_shl }
                     }
                 }
-            }
-            wrap_bitshift_inner! {
+
                 impl<const LANES: usize> Shr for Simd<$int, LANES> {
                     fn shr(self, rhs: Self) -> Self::Output {
                         // This automatically monomorphizes to lshr or ashr, depending,
@@ -91,53 +135,86 @@ macro_rules! wrap_bitshifts {
     };
 }
 
-wrap_bitshifts! {
-    impl<const LANES: usize> ShiftOps for Simd<i8, LANES> {
+// Integers can always accept bitand, bitor, and bitxor.
+// The only question is how to handle shifts >= <Int>::BITS?
+// Our current solution uses wrapping logic.
+bitops! {
+    impl<const LANES: usize> BitOps for Simd<i8, LANES> {
+        fn bitand(self, rhs: Self) -> Self::Output;
+        fn bitor(self, rhs: Self) -> Self::Output;
+        fn bitxor(self, rhs: Self) -> Self::Output;
         fn shl(self, rhs: Self) -> Self::Output;
         fn shr(self, rhs: Self) -> Self::Output;
     }
 
-    impl<const LANES: usize> ShiftOps for Simd<i16, LANES> {
+    impl<const LANES: usize> BitOps for Simd<i16, LANES> {
+        fn bitand(self, rhs: Self) -> Self::Output;
+        fn bitor(self, rhs: Self) -> Self::Output;
+        fn bitxor(self, rhs: Self) -> Self::Output;
         fn shl(self, rhs: Self) -> Self::Output;
         fn shr(self, rhs: Self) -> Self::Output;
     }
 
-    impl<const LANES: usize> ShiftOps for Simd<i32, LANES> {
+    impl<const LANES: usize> BitOps for Simd<i32, LANES> {
+        fn bitand(self, rhs: Self) -> Self::Output;
+        fn bitor(self, rhs: Self) -> Self::Output;
+        fn bitxor(self, rhs: Self) -> Self::Output;
         fn shl(self, rhs: Self) -> Self::Output;
         fn shr(self, rhs: Self) -> Self::Output;
     }
 
-    impl<const LANES: usize> ShiftOps for Simd<i64, LANES> {
+    impl<const LANES: usize> BitOps for Simd<i64, LANES> {
+        fn bitand(self, rhs: Self) -> Self::Output;
+        fn bitor(self, rhs: Self) -> Self::Output;
+        fn bitxor(self, rhs: Self) -> Self::Output;
         fn shl(self, rhs: Self) -> Self::Output;
         fn shr(self, rhs: Self) -> Self::Output;
     }
 
-    impl<const LANES: usize> ShiftOps for Simd<isize, LANES> {
+    impl<const LANES: usize> BitOps for Simd<isize, LANES> {
+        fn bitand(self, rhs: Self) -> Self::Output;
+        fn bitor(self, rhs: Self) -> Self::Output;
+        fn bitxor(self, rhs: Self) -> Self::Output;
         fn shl(self, rhs: Self) -> Self::Output;
         fn shr(self, rhs: Self) -> Self::Output;
     }
 
-    impl<const LANES: usize> ShiftOps for Simd<u8, LANES> {
+    impl<const LANES: usize> BitOps for Simd<u8, LANES> {
+        fn bitand(self, rhs: Self) -> Self::Output;
+        fn bitor(self, rhs: Self) -> Self::Output;
+        fn bitxor(self, rhs: Self) -> Self::Output;
         fn shl(self, rhs: Self) -> Self::Output;
         fn shr(self, rhs: Self) -> Self::Output;
     }
 
-    impl<const LANES: usize> ShiftOps for Simd<u16, LANES> {
+    impl<const LANES: usize> BitOps for Simd<u16, LANES> {
+        fn bitand(self, rhs: Self) -> Self::Output;
+        fn bitor(self, rhs: Self) -> Self::Output;
+        fn bitxor(self, rhs: Self) -> Self::Output;
         fn shl(self, rhs: Self) -> Self::Output;
         fn shr(self, rhs: Self) -> Self::Output;
     }
 
-    impl<const LANES: usize> ShiftOps for Simd<u32, LANES> {
+    impl<const LANES: usize> BitOps for Simd<u32, LANES> {
+        fn bitand(self, rhs: Self) -> Self::Output;
+        fn bitor(self, rhs: Self) -> Self::Output;
+        fn bitxor(self, rhs: Self) -> Self::Output;
         fn shl(self, rhs: Self) -> Self::Output;
         fn shr(self, rhs: Self) -> Self::Output;
     }
 
-    impl<const LANES: usize> ShiftOps for Simd<u64, LANES> {
+    impl<const LANES: usize> BitOps for Simd<u64, LANES> {
+        fn bitand(self, rhs: Self) -> Self::Output;
+        fn bitor(self, rhs: Self) -> Self::Output;
+        fn bitxor(self, rhs: Self) -> Self::Output;
         fn shl(self, rhs: Self) -> Self::Output;
         fn shr(self, rhs: Self) -> Self::Output;
     }
 
-    impl<const LANES: usize> ShiftOps for Simd<usize, LANES> {
+    impl<const LANES: usize> BitOps for Simd<usize, LANES> {
+        fn bitand(self, rhs: Self) -> Self::Output;
+        fn bitor(self, rhs: Self) -> Self::Output;
+        fn bitxor(self, rhs: Self) -> Self::Output;
         fn shl(self, rhs: Self) -> Self::Output;
         fn shr(self, rhs: Self) -> Self::Output;
     }
@@ -186,15 +263,6 @@ macro_rules! impl_op {
     { impl Rem for $scalar:ty } => {
         impl_op! { @binary $scalar, Rem::rem, simd_rem }
     };
-    { impl BitAnd for $scalar:ty } => {
-        impl_op! { @binary $scalar, BitAnd::bitand, simd_and }
-    };
-    { impl BitOr for $scalar:ty } => {
-        impl_op! { @binary $scalar, BitOr::bitor, simd_or }
-    };
-    { impl BitXor for $scalar:ty } => {
-        impl_op! { @binary $scalar, BitXor::bitxor, simd_xor }
-    };
 
     // generic binary op with assignment when output is `Self`
     { @binary $scalar:ty, $trait:ident :: $trait_fn:ident, $intrinsic:ident } => {
@@ -236,9 +304,6 @@ macro_rules! impl_unsigned_int_ops {
             impl_op! { impl Add for $scalar }
             impl_op! { impl Sub for $scalar }
             impl_op! { impl Mul for $scalar }
-            impl_op! { impl BitAnd for $scalar }
-            impl_op! { impl BitOr  for $scalar }
-            impl_op! { impl BitXor for $scalar }
 
             // Integers panic on divide by 0
             impl_ref_ops! {

From 049e8ca7f7fc42501b98afcb9c32fd51080bd75a Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Wed, 8 Dec 2021 17:31:19 -0800
Subject: [PATCH 03/18] Refactor float arith with `#[must_use]`

---
 crates/core_simd/src/ops.rs | 78 ++++++++++++++++++++++++++++++-------
 1 file changed, 64 insertions(+), 14 deletions(-)

diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index 5e775d6ca138b..65b461d39818b 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -220,6 +220,70 @@ bitops! {
     }
 }
 
+macro_rules! float_arith {
+    ($(impl<const LANES: usize> FloatArith for Simd<$float:ty, LANES> {
+        fn add(self, rhs: Self) -> Self::Output;
+        fn mul(self, rhs: Self) -> Self::Output;
+        fn sub(self, rhs: Self) -> Self::Output;
+        fn div(self, rhs: Self) -> Self::Output;
+        fn rem(self, rhs: Self) -> Self::Output;
+     })*) => {
+        $(
+            unsafe_base_op!{
+                impl<const LANES: usize> Add for Simd<$float, LANES> {
+                    fn add(self, rhs: Self) -> Self::Output {
+                        unsafe { simd_add }
+                    }
+                }
+
+                impl<const LANES: usize> Mul for Simd<$float, LANES> {
+                    fn mul(self, rhs: Self) -> Self::Output {
+                        unsafe { simd_mul }
+                    }
+                }
+
+                impl<const LANES: usize> Sub for Simd<$float, LANES> {
+                    fn sub(self, rhs: Self) -> Self::Output {
+                        unsafe { simd_sub }
+                    }
+                }
+
+                impl<const LANES: usize> Div for Simd<$float, LANES> {
+                    fn div(self, rhs: Self) -> Self::Output {
+                        unsafe { simd_div }
+                    }
+                }
+
+                impl<const LANES: usize> Rem for Simd<$float, LANES> {
+                    fn rem(self, rhs: Self) -> Self::Output {
+                        unsafe { simd_rem }
+                    }
+                }
+            }
+        )*
+    };
+}
+
+// We don't need any special precautions here:
+// Floats always accept arithmetic ops, but may become NaN.
+float_arith! {
+    impl<const LANES: usize> FloatArith for Simd<f32, LANES> {
+        fn add(self, rhs: Self) -> Self::Output;
+        fn mul(self, rhs: Self) -> Self::Output;
+        fn sub(self, rhs: Self) -> Self::Output;
+        fn div(self, rhs: Self) -> Self::Output;
+        fn rem(self, rhs: Self) -> Self::Output;
+    }
+
+    impl<const LANES: usize> FloatArith for Simd<f64, LANES> {
+        fn add(self, rhs: Self) -> Self::Output;
+        fn mul(self, rhs: Self) -> Self::Output;
+        fn sub(self, rhs: Self) -> Self::Output;
+        fn div(self, rhs: Self) -> Self::Output;
+        fn rem(self, rhs: Self) -> Self::Output;
+    }
+}
+
 /// Automatically implements operators over references in addition to the provided operator.
 macro_rules! impl_ref_ops {
     // binary op
@@ -284,19 +348,6 @@ macro_rules! impl_op {
     };
 }
 
-/// Implements floating-point operators for the provided types.
-macro_rules! impl_float_ops {
-    { $($scalar:ty),* } => {
-        $(
-            impl_op! { impl Add for $scalar }
-            impl_op! { impl Sub for $scalar }
-            impl_op! { impl Mul for $scalar }
-            impl_op! { impl Div for $scalar }
-            impl_op! { impl Rem for $scalar }
-        )*
-    };
-}
-
 /// Implements unsigned integer operators for the provided types.
 macro_rules! impl_unsigned_int_ops {
     { $($scalar:ty),* } => {
@@ -375,4 +426,3 @@ macro_rules! impl_signed_int_ops {
 
 impl_unsigned_int_ops! { u8, u16, u32, u64, usize }
 impl_signed_int_ops! { i8, i16, i32, i64, isize }
-impl_float_ops! { f32, f64 }

From 5dcd397f47a17aec3b049af2d7541530b859e47b Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Wed, 8 Dec 2021 18:42:06 -0800
Subject: [PATCH 04/18] Finish refactoring ints in ops.rs

This should perform a SIMD check for whether or not we can div/rem,
so that we can panic several times faster!
---
 crates/core_simd/src/ops.rs | 271 +++++++++++++++++++-----------------
 1 file changed, 147 insertions(+), 124 deletions(-)

diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index 65b461d39818b..e6d7e695391cf 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -1,5 +1,4 @@
-use crate::simd::intrinsics;
-use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
+use crate::simd::{LaneCount, Mask, Simd, SimdElement, SupportedLaneCount};
 use core::ops::{Add, Mul};
 use core::ops::{BitAnd, BitOr, BitXor};
 use core::ops::{Div, Rem, Sub};
@@ -284,145 +283,169 @@ float_arith! {
     }
 }
 
-/// Automatically implements operators over references in addition to the provided operator.
-macro_rules! impl_ref_ops {
-    // binary op
-    {
-        impl<const $lanes:ident: usize> core::ops::$trait:ident<$rhs:ty> for $type:ty
-        where
-            LaneCount<$lanes2:ident>: SupportedLaneCount,
-        {
-            type Output = $output:ty;
-
-            $(#[$attrs:meta])*
-            fn $fn:ident($self_tok:ident, $rhs_arg:ident: $rhs_arg_ty:ty) -> Self::Output $body:tt
+// Division by zero is poison, according to LLVM.
+// So is dividing the MIN value of a signed integer by -1,
+// since that would return MAX + 1.
+// FIXME: Rust allows <SInt>::MIN / -1,
+// so we should probably figure out how to make that safe.
+macro_rules! int_divrem_guard {
+    ($(impl<const LANES: usize> $op:ident for Simd<$sint:ty, LANES> {
+        const PANIC_ZERO: &'static str = $zero:literal;
+        const PANIC_OVERFLOW: &'static str = $overflow:literal;
+        fn $call:ident {
+            unsafe { $simd_call:ident }
         }
-    } => {
-        impl<const $lanes: usize> core::ops::$trait<$rhs> for $type
+    })*) => {
+        $(impl<const LANES: usize> $op for Simd<$sint, LANES>
         where
-            LaneCount<$lanes2>: SupportedLaneCount,
+            $sint: SimdElement,
+            LaneCount<LANES>: SupportedLaneCount,
         {
-            type Output = $output;
-
-            $(#[$attrs])*
-            fn $fn($self_tok, $rhs_arg: $rhs_arg_ty) -> Self::Output $body
-        }
+            type Output = Self;
+            #[inline]
+            #[must_use = "operator returns a new vector without mutating the inputs"]
+            fn $call(self, rhs: Self) -> Self::Output {
+                if rhs.lanes_eq(Simd::splat(0)).any() {
+                    panic!("attempt to calculate the remainder with a divisor of zero");
+                } else if <$sint>::MIN != 0 && self.lanes_eq(Simd::splat(<$sint>::MIN)) & rhs.lanes_eq(Simd::splat(-1 as _))
+                    != Mask::splat(false)
+                 {
+                    panic!("attempt to calculate the remainder with overflow");
+                } else {
+                    unsafe { $crate::intrinsics::$simd_call(self, rhs) }
+                 }
+             }
+        })*
     };
 }
 
-/// Automatically implements operators over vectors and scalars for a particular vector.
-macro_rules! impl_op {
-    { impl Add for $scalar:ty } => {
-        impl_op! { @binary $scalar, Add::add, simd_add }
-    };
-    { impl Sub for $scalar:ty } => {
-        impl_op! { @binary $scalar, Sub::sub, simd_sub }
-    };
-    { impl Mul for $scalar:ty } => {
-        impl_op! { @binary $scalar, Mul::mul, simd_mul }
-    };
-    { impl Div for $scalar:ty } => {
-        impl_op! { @binary $scalar, Div::div, simd_div }
-    };
-    { impl Rem for $scalar:ty } => {
-        impl_op! { @binary $scalar, Rem::rem, simd_rem }
-    };
+macro_rules! int_arith {
+    ($(impl<const LANES: usize> IntArith for Simd<$sint:ty, LANES> {
+        fn add(self, rhs: Self) -> Self::Output;
+        fn mul(self, rhs: Self) -> Self::Output;
+        fn sub(self, rhs: Self) -> Self::Output;
+        fn div(self, rhs: Self) -> Self::Output;
+        fn rem(self, rhs: Self) -> Self::Output;
+    })*) => {
+        $(
+        unsafe_base_op!{
+            impl<const LANES: usize> Add for Simd<$sint, LANES> {
+                fn add(self, rhs: Self) -> Self::Output {
+                    unsafe { simd_add }
+                }
+            }
 
-    // generic binary op with assignment when output is `Self`
-    { @binary $scalar:ty, $trait:ident :: $trait_fn:ident, $intrinsic:ident } => {
-        impl_ref_ops! {
-            impl<const LANES: usize> core::ops::$trait<Self> for Simd<$scalar, LANES>
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                type Output = Self;
+            impl<const LANES: usize> Mul for Simd<$sint, LANES> {
+                fn mul(self, rhs: Self) -> Self::Output {
+                    unsafe { simd_mul }
+                }
+            }
 
-                #[inline]
-                fn $trait_fn(self, rhs: Self) -> Self::Output {
-                    unsafe {
-                        intrinsics::$intrinsic(self, rhs)
-                    }
+            impl<const LANES: usize> Sub for Simd<$sint, LANES> {
+                fn sub(self, rhs: Self) -> Self::Output {
+                    unsafe { simd_sub }
                 }
             }
         }
-    };
-}
 
-/// Implements unsigned integer operators for the provided types.
-macro_rules! impl_unsigned_int_ops {
-    { $($scalar:ty),* } => {
-        $(
-            impl_op! { impl Add for $scalar }
-            impl_op! { impl Sub for $scalar }
-            impl_op! { impl Mul for $scalar }
-
-            // Integers panic on divide by 0
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Div<Self> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Self;
-
-                    #[inline]
-                    fn div(self, rhs: Self) -> Self::Output {
-                        if rhs.as_array()
-                            .iter()
-                            .any(|x| *x == 0)
-                        {
-                            panic!("attempt to divide by zero");
-                        }
-
-                        // Guards for div(MIN, -1),
-                        // this check only applies to signed ints
-                        if <$scalar>::MIN != 0 && self.as_array().iter()
-                                .zip(rhs.as_array().iter())
-                                .any(|(x,y)| *x == <$scalar>::MIN && *y == -1 as _) {
-                            panic!("attempt to divide with overflow");
-                        }
-                        unsafe { intrinsics::simd_div(self, rhs) }
-                    }
+        int_divrem_guard!{
+            impl<const LANES: usize> Div for Simd<$sint, LANES> {
+                const PANIC_ZERO: &'static str = "attempt to divide by zero";
+                const PANIC_OVERFLOW: &'static str = "attempt to divide with overflow";
+                fn div {
+                    unsafe { simd_div }
                 }
             }
 
-            // remainder panics on zero divisor
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Rem<Self> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Self;
-
-                    #[inline]
-                    fn rem(self, rhs: Self) -> Self::Output {
-                        if rhs.as_array()
-                            .iter()
-                            .any(|x| *x == 0)
-                        {
-                            panic!("attempt to calculate the remainder with a divisor of zero");
-                        }
-
-                        // Guards for rem(MIN, -1)
-                        // this branch applies the check only to signed ints
-                        if <$scalar>::MIN != 0 && self.as_array().iter()
-                                .zip(rhs.as_array().iter())
-                                .any(|(x,y)| *x == <$scalar>::MIN && *y == -1 as _) {
-                            panic!("attempt to calculate the remainder with overflow");
-                        }
-                        unsafe { intrinsics::simd_rem(self, rhs) }
-                    }
+            impl<const LANES: usize> Rem for Simd<$sint, LANES> {
+                const PANIC_ZERO: &'static str = "attempt to calculate the remainder with a divisor of zero";
+                const PANIC_OVERFLOW: &'static str = "attempt to calculate the remainder with overflow";
+                fn rem {
+                    unsafe { simd_rem }
                 }
             }
-        )*
-    };
+        })*
+    }
 }
 
-/// Implements unsigned integer operators for the provided types.
-macro_rules! impl_signed_int_ops {
-    { $($scalar:ty),* } => {
-        impl_unsigned_int_ops! { $($scalar),* }
-    };
-}
+int_arith! {
+    impl<const LANES: usize> IntArith for Simd<i8, LANES> {
+        fn add(self, rhs: Self) -> Self::Output;
+        fn mul(self, rhs: Self) -> Self::Output;
+        fn sub(self, rhs: Self) -> Self::Output;
+        fn div(self, rhs: Self) -> Self::Output;
+        fn rem(self, rhs: Self) -> Self::Output;
+    }
 
-impl_unsigned_int_ops! { u8, u16, u32, u64, usize }
-impl_signed_int_ops! { i8, i16, i32, i64, isize }
+    impl<const LANES: usize> IntArith for Simd<i16, LANES> {
+        fn add(self, rhs: Self) -> Self::Output;
+        fn mul(self, rhs: Self) -> Self::Output;
+        fn sub(self, rhs: Self) -> Self::Output;
+        fn div(self, rhs: Self) -> Self::Output;
+        fn rem(self, rhs: Self) -> Self::Output;
+    }
+
+    impl<const LANES: usize> IntArith for Simd<i32, LANES> {
+        fn add(self, rhs: Self) -> Self::Output;
+        fn mul(self, rhs: Self) -> Self::Output;
+        fn sub(self, rhs: Self) -> Self::Output;
+        fn div(self, rhs: Self) -> Self::Output;
+        fn rem(self, rhs: Self) -> Self::Output;
+    }
+
+    impl<const LANES: usize> IntArith for Simd<i64, LANES> {
+        fn add(self, rhs: Self) -> Self::Output;
+        fn mul(self, rhs: Self) -> Self::Output;
+        fn sub(self, rhs: Self) -> Self::Output;
+        fn div(self, rhs: Self) -> Self::Output;
+        fn rem(self, rhs: Self) -> Self::Output;
+    }
+
+    impl<const LANES: usize> IntArith for Simd<isize, LANES> {
+        fn add(self, rhs: Self) -> Self::Output;
+        fn mul(self, rhs: Self) -> Self::Output;
+        fn sub(self, rhs: Self) -> Self::Output;
+        fn div(self, rhs: Self) -> Self::Output;
+        fn rem(self, rhs: Self) -> Self::Output;
+    }
+
+    impl<const LANES: usize> IntArith for Simd<u8, LANES> {
+        fn add(self, rhs: Self) -> Self::Output;
+        fn mul(self, rhs: Self) -> Self::Output;
+        fn sub(self, rhs: Self) -> Self::Output;
+        fn div(self, rhs: Self) -> Self::Output;
+        fn rem(self, rhs: Self) -> Self::Output;
+    }
+
+    impl<const LANES: usize> IntArith for Simd<u16, LANES> {
+        fn add(self, rhs: Self) -> Self::Output;
+        fn mul(self, rhs: Self) -> Self::Output;
+        fn sub(self, rhs: Self) -> Self::Output;
+        fn div(self, rhs: Self) -> Self::Output;
+        fn rem(self, rhs: Self) -> Self::Output;
+    }
+
+    impl<const LANES: usize> IntArith for Simd<u32, LANES> {
+        fn add(self, rhs: Self) -> Self::Output;
+        fn mul(self, rhs: Self) -> Self::Output;
+        fn sub(self, rhs: Self) -> Self::Output;
+        fn div(self, rhs: Self) -> Self::Output;
+        fn rem(self, rhs: Self) -> Self::Output;
+    }
+
+    impl<const LANES: usize> IntArith for Simd<u64, LANES> {
+        fn add(self, rhs: Self) -> Self::Output;
+        fn mul(self, rhs: Self) -> Self::Output;
+        fn sub(self, rhs: Self) -> Self::Output;
+        fn div(self, rhs: Self) -> Self::Output;
+        fn rem(self, rhs: Self) -> Self::Output;
+    }
+
+    impl<const LANES: usize> IntArith for Simd<usize, LANES> {
+        fn add(self, rhs: Self) -> Self::Output;
+        fn mul(self, rhs: Self) -> Self::Output;
+        fn sub(self, rhs: Self) -> Self::Output;
+        fn div(self, rhs: Self) -> Self::Output;
+        fn rem(self, rhs: Self) -> Self::Output;
+    }
+}

From bc326a2bbccdccb321328e7a1cde3ad3734a5953 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Tue, 21 Dec 2021 18:28:57 -0800
Subject: [PATCH 05/18] Refactor ops.rs with a recursive macro

This approaches reducing macro nesting in a slightly different way.
Instead of just flattening details, make one macro apply another.
This allows specifying all details up-front in the first macro
invocation, making it easier to audit and refactor in the future.
---
 crates/core_simd/src/ops.rs | 508 +++++++++++-------------------------
 1 file changed, 147 insertions(+), 361 deletions(-)

diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index e6d7e695391cf..6cfc8f80b53c7 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -31,27 +31,10 @@ where
     }
 }
 
-macro_rules! unsafe_base_op {
-    ($(impl<const LANES: usize> $op:ident for Simd<$scalar:ty, LANES> {
-        fn $call:ident(self, rhs: Self) -> Self::Output {
-            unsafe{ $simd_call:ident }
-        }
-    })*) => {
-        $(impl<const LANES: usize> $op for Simd<$scalar, LANES>
-            where
-                $scalar: SimdElement,
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                type Output = Self;
-
-                #[inline]
-                #[must_use = "operator returns a new vector without mutating the inputs"]
-                fn $call(self, rhs: Self) -> Self::Output {
-                    unsafe { $crate::intrinsics::$simd_call(self, rhs) }
-                }
-            }
-        )*
-    }
+macro_rules! unsafe_base {
+    ($lhs:ident, $rhs:ident, {$simd_call:ident}, $($_:tt)*) => {
+        unsafe { $crate::intrinsics::$simd_call($lhs, $rhs) }
+    };
 }
 
 /// SAFETY: This macro should not be used for anything except Shl or Shr, and passed the appropriate shift intrinsic.
@@ -64,388 +47,191 @@ macro_rules! unsafe_base_op {
 // FIXME: Consider implementing this in cg_llvm instead?
 // cg_clif defaults to this, and scalar MIR shifts also default to wrapping
 macro_rules! wrap_bitshift {
-    ($(impl<const LANES: usize> $op:ident for Simd<$int:ty, LANES> {
-        fn $call:ident(self, rhs: Self) -> Self::Output {
-            unsafe { $simd_call:ident }
+    ($lhs:ident, $rhs:ident, {$simd_call:ident}, $int:ident) => {
+        unsafe {
+            $crate::intrinsics::$simd_call($lhs, $rhs.bitand(Simd::splat(<$int>::BITS as $int - 1)))
         }
-    })*) => {
-        $(impl<const LANES: usize> $op for Simd<$int, LANES>
-        where
-            $int: SimdElement,
-            LaneCount<LANES>: SupportedLaneCount,
-        {
-            type Output = Self;
-
-            #[inline]
-            #[must_use = "operator returns a new vector without mutating the inputs"]
-            fn $call(self, rhs: Self) -> Self::Output {
-                unsafe {
-                    $crate::intrinsics::$simd_call(self, rhs.bitand(Simd::splat(<$int>::BITS as $int - 1)))
-                }
-            }
-        })*
     };
 }
 
-macro_rules! bitops {
-    ($(impl<const LANES: usize> BitOps for Simd<$int:ty, LANES> {
-        fn bitand(self, rhs: Self) -> Self::Output;
-        fn bitor(self, rhs: Self) -> Self::Output;
-        fn bitxor(self, rhs: Self) -> Self::Output;
-        fn shl(self, rhs: Self) -> Self::Output;
-        fn shr(self, rhs: Self) -> Self::Output;
-     })*) => {
-        $(
-            unsafe_base_op!{
-                impl<const LANES: usize> BitAnd for Simd<$int, LANES> {
-                    fn bitand(self, rhs: Self) -> Self::Output {
-                        unsafe { simd_and }
-                    }
-                }
-
-                impl<const LANES: usize> BitOr for Simd<$int, LANES> {
-                    fn bitor(self, rhs: Self) -> Self::Output {
-                        unsafe { simd_or }
-                    }
-                }
-
-                impl<const LANES: usize> BitXor for Simd<$int, LANES> {
-                    fn bitxor(self, rhs: Self) -> Self::Output {
-                        unsafe { simd_xor }
-                    }
-                }
-            }
-            wrap_bitshift! {
-                impl<const LANES: usize> Shl for Simd<$int, LANES> {
-                    fn shl(self, rhs: Self) -> Self::Output {
-                        unsafe { simd_shl }
-                    }
-                }
-
-                impl<const LANES: usize> Shr for Simd<$int, LANES> {
-                    fn shr(self, rhs: Self) -> Self::Output {
-                        // This automatically monomorphizes to lshr or ashr, depending,
-                        // so it's fine to use it for both UInts and SInts.
-                        unsafe { simd_shr }
-                    }
-                }
-            }
-        )*
+// Division by zero is poison, according to LLVM.
+// So is dividing the MIN value of a signed integer by -1,
+// since that would return MAX + 1.
+// FIXME: Rust allows <SInt>::MIN / -1,
+// so we should probably figure out how to make that safe.
+macro_rules! int_divrem_guard {
+    (   $lhs:ident,
+        $rhs:ident,
+        {   const PANIC_ZERO: &'static str = $zero:literal;
+            const PANIC_OVERFLOW: &'static str = $overflow:literal;
+            $simd_call:ident
+        },
+        $int:ident ) => {
+        if $rhs.lanes_eq(Simd::splat(0)).any() {
+            panic!($zero);
+        } else if <$int>::MIN != 0
+            && $lhs.lanes_eq(Simd::splat(<$int>::MIN)) & $rhs.lanes_eq(Simd::splat(-1 as _))
+                != Mask::splat(false)
+        {
+            panic!($overflow);
+        } else {
+            unsafe { $crate::intrinsics::$simd_call($lhs, $rhs) }
+        }
     };
 }
 
-// Integers can always accept bitand, bitor, and bitxor.
-// The only question is how to handle shifts >= <Int>::BITS?
-// Our current solution uses wrapping logic.
-bitops! {
-    impl<const LANES: usize> BitOps for Simd<i8, LANES> {
-        fn bitand(self, rhs: Self) -> Self::Output;
-        fn bitor(self, rhs: Self) -> Self::Output;
-        fn bitxor(self, rhs: Self) -> Self::Output;
-        fn shl(self, rhs: Self) -> Self::Output;
-        fn shr(self, rhs: Self) -> Self::Output;
+macro_rules! for_base_types {
+    (   T = ($($scalar:ident),*);
+        type Lhs = Simd<T, N>;
+        type Rhs = Simd<T, N>;
+        type Output = $out:ty;
+
+        impl $op:ident::$call:ident {
+            $macro_impl:ident $inner:tt
+        }) => {
+            $(
+                impl<const N: usize> $op<Self> for Simd<$scalar, N>
+                where
+                    $scalar: SimdElement,
+                    LaneCount<N>: SupportedLaneCount,
+                {
+                    type Output = $out;
+
+                    #[inline]
+                    #[must_use = "operator returns a new vector without mutating the inputs"]
+                    fn $call(self, rhs: Self) -> Self::Output {
+                        $macro_impl!(self, rhs, $inner, $scalar)
+                    }
+                })*
     }
+}
 
-    impl<const LANES: usize> BitOps for Simd<i16, LANES> {
-        fn bitand(self, rhs: Self) -> Self::Output;
-        fn bitor(self, rhs: Self) -> Self::Output;
-        fn bitxor(self, rhs: Self) -> Self::Output;
-        fn shl(self, rhs: Self) -> Self::Output;
-        fn shr(self, rhs: Self) -> Self::Output;
+// A "TokenTree muncher": takes a set of scalar types `T = {};`
+// type parameters for the ops it implements, `Op::fn` names,
+// and a macro that expands into an expr, substituting in an intrinsic.
+// It passes that to for_base_types, which expands an impl for the types,
+// using the expanded expr in the function, and recurses with itself.
+//
+// tl;dr impls a set of ops::{Traits} for a set of types
+macro_rules! for_base_ops {
+    (
+        T = $types:tt;
+        type Lhs = Simd<T, N>;
+        type Rhs = Simd<T, N>;
+        type Output = $out:ident;
+        impl $op:ident::$call:ident
+            $inner:tt
+        $($rest:tt)*
+    ) => {
+        for_base_types! {
+            T = $types;
+            type Lhs = Simd<T, N>;
+            type Rhs = Simd<T, N>;
+            type Output = $out;
+            impl $op::$call
+                $inner
+        }
+        for_base_ops! {
+            T = $types;
+            type Lhs = Simd<T, N>;
+            type Rhs = Simd<T, N>;
+            type Output = $out;
+            $($rest)*
+        }
+    };
+    ($($done:tt)*) => {
+        // Done.
     }
+}
 
-    impl<const LANES: usize> BitOps for Simd<i32, LANES> {
-        fn bitand(self, rhs: Self) -> Self::Output;
-        fn bitor(self, rhs: Self) -> Self::Output;
-        fn bitxor(self, rhs: Self) -> Self::Output;
-        fn shl(self, rhs: Self) -> Self::Output;
-        fn shr(self, rhs: Self) -> Self::Output;
-    }
+// Integers can always accept add, mul, sub, bitand, bitor, and bitxor.
+// For all of these operations, simd_* intrinsics apply wrapping logic.
+for_base_ops! {
+    T = (i8, i16, i32, i64, isize, u8, u16, u32, u64, usize);
+    type Lhs = Simd<T, N>;
+    type Rhs = Simd<T, N>;
+    type Output = Self;
 
-    impl<const LANES: usize> BitOps for Simd<i64, LANES> {
-        fn bitand(self, rhs: Self) -> Self::Output;
-        fn bitor(self, rhs: Self) -> Self::Output;
-        fn bitxor(self, rhs: Self) -> Self::Output;
-        fn shl(self, rhs: Self) -> Self::Output;
-        fn shr(self, rhs: Self) -> Self::Output;
+    impl Add::add {
+        unsafe_base { simd_add }
     }
 
-    impl<const LANES: usize> BitOps for Simd<isize, LANES> {
-        fn bitand(self, rhs: Self) -> Self::Output;
-        fn bitor(self, rhs: Self) -> Self::Output;
-        fn bitxor(self, rhs: Self) -> Self::Output;
-        fn shl(self, rhs: Self) -> Self::Output;
-        fn shr(self, rhs: Self) -> Self::Output;
+    impl Mul::mul {
+        unsafe_base { simd_mul }
     }
 
-    impl<const LANES: usize> BitOps for Simd<u8, LANES> {
-        fn bitand(self, rhs: Self) -> Self::Output;
-        fn bitor(self, rhs: Self) -> Self::Output;
-        fn bitxor(self, rhs: Self) -> Self::Output;
-        fn shl(self, rhs: Self) -> Self::Output;
-        fn shr(self, rhs: Self) -> Self::Output;
+    impl Sub::sub {
+        unsafe_base { simd_sub }
     }
 
-    impl<const LANES: usize> BitOps for Simd<u16, LANES> {
-        fn bitand(self, rhs: Self) -> Self::Output;
-        fn bitor(self, rhs: Self) -> Self::Output;
-        fn bitxor(self, rhs: Self) -> Self::Output;
-        fn shl(self, rhs: Self) -> Self::Output;
-        fn shr(self, rhs: Self) -> Self::Output;
+    impl BitAnd::bitand {
+        unsafe_base { simd_and }
     }
 
-    impl<const LANES: usize> BitOps for Simd<u32, LANES> {
-        fn bitand(self, rhs: Self) -> Self::Output;
-        fn bitor(self, rhs: Self) -> Self::Output;
-        fn bitxor(self, rhs: Self) -> Self::Output;
-        fn shl(self, rhs: Self) -> Self::Output;
-        fn shr(self, rhs: Self) -> Self::Output;
+    impl BitOr::bitor {
+        unsafe_base { simd_or }
     }
 
-    impl<const LANES: usize> BitOps for Simd<u64, LANES> {
-        fn bitand(self, rhs: Self) -> Self::Output;
-        fn bitor(self, rhs: Self) -> Self::Output;
-        fn bitxor(self, rhs: Self) -> Self::Output;
-        fn shl(self, rhs: Self) -> Self::Output;
-        fn shr(self, rhs: Self) -> Self::Output;
+    impl BitXor::bitxor {
+        unsafe_base { simd_xor }
     }
 
-    impl<const LANES: usize> BitOps for Simd<usize, LANES> {
-        fn bitand(self, rhs: Self) -> Self::Output;
-        fn bitor(self, rhs: Self) -> Self::Output;
-        fn bitxor(self, rhs: Self) -> Self::Output;
-        fn shl(self, rhs: Self) -> Self::Output;
-        fn shr(self, rhs: Self) -> Self::Output;
+    impl Div::div {
+        int_divrem_guard {
+            const PANIC_ZERO: &'static str = "attempt to divide by zero";
+            const PANIC_OVERFLOW: &'static str = "attempt to divide with overflow";
+            simd_div
+        }
     }
-}
-
-macro_rules! float_arith {
-    ($(impl<const LANES: usize> FloatArith for Simd<$float:ty, LANES> {
-        fn add(self, rhs: Self) -> Self::Output;
-        fn mul(self, rhs: Self) -> Self::Output;
-        fn sub(self, rhs: Self) -> Self::Output;
-        fn div(self, rhs: Self) -> Self::Output;
-        fn rem(self, rhs: Self) -> Self::Output;
-     })*) => {
-        $(
-            unsafe_base_op!{
-                impl<const LANES: usize> Add for Simd<$float, LANES> {
-                    fn add(self, rhs: Self) -> Self::Output {
-                        unsafe { simd_add }
-                    }
-                }
-
-                impl<const LANES: usize> Mul for Simd<$float, LANES> {
-                    fn mul(self, rhs: Self) -> Self::Output {
-                        unsafe { simd_mul }
-                    }
-                }
-
-                impl<const LANES: usize> Sub for Simd<$float, LANES> {
-                    fn sub(self, rhs: Self) -> Self::Output {
-                        unsafe { simd_sub }
-                    }
-                }
 
-                impl<const LANES: usize> Div for Simd<$float, LANES> {
-                    fn div(self, rhs: Self) -> Self::Output {
-                        unsafe { simd_div }
-                    }
-                }
-
-                impl<const LANES: usize> Rem for Simd<$float, LANES> {
-                    fn rem(self, rhs: Self) -> Self::Output {
-                        unsafe { simd_rem }
-                    }
-                }
-            }
-        )*
-    };
-}
-
-// We don't need any special precautions here:
-// Floats always accept arithmetic ops, but may become NaN.
-float_arith! {
-    impl<const LANES: usize> FloatArith for Simd<f32, LANES> {
-        fn add(self, rhs: Self) -> Self::Output;
-        fn mul(self, rhs: Self) -> Self::Output;
-        fn sub(self, rhs: Self) -> Self::Output;
-        fn div(self, rhs: Self) -> Self::Output;
-        fn rem(self, rhs: Self) -> Self::Output;
+    impl Rem::rem {
+        int_divrem_guard {
+            const PANIC_ZERO: &'static str = "attempt to calculate the remainder with a divisor of zero";
+            const PANIC_OVERFLOW: &'static str = "attempt to calculate the remainder with overflow";
+            simd_rem
+        }
     }
 
-    impl<const LANES: usize> FloatArith for Simd<f64, LANES> {
-        fn add(self, rhs: Self) -> Self::Output;
-        fn mul(self, rhs: Self) -> Self::Output;
-        fn sub(self, rhs: Self) -> Self::Output;
-        fn div(self, rhs: Self) -> Self::Output;
-        fn rem(self, rhs: Self) -> Self::Output;
+    // The only question is how to handle shifts >= <Int>::BITS?
+    // Our current solution uses wrapping logic.
+    impl Shl::shl {
+        wrap_bitshift { simd_shl }
     }
-}
-
-// Division by zero is poison, according to LLVM.
-// So is dividing the MIN value of a signed integer by -1,
-// since that would return MAX + 1.
-// FIXME: Rust allows <SInt>::MIN / -1,
-// so we should probably figure out how to make that safe.
-macro_rules! int_divrem_guard {
-    ($(impl<const LANES: usize> $op:ident for Simd<$sint:ty, LANES> {
-        const PANIC_ZERO: &'static str = $zero:literal;
-        const PANIC_OVERFLOW: &'static str = $overflow:literal;
-        fn $call:ident {
-            unsafe { $simd_call:ident }
-        }
-    })*) => {
-        $(impl<const LANES: usize> $op for Simd<$sint, LANES>
-        where
-            $sint: SimdElement,
-            LaneCount<LANES>: SupportedLaneCount,
-        {
-            type Output = Self;
-            #[inline]
-            #[must_use = "operator returns a new vector without mutating the inputs"]
-            fn $call(self, rhs: Self) -> Self::Output {
-                if rhs.lanes_eq(Simd::splat(0)).any() {
-                    panic!("attempt to calculate the remainder with a divisor of zero");
-                } else if <$sint>::MIN != 0 && self.lanes_eq(Simd::splat(<$sint>::MIN)) & rhs.lanes_eq(Simd::splat(-1 as _))
-                    != Mask::splat(false)
-                 {
-                    panic!("attempt to calculate the remainder with overflow");
-                } else {
-                    unsafe { $crate::intrinsics::$simd_call(self, rhs) }
-                 }
-             }
-        })*
-    };
-}
-
-macro_rules! int_arith {
-    ($(impl<const LANES: usize> IntArith for Simd<$sint:ty, LANES> {
-        fn add(self, rhs: Self) -> Self::Output;
-        fn mul(self, rhs: Self) -> Self::Output;
-        fn sub(self, rhs: Self) -> Self::Output;
-        fn div(self, rhs: Self) -> Self::Output;
-        fn rem(self, rhs: Self) -> Self::Output;
-    })*) => {
-        $(
-        unsafe_base_op!{
-            impl<const LANES: usize> Add for Simd<$sint, LANES> {
-                fn add(self, rhs: Self) -> Self::Output {
-                    unsafe { simd_add }
-                }
-            }
-
-            impl<const LANES: usize> Mul for Simd<$sint, LANES> {
-                fn mul(self, rhs: Self) -> Self::Output {
-                    unsafe { simd_mul }
-                }
-            }
 
-            impl<const LANES: usize> Sub for Simd<$sint, LANES> {
-                fn sub(self, rhs: Self) -> Self::Output {
-                    unsafe { simd_sub }
-                }
-            }
+    impl Shr::shr {
+        wrap_bitshift {
+            // This automatically monomorphizes to lshr or ashr, depending,
+            // so it's fine to use it for both UInts and SInts.
+            simd_shr
         }
-
-        int_divrem_guard!{
-            impl<const LANES: usize> Div for Simd<$sint, LANES> {
-                const PANIC_ZERO: &'static str = "attempt to divide by zero";
-                const PANIC_OVERFLOW: &'static str = "attempt to divide with overflow";
-                fn div {
-                    unsafe { simd_div }
-                }
-            }
-
-            impl<const LANES: usize> Rem for Simd<$sint, LANES> {
-                const PANIC_ZERO: &'static str = "attempt to calculate the remainder with a divisor of zero";
-                const PANIC_OVERFLOW: &'static str = "attempt to calculate the remainder with overflow";
-                fn rem {
-                    unsafe { simd_rem }
-                }
-            }
-        })*
     }
 }
 
-int_arith! {
-    impl<const LANES: usize> IntArith for Simd<i8, LANES> {
-        fn add(self, rhs: Self) -> Self::Output;
-        fn mul(self, rhs: Self) -> Self::Output;
-        fn sub(self, rhs: Self) -> Self::Output;
-        fn div(self, rhs: Self) -> Self::Output;
-        fn rem(self, rhs: Self) -> Self::Output;
-    }
-
-    impl<const LANES: usize> IntArith for Simd<i16, LANES> {
-        fn add(self, rhs: Self) -> Self::Output;
-        fn mul(self, rhs: Self) -> Self::Output;
-        fn sub(self, rhs: Self) -> Self::Output;
-        fn div(self, rhs: Self) -> Self::Output;
-        fn rem(self, rhs: Self) -> Self::Output;
-    }
-
-    impl<const LANES: usize> IntArith for Simd<i32, LANES> {
-        fn add(self, rhs: Self) -> Self::Output;
-        fn mul(self, rhs: Self) -> Self::Output;
-        fn sub(self, rhs: Self) -> Self::Output;
-        fn div(self, rhs: Self) -> Self::Output;
-        fn rem(self, rhs: Self) -> Self::Output;
-    }
-
-    impl<const LANES: usize> IntArith for Simd<i64, LANES> {
-        fn add(self, rhs: Self) -> Self::Output;
-        fn mul(self, rhs: Self) -> Self::Output;
-        fn sub(self, rhs: Self) -> Self::Output;
-        fn div(self, rhs: Self) -> Self::Output;
-        fn rem(self, rhs: Self) -> Self::Output;
-    }
-
-    impl<const LANES: usize> IntArith for Simd<isize, LANES> {
-        fn add(self, rhs: Self) -> Self::Output;
-        fn mul(self, rhs: Self) -> Self::Output;
-        fn sub(self, rhs: Self) -> Self::Output;
-        fn div(self, rhs: Self) -> Self::Output;
-        fn rem(self, rhs: Self) -> Self::Output;
-    }
+// We don't need any special precautions here:
+// Floats always accept arithmetic ops, but may become NaN.
+for_base_ops! {
+    T = (f32, f64);
+    type Lhs = Simd<T, N>;
+    type Rhs = Simd<T, N>;
+    type Output = Self;
 
-    impl<const LANES: usize> IntArith for Simd<u8, LANES> {
-        fn add(self, rhs: Self) -> Self::Output;
-        fn mul(self, rhs: Self) -> Self::Output;
-        fn sub(self, rhs: Self) -> Self::Output;
-        fn div(self, rhs: Self) -> Self::Output;
-        fn rem(self, rhs: Self) -> Self::Output;
+    impl Add::add {
+        unsafe_base { simd_add }
     }
 
-    impl<const LANES: usize> IntArith for Simd<u16, LANES> {
-        fn add(self, rhs: Self) -> Self::Output;
-        fn mul(self, rhs: Self) -> Self::Output;
-        fn sub(self, rhs: Self) -> Self::Output;
-        fn div(self, rhs: Self) -> Self::Output;
-        fn rem(self, rhs: Self) -> Self::Output;
+    impl Mul::mul {
+        unsafe_base { simd_mul }
     }
 
-    impl<const LANES: usize> IntArith for Simd<u32, LANES> {
-        fn add(self, rhs: Self) -> Self::Output;
-        fn mul(self, rhs: Self) -> Self::Output;
-        fn sub(self, rhs: Self) -> Self::Output;
-        fn div(self, rhs: Self) -> Self::Output;
-        fn rem(self, rhs: Self) -> Self::Output;
+    impl Sub::sub {
+        unsafe_base { simd_sub }
     }
 
-    impl<const LANES: usize> IntArith for Simd<u64, LANES> {
-        fn add(self, rhs: Self) -> Self::Output;
-        fn mul(self, rhs: Self) -> Self::Output;
-        fn sub(self, rhs: Self) -> Self::Output;
-        fn div(self, rhs: Self) -> Self::Output;
-        fn rem(self, rhs: Self) -> Self::Output;
+    impl Div::div {
+        unsafe_base { simd_div }
     }
 
-    impl<const LANES: usize> IntArith for Simd<usize, LANES> {
-        fn add(self, rhs: Self) -> Self::Output;
-        fn mul(self, rhs: Self) -> Self::Output;
-        fn sub(self, rhs: Self) -> Self::Output;
-        fn div(self, rhs: Self) -> Self::Output;
-        fn rem(self, rhs: Self) -> Self::Output;
+    impl Rem::rem {
+        unsafe_base { simd_rem }
     }
 }

From a42420583bdb6ea788c2f7ec0a0360d99934f2a7 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Thu, 23 Dec 2021 23:14:13 -0800
Subject: [PATCH 06/18] Use Mask::any in div check

---
 crates/core_simd/src/ops.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index 6cfc8f80b53c7..82b007aa6966d 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -1,4 +1,4 @@
-use crate::simd::{LaneCount, Mask, Simd, SimdElement, SupportedLaneCount};
+use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
 use core::ops::{Add, Mul};
 use core::ops::{BitAnd, BitOr, BitXor};
 use core::ops::{Div, Rem, Sub};
@@ -70,8 +70,7 @@ macro_rules! int_divrem_guard {
         if $rhs.lanes_eq(Simd::splat(0)).any() {
             panic!($zero);
         } else if <$int>::MIN != 0
-            && $lhs.lanes_eq(Simd::splat(<$int>::MIN)) & $rhs.lanes_eq(Simd::splat(-1 as _))
-                != Mask::splat(false)
+            && ($lhs.lanes_eq(Simd::splat(<$int>::MIN)) & $rhs.lanes_eq(Simd::splat(-1 as _))).any()
         {
             panic!($overflow);
         } else {

From ecc00efee0f4bf950f6fa8ee00d88fefa73a8c0b Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Tue, 21 Dec 2021 15:00:35 -0800
Subject: [PATCH 07/18] impl std::simd::StdFloat

While consulting with Simulacrum on how to make available the float
functions that currently require runtime support for `Simd<f32, N>` and
`Simd<f64, N>`, we realized breaking coherence with the classic approach
of lang items was, since `{core,std}::simd::Simd` is a `ty::Adt`, likely
to be quite a bit nasty. The project group has a long-term plan for how
to get around this kind of issue and move the associated functions into
libcore, but that will likely take time as well. Since all routes
forward are temporally costly, we probably will skip the lang item
approach entirely and go the "proper" route, but in the interests of
having something this year for people to play around with, this
extension trait was whipped up.

For now, while it involves a lot of fairly internal details most users
shouldn't have to care about, I went ahead and fully documented the
situation for any passerby to read on the trait, as the situation is
quite unusual and puzzling to begin with.
---
 Cargo.toml                  |   1 +
 crates/std_float/Cargo.toml |  13 +++
 crates/std_float/src/lib.rs | 165 ++++++++++++++++++++++++++++++++++++
 3 files changed, 179 insertions(+)
 create mode 100644 crates/std_float/Cargo.toml
 create mode 100644 crates/std_float/src/lib.rs

diff --git a/Cargo.toml b/Cargo.toml
index 3f1abd73519bd..9802386e4566d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,5 +2,6 @@
 
 members = [
     "crates/core_simd",
+    "crates/std_float",
     "crates/test_helpers",
 ]
diff --git a/crates/std_float/Cargo.toml b/crates/std_float/Cargo.toml
new file mode 100644
index 0000000000000..82f66b8dcb749
--- /dev/null
+++ b/crates/std_float/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "std_float"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+core_simd = { path = "../core_simd" }
+
+[features]
+default = ["as_crate"]
+as_crate = []
diff --git a/crates/std_float/src/lib.rs b/crates/std_float/src/lib.rs
new file mode 100644
index 0000000000000..4bd4d4c05e3b9
--- /dev/null
+++ b/crates/std_float/src/lib.rs
@@ -0,0 +1,165 @@
+#![cfg_attr(feature = "as_crate", no_std)] // We are std!
+#![cfg_attr(
+    feature = "as_crate",
+    feature(platform_intrinsics),
+    feature(portable_simd)
+)]
+#[cfg(not(feature = "as_crate"))]
+use core::simd;
+#[cfg(feature = "as_crate")]
+use core_simd::simd;
+
+use simd::{LaneCount, Simd, SupportedLaneCount};
+
+#[cfg(feature = "as_crate")]
+mod experimental {
+    pub trait Sealed {}
+}
+
+#[cfg(feature = "as_crate")]
+use experimental as sealed;
+
+use crate::sealed::Sealed;
+
+// "platform intrinsics" are essentially "codegen intrinsics"
+// each of these may be scalarized and lowered to a libm call
+extern "platform-intrinsic" {
+    // ceil
+    fn simd_ceil<T>(x: T) -> T;
+
+    // floor
+    fn simd_floor<T>(x: T) -> T;
+
+    // round
+    fn simd_round<T>(x: T) -> T;
+
+    // trunc
+    fn simd_trunc<T>(x: T) -> T;
+
+    // fsqrt
+    fn simd_fsqrt<T>(x: T) -> T;
+
+    // fma
+    fn simd_fma<T>(x: T, y: T, z: T) -> T;
+}
+
+/// This trait provides a possibly-temporary implementation of float functions
+/// that may, in the absence of hardware support, canonicalize to calling an
+/// operating system's `math.h` dynamically-loaded library (also known as a
+/// shared object). As these conditionally require runtime support, they
+/// should only appear in binaries built assuming OS support: `std`.
+///
+/// However, there is no reason SIMD types, in general, need OS support,
+/// as for many architectures an embedded binary may simply configure that
+/// support itself. This means these types must be visible in `core`
+/// but have these functions available in `std`.
+///
+/// [`f32`] and [`f64`] achieve a similar trick by using "lang items", but
+/// due to compiler limitations, it is harder to implement this approach for
+/// abstract data types like [`Simd`]. From that need, this trait is born.
+///
+/// It is possible this trait will be replaced in some manner in the future,
+/// when either the compiler or its supporting runtime functions are improved.
+/// For now this trait is available to permit experimentation with SIMD float
+/// operations that may lack hardware support, such as `mul_add`.
+pub trait StdFloat: Sealed + Sized {
+    /// Fused multiply-add.  Computes `(self * a) + b` with only one rounding error,
+    /// yielding a more accurate result than an unfused multiply-add.
+    ///
+    /// Using `mul_add` *may* be more performant than an unfused multiply-add if the target
+    /// architecture has a dedicated `fma` CPU instruction.  However, this is not always
+    /// true, and will be heavily dependent on designing algorithms with specific target
+    /// hardware in mind.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn mul_add(self, a: Self, b: Self) -> Self {
+        unsafe { simd_fma(self, a, b) }
+    }
+
+    /// Produces a vector where every lane has the square root value
+    /// of the equivalently-indexed lane in `self`
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn sqrt(self) -> Self {
+        unsafe { simd_fsqrt(self) }
+    }
+
+    /// Returns the smallest integer greater than or equal to each lane.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[inline]
+    fn ceil(self) -> Self {
+        unsafe { simd_ceil(self) }
+    }
+
+    /// Returns the largest integer value less than or equal to each lane.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[inline]
+    fn floor(self) -> Self {
+        unsafe { simd_floor(self) }
+    }
+
+    /// Rounds to the nearest integer value. Ties round toward zero.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[inline]
+    fn round(self) -> Self {
+        unsafe { simd_round(self) }
+    }
+
+    /// Returns the floating point's integer value, with its fractional part removed.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[inline]
+    fn trunc(self) -> Self {
+        unsafe { simd_trunc(self) }
+    }
+
+    /// Returns the floating point's fractional value, with its integer part removed.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn fract(self) -> Self;
+}
+
+impl<const N: usize> Sealed for Simd<f32, N> where LaneCount<N>: SupportedLaneCount {}
+impl<const N: usize> Sealed for Simd<f64, N> where LaneCount<N>: SupportedLaneCount {}
+
+// We can safely just use all the defaults.
+impl<const N: usize> StdFloat for Simd<f32, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+{
+    /// Returns the floating point's fractional value, with its integer part removed.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[inline]
+    fn fract(self) -> Self {
+        self - self.trunc()
+    }
+}
+
+impl<const N: usize> StdFloat for Simd<f64, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+{
+    /// Returns the floating point's fractional value, with its integer part removed.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[inline]
+    fn fract(self) -> Self {
+        self - self.trunc()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use simd::*;
+
+    #[test]
+    fn everything_works() {
+        let x = f32x4::from_array([0.1, 0.5, 0.6, -1.5]);
+        let x2 = x + x;
+        let _xc = x.ceil();
+        let _xf = x.floor();
+        let _xr = x.round();
+        let _xt = x.trunc();
+        let _xfma = x.mul_add(x, x);
+        let _xsqrt = x.sqrt();
+        let _ = x2.abs() * x2;
+    }
+}

From af26e3b9fd5c21492eb603ec57bc72aee8e7f84b Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Tue, 21 Dec 2021 15:29:29 -0800
Subject: [PATCH 08/18] Tear down and rewrite support for float testing

---
 crates/core_simd/Cargo.toml          |  3 ++
 crates/core_simd/examples/nbody.rs   | 10 +++----
 crates/core_simd/src/intrinsics.rs   | 26 ------------------
 crates/core_simd/src/round.rs        | 41 ----------------------------
 crates/core_simd/src/vector/float.rs | 23 ----------------
 crates/core_simd/tests/ops_macros.rs |  2 ++
 crates/core_simd/tests/round.rs      |  2 ++
 7 files changed, 12 insertions(+), 95 deletions(-)

diff --git a/crates/core_simd/Cargo.toml b/crates/core_simd/Cargo.toml
index a103ef115a586..d2ff5f3b1b195 100644
--- a/crates/core_simd/Cargo.toml
+++ b/crates/core_simd/Cargo.toml
@@ -26,3 +26,6 @@ features = ["alloc"]
 
 [dev-dependencies.test_helpers]
 path = "../test_helpers"
+
+[dev-dependencies]
+std_float = { path = "../std_float/", features = ["as_crate"] }
diff --git a/crates/core_simd/examples/nbody.rs b/crates/core_simd/examples/nbody.rs
index 43280feebbd67..7b1e6840f6424 100644
--- a/crates/core_simd/examples/nbody.rs
+++ b/crates/core_simd/examples/nbody.rs
@@ -1,11 +1,13 @@
-#![cfg_attr(feature = "std", feature(portable_simd))]
+#![feature(portable_simd)]
+extern crate std_float;
 
 /// Benchmarks game nbody code
 /// Taken from the `packed_simd` crate
 /// Run this benchmark with `cargo test --example nbody`
-#[cfg(feature = "std")]
 mod nbody {
-    use core_simd::*;
+    use core_simd::simd::*;
+    #[allow(unused)] // False positive?
+    use std_float::StdFloat;
 
     use std::f64::consts::PI;
     const SOLAR_MASS: f64 = 4.0 * PI * PI;
@@ -167,7 +169,6 @@ mod nbody {
     }
 }
 
-#[cfg(feature = "std")]
 #[cfg(test)]
 mod tests {
     // Good enough for demonstration purposes, not going for strictness here.
@@ -184,7 +185,6 @@ mod tests {
 }
 
 fn main() {
-    #[cfg(feature = "std")]
     {
         let (energy_before, energy_after) = nbody::run(1000);
         println!("Energy before: {}", energy_before);
diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index 6a6d26d10a7f2..0bc241af1f1cd 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -87,29 +87,3 @@ extern "platform-intrinsic" {
     #[allow(unused)]
     pub(crate) fn simd_select_bitmask<M, T>(m: M, a: T, b: T) -> T;
 }
-
-#[cfg(feature = "std")]
-mod std {
-    extern "platform-intrinsic" {
-        // ceil
-        pub(crate) fn simd_ceil<T>(x: T) -> T;
-
-        // floor
-        pub(crate) fn simd_floor<T>(x: T) -> T;
-
-        // round
-        pub(crate) fn simd_round<T>(x: T) -> T;
-
-        // trunc
-        pub(crate) fn simd_trunc<T>(x: T) -> T;
-
-        // fsqrt
-        pub(crate) fn simd_fsqrt<T>(x: T) -> T;
-
-        // fma
-        pub(crate) fn simd_fma<T>(x: T, y: T, z: T) -> T;
-    }
-}
-
-#[cfg(feature = "std")]
-pub(crate) use crate::simd::intrinsics::std::*;
diff --git a/crates/core_simd/src/round.rs b/crates/core_simd/src/round.rs
index 09789e1149206..06ccab3ec494c 100644
--- a/crates/core_simd/src/round.rs
+++ b/crates/core_simd/src/round.rs
@@ -5,47 +5,6 @@ macro_rules! implement {
     {
         $type:ty, $int_type:ty
     } => {
-        #[cfg(feature = "std")]
-        impl<const LANES: usize> Simd<$type, LANES>
-        where
-            LaneCount<LANES>: SupportedLaneCount,
-        {
-            /// Returns the smallest integer greater than or equal to each lane.
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[inline]
-            pub fn ceil(self) -> Self {
-                unsafe { intrinsics::simd_ceil(self) }
-            }
-
-            /// Returns the largest integer value less than or equal to each lane.
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[inline]
-            pub fn floor(self) -> Self {
-                unsafe { intrinsics::simd_floor(self) }
-            }
-
-            /// Rounds to the nearest integer value. Ties round toward zero.
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[inline]
-            pub fn round(self) -> Self {
-                unsafe { intrinsics::simd_round(self) }
-            }
-
-            /// Returns the floating point's integer value, with its fractional part removed.
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[inline]
-            pub fn trunc(self) -> Self {
-                unsafe { intrinsics::simd_trunc(self) }
-            }
-
-            /// Returns the floating point's fractional value, with its integer part removed.
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[inline]
-            pub fn fract(self) -> Self {
-                self - self.trunc()
-            }
-        }
-
         impl<const LANES: usize> Simd<$type, LANES>
         where
             LaneCount<LANES>: SupportedLaneCount,
diff --git a/crates/core_simd/src/vector/float.rs b/crates/core_simd/src/vector/float.rs
index 4a4b23238c4ab..3528a420351cf 100644
--- a/crates/core_simd/src/vector/float.rs
+++ b/crates/core_simd/src/vector/float.rs
@@ -38,29 +38,6 @@ macro_rules! impl_float_vector {
                 unsafe { intrinsics::simd_fabs(self) }
             }
 
-            /// Fused multiply-add.  Computes `(self * a) + b` with only one rounding error,
-            /// yielding a more accurate result than an unfused multiply-add.
-            ///
-            /// Using `mul_add` *may* be more performant than an unfused multiply-add if the target
-            /// architecture has a dedicated `fma` CPU instruction.  However, this is not always
-            /// true, and will be heavily dependent on designing algorithms with specific target
-            /// hardware in mind.
-            #[cfg(feature = "std")]
-            #[inline]
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            pub fn mul_add(self, a: Self, b: Self) -> Self {
-                unsafe { intrinsics::simd_fma(self, a, b) }
-            }
-
-            /// Produces a vector where every lane has the square root value
-            /// of the equivalently-indexed lane in `self`
-            #[inline]
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[cfg(feature = "std")]
-            pub fn sqrt(self) -> Self {
-                unsafe { intrinsics::simd_fsqrt(self) }
-            }
-
             /// Takes the reciprocal (inverse) of each lane, `1/x`.
             #[inline]
             #[must_use = "method returns a new vector and does not mutate the original value"]
diff --git a/crates/core_simd/tests/ops_macros.rs b/crates/core_simd/tests/ops_macros.rs
index 43ddde4c55e01..4fb9de198ee15 100644
--- a/crates/core_simd/tests/ops_macros.rs
+++ b/crates/core_simd/tests/ops_macros.rs
@@ -546,6 +546,8 @@ macro_rules! impl_float_tests {
 
             #[cfg(feature = "std")]
             mod std {
+                use std_float::StdFloat;
+
                 use super::*;
                 test_helpers::test_lanes! {
                     fn sqrt<const LANES: usize>() {
diff --git a/crates/core_simd/tests/round.rs b/crates/core_simd/tests/round.rs
index 11d617a6c2c56..1a1bc9ebca76a 100644
--- a/crates/core_simd/tests/round.rs
+++ b/crates/core_simd/tests/round.rs
@@ -3,6 +3,8 @@
 macro_rules! float_rounding_test {
     { $scalar:tt, $int_scalar:tt } => {
         mod $scalar {
+            use std_float::StdFloat;
+
             type Vector<const LANES: usize> = core_simd::Simd<$scalar, LANES>;
             type Scalar = $scalar;
             type IntScalar = $int_scalar;

From 65cb2c90a0688c110d983a2dbb9932900cd6b5d9 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 9 Jan 2022 13:12:22 -0500
Subject: [PATCH 09/18] Fix mask alias

---
 crates/core_simd/src/masks.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index 191e96903133f..c1ffcaf911669 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -516,7 +516,7 @@ pub type mask16x8 = Mask<i16, 8>;
 pub type mask16x16 = Mask<i16, 16>;
 
 /// Vector of 32 16-bit masks
-pub type mask16x32 = Mask<i32, 32>;
+pub type mask16x32 = Mask<i16, 32>;
 
 /// Vector of two 32-bit masks
 pub type mask32x2 = Mask<i32, 2>;

From 138b9cf4bf8f483c60e4454f0a7e64973474ca07 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Thu, 13 Jan 2022 17:59:55 -0500
Subject: [PATCH 10/18] Use intrinsic for min/max

---
 crates/core_simd/src/intrinsics.rs   |  4 ++++
 crates/core_simd/src/vector/float.rs | 12 ++----------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index 0bc241af1f1cd..70f1d47c08b37 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -46,6 +46,10 @@ extern "platform-intrinsic" {
     /// fabs
     pub(crate) fn simd_fabs<T>(x: T) -> T;
 
+    // minnum/maxnum
+    pub(crate) fn simd_fmin<T>(x: T, y: T) -> T;
+    pub(crate) fn simd_fmax<T>(x: T, y: T) -> T;
+
     pub(crate) fn simd_eq<T, U>(x: T, y: T) -> U;
     pub(crate) fn simd_ne<T, U>(x: T, y: T) -> U;
     pub(crate) fn simd_lt<T, U>(x: T, y: T) -> U;
diff --git a/crates/core_simd/src/vector/float.rs b/crates/core_simd/src/vector/float.rs
index 3528a420351cf..0e179d6fa76bb 100644
--- a/crates/core_simd/src/vector/float.rs
+++ b/crates/core_simd/src/vector/float.rs
@@ -141,11 +141,7 @@ macro_rules! impl_float_vector {
             #[inline]
             #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn min(self, other: Self) -> Self {
-                // TODO consider using an intrinsic
-                self.is_nan().select(
-                    other,
-                    self.lanes_ge(other).select(other, self)
-                )
+                unsafe { intrinsics::simd_fmin(self, other) }
             }
 
             /// Returns the maximum of each lane.
@@ -154,11 +150,7 @@ macro_rules! impl_float_vector {
             #[inline]
             #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn max(self, other: Self) -> Self {
-                // TODO consider using an intrinsic
-                self.is_nan().select(
-                    other,
-                    self.lanes_le(other).select(other, self)
-                )
+                unsafe { intrinsics::simd_fmax(self, other) }
             }
 
             /// Restrict each lane to a certain interval unless it is NaN.

From a4f5f01b8aa92780e695d471e72e699ef10abe30 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Sat, 13 Nov 2021 15:06:48 -0800
Subject: [PATCH 11/18] Use intrinsics for Mask::{to,from}_array

This significantly simplifies codegen and should improve mask perf.

Co-authored-by: Jacob Lifshay <programmerjake@gmail.com>
---
 crates/core_simd/src/masks.rs | 40 +++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index c1ffcaf911669..ae1fef53da88e 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -12,9 +12,10 @@
 )]
 mod mask_impl;
 
+use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
 use core::cmp::Ordering;
-use core::fmt;
+use core::{fmt, mem};
 
 mod sealed {
     use super::*;
@@ -105,22 +106,39 @@ where
         Self(mask_impl::Mask::splat(value))
     }
 
-    /// Converts an array to a SIMD vector.
+    /// Converts an array of bools to a SIMD mask.
     pub fn from_array(array: [bool; LANES]) -> Self {
-        let mut vector = Self::splat(false);
-        for (i, v) in array.iter().enumerate() {
-            vector.set(i, *v);
+        // SAFETY: Rust's bool has a layout of 1 byte (u8) with a value of
+        //     true:    0b_0000_0001
+        //     false:   0b_0000_0000
+        // Thus, an array of bools is also a valid array of bytes: [u8; N]
+        // This would be hypothetically valid as an "in-place" transmute,
+        // but these are "dependently-sized" types, so copy elision it is!
+        unsafe {
+            let bytes: [u8; LANES] = mem::transmute_copy(&array);
+            let bools: Simd<i8, LANES> =
+                intrinsics::simd_ne(Simd::from_array(bytes), Simd::splat(0u8));
+            Mask::from_int_unchecked(intrinsics::simd_cast(bools))
         }
-        vector
     }
 
-    /// Converts a SIMD vector to an array.
+    /// Converts a SIMD mask to an array of bools.
     pub fn to_array(self) -> [bool; LANES] {
-        let mut array = [false; LANES];
-        for (i, v) in array.iter_mut().enumerate() {
-            *v = self.test(i);
+        // This follows mostly the same logic as from_array.
+        // SAFETY: Rust's bool has a layout of 1 byte (u8) with a value of
+        //     true:    0b_0000_0001
+        //     false:   0b_0000_0000
+        // Thus, an array of bools is also a valid array of bytes: [u8; N]
+        // Since our masks are equal to integers where all bits are set,
+        // we can simply convert them to i8s, and then bitand them by the
+        // bitpattern for Rust's "true" bool.
+        // This would be hypothetically valid as an "in-place" transmute,
+        // but these are "dependently-sized" types, so copy elision it is!
+        unsafe {
+            let mut bytes: Simd<i8, LANES> = intrinsics::simd_cast(self.to_int());
+            bytes &= Simd::splat(1i8);
+            mem::transmute_copy(&bytes)
         }
-        array
     }
 
     /// Converts a vector of integers to a mask, where 0 represents `false` and -1

From 56566d816deda17b2ddaf3e3e603f2af16e26653 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Thu, 20 Jan 2022 15:48:46 -0800
Subject: [PATCH 12/18] Annotate signed type in int_divrem_guard

The way the macro expands, it may sometimes infer
"this is a uint, but doesn't impl Neg???"
Also, I made the "wrong path for intrinsics" error.
These fixes allow integration into libcore.
---
 crates/core_simd/src/ops.rs | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index 82b007aa6966d..b65038933bf33 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -33,7 +33,7 @@ where
 
 macro_rules! unsafe_base {
     ($lhs:ident, $rhs:ident, {$simd_call:ident}, $($_:tt)*) => {
-        unsafe { $crate::intrinsics::$simd_call($lhs, $rhs) }
+        unsafe { $crate::simd::intrinsics::$simd_call($lhs, $rhs) }
     };
 }
 
@@ -49,7 +49,10 @@ macro_rules! unsafe_base {
 macro_rules! wrap_bitshift {
     ($lhs:ident, $rhs:ident, {$simd_call:ident}, $int:ident) => {
         unsafe {
-            $crate::intrinsics::$simd_call($lhs, $rhs.bitand(Simd::splat(<$int>::BITS as $int - 1)))
+            $crate::simd::intrinsics::$simd_call(
+                $lhs,
+                $rhs.bitand(Simd::splat(<$int>::BITS as $int - 1)),
+            )
         }
     };
 }
@@ -70,11 +73,13 @@ macro_rules! int_divrem_guard {
         if $rhs.lanes_eq(Simd::splat(0)).any() {
             panic!($zero);
         } else if <$int>::MIN != 0
-            && ($lhs.lanes_eq(Simd::splat(<$int>::MIN)) & $rhs.lanes_eq(Simd::splat(-1 as _))).any()
+            && ($lhs.lanes_eq(Simd::splat(<$int>::MIN))
+                // type inference can break here, so cut an SInt to size
+                & $rhs.lanes_eq(Simd::splat(-1i64 as _))).any()
         {
             panic!($overflow);
         } else {
-            unsafe { $crate::intrinsics::$simd_call($lhs, $rhs) }
+            unsafe { $crate::simd::intrinsics::$simd_call($lhs, $rhs) }
         }
     };
 }

From 4fc62c20829487a87fe161d8f0ea4789df6c1a01 Mon Sep 17 00:00:00 2001
From: Alec Goncharow <alec@alecgoncharow.com>
Date: Sun, 23 Jan 2022 16:42:57 -0500
Subject: [PATCH 13/18] fix documentation typo

Remove redundant "neither" in the documentation comment.
---
 crates/core_simd/src/vector/float.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/vector/float.rs b/crates/core_simd/src/vector/float.rs
index 0e179d6fa76bb..ae900ff32a031 100644
--- a/crates/core_simd/src/vector/float.rs
+++ b/crates/core_simd/src/vector/float.rs
@@ -105,7 +105,7 @@ macro_rules! impl_float_vector {
                 self.abs().lanes_ne(Self::splat(0.0)) & (self.to_bits() & Self::splat(<$type>::INFINITY).to_bits()).lanes_eq(Simd::splat(0))
             }
 
-            /// Returns true for each lane if its value is neither neither zero, infinite,
+            /// Returns true for each lane if its value is neither zero, infinite,
             /// subnormal, or `NaN`.
             #[inline]
             #[must_use = "method returns a new mask and does not mutate the original value"]

From 36cca22f16e1e67076ac4490cddd6002b3ddea2b Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Mon, 24 Jan 2022 20:11:17 -0500
Subject: [PATCH 14/18] Update crates/core_simd/src/vector/float.rs

Co-authored-by: Alexander Ronald Altman <alexanderaltman@me.com>
---
 crates/core_simd/src/vector/float.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/vector/float.rs b/crates/core_simd/src/vector/float.rs
index ae900ff32a031..fcc7f6d8d1c50 100644
--- a/crates/core_simd/src/vector/float.rs
+++ b/crates/core_simd/src/vector/float.rs
@@ -106,7 +106,7 @@ macro_rules! impl_float_vector {
             }
 
             /// Returns true for each lane if its value is neither zero, infinite,
-            /// subnormal, or `NaN`.
+            /// subnormal, nor `NaN`.
             #[inline]
             #[must_use = "method returns a new mask and does not mutate the original value"]
             pub fn is_normal(self) -> Mask<$mask_ty, LANES> {

From a991d48e95911c0e94f47bda10cbb50200852ec2 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Wed, 26 Jan 2022 16:58:38 -0800
Subject: [PATCH 15/18] Add Simd::cast

---
 crates/core_simd/src/intrinsics.rs |  3 +++
 crates/core_simd/src/vector.rs     | 29 +++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index 70f1d47c08b37..2291400537c94 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -39,6 +39,9 @@ extern "platform-intrinsic" {
 
     /// fptoui/fptosi/uitofp/sitofp
     pub(crate) fn simd_cast<T, U>(x: T) -> U;
+    /// follows Rust's `T as U` semantics, including saturating float casts
+    /// which amounts to the same as `simd_cast` for many cases
+    pub(crate) fn simd_as<T, U>(x: T) -> U;
 
     /// neg/fneg
     pub(crate) fn simd_neg<T>(x: T) -> T;
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 7c5ec2bc314c4..a9e99a18c2db5 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -75,6 +75,35 @@ where
         Self(array)
     }
 
+    /// Performs lanewise conversion of a SIMD vector's elements to another SIMD-valid type.
+    /// This follows the semantics of Rust's `as` conversion for casting
+    /// integers to unsigned integers (interpreting as the other type, so `-1` to `MAX`),
+    /// and from floats to integers (truncating, or saturating at the limits) for each lane,
+    /// or vice versa.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "std")] use core_simd::Simd;
+    /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+    /// let floats: Simd<f32, 4> = Simd::from_array([1.9, -4.5, f32::INFINITY, f32::NAN]);
+    /// let ints = floats.cast::<i32>();
+    /// assert_eq!(ints, Simd::from_array([1, -4, i32::MAX, 0]));
+    ///
+    /// // Formally equivalent, but `Simd::cast` can optimize better.
+    /// assert_eq!(ints, Simd::from_array(floats.to_array().map(|x| x as i32)));
+    ///
+    /// // The float conversion does not round-trip.
+    /// let floats_again = ints.cast();
+    /// assert_ne!(floats, floats_again);
+    /// assert_eq!(floats_again, Simd::from_array([1.0, -4.0, 2147483647.0, 0.0]));
+    /// ```
+    #[must_use]
+    #[inline]
+    pub fn cast<U: SimdElement>(self) -> Simd<U, LANES> {
+        unsafe { intrinsics::simd_as(self) }
+    }
+
     /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.
     /// If an index is out-of-bounds, the lane is instead selected from the `or` vector.
     ///

From 0031b02cee0c7679120c7a942c378cd13bfb5021 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Wed, 26 Jan 2022 20:54:05 -0800
Subject: [PATCH 16/18] Add core_simd/tests/cast.rs

---
 crates/core_simd/tests/cast.rs | 37 ++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 crates/core_simd/tests/cast.rs

diff --git a/crates/core_simd/tests/cast.rs b/crates/core_simd/tests/cast.rs
new file mode 100644
index 0000000000000..ab5650f071323
--- /dev/null
+++ b/crates/core_simd/tests/cast.rs
@@ -0,0 +1,37 @@
+#![feature(portable_simd)]
+macro_rules! cast_types {
+    ($start:ident, $($target:ident),*) => {
+        mod $start {
+            use core_simd::simd::Simd;
+            type Vector<const N: usize> = Simd<$start, N>;
+            $(
+                mod $target {
+                    use super::*;
+                    test_helpers::test_lanes! {
+                        fn cast_as<const N: usize>() {
+                            test_helpers::test_unary_elementwise(
+                                &Vector::<N>::cast::<$target>,
+                                &|x| x as $target,
+                                &|_| true,
+                            )
+                        }
+                    }
+                }
+            )*
+        }
+    };
+}
+
+// The hypothesis is that widening conversions aren't terribly interesting.
+cast_types!(f32, f64, i8, u8, usize, isize);
+cast_types!(f64, f32, i8, u8, usize, isize);
+cast_types!(i8, u8, f32);
+cast_types!(u8, i8, f32);
+cast_types!(i16, u16, i8, u8, f32);
+cast_types!(u16, i16, i8, u8, f32);
+cast_types!(i32, u32, i8, u8, f32, f64);
+cast_types!(u32, i32, i8, u8, f32, f64);
+cast_types!(i64, u64, i8, u8, isize, usize, f32, f64);
+cast_types!(u64, i64, i8, u8, isize, usize, f32, f64);
+cast_types!(isize, usize, i8, u8, f32, f64);
+cast_types!(usize, isize, i8, u8, f32, f64);

From 03f6fbb21e6050da2a05b3ce8f480c020b384916 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Thu, 27 Jan 2022 09:07:15 -0800
Subject: [PATCH 17/18] Omit Simd::cast during bootstrap

---
 crates/core_simd/src/intrinsics.rs | 1 +
 crates/core_simd/src/vector.rs     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index 2291400537c94..233657202f7e8 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -41,6 +41,7 @@ extern "platform-intrinsic" {
     pub(crate) fn simd_cast<T, U>(x: T) -> U;
     /// follows Rust's `T as U` semantics, including saturating float casts
     /// which amounts to the same as `simd_cast` for many cases
+    #[cfg(not(bootstrap))]
     pub(crate) fn simd_as<T, U>(x: T) -> U;
 
     /// neg/fneg
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index a9e99a18c2db5..b7ef7a56c7319 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -100,6 +100,7 @@ where
     /// ```
     #[must_use]
     #[inline]
+    #[cfg(not(bootstrap))]
     pub fn cast<U: SimdElement>(self) -> Simd<U, LANES> {
         unsafe { intrinsics::simd_as(self) }
     }

From e96159e9af4e55070481a7c071e61e0adf337807 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Thu, 27 Jan 2022 11:50:58 -0800
Subject: [PATCH 18/18] pub use std::simd::StdFloat;

Make available the remaining float intrinsics that require runtime support
from a platform's libm, and thus cannot be included in a no-deps libcore,
by exposing them through a sealed trait, `std::simd::StdFloat`.

We might use the trait approach a bit more in the future, or maybe not.
Ideally, this trait doesn't stick around, even if so.
If we don't need to intermesh it with std, it can be used as a crate,
but currently that is somewhat uncertain.
---
 library/std/src/lib.rs                        | 22 ++++++++++++++++--
 src/test/ui/simd/libm_std_can_float.rs        | 23 +++++++++++++++++++
 .../simd/portable-intrinsics-arent-exposed.rs |  3 ++-
 .../portable-intrinsics-arent-exposed.stderr  | 23 ++++++++++---------
 4 files changed, 57 insertions(+), 14 deletions(-)
 create mode 100644 src/test/ui/simd/libm_std_can_float.rs

diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs
index 17fe0011569b0..159b5027455ae 100644
--- a/library/std/src/lib.rs
+++ b/library/std/src/lib.rs
@@ -313,6 +313,7 @@
 #![feature(panic_internals)]
 #![feature(panic_unwind)]
 #![feature(pin_static_ref)]
+#![feature(platform_intrinsics)]
 #![feature(portable_simd)]
 #![feature(prelude_import)]
 #![feature(ptr_as_uninit)]
@@ -465,8 +466,6 @@ pub use core::pin;
 pub use core::ptr;
 #[stable(feature = "rust1", since = "1.0.0")]
 pub use core::result;
-#[unstable(feature = "portable_simd", issue = "86656")]
-pub use core::simd;
 #[unstable(feature = "async_stream", issue = "79024")]
 pub use core::stream;
 #[stable(feature = "i128", since = "1.26.0")]
@@ -513,6 +512,25 @@ pub mod time;
 #[unstable(feature = "once_cell", issue = "74465")]
 pub mod lazy;
 
+// Pull in `std_float` crate  into libstd. The contents of
+// `std_float` are in a different repository: rust-lang/portable-simd.
+#[path = "../../portable-simd/crates/std_float/src/lib.rs"]
+#[allow(missing_debug_implementations, dead_code, unsafe_op_in_unsafe_fn, unused_unsafe)]
+#[allow(rustdoc::bare_urls)]
+#[unstable(feature = "portable_simd", issue = "86656")]
+#[cfg(not(all(miri, doctest)))] // Miri does not support all SIMD intrinsics
+mod std_float;
+
+#[cfg(not(all(miri, doctest)))] // Miri does not support all SIMD intrinsics
+#[doc = include_str!("../../portable-simd/crates/core_simd/src/core_simd_docs.md")]
+#[unstable(feature = "portable_simd", issue = "86656")]
+pub mod simd {
+    #[doc(inline)]
+    pub use crate::std_float::StdFloat;
+    #[doc(inline)]
+    pub use core::simd::*;
+}
+
 #[stable(feature = "futures_api", since = "1.36.0")]
 pub mod task {
     //! Types and Traits for working with asynchronous tasks.
diff --git a/src/test/ui/simd/libm_std_can_float.rs b/src/test/ui/simd/libm_std_can_float.rs
new file mode 100644
index 0000000000000..6a844e7120e79
--- /dev/null
+++ b/src/test/ui/simd/libm_std_can_float.rs
@@ -0,0 +1,23 @@
+// run-pass
+
+// This is the converse of the other libm test.
+#![feature(portable_simd)]
+use std::simd::f32x4;
+use std::simd::StdFloat;
+
+// For SIMD float ops, the LLIR version which is used to implement the portable
+// forms of them may become calls to math.h AKA libm. So, we can't guarantee
+// we can compile them for #![no_std] crates.
+//
+// However, we can expose some of these ops via an extension trait.
+fn main() {
+    let x = f32x4::from_array([0.1, 0.5, 0.6, -1.5]);
+    let x2 = x + x;
+    let _xc = x.ceil();
+    let _xf = x.floor();
+    let _xr = x.round();
+    let _xt = x.trunc();
+    let _xfma = x.mul_add(x, x);
+    let _xsqrt = x.sqrt();
+    let _ = x2.abs() * x2;
+}
diff --git a/src/test/ui/simd/portable-intrinsics-arent-exposed.rs b/src/test/ui/simd/portable-intrinsics-arent-exposed.rs
index 4d7590323550c..667c8b67b1d46 100644
--- a/src/test/ui/simd/portable-intrinsics-arent-exposed.rs
+++ b/src/test/ui/simd/portable-intrinsics-arent-exposed.rs
@@ -1,7 +1,8 @@
 // May not matter, since people can use them with a nightly feature.
 // However this tests to guarantee they don't leak out via portable_simd,
 // and thus don't accidentally get stabilized.
-use std::simd::intrinsics; //~ERROR E0603
+use core::simd::intrinsics; //~ERROR E0433
+use std::simd::intrinsics; //~ERROR E0432
 
 fn main() {
     ()
diff --git a/src/test/ui/simd/portable-intrinsics-arent-exposed.stderr b/src/test/ui/simd/portable-intrinsics-arent-exposed.stderr
index 9ac73eca19345..f568aa0429525 100644
--- a/src/test/ui/simd/portable-intrinsics-arent-exposed.stderr
+++ b/src/test/ui/simd/portable-intrinsics-arent-exposed.stderr
@@ -1,15 +1,16 @@
-error[E0603]: module `intrinsics` is private
-  --> $DIR/portable-intrinsics-arent-exposed.rs:4:16
+error[E0433]: failed to resolve: maybe a missing crate `core`?
+  --> $DIR/portable-intrinsics-arent-exposed.rs:4:5
    |
-LL | use std::simd::intrinsics;
-   |                ^^^^^^^^^^ private module
-   |
-note: the module `intrinsics` is defined here
-  --> $SRC_DIR/core/src/lib.rs:LL:COL
+LL | use core::simd::intrinsics;
+   |     ^^^^ maybe a missing crate `core`?
+
+error[E0432]: unresolved import `std::simd::intrinsics`
+  --> $DIR/portable-intrinsics-arent-exposed.rs:5:5
    |
-LL |     pub use crate::core_simd::simd::*;
-   |             ^^^^^^^^^^^^^^^^^^^^^^^^^
+LL | use std::simd::intrinsics;
+   |     ^^^^^^^^^^^^^^^^^^^^^ no `intrinsics` in `simd`
 
-error: aborting due to previous error
+error: aborting due to 2 previous errors
 
-For more information about this error, try `rustc --explain E0603`.
+Some errors have detailed explanations: E0432, E0433.
+For more information about an error, try `rustc --explain E0432`.