-
Notifications
You must be signed in to change notification settings - Fork 15.3k
Description
I have some rust code that does a floating point to int conversion in a loop (amongst some other computation). Rust semantics are that this conversion treats NaN as zero and saturates to the min/max values of the integer, which matches the llvm.fptosi.sat intrinsic.
Unfortunately this conversion seems to prevent the whole loop getting vectorized when targeting skylake or skx architectures. The assembly contains 4 sets of scalar vminsd/vmaxsd/vucomisd/vcvttsd2si instructions.
Godbolt link: https://rust.godbolt.org/z/h4EasKzEY
Simplified rust code:
pub fn f64_to_int(input: &[f64;4], output: &mut [i32; 4]) {
output.iter_mut().zip(input.iter()).for_each(|(o, i)| {
*o = *i as i32
})
}Using a non-saturating conversion gets vectorized, also when manually handling NaN and clamping to the valid range before (Unless I'm missing some edge cases that should have the same semantics as the intrinsic):
pub unsafe fn f64_to_int_unchecked(input: &[f64; 4], output: &mut [i32; 4]) {
output.iter_mut().zip(input.iter()).for_each(|(o, i)| {
*o = i.to_int_unchecked()
})
}
pub fn f64_to_int_manual(input: &[f64; 4], output: &mut [i32; 4]) {
output.iter_mut().zip(input.iter()).for_each(|(o, i)| {
let non_nan = if i.is_nan() {
0.0
} else {
*i
};
let clamped = non_nan.clamp(i32::MIN as f64, i32::MAX as f64);
// value is not nan and inside the range of i32
*o = unsafe { clamped.to_int_unchecked() }
})
}Cleaned up llvm ir for all three methods:
declare <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double>)
define void @to_int(ptr noalias noundef readonly align 8 dereferenceable(32) %input, ptr noalias noundef align 4 dereferenceable(16) %output) {
%1 = load <4 x double>, ptr %input, align 8
%2 = tail call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> %1)
store <4 x i32> %2, ptr %output, align 4
ret void
}
define void @to_int_unchecked(ptr noalias noundef readonly align 8 dereferenceable(32) %input, ptr noalias noundef align 4 dereferenceable(16) %output) {
%1 = load <4 x double>, ptr %input, align 8
%2 = fptosi <4 x double> %1 to <4 x i32>
store <4 x i32> %2, ptr %output, align 4
ret void
}
define void @to_int_manual(ptr noalias noundef readonly align 8 dereferenceable(32) %input, ptr noalias noundef align 4 dereferenceable(16) %output) {
%1 = load <4 x double>, ptr %input, align 8
%2 = fcmp ord <4 x double> %1, zeroinitializer
%3 = select <4 x i1> %2, <4 x double> %1, <4 x double> zeroinitializer
%4 = fcmp olt <4 x double> %3, <double 0xC1E0000000000000, double 0xC1E0000000000000, double 0xC1E0000000000000, double 0xC1E0000000000000>
%5 = select <4 x i1> %4, <4 x double> <double 0xC1E0000000000000, double 0xC1E0000000000000, double 0xC1E0000000000000, double 0xC1E0000000000000>, <4 x double> %3
%6 = fcmp ogt <4 x double> %5, <double 0x41DFFFFFFFC00000, double 0x41DFFFFFFFC00000, double 0x41DFFFFFFFC00000, double 0x41DFFFFFFFC00000>
%7 = select <4 x i1> %6, <4 x double> <double 0x41DFFFFFFFC00000, double 0x41DFFFFFFFC00000, double 0x41DFFFFFFFC00000, double 0x41DFFFFFFFC00000>, <4 x double> %5
%8 = fptosi <4 x double> %7 to <4 x i32>
store <4 x i32> %8, ptr %output, align 4
ret void
}Conversion to i64 should also be possible to vectorize, but probably requires avx512 target features.