diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 4ff84ce8b79f6..f7a4295b11c57 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -13826,12 +13826,14 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_reduce_fadd_ps512: { Function *F = CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType()); + Builder.getFastMathFlags().setAllowReassoc(true); return Builder.CreateCall(F, {Ops[0], Ops[1]}); } case X86::BI__builtin_ia32_reduce_fmul_pd512: case X86::BI__builtin_ia32_reduce_fmul_ps512: { Function *F = CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType()); + Builder.getFastMathFlags().setAllowReassoc(true); return Builder.CreateCall(F, {Ops[0], Ops[1]}); } case X86::BI__builtin_ia32_reduce_mul_d512: diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 2ee4350b14d43..f226382cbb2c6 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -9297,9 +9297,12 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A) /* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as * outputs. This class of vector operation forms the basis of many scientific - * computations. In vector-reduction arithmetic, the evaluation off is + * computations. In vector-reduction arithmetic, the evaluation order is * independent of the order of the input elements of V. + * For floating point types, we always assume the elements are reassociable even + * if -fast-math is off. + * Used bisection method. At each step, we partition the vector with previous * step in half, and the operation is performed on its two halves. * This takes log2(n) steps where n is the number of elements in the vector. @@ -9345,8 +9348,11 @@ _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) { return __builtin_ia32_reduce_or_q512(__W); } +// -0.0 is used to ignore the start value since it is the neutral value of +// floating point addition. For more information, please refer to +// https://llvm.org/docs/LangRef.html#llvm-vector-reduce-fadd-intrinsic static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) { - return __builtin_ia32_reduce_fadd_pd512(0.0, __W); + return __builtin_ia32_reduce_fadd_pd512(-0.0, __W); } static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) { @@ -9356,7 +9362,7 @@ static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) { __W = _mm512_maskz_mov_pd(__M, __W); - return __builtin_ia32_reduce_fadd_pd512(0.0, __W); + return __builtin_ia32_reduce_fadd_pd512(-0.0, __W); } static __inline__ double __DEFAULT_FN_ATTRS512 @@ -9411,7 +9417,7 @@ _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) { static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_reduce_add_ps(__m512 __W) { - return __builtin_ia32_reduce_fadd_ps512(0.0f, __W); + return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W); } static __inline__ float __DEFAULT_FN_ATTRS512 @@ -9422,7 +9428,7 @@ _mm512_reduce_mul_ps(__m512 __W) { static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) { __W = _mm512_maskz_mov_ps(__M, __W); - return __builtin_ia32_reduce_fadd_ps512(0.0f, __W); + return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W); } static __inline__ float __DEFAULT_FN_ATTRS512 diff --git a/clang/test/CodeGen/X86/avx512-reduceIntrin.c b/clang/test/CodeGen/X86/avx512-reduceIntrin.c index d8a1130f3cef0..62580ca1914e9 100644 --- a/clang/test/CodeGen/X86/avx512-reduceIntrin.c +++ b/clang/test/CodeGen/X86/avx512-reduceIntrin.c @@ -11,13 +11,13 @@ long long test_mm512_reduce_add_epi64(__m512i __W){ long long test_mm512_reduce_mul_epi64(__m512i __W){ // CHECK-LABEL: @test_mm512_reduce_mul_epi64( // CHECK: call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> %{{.*}}) - return _mm512_reduce_mul_epi64(__W); + return _mm512_reduce_mul_epi64(__W); } long long test_mm512_reduce_or_epi64(__m512i __W){ // CHECK-LABEL: @test_mm512_reduce_or_epi64( // CHECK: call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %{{.*}}) - return _mm512_reduce_or_epi64(__W); + return _mm512_reduce_or_epi64(__W); } long long test_mm512_reduce_and_epi64(__m512i __W){ @@ -31,7 +31,7 @@ long long test_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W){ // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} // CHECK: call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %{{.*}}) - return _mm512_mask_reduce_add_epi64(__M, __W); + return _mm512_mask_reduce_add_epi64(__M, __W); } long long test_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W){ @@ -39,7 +39,7 @@ long long test_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W){ // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} // CHECK: call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> %{{.*}}) - return _mm512_mask_reduce_mul_epi64(__M, __W); + return _mm512_mask_reduce_mul_epi64(__M, __W); } long long test_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W){ @@ -47,7 +47,7 @@ long long test_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W){ // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} // CHECK: call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %{{.*}}) - return _mm512_mask_reduce_and_epi64(__M, __W); + return _mm512_mask_reduce_and_epi64(__M, __W); } long long test_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W){ @@ -55,30 +55,30 @@ long long test_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W){ // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} // CHECK: call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %{{.*}}) - return _mm512_mask_reduce_or_epi64(__M, __W); + return _mm512_mask_reduce_or_epi64(__M, __W); } int test_mm512_reduce_add_epi32(__m512i __W){ // CHECK-LABEL: @test_mm512_reduce_add_epi32( // CHECK: call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %{{.*}}) - return _mm512_reduce_add_epi32(__W); + return _mm512_reduce_add_epi32(__W); } int test_mm512_reduce_mul_epi32(__m512i __W){ // CHECK-LABEL: @test_mm512_reduce_mul_epi32( // CHECK: call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %{{.*}}) - return _mm512_reduce_mul_epi32(__W); + return _mm512_reduce_mul_epi32(__W); } int test_mm512_reduce_or_epi32(__m512i __W){ // CHECK: call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %{{.*}}) - return _mm512_reduce_or_epi32(__W); + return _mm512_reduce_or_epi32(__W); } int test_mm512_reduce_and_epi32(__m512i __W){ // CHECK-LABEL: @test_mm512_reduce_and_epi32( // CHECK: call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %{{.*}}) - return _mm512_reduce_and_epi32(__W); + return _mm512_reduce_and_epi32(__W); } int test_mm512_mask_reduce_add_epi32(__mmask16 __M, __m512i __W){ @@ -86,7 +86,7 @@ int test_mm512_mask_reduce_add_epi32(__mmask16 __M, __m512i __W){ // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} // CHECK: call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %{{.*}}) - return _mm512_mask_reduce_add_epi32(__M, __W); + return _mm512_mask_reduce_add_epi32(__M, __W); } int test_mm512_mask_reduce_mul_epi32(__mmask16 __M, __m512i __W){ @@ -94,7 +94,7 @@ int test_mm512_mask_reduce_mul_epi32(__mmask16 __M, __m512i __W){ // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} // CHECK: call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %{{.*}}) - return _mm512_mask_reduce_mul_epi32(__M, __W); + return _mm512_mask_reduce_mul_epi32(__M, __W); } int test_mm512_mask_reduce_and_epi32(__mmask16 __M, __m512i __W){ @@ -102,7 +102,7 @@ int test_mm512_mask_reduce_and_epi32(__mmask16 __M, __m512i __W){ // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} // CHECK: call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %{{.*}}) - return _mm512_mask_reduce_and_epi32(__M, __W); + return _mm512_mask_reduce_and_epi32(__M, __W); } int test_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W){ @@ -110,61 +110,65 @@ int test_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W){ // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} // CHECK: call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %{{.*}}) - return _mm512_mask_reduce_or_epi32(__M, __W); + return _mm512_mask_reduce_or_epi32(__M, __W); } -double test_mm512_reduce_add_pd(__m512d __W){ +double test_mm512_reduce_add_pd(__m512d __W, double ExtraAddOp){ // CHECK-LABEL: @test_mm512_reduce_add_pd( -// CHECK: call double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> %{{.*}}) - return _mm512_reduce_add_pd(__W); +// CHECK-NOT: reassoc +// CHECK: call reassoc double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> %{{.*}}) +// CHECK-NOT: reassoc + return _mm512_reduce_add_pd(__W) + ExtraAddOp; } -double test_mm512_reduce_mul_pd(__m512d __W){ +double test_mm512_reduce_mul_pd(__m512d __W, double ExtraMulOp){ // CHECK-LABEL: @test_mm512_reduce_mul_pd( -// CHECK: call double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}}) - return _mm512_reduce_mul_pd(__W); +// CHECK-NOT: reassoc +// CHECK: call reassoc double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}}) +// CHECK-NOT: reassoc + return _mm512_reduce_mul_pd(__W) * ExtraMulOp; } float test_mm512_reduce_add_ps(__m512 __W){ // CHECK-LABEL: @test_mm512_reduce_add_ps( -// CHECK: call float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %{{.*}}) - return _mm512_reduce_add_ps(__W); +// CHECK: call reassoc float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> %{{.*}}) + return _mm512_reduce_add_ps(__W); } float test_mm512_reduce_mul_ps(__m512 __W){ // CHECK-LABEL: @test_mm512_reduce_mul_ps( -// CHECK: call float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}}) - return _mm512_reduce_mul_ps(__W); +// CHECK: call reassoc float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}}) + return _mm512_reduce_mul_ps(__W); } double test_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W){ // CHECK-LABEL: @test_mm512_mask_reduce_add_pd( // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} -// CHECK: call double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> %{{.*}}) - return _mm512_mask_reduce_add_pd(__M, __W); +// CHECK: call reassoc double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> %{{.*}}) + return _mm512_mask_reduce_add_pd(__M, __W); } double test_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W){ // CHECK-LABEL: @test_mm512_mask_reduce_mul_pd( // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} -// CHECK: call double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}}) - return _mm512_mask_reduce_mul_pd(__M, __W); +// CHECK: call reassoc double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}}) + return _mm512_mask_reduce_mul_pd(__M, __W); } float test_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W){ // CHECK-LABEL: @test_mm512_mask_reduce_add_ps( // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> {{.*}} -// CHECK: call float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %{{.*}}) - return _mm512_mask_reduce_add_ps(__M, __W); +// CHECK: call reassoc float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> %{{.*}}) + return _mm512_mask_reduce_add_ps(__M, __W); } float test_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W){ // CHECK-LABEL: @test_mm512_mask_reduce_mul_ps( // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> %{{.*}} -// CHECK: call float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}}) - return _mm512_mask_reduce_mul_ps(__M, __W); +// CHECK: call reassoc float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}}) + return _mm512_mask_reduce_mul_ps(__M, __W); }