diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index a133641dc6a19..7a81198fc9bae 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -1883,7 +1883,7 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI, unsigned KMaskOp = -1U; if (X86II::isKMasked(TSFlags)) { // For k-zero-masked operations it is Ok to commute the first vector - // operand. + // operand. Unless this is an intrinsic instruction. // For regular k-masked operations a conservative choice is done as the // elements of the first vector operand, for which the corresponding bit // in the k-mask operand is set to 0, are copied to the result of the @@ -1902,7 +1902,7 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI, // The operand with index = 1 is used as a source for those elements for // which the corresponding bit in the k-mask is set to 0. - if (X86II::isKMergeMasked(TSFlags)) + if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic) FirstCommutableVecOp = 3; LastCommutableVecOp++; diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index a089e1ee99af5..8844c0e5e95f1 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -5818,6 +5818,35 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x flo ret <4 x float> %res2 } +; Make sure we don't commute this to fold the load as that source isn't commutable. +define <4 x float> @test_int_x86_avx512_maskz_vfmadd_ss_load0(i8 zeroext %0, <4 x float>* nocapture readonly %1, float %2, float %3) { +; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_load0: +; X64: # %bb.0: +; X64-NEXT: vmovaps (%rsi), %xmm2 +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm0 * xmm2) + xmm1 +; X64-NEXT: vmovaps %xmm2, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_load0: +; X86: # %bb.0: +; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: vmovaps (%ecx), %xmm0 +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm1 +; X86-NEXT: retl + %5 = load <4 x float>, <4 x float>* %1, align 16 + %6 = extractelement <4 x float> %5, i64 0 + %7 = tail call float @llvm.fma.f32(float %6, float %2, float %3) #2 + %8 = bitcast i8 %0 to <8 x i1> + %9 = extractelement <8 x i1> %8, i64 0 + %10 = select i1 %9, float %7, float 0.000000e+00 + %11 = insertelement <4 x float> %5, float %10, i64 0 + ret <4 x float> %11 +} + define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ ; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_sd: ; X64: # %bb.0: