diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index abd6e64de0097..a032b31a1fc7c 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -175,43 +175,8 @@ class CopyTracker { if (MachineInstr *MI = I->second.MI) { std::optional CopyOperands = isCopyInstr(*MI, TII, UseCopyInstr); - - MCRegister Def = CopyOperands->Destination->getReg().asMCReg(); - MCRegister Src = CopyOperands->Source->getReg().asMCReg(); - - markRegsUnavailable(Def, TRI); - - // Since we clobber the destination of a copy, the semantic of Src's - // "DefRegs" to contain Def is no longer effectual. We will also need - // to remove the record from the copy maps that indicates Src defined - // Def. Failing to do so might cause the target to miss some - // opportunities to further eliminate redundant copy instructions. - // Consider the following sequence during the - // ForwardCopyPropagateBlock procedure: - // L1: r0 = COPY r9 <- TrackMI - // L2: r0 = COPY r8 <- TrackMI (Remove r9 defined r0 from tracker) - // L3: use r0 <- Remove L2 from MaybeDeadCopies - // L4: early-clobber r9 <- Clobber r9 (L2 is still valid in tracker) - // L5: r0 = COPY r8 <- Remove NopCopy - for (MCRegUnit SrcUnit : TRI.regunits(Src)) { - auto SrcCopy = Copies.find(SrcUnit); - if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) { - // If SrcCopy defines multiple values, we only need - // to erase the record for Def in DefRegs. - for (auto itr = SrcCopy->second.DefRegs.begin(); - itr != SrcCopy->second.DefRegs.end(); itr++) { - if (*itr == Def) { - SrcCopy->second.DefRegs.erase(itr); - // If DefReg becomes empty after removal, we can directly - // remove SrcCopy from the tracker's copy maps. - if (SrcCopy->second.DefRegs.empty()) { - Copies.erase(SrcCopy); - } - break; - } - } - } - } + markRegsUnavailable({CopyOperands->Destination->getReg().asMCReg()}, + TRI); } // Now we can erase the copy. Copies.erase(I); @@ -820,7 +785,6 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) { // ... // %xmm2 = copy %xmm9 Tracker.clobberRegister(Def, *TRI, *TII, UseCopyInstr); - for (const MachineOperand &MO : MI.implicit_operands()) { if (!MO.isReg() || !MO.isDef()) continue; @@ -831,6 +795,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) { } Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr); + continue; } } diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll index 565b048008608..8d03594fe1bfd 100644 --- a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll +++ b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll @@ -37,7 +37,7 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i ; CHECK-NEXT: std r30, 160(r1) # 8-byte Folded Spill ; CHECK-NEXT: ld r30, 272(r1) ; CHECK-NEXT: xxmtacc acc0 -; CHECK-NEXT: xvf16ger2pp acc0, v28, v30 +; CHECK-NEXT: xvf16ger2pp acc0, v2, v4 ; CHECK-NEXT: xxmfacc acc0 ; CHECK-NEXT: stxvp vsp0, 64(r1) ; CHECK-NEXT: stxvp vsp2, 32(r1) @@ -88,7 +88,7 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i ; CHECK-BE-NEXT: std r30, 240(r1) # 8-byte Folded Spill ; CHECK-BE-NEXT: ld r30, 368(r1) ; CHECK-BE-NEXT: xxmtacc acc0 -; CHECK-BE-NEXT: xvf16ger2pp acc0, v28, v30 +; CHECK-BE-NEXT: xvf16ger2pp acc0, v2, v4 ; CHECK-BE-NEXT: xxmfacc acc0 ; CHECK-BE-NEXT: stxvp vsp0, 112(r1) ; CHECK-BE-NEXT: stxvp vsp2, 144(r1) diff --git a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll index 6515767e0ef63..33a8260c7bf52 100644 --- a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll +++ b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll @@ -14,17 +14,17 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i ; CHECK-NEXT: vmr v1, v4 ; CHECK-NEXT: vmr v4, v3 ; CHECK-NEXT: vmr v0, v2 -; CHECK-NEXT: vmr v3, v0 -; CHECK-NEXT: ld r3, 96(r1) ; CHECK-NEXT: xxlor vs3, v5, v5 -; CHECK-NEXT: vmr v2, v5 +; CHECK-NEXT: ld r3, 96(r1) ; CHECK-NEXT: xxlor vs0, v0, v0 ; CHECK-NEXT: xxlor vs1, v1, v1 ; CHECK-NEXT: xxlor vs2, v4, v4 ; CHECK-NEXT: xxmtacc acc0 -; CHECK-NEXT: xvi4ger8pp acc0, v0, v4 -; CHECK-NEXT: xvf16ger2pp acc0, v0, v1 -; CHECK-NEXT: pmxvf32gerpn acc0, v4, v5, 0, 0 +; CHECK-NEXT: xvi4ger8pp acc0, v2, v3 +; CHECK-NEXT: xvf16ger2pp acc0, v2, v1 +; CHECK-NEXT: pmxvf32gerpn acc0, v3, v5, 0, 0 +; CHECK-NEXT: vmr v3, v2 +; CHECK-NEXT: vmr v2, v5 ; CHECK-NEXT: pmxvf64gernp acc0, vsp34, v0, 0, 0 ; CHECK-NEXT: xxmfacc acc0 ; CHECK-NEXT: stxv vs0, 48(r3) @@ -38,17 +38,17 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i ; CHECK-BE-NEXT: vmr v1, v4 ; CHECK-BE-NEXT: vmr v4, v3 ; CHECK-BE-NEXT: vmr v0, v2 -; CHECK-BE-NEXT: vmr v3, v0 -; CHECK-BE-NEXT: ld r3, 112(r1) ; CHECK-BE-NEXT: xxlor vs3, v5, v5 -; CHECK-BE-NEXT: vmr v2, v5 +; CHECK-BE-NEXT: ld r3, 112(r1) ; CHECK-BE-NEXT: xxlor vs0, v0, v0 ; CHECK-BE-NEXT: xxlor vs1, v1, v1 ; CHECK-BE-NEXT: xxlor vs2, v4, v4 ; CHECK-BE-NEXT: xxmtacc acc0 -; CHECK-BE-NEXT: xvi4ger8pp acc0, v0, v4 -; CHECK-BE-NEXT: xvf16ger2pp acc0, v0, v1 -; CHECK-BE-NEXT: pmxvf32gerpn acc0, v4, v5, 0, 0 +; CHECK-BE-NEXT: xvi4ger8pp acc0, v2, v3 +; CHECK-BE-NEXT: xvf16ger2pp acc0, v2, v1 +; CHECK-BE-NEXT: pmxvf32gerpn acc0, v3, v5, 0, 0 +; CHECK-BE-NEXT: vmr v3, v2 +; CHECK-BE-NEXT: vmr v2, v5 ; CHECK-BE-NEXT: pmxvf64gernp acc0, vsp34, v0, 0, 0 ; CHECK-BE-NEXT: xxmfacc acc0 ; CHECK-BE-NEXT: stxv vs1, 16(r3) diff --git a/llvm/test/CodeGen/RISCV/machine-cp.mir b/llvm/test/CodeGen/RISCV/machine-cp.mir index 14ae069e5ef70..f3674f89cd918 100644 --- a/llvm/test/CodeGen/RISCV/machine-cp.mir +++ b/llvm/test/CodeGen/RISCV/machine-cp.mir @@ -9,10 +9,6 @@ entry: ret void } - define void @bar() { - entry: - ret void - } ... --- name: foo @@ -25,7 +21,6 @@ body: | ; RV32-NEXT: renamable $v4_v5_v6_v7_v8_v9_v10_v11 = COPY killed renamable $v0_v1_v2_v3_v4_v5_v6_v7 ; RV32-NEXT: renamable $v28 = COPY renamable $v8, implicit killed $v28_v29_v30, implicit-def $v28_v29_v30 ; RV32-NEXT: PseudoRET implicit $v28 - ; ; RV64-LABEL: name: foo ; RV64: liveins: $v28_v29_v30, $v8_v9, $v1 ; RV64-NEXT: {{ $}} @@ -37,30 +32,3 @@ body: | renamable $v28 = COPY renamable $v8, implicit killed $v28_v29_v30, implicit-def $v28_v29_v30 PseudoRET implicit $v28 ... ---- -name: bar -body: | - bb.0.entry: - liveins: $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $x28, $x29, $x30, $x31 - ; RV32-LABEL: name: bar - ; RV32: liveins: $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $x28, $x29, $x30, $x31 - ; RV32-NEXT: {{ $}} - ; RV32-NEXT: $v0 = COPY renamable $v8 - ; RV32-NEXT: renamable $v14m2 = PseudoVLE32_V_M2_MASK undef renamable $v14m2, renamable $x15, $v0, -1, 5 /* e32 */, 1 /* ta, mu */, implicit $vl, implicit $vtype - ; RV32-NEXT: early-clobber renamable $v9 = PseudoVMSLE_VI_M2 killed renamable $v10m2, -1, -1, 5 /* e32 */, implicit $vl, implicit $vtype - ; RV32-NEXT: PseudoVSE32_V_M2_MASK killed renamable $v14m2, renamable $x9, $v0, -1, 5 /* e32 */, implicit $vl, implicit $vtype - ; - ; RV64-LABEL: name: bar - ; RV64: liveins: $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $x28, $x29, $x30, $x31 - ; RV64-NEXT: {{ $}} - ; RV64-NEXT: $v0 = COPY renamable $v8 - ; RV64-NEXT: renamable $v14m2 = PseudoVLE32_V_M2_MASK undef renamable $v14m2, renamable $x15, $v0, -1, 5 /* e32 */, 1 /* ta, mu */, implicit $vl, implicit $vtype - ; RV64-NEXT: early-clobber renamable $v9 = PseudoVMSLE_VI_M2 killed renamable $v10m2, -1, -1, 5 /* e32 */, implicit $vl, implicit $vtype - ; RV64-NEXT: PseudoVSE32_V_M2_MASK killed renamable $v14m2, renamable $x9, $v0, -1, 5 /* e32 */, implicit $vl, implicit $vtype - $v0 = COPY killed renamable $v9 - $v0 = COPY renamable $v8 - renamable $v14m2 = PseudoVLE32_V_M2_MASK undef renamable $v14m2, renamable $x15, $v0, -1, 5, 1, implicit $vl, implicit $vtype - early-clobber renamable $v9 = PseudoVMSLE_VI_M2 killed renamable $v10m2, -1, -1, 5, implicit $vl, implicit $vtype - $v0 = COPY killed renamable $v8 - PseudoVSE32_V_M2_MASK killed renamable $v14m2, renamable $x9, $v0, -1, 5, implicit $vl, implicit $vtype -... diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll index 5407eadb160bd..d9958f4aae350 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll @@ -637,6 +637,7 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z ; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfabs.v v16, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v1 ; CHECK-NEXT: vmflt.vf v1, v16, fa5, v0.t ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll index 4fbe05cd1b2f2..1fe8d834dbcdd 100644 --- a/llvm/test/CodeGen/X86/shift-i128.ll +++ b/llvm/test/CodeGen/X86/shift-i128.ll @@ -347,6 +347,7 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: movl %edx, %ecx ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; i686-NEXT: shrdl %cl, %eax, (%esp) # 4-byte Folded Spill +; i686-NEXT: movl %edx, %ecx ; i686-NEXT: shrl %cl, %esi ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl %esi, 28(%ecx) @@ -488,6 +489,7 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; i686-NEXT: shrdl %cl, %esi, %ebx +; i686-NEXT: movl %edx, %ecx ; i686-NEXT: sarl %cl, %ebp ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl %ebp, 28(%ecx) @@ -621,9 +623,11 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; i686-NEXT: shll %cl, %edi ; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl %ecx, %edi ; i686-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; i686-NEXT: negl %ebp ; i686-NEXT: movl 64(%esp,%ebp), %esi +; i686-NEXT: movl %edi, %ecx ; i686-NEXT: # kill: def $cl killed $cl killed $ecx ; i686-NEXT: movl (%esp), %edi # 4-byte Reload ; i686-NEXT: shldl %cl, %edi, %esi diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll index e1466aebf4225..0e4e706669300 100644 --- a/llvm/test/CodeGen/X86/shift-i256.ll +++ b/llvm/test/CodeGen/X86/shift-i256.ll @@ -78,6 +78,7 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone { ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; CHECK-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; CHECK-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill ; CHECK-NEXT: movl 28(%esp,%ebp), %edx diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll index b2b5bcc5b44b2..abab313f4b12e 100644 --- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll @@ -1201,7 +1201,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %edx, %ebx +; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl %ecx, %esi diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index 9a3bbf1d416cc..dc8fabe3a4329 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -14979,7 +14979,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3,4,5],xmm0[6],xmm13[7] ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm7 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5,6,7,8,9,10],ymm14[11],ymm13[12,13,14,15] @@ -15734,7 +15734,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5],xmm0[6],xmm12[7] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm17 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm13 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll index 831766ececbac..104e42930d4c7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll @@ -4642,7 +4642,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-ONLY-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-ONLY-NEXT: vpermt2q %zmm5, %zmm29, %zmm9 +; AVX512BW-ONLY-NEXT: vpermt2q %zmm8, %zmm29, %zmm9 ; AVX512BW-ONLY-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512BW-ONLY-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 @@ -5087,7 +5087,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQBW-ONLY-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-ONLY-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 ; AVX512DQBW-ONLY-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQBW-ONLY-NEXT: vpermt2q %zmm5, %zmm29, %zmm9 +; AVX512DQBW-ONLY-NEXT: vpermt2q %zmm8, %zmm29, %zmm9 ; AVX512DQBW-ONLY-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 ; AVX512DQBW-ONLY-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512DQBW-ONLY-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index 7947dd0ff373c..4d5c8fe891e81 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -2476,6 +2476,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] ; SSE-NEXT: psllq $48, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pandn %xmm5, %xmm1 @@ -2532,7 +2533,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index 2e305467e0c22..370a7d2213692 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -1024,8 +1024,8 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm9, %xmm7 ; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm15 ; SSE-NEXT: pand %xmm14, %xmm15 ; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: pandn %xmm8, %xmm3 @@ -2148,6 +2148,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm5, %xmm9 ; SSE-NEXT: pand %xmm13, %xmm9 ; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm10, %xmm13 @@ -2184,7 +2185,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: por %xmm10, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535] @@ -5450,19 +5451,19 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm14, %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm11, %xmm6 ; SSE-NEXT: pandn %xmm11, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -9964,6 +9965,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm3, %ymm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, %xmm14 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm8 @@ -9992,12 +9994,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm25, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm10, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm7 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll index f2133b9e42d30..0c2df82fd1be5 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -11212,6 +11212,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm9 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm23 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] ; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm17, %zmm9 @@ -11288,6 +11289,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm10 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm9 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm31 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 @@ -11300,7 +11302,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm11 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index 810ff07eafb63..3a0e1d92b48c0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -1343,9 +1343,10 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[3,3] ; SSE-NEXT: movdqa %xmm15, %xmm10 ; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0,2] ; SSE-NEXT: andps %xmm8, %xmm1 ; SSE-NEXT: orps %xmm6, %xmm1 diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll index f84131dfc8797..24475360cbbc4 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll @@ -1845,6 +1845,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx @@ -2484,6 +2485,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, (%esp) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebx), %edx @@ -3127,6 +3129,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx @@ -3559,6 +3562,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r15, %r11 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rdi), %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rbp ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8 @@ -4193,6 +4197,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload @@ -4874,6 +4879,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r15, %r11 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%r10), %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r12, %rsi @@ -5194,7 +5200,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: notl %edx ; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -5205,6 +5211,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx @@ -5527,6 +5534,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload @@ -6225,6 +6233,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r15, %r11 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rdi), %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rbp ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8 @@ -6863,6 +6872,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload @@ -7350,9 +7360,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; ALL: {{.*}} -; X64: {{.*}} -; X64-NO-SHLD: {{.*}} -; X64-SHLD: {{.*}} ; X86: {{.*}} ; X86-NO-SHLD: {{.*}} ; X86-SHLD: {{.*}} +; X64: {{.*}} +; X64-NO-SHLD: {{.*}} +; X64-SHLD: {{.*}}