Skip to content

Commit

Permalink
[MCP] Enhance MCP copy Instruction removal for special case (#70778)
Browse files Browse the repository at this point in the history
Machine Copy Propagation Pass may lose some opportunities to further
remove the redundant copy instructions during the ForwardCopyPropagateBlock
procedure. When we Clobber a "Def" register, we also need to remove the record 
from the copy maps that indicates "Src" defined "Def" to ensure the correct semantics
of the ClobberRegister function.

For more information, please see the C++ test case generated code in 
"vector.body" after the MCP Pass: https://gcc.godbolt.org/z/nK4oMaWv5.
  • Loading branch information
LWenH committed Nov 22, 2023
1 parent c2b3f16 commit cae46f6
Show file tree
Hide file tree
Showing 13 changed files with 96 additions and 51 deletions.
41 changes: 38 additions & 3 deletions llvm/lib/CodeGen/MachineCopyPropagation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,43 @@ class CopyTracker {
if (MachineInstr *MI = I->second.MI) {
std::optional<DestSourcePair> CopyOperands =
isCopyInstr(*MI, TII, UseCopyInstr);
markRegsUnavailable({CopyOperands->Destination->getReg().asMCReg()},
TRI);

MCRegister Def = CopyOperands->Destination->getReg().asMCReg();
MCRegister Src = CopyOperands->Source->getReg().asMCReg();

markRegsUnavailable(Def, TRI);

// Since we clobber the destination of a copy, the semantic of Src's
// "DefRegs" to contain Def is no longer effectual. We will also need
// to remove the record from the copy maps that indicates Src defined
// Def. Failing to do so might cause the target to miss some
// opportunities to further eliminate redundant copy instructions.
// Consider the following sequence during the
// ForwardCopyPropagateBlock procedure:
// L1: r0 = COPY r9 <- TrackMI
// L2: r0 = COPY r8 <- TrackMI (Remove r9 defined r0 from tracker)
// L3: use r0 <- Remove L2 from MaybeDeadCopies
// L4: early-clobber r9 <- Clobber r9 (L2 is still valid in tracker)
// L5: r0 = COPY r8 <- Remove NopCopy
for (MCRegUnit SrcUnit : TRI.regunits(Src)) {
auto SrcCopy = Copies.find(SrcUnit);
if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) {
// If SrcCopy defines multiple values, we only need
// to erase the record for Def in DefRegs.
for (auto itr = SrcCopy->second.DefRegs.begin();
itr != SrcCopy->second.DefRegs.end(); itr++) {
if (*itr == Def) {
SrcCopy->second.DefRegs.erase(itr);
// If DefReg becomes empty after removal, we can directly
// remove SrcCopy from the tracker's copy maps.
if (SrcCopy->second.DefRegs.empty()) {
Copies.erase(SrcCopy);
}
break;
}
}
}
}
}
// Now we can erase the copy.
Copies.erase(I);
Expand Down Expand Up @@ -785,6 +820,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
// ...
// %xmm2 = copy %xmm9
Tracker.clobberRegister(Def, *TRI, *TII, UseCopyInstr);

for (const MachineOperand &MO : MI.implicit_operands()) {
if (!MO.isReg() || !MO.isDef())
continue;
Expand All @@ -795,7 +831,6 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
}

Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr);

continue;
}
}
Expand Down
32 changes: 32 additions & 0 deletions llvm/test/CodeGen/RISCV/machine-cp.mir
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
entry:
ret void
}
define void @bar() {
entry:
ret void
}
...
---
name: foo
Expand All @@ -21,6 +25,7 @@ body: |
; RV32-NEXT: renamable $v4_v5_v6_v7_v8_v9_v10_v11 = COPY killed renamable $v0_v1_v2_v3_v4_v5_v6_v7
; RV32-NEXT: renamable $v28 = COPY renamable $v8, implicit killed $v28_v29_v30, implicit-def $v28_v29_v30
; RV32-NEXT: PseudoRET implicit $v28
;
; RV64-LABEL: name: foo
; RV64: liveins: $v28_v29_v30, $v8_v9, $v1
; RV64-NEXT: {{ $}}
Expand All @@ -32,3 +37,30 @@ body: |
renamable $v28 = COPY renamable $v8, implicit killed $v28_v29_v30, implicit-def $v28_v29_v30
PseudoRET implicit $v28
...
---
name: bar
body: |
bb.0.entry:
liveins: $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $x28, $x29, $x30, $x31
; RV32-LABEL: name: bar
; RV32: liveins: $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $x28, $x29, $x30, $x31
; RV32-NEXT: {{ $}}
; RV32-NEXT: $v0 = COPY renamable $v8
; RV32-NEXT: renamable $v14m2 = PseudoVLE32_V_M2_MASK undef renamable $v14m2, renamable $x15, $v0, -1, 5 /* e32 */, 1 /* ta, mu */, implicit $vl, implicit $vtype
; RV32-NEXT: early-clobber renamable $v9 = PseudoVMSLE_VI_M2 killed renamable $v10m2, -1, -1, 5 /* e32 */, implicit $vl, implicit $vtype
; RV32-NEXT: PseudoVSE32_V_M2_MASK killed renamable $v14m2, renamable $x9, $v0, -1, 5 /* e32 */, implicit $vl, implicit $vtype
;
; RV64-LABEL: name: bar
; RV64: liveins: $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $x28, $x29, $x30, $x31
; RV64-NEXT: {{ $}}
; RV64-NEXT: $v0 = COPY renamable $v8
; RV64-NEXT: renamable $v14m2 = PseudoVLE32_V_M2_MASK undef renamable $v14m2, renamable $x15, $v0, -1, 5 /* e32 */, 1 /* ta, mu */, implicit $vl, implicit $vtype
; RV64-NEXT: early-clobber renamable $v9 = PseudoVMSLE_VI_M2 killed renamable $v10m2, -1, -1, 5 /* e32 */, implicit $vl, implicit $vtype
; RV64-NEXT: PseudoVSE32_V_M2_MASK killed renamable $v14m2, renamable $x9, $v0, -1, 5 /* e32 */, implicit $vl, implicit $vtype
$v0 = COPY killed renamable $v9
$v0 = COPY renamable $v8
renamable $v14m2 = PseudoVLE32_V_M2_MASK undef renamable $v14m2, renamable $x15, $v0, -1, 5, 1, implicit $vl, implicit $vtype
early-clobber renamable $v9 = PseudoVMSLE_VI_M2 killed renamable $v10m2, -1, -1, 5, implicit $vl, implicit $vtype
$v0 = COPY killed renamable $v8
PseudoVSE32_V_M2_MASK killed renamable $v14m2, renamable $x9, $v0, -1, 5, implicit $vl, implicit $vtype
...
1 change: 0 additions & 1 deletion llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,6 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vfabs.v v16, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; CHECK-NEXT: vmv1r.v v0, v1
; CHECK-NEXT: vmflt.vf v1, v16, fa5, v0.t
; CHECK-NEXT: frflags a0
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
Expand Down
4 changes: 0 additions & 4 deletions llvm/test/CodeGen/X86/shift-i128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,6 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-NEXT: movl %edx, %ecx
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; i686-NEXT: shrdl %cl, %eax, (%esp) # 4-byte Folded Spill
; i686-NEXT: movl %edx, %ecx
; i686-NEXT: shrl %cl, %esi
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
; i686-NEXT: movl %esi, 28(%ecx)
Expand Down Expand Up @@ -489,7 +488,6 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; i686-NEXT: shrdl %cl, %esi, %ebx
; i686-NEXT: movl %edx, %ecx
; i686-NEXT: sarl %cl, %ebp
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
; i686-NEXT: movl %ebp, 28(%ecx)
Expand Down Expand Up @@ -623,11 +621,9 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; i686-NEXT: shll %cl, %edi
; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl %ecx, %edi
; i686-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; i686-NEXT: negl %ebp
; i686-NEXT: movl 64(%esp,%ebp), %esi
; i686-NEXT: movl %edi, %ecx
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
; i686-NEXT: movl (%esp), %edi # 4-byte Reload
; i686-NEXT: shldl %cl, %edi, %esi
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/X86/shift-i256.ll
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; CHECK-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; CHECK-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
; CHECK-NEXT: movl 28(%esp,%ebp), %edx
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1201,7 +1201,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl %ebp, %ebx
; X86-NEXT: addl %edx, %ebx
; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl %ecx, %esi
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14979,7 +14979,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3,4,5],xmm0[6],xmm13[7]
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm7
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7]
; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,1,0,1]
; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5,6,7,8,9,10],ymm14[11],ymm13[12,13,14,15]
Expand Down Expand Up @@ -15734,7 +15734,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12
; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5],xmm0[6],xmm12[7]
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6,7]
; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm17
; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm17
; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm13
; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14
; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7]
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5534,7 +5534,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm10
; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9
; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm9
; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm9
; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm8
; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1
; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3
Expand Down Expand Up @@ -5979,7 +5979,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm10
; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9
; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm9
; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm9
; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm8
; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1
; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm3
Expand Down Expand Up @@ -6424,7 +6424,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm10
; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9
; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm9
; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm9
; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm8
; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1
; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3
Expand Down Expand Up @@ -6869,7 +6869,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm10
; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9
; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm9
; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm9
; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm8
; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1
; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm3
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2476,7 +2476,6 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
; SSE-NEXT: psllq $48, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm7, %xmm4
; SSE-NEXT: movdqa %xmm7, %xmm1
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; SSE-NEXT: pandn %xmm5, %xmm1
Expand Down Expand Up @@ -2533,7 +2532,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: movdqa %xmm8, %xmm1
; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; SSE-NEXT: movdqa %xmm4, %xmm0
; SSE-NEXT: movdqa %xmm7, %xmm0
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; SSE-NEXT: pandn %xmm4, %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
Expand Down
26 changes: 12 additions & 14 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1024,8 +1024,8 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,65535]
; SSE-NEXT: movdqa %xmm9, %xmm7
; SSE-NEXT: pand %xmm14, %xmm7
; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm6, %xmm15
; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm5, %xmm15
; SSE-NEXT: pand %xmm14, %xmm15
; SSE-NEXT: movdqa %xmm11, %xmm3
; SSE-NEXT: pandn %xmm8, %xmm3
Expand Down Expand Up @@ -2148,7 +2148,6 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movdqa %xmm5, %xmm9
; SSE-NEXT: pand %xmm13, %xmm9
; SSE-NEXT: por %xmm0, %xmm9
; SSE-NEXT: movdqa %xmm6, %xmm3
; SSE-NEXT: movdqa %xmm6, %xmm0
; SSE-NEXT: pand %xmm13, %xmm0
; SSE-NEXT: pandn %xmm10, %xmm13
Expand Down Expand Up @@ -2185,7 +2184,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pandn %xmm3, %xmm2
; SSE-NEXT: pandn %xmm6, %xmm2
; SSE-NEXT: por %xmm10, %xmm2
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535]
Expand Down Expand Up @@ -5451,19 +5450,19 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pand %xmm14, %xmm6
; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: movdqa %xmm14, %xmm3
; SSE-NEXT: movdqa %xmm11, %xmm6
; SSE-NEXT: pandn %xmm11, %xmm3
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pand %xmm0, %xmm5
; SSE-NEXT: pand %xmm14, %xmm5
; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm2, %xmm3
; SSE-NEXT: pand %xmm0, %xmm3
; SSE-NEXT: pand %xmm14, %xmm3
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pandn %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
Expand Down Expand Up @@ -9965,7 +9964,6 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm3, %ymm6
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm3
; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, %xmm14
; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm8
Expand Down Expand Up @@ -9994,12 +9992,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm25, %ymm2
; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm5
; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm10, %zmm3, %zmm5
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm7
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm7
; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2
Expand Down
4 changes: 1 addition & 3 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11212,7 +11212,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm9
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm23
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm17, %zmm9
Expand Down Expand Up @@ -11289,7 +11288,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm10
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm9
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm15
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm31
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
Expand All @@ -11302,7 +11300,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm9
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm11
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm11
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
Expand Down

0 comments on commit cae46f6

Please sign in to comment.