Skip to content

Commit

Permalink
[MCP] Enhance MCP copy Instruction removal for special case(reapply) (#…
Browse files Browse the repository at this point in the history
…74239)

Machine Copy Propagation Pass may lose some opportunities to further
remove the redundant copy instructions during the ForwardCopyPropagateBlock
procedure. When we Clobber a "Def" register, we also need to remove the record 
from the copy maps that indicates "Src" defined "Def" to ensure the correct semantics
of the ClobberRegister function.  This patch reapplies #70778 and addresses the corner 
case bug  #73512 specific to the AMDGPU backend. Additionally, it refines the criteria 
for removing empty records from the copy maps, thereby enhancing overall safety.

For more information, please see the C++ test case generated code in 
"vector.body" after the MCP Pass: https://gcc.godbolt.org/z/nK4oMaWv5.
  • Loading branch information
LWenH committed Dec 26, 2023
1 parent c019ed9 commit dc1fade
Show file tree
Hide file tree
Showing 14 changed files with 98 additions and 58 deletions.
42 changes: 40 additions & 2 deletions llvm/lib/CodeGen/MachineCopyPropagation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,46 @@ class CopyTracker {
if (MachineInstr *MI = I->second.MI) {
std::optional<DestSourcePair> CopyOperands =
isCopyInstr(*MI, TII, UseCopyInstr);
markRegsUnavailable({CopyOperands->Destination->getReg().asMCReg()},
TRI);

MCRegister Def = CopyOperands->Destination->getReg().asMCReg();
MCRegister Src = CopyOperands->Source->getReg().asMCReg();

markRegsUnavailable(Def, TRI);

// Since we clobber the destination of a copy, the semantic of Src's
// "DefRegs" to contain Def is no longer effectual. We will also need
// to remove the record from the copy maps that indicates Src defined
// Def. Failing to do so might cause the target to miss some
// opportunities to further eliminate redundant copy instructions.
// Consider the following sequence during the
// ForwardCopyPropagateBlock procedure:
// L1: r0 = COPY r9 <- TrackMI
// L2: r0 = COPY r8 <- TrackMI (Remove r9 defined r0 from tracker)
// L3: use r0 <- Remove L2 from MaybeDeadCopies
// L4: early-clobber r9 <- Clobber r9 (L2 is still valid in tracker)
// L5: r0 = COPY r8 <- Remove NopCopy
for (MCRegUnit SrcUnit : TRI.regunits(Src)) {
auto SrcCopy = Copies.find(SrcUnit);
if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) {
// If SrcCopy defines multiple values, we only need
// to erase the record for Def in DefRegs.
for (auto itr = SrcCopy->second.DefRegs.begin();
itr != SrcCopy->second.DefRegs.end(); itr++) {
if (*itr == Def) {
SrcCopy->second.DefRegs.erase(itr);
// If DefReg becomes empty after removal, we can remove the
// SrcCopy from the tracker's copy maps. We only remove those
// entries solely record the Def is defined by Src. If an
// entry also contains the definition record of other Def'
// registers, it cannot be cleared.
if (SrcCopy->second.DefRegs.empty() && !SrcCopy->second.MI) {
Copies.erase(SrcCopy);
}
break;
}
}
}
}
}
// Now we can erase the copy.
Copies.erase(I);
Expand Down
26 changes: 26 additions & 0 deletions llvm/test/CodeGen/AMDGPU/mcp-implicit-clobber.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
# RUN: llc -march=amdgcn -mcpu=gfx900 %s -o - -run-pass machine-cp -verify-machineinstrs | FileCheck %s

# The MachineCopyPropagation Pass should not treat the subsequent
# instruction "$sgpr2_sgpr3 = COPY $sgpr6_sgpr7" as a NopCopy.
# For detailed information, please refer to issue 73512.
---
name: foo
body: |
bb.0.entry:
liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
; CHECK-LABEL: name: foo
; CHECK: liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $sgpr2_sgpr3 = COPY $sgpr6_sgpr7
; CHECK-NEXT: S_NOP 0, implicit-def $sgpr0
; CHECK-NEXT: $sgpr2_sgpr3 = COPY $sgpr6_sgpr7
; CHECK-NEXT: S_NOP 0, implicit $sgpr2_sgpr3
$sgpr2_sgpr3 = COPY $sgpr6_sgpr7
$sgpr0 = COPY $sgpr3
S_NOP 0, implicit-def $sgpr0
$sgpr3 = COPY killed $sgpr5
$sgpr2_sgpr3 = COPY $sgpr6_sgpr7
S_NOP 0, implicit $sgpr2_sgpr3
...
1 change: 0 additions & 1 deletion llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,6 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vfabs.v v16, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; CHECK-NEXT: vmv1r.v v0, v1
; CHECK-NEXT: vmflt.vf v1, v16, fa5, v0.t
; CHECK-NEXT: frflags a0
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
Expand Down
4 changes: 0 additions & 4 deletions llvm/test/CodeGen/X86/shift-i128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,6 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-NEXT: movl %edx, %ecx
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; i686-NEXT: shrdl %cl, %eax, (%esp) # 4-byte Folded Spill
; i686-NEXT: movl %edx, %ecx
; i686-NEXT: shrl %cl, %esi
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
; i686-NEXT: movl %esi, 28(%ecx)
Expand Down Expand Up @@ -489,7 +488,6 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; i686-NEXT: shrdl %cl, %esi, %ebx
; i686-NEXT: movl %edx, %ecx
; i686-NEXT: sarl %cl, %ebp
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
; i686-NEXT: movl %ebp, 28(%ecx)
Expand Down Expand Up @@ -623,11 +621,9 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; i686-NEXT: shll %cl, %edi
; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl %ecx, %edi
; i686-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; i686-NEXT: negl %ebp
; i686-NEXT: movl 64(%esp,%ebp), %esi
; i686-NEXT: movl %edi, %ecx
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
; i686-NEXT: movl (%esp), %edi # 4-byte Reload
; i686-NEXT: shldl %cl, %edi, %esi
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/X86/shift-i256.ll
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; CHECK-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; CHECK-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
; CHECK-NEXT: movl 28(%esp,%ebp), %edx
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1201,7 +1201,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl %ebp, %ebx
; X86-NEXT: addl %edx, %ebx
; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl %ecx, %esi
Expand Down
21 changes: 9 additions & 12 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14447,7 +14447,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm3
; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm10
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7]
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
Expand Down Expand Up @@ -14483,7 +14482,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3
; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm11
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7]
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3],xmm3[4],xmm7[5,6,7]
; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1
Expand Down Expand Up @@ -14516,7 +14515,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0
; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7]
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7]
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6
; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
Expand All @@ -14530,8 +14529,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 {%k1} # 16-byte Folded Reload
; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7]
; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm18
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15, %ymm18
; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm25
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3],xmm6[4],xmm0[5],xmm6[6,7]
Expand Down Expand Up @@ -14738,7 +14737,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm27
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm1[3],ymm13[4,5],ymm1[6],ymm13[7]
; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm19
; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm16
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7]
Expand All @@ -14747,7 +14745,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128>
; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm1
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7]
; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,1,2,0]
; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
Expand Down Expand Up @@ -14823,14 +14820,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm12
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5],xmm0[6],xmm12[7]
; AVX512DQ-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7]
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm13
; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm3[2,3],ymm14[4,5],ymm3[6,7]
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm13
; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[0,1,0,1]
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7,8,9,10],ymm15[11],ymm12[12,13,14,15]
; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm5
; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm4
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7]
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm4
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7]
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm10
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1],xmm15[2,3,4,5],xmm10[6],xmm15[7]
; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8490,12 +8490,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-NEXT: vpermt2q %zmm31, %zmm23, %zmm12
; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4]
; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0
; AVX512F-NEXT: vpermt2q %zmm31, %zmm24, %zmm0
; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5]
; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0
; AVX512F-NEXT: vpermt2q %zmm31, %zmm8, %zmm0
; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6]
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2480,7 +2480,6 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
; SSE-NEXT: psllq $48, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm7, %xmm4
; SSE-NEXT: movdqa %xmm7, %xmm1
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; SSE-NEXT: pandn %xmm5, %xmm1
Expand Down Expand Up @@ -2537,7 +2536,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: movdqa %xmm8, %xmm1
; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; SSE-NEXT: movdqa %xmm4, %xmm0
; SSE-NEXT: movdqa %xmm7, %xmm0
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; SSE-NEXT: pandn %xmm4, %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1181,13 +1181,13 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pandn %xmm9, %xmm4
; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm9, %xmm11
; SSE-NEXT: pand %xmm1, %xmm11
; SSE-NEXT: movdqa %xmm1, %xmm4
; SSE-NEXT: pand %xmm10, %xmm11
; SSE-NEXT: movdqa %xmm10, %xmm4
; SSE-NEXT: pandn %xmm0, %xmm4
; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 96(%rdi), %xmm13
; SSE-NEXT: movdqa %xmm13, %xmm4
; SSE-NEXT: pand %xmm1, %xmm4
; SSE-NEXT: pand %xmm10, %xmm4
; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 176(%rdi), %xmm4
; SSE-NEXT: movdqa %xmm4, %xmm10
Expand Down
21 changes: 10 additions & 11 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1024,8 +1024,8 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,65535]
; SSE-NEXT: movdqa %xmm9, %xmm7
; SSE-NEXT: pand %xmm14, %xmm7
; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm6, %xmm15
; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm5, %xmm15
; SSE-NEXT: pand %xmm14, %xmm15
; SSE-NEXT: movdqa %xmm11, %xmm3
; SSE-NEXT: pandn %xmm8, %xmm3
Expand Down Expand Up @@ -2148,7 +2148,6 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movdqa %xmm5, %xmm9
; SSE-NEXT: pand %xmm13, %xmm9
; SSE-NEXT: por %xmm0, %xmm9
; SSE-NEXT: movdqa %xmm6, %xmm3
; SSE-NEXT: movdqa %xmm6, %xmm0
; SSE-NEXT: pand %xmm13, %xmm0
; SSE-NEXT: pandn %xmm10, %xmm13
Expand Down Expand Up @@ -2185,7 +2184,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pandn %xmm3, %xmm2
; SSE-NEXT: pandn %xmm6, %xmm2
; SSE-NEXT: por %xmm10, %xmm2
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535]
Expand Down Expand Up @@ -5451,19 +5450,19 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pand %xmm14, %xmm6
; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: movdqa %xmm14, %xmm3
; SSE-NEXT: movdqa %xmm11, %xmm6
; SSE-NEXT: pandn %xmm11, %xmm3
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pand %xmm0, %xmm5
; SSE-NEXT: pand %xmm14, %xmm5
; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm2, %xmm3
; SSE-NEXT: pand %xmm0, %xmm3
; SSE-NEXT: pand %xmm14, %xmm3
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pandn %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
Expand Down
4 changes: 1 addition & 3 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11212,7 +11212,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm9
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm23
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm17, %zmm9
Expand Down Expand Up @@ -11289,7 +11288,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm10
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm9
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm15
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm31
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
Expand All @@ -11302,7 +11300,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm9
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm11
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm11
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
Expand Down

0 comments on commit dc1fade

Please sign in to comment.