Skip to content

Commit

Permalink
[DAG] visitVECTOR_SHUFFLE - fold splat(insert_vector_elt()) and splat…
Browse files Browse the repository at this point in the history
…(scalar_to_vector()) to build_vector splats

Addresses a number of regressions identified in D127115
  • Loading branch information
RKSimon committed Jun 11, 2022
1 parent ae2ae84 commit cf5c63d
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 74 deletions.
13 changes: 13 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Expand Up @@ -22313,6 +22313,19 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
return DAG.getVectorShuffle(VT, DL, Insert, DAG.getUNDEF(VT), ZeroMask);
}

// splat(scalar_to_vector(x), 0) -> build_vector(x,...,x)
// splat(insert_vector_elt(v, x, c), c) -> build_vector(x,...,x)
if ((!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) &&
N0.hasOneUse()) {
if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR && SplatIndex == 0)
return DAG.getSplatBuildVector(VT, SDLoc(N), N0.getOperand(0));

if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT)
if (auto *Idx = dyn_cast<ConstantSDNode>(N0.getOperand(2)))
if (Idx->getAPIntValue() == SplatIndex)
return DAG.getSplatBuildVector(VT, SDLoc(N), N0.getOperand(1));
}

// If this is a bit convert that changes the element type of the vector but
// not the number of vector elements, look through it. Be careful not to
// look though conversions that change things like v4f32 to v2f64.
Expand Down
9 changes: 3 additions & 6 deletions llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
Expand Up @@ -1002,13 +1002,10 @@ define dso_local <2 x i64> @testSplat8(<8 x i8>* nocapture readonly %ptr) local_
; CHECK-NOVSX-LABEL: testSplat8:
; CHECK-NOVSX: # %bb.0: # %entry
; CHECK-NOVSX-NEXT: ld r3, 0(r3)
; CHECK-NOVSX-NEXT: addis r4, r2, .LCPI19_0@toc@ha
; CHECK-NOVSX-NEXT: addi r4, r4, .LCPI19_0@toc@l
; CHECK-NOVSX-NEXT: lvx v2, 0, r4
; CHECK-NOVSX-NEXT: addi r4, r1, -16
; CHECK-NOVSX-NEXT: std r3, -8(r1)
; CHECK-NOVSX-NEXT: std r3, -16(r1)
; CHECK-NOVSX-NEXT: addi r3, r1, -16
; CHECK-NOVSX-NEXT: lvx v3, 0, r3
; CHECK-NOVSX-NEXT: vperm v2, v3, v3, v2
; CHECK-NOVSX-NEXT: lvx v2, 0, r4
; CHECK-NOVSX-NEXT: blr
;
; CHECK-P7-LABEL: testSplat8:
Expand Down
14 changes: 4 additions & 10 deletions llvm/test/CodeGen/PowerPC/load-and-splat.ll
Expand Up @@ -546,11 +546,8 @@ define <16 x i8> @unadjusted_lxvwsx(i32* %s, i32* %t) {
;
; P7-LABEL: unadjusted_lxvwsx:
; P7: # %bb.0: # %entry
; P7-NEXT: lwz r3, 0(r3)
; P7-NEXT: addi r4, r1, -16
; P7-NEXT: stw r3, -16(r1)
; P7-NEXT: lxvw4x vs0, 0, r4
; P7-NEXT: xxspltw v2, vs0, 0
; P7-NEXT: lfiwzx f0, 0, r3
; P7-NEXT: xxspltw v2, vs0, 1
; P7-NEXT: blr
;
; P9-AIX32-LABEL: unadjusted_lxvwsx:
Expand All @@ -566,11 +563,8 @@ define <16 x i8> @unadjusted_lxvwsx(i32* %s, i32* %t) {
;
; P7-AIX32-LABEL: unadjusted_lxvwsx:
; P7-AIX32: # %bb.0: # %entry
; P7-AIX32-NEXT: lwz r3, 0(r3)
; P7-AIX32-NEXT: addi r4, r1, -16
; P7-AIX32-NEXT: stw r3, -16(r1)
; P7-AIX32-NEXT: lxvw4x vs0, 0, r4
; P7-AIX32-NEXT: xxspltw v2, vs0, 0
; P7-AIX32-NEXT: lfiwzx f0, 0, r3
; P7-AIX32-NEXT: xxspltw v2, vs0, 1
; P7-AIX32-NEXT: blr
entry:
%0 = bitcast i32* %s to <4 x i8>*
Expand Down
7 changes: 3 additions & 4 deletions llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
Expand Up @@ -36,7 +36,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) {
; AVX2-LABEL: ext_i2_2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
Expand Down Expand Up @@ -207,7 +207,7 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
; AVX1-LABEL: ext_i4_4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
Expand Down Expand Up @@ -418,8 +418,7 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
;
; AVX1-LABEL: ext_i8_8i64:
; AVX1: # %bb.0:
; AVX1-NEXT: # kill: def $edi killed $edi def $rdi
; AVX1-NEXT: vmovq %rdi, %xmm0
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
Expand Down
7 changes: 3 additions & 4 deletions llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
Expand Up @@ -39,7 +39,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) {
; AVX2-LABEL: ext_i2_2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
Expand Down Expand Up @@ -263,7 +263,7 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
; AVX1-LABEL: ext_i4_4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
Expand Down Expand Up @@ -541,8 +541,7 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
;
; AVX1-LABEL: ext_i8_8i64:
; AVX1: # %bb.0:
; AVX1-NEXT: # kill: def $edi killed $edi def $rdi
; AVX1-NEXT: vmovq %rdi, %xmm0
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
Expand Up @@ -31,7 +31,7 @@ define <2 x i1> @bitcast_i2_2i1(i2 zeroext %a0) {
; AVX2-LABEL: bitcast_i2_2i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
Expand Down
11 changes: 5 additions & 6 deletions llvm/test/CodeGen/X86/pr15296.ll
Expand Up @@ -36,12 +36,11 @@ allocas:
define <8 x i32> @shiftInput___canonical(<8 x i32> %input, i32 %shiftval, <8 x i32> %__mask) nounwind {
; X86-LABEL: shiftInput___canonical:
; X86: # %bb.0: # %allocas
; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm1
; X86-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; X86-NEXT: vextractf128 $1, %ymm0, %xmm2
; X86-NEXT: vpsrld %xmm1, %xmm2, %xmm2
; X86-NEXT: vpsrld %xmm1, %xmm0, %xmm0
; X86-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X86-NEXT: vpsrld %xmm2, %xmm1, %xmm1
; X86-NEXT: vpsrld %xmm2, %xmm0, %xmm0
; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: shiftInput___canonical:
Expand Down
64 changes: 21 additions & 43 deletions llvm/test/CodeGen/X86/pr51615.ll
Expand Up @@ -81,49 +81,27 @@ define void @volatile_load_2_elts_bitcast() {
}

define void @elts_from_consecutive_loads(<2 x i64>* %arg, i32* %arg12, <8 x i32>* %arg13, float %arg14, i1 %arg15) {
; AVX-LABEL: elts_from_consecutive_loads:
; AVX: # %bb.0: # %bb
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: .p2align 4, 0x90
; AVX-NEXT: .LBB3_1: # %bb16
; AVX-NEXT: # =>This Loop Header: Depth=1
; AVX-NEXT: # Child Loop BB3_2 Depth 2
; AVX-NEXT: testb $1, %cl
; AVX-NEXT: je .LBB3_1
; AVX-NEXT: .p2align 4, 0x90
; AVX-NEXT: .LBB3_2: # %bb17
; AVX-NEXT: # Parent Loop BB3_1 Depth=1
; AVX-NEXT: # => This Inner Loop Header: Depth=2
; AVX-NEXT: movl (%rdi), %eax
; AVX-NEXT: vbroadcastss (%rdi), %ymm2
; AVX-NEXT: movl %eax, (%rsi)
; AVX-NEXT: vmovaps %ymm2, (%rdx)
; AVX-NEXT: vucomiss %xmm1, %xmm0
; AVX-NEXT: jne .LBB3_2
; AVX-NEXT: jp .LBB3_2
; AVX-NEXT: jmp .LBB3_1
;
; AVX2-LABEL: elts_from_consecutive_loads:
; AVX2: # %bb.0: # %bb
; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2-NEXT: .p2align 4, 0x90
; AVX2-NEXT: .LBB3_1: # %bb16
; AVX2-NEXT: # =>This Loop Header: Depth=1
; AVX2-NEXT: # Child Loop BB3_2 Depth 2
; AVX2-NEXT: testb $1, %cl
; AVX2-NEXT: je .LBB3_1
; AVX2-NEXT: .p2align 4, 0x90
; AVX2-NEXT: .LBB3_2: # %bb17
; AVX2-NEXT: # Parent Loop BB3_1 Depth=1
; AVX2-NEXT: # => This Inner Loop Header: Depth=2
; AVX2-NEXT: vmovaps (%rdi), %xmm2
; AVX2-NEXT: vmovss %xmm2, (%rsi)
; AVX2-NEXT: vbroadcastss %xmm2, %ymm2
; AVX2-NEXT: vmovaps %ymm2, (%rdx)
; AVX2-NEXT: vucomiss %xmm1, %xmm0
; AVX2-NEXT: jne .LBB3_2
; AVX2-NEXT: jp .LBB3_2
; AVX2-NEXT: jmp .LBB3_1
; ALL-LABEL: elts_from_consecutive_loads:
; ALL: # %bb.0: # %bb
; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; ALL-NEXT: .p2align 4, 0x90
; ALL-NEXT: .LBB3_1: # %bb16
; ALL-NEXT: # =>This Loop Header: Depth=1
; ALL-NEXT: # Child Loop BB3_2 Depth 2
; ALL-NEXT: testb $1, %cl
; ALL-NEXT: je .LBB3_1
; ALL-NEXT: .p2align 4, 0x90
; ALL-NEXT: .LBB3_2: # %bb17
; ALL-NEXT: # Parent Loop BB3_1 Depth=1
; ALL-NEXT: # => This Inner Loop Header: Depth=2
; ALL-NEXT: movl (%rdi), %eax
; ALL-NEXT: vbroadcastss (%rdi), %ymm2
; ALL-NEXT: movl %eax, (%rsi)
; ALL-NEXT: vmovaps %ymm2, (%rdx)
; ALL-NEXT: vucomiss %xmm1, %xmm0
; ALL-NEXT: jne .LBB3_2
; ALL-NEXT: jp .LBB3_2
; ALL-NEXT: jmp .LBB3_1
bb:
br label %bb16

Expand Down

0 comments on commit cf5c63d

Please sign in to comment.