-
Notifications
You must be signed in to change notification settings - Fork 12k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[DAG] isSplatValue - node is a splat if all demanded elts have the same whole constant value #74443
Merged
Conversation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-arm Author: Simon Pilgrim (RKSimon) ChangesPatch is 326.97 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/74443.diff 13 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 51ae8b703e50f..5be1892a44f6d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2881,6 +2881,12 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
}
}
+ // Fallback - this is a splat if all demanded elts are the same constant.
+ if (computeKnownBits(V, DemandedElts, Depth).isConstant()) {
+ UndefElts = ~DemandedElts;
+ return true;
+ }
+
return false;
}
diff --git a/llvm/test/CodeGen/ARM/vector-store.ll b/llvm/test/CodeGen/ARM/vector-store.ll
index a8a1031637afc..9c8ea7a2c440c 100644
--- a/llvm/test/CodeGen/ARM/vector-store.ll
+++ b/llvm/test/CodeGen/ARM/vector-store.ll
@@ -403,17 +403,14 @@ define void @v3i8store(ptr %p) {
; CHECK-LABEL: v3i8store:
; CHECK: @ %bb.0:
; CHECK-NEXT: sub sp, #4
-; CHECK-NEXT: vmov.i32 d16, #0xff
-; CHECK-NEXT: mov r1, sp
-; CHECK-NEXT: vmov.i32 d17, #0x0
-; CHECK-NEXT: movs r2, #0
-; CHECK-NEXT: vand d16, d17, d16
-; CHECK-NEXT: vst1.32 {d16[0]}, [r1:32]
-; CHECK-NEXT: vld1.32 {d16[0]}, [r1:32]
+; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: mov r2, sp
+; CHECK-NEXT: str r1, [sp]
+; CHECK-NEXT: vld1.32 {d16[0]}, [r2:32]
+; CHECK-NEXT: strb r1, [r0, #2]
; CHECK-NEXT: vmovl.u16 q8, d16
-; CHECK-NEXT: strb r2, [r0, #2]
-; CHECK-NEXT: vmov.32 r1, d16[0]
-; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov.32 r2, d16[0]
+; CHECK-NEXT: strh r2, [r0]
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: bx lr
store <3 x i8> zeroinitializer, ptr %p, align 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index a56a81f5f793b..769bfe8cd5ba9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -244,14 +244,14 @@ define <8 x i64> @vrgather_shuffle_vx_v8i64(<8 x i64> %x) {
; RV32-NEXT: addi a0, a0, %lo(.LCPI13_0)
; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu
; RV32-NEXT: vle16.v v16, (a0)
-; RV32-NEXT: vrgatherei16.vv v12, v8, v16
+; RV32-NEXT: vmv.v.i v20, 5
; RV32-NEXT: lui a0, %hi(.LCPI13_1)
; RV32-NEXT: addi a0, a0, %lo(.LCPI13_1)
-; RV32-NEXT: vle16.v v8, (a0)
-; RV32-NEXT: li a0, 140
+; RV32-NEXT: vle16.v v17, (a0)
+; RV32-NEXT: li a0, 115
; RV32-NEXT: vmv.s.x v0, a0
-; RV32-NEXT: vmv.v.i v16, 5
-; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t
+; RV32-NEXT: vrgatherei16.vv v12, v20, v16
+; RV32-NEXT: vrgatherei16.vv v12, v8, v17, v0.t
; RV32-NEXT: vmv.v.v v8, v12
; RV32-NEXT: ret
;
diff --git a/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll b/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
index 88425ea87845d..5d563b141ed66 100644
--- a/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
+++ b/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
@@ -54,7 +54,10 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4
; X32-NEXT: xorps %xmm0, %xmm0
; X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
; X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; X32-NEXT: mulps %xmm0, %xmm0
+; X32-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
; X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
; X32-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
@@ -138,7 +141,10 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
-; X64-NEXT: mulps %xmm0, %xmm0
+; X64-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; X64-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
; X64-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll
index 6c07c4ca523f8..853d8a278960d 100644
--- a/llvm/test/CodeGen/X86/var-permute-256.ll
+++ b/llvm/test/CodeGen/X86/var-permute-256.ll
@@ -25,18 +25,20 @@ define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind {
;
; AVX1-LABEL: var_shuffle_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
-; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpaddq %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [2,2]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4
-; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vpermilpd %ymm1, %ymm4, %ymm2
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vblendvpd %ymm3, %ymm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shuffle_v4i64:
@@ -88,15 +90,16 @@ define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
;
; AVX1-LABEL: var_shuffle_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
-; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,3,3,3]
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; INT256-LABEL: var_shuffle_v8i32:
@@ -445,18 +448,20 @@ define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) noun
;
; AVX1-LABEL: var_shuffle_v4f64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
-; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpaddq %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [2,2]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4
-; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vpermilpd %ymm1, %ymm4, %ymm2
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vblendvpd %ymm3, %ymm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shuffle_v4f64:
@@ -508,15 +513,16 @@ define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwi
;
; AVX1-LABEL: var_shuffle_v8f32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
-; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,3,3,3]
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; INT256-LABEL: var_shuffle_v8f32:
@@ -569,17 +575,19 @@ define <4 x i64> @var_shuffle_v4i64_from_v2i64(<2 x i64> %v, <4 x i64> %indices)
; AVX1-LABEL: var_shuffle_v4i64_from_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpaddq %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [2,2]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3
-; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm0
-; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm2
-; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm1
+; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shuffle_v4i64_from_v2i64:
@@ -633,14 +641,15 @@ define <8 x i32> @var_shuffle_v8i32_from_v4i32(<4 x i32> %v, <8 x i32> %indices)
; AVX1-LABEL: var_shuffle_v8i32_from_v4i32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,3,3,3]
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm3
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; INT256-LABEL: var_shuffle_v8i32_from_v4i32:
@@ -990,17 +999,19 @@ define <4 x double> @var_shuffle_v4f64_from_v2f64(<2 x double> %v, <4 x i64> %in
; AVX1-LABEL: var_shuffle_v4f64_from_v2f64:
; AVX1: # %bb.0:
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpaddq %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [2,2]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3
-; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm0
-; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm2
-; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm1
+; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shuffle_v4f64_from_v2f64:
@@ -1054,14 +1065,15 @@ define <8 x float> @var_shuffle_v8f32_from_v4f32(<4 x float> %v, <8 x i32> %indi
; AVX1-LABEL: var_shuffle_v8f32_from_v4f32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,3,3,3]
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm3
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; INT256-LABEL: var_shuffle_v8f32_from_v4f32:
@@ -1271,20 +1283,22 @@ define <4 x i64> @var_shuffle_v4i64_with_v16i8_indices(<4 x i64> %v, <16 x i8> %
;
; AVX1-LABEL: var_shuffle_v4i64_with_v16i8_indices:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsrld $16, %xmm1, %xmm2
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [2,2]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vpaddq %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4
-; AVX1-NEXT: vpermilpd %ymm4, %ymm3, %ymm3
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vpermilpd %ymm1, %ymm4, %ymm2
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT: vblendvpd %ymm1, %ymm3, %ymm0, %ymm0
+; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vblendvpd %ymm3, %ymm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shuffle_v4i64_with_v16i8_indices:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
index c9cbb0994810f..e2195f1fc25a1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
@@ -156,64 +156,62 @@ define void @load_i8_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
; SSE-LABEL: load_i8_stride4_vf8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa (%rdi), %xmm3
; SSE-NEXT: movdqa 16(%rdi), %xmm4
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0]
+; SSE-NEXT: movdqa %xmm4, %xmm1
; SSE-NEXT: movdqa %xmm4, %xmm2
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pand %xmm0, %xmm4
+; SSE-NEXT: pand %xmm3, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: pxor %xmm7, %xmm7
-; SSE-NEXT: movdqa %xmm4, %xmm2
-; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm4, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[0,1,1,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[1,3,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm1, %xmm6
-; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
-; SSE-NEXT: packuswb %xmm8, %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3]
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm8, %xmm4
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
-; SSE-NEXT: pand %xmm8, %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
-; SSE-NEXT: packuswb %xmm4, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; SSE-NEXT: pxor %xmm6, %xmm6
+; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13]...
[truncated]
|
RKSimon
commented
Dec 5, 2023
phoebewang
reviewed
Dec 5, 2023
RKSimon
force-pushed
the
perf/splat-constant
branch
from
December 5, 2023 13:03
0e81d9c
to
f21c863
Compare
RKSimon
force-pushed
the
perf/splat-constant
branch
2 times, most recently
from
December 7, 2023 15:24
0b9a603
to
ec63552
Compare
phoebewang
approved these changes
Dec 8, 2023
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
RKSimon
force-pushed
the
perf/splat-constant
branch
from
December 8, 2023 10:40
ec63552
to
642834a
Compare
ampandey-1995
pushed a commit
to ampandey-1995/llvm-project
that referenced
this pull request
Jan 19, 2024
… demanded elts have the same whole constant value (llvm#74443) Relying on ComputeKnownBits to find a splat is causing miscompilations where a shift of zero is being assumed to give zero, but further simplification leads to a shift of zero by undef, resulting in an unexpected undef value. Fixes llvm#78109
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
No description provided.