Skip to content

Commit

Permalink
[X86][AVX] Added support for lowering to VBROADCASTF128/VBROADCASTI128
Browse files Browse the repository at this point in the history
As reported on PR26235, we don't currently make use of the VBROADCASTF128/VBROADCASTI128 instructions (or the AVX512 equivalents) to load+splat a 128-bit vector to both lanes of a 256-bit vector.

This patch enables lowering from subvector insertion/concatenation patterns and auto-upgrades the llvm.x86.avx.vbroadcastf128.pd.256 / llvm.x86.avx.vbroadcastf128.ps.256 intrinsics to match.

We could possibly investigate using VBROADCASTF128/VBROADCASTI128 to load repeated constants as well (similar to how we already do for scalar broadcasts).

Differential Revision: https://reviews.llvm.org/D22460

llvm-svn: 276281
  • Loading branch information
RKSimon committed Jul 21, 2016
1 parent 62aee52 commit c8e20b1
Show file tree
Hide file tree
Showing 11 changed files with 167 additions and 207 deletions.
21 changes: 14 additions & 7 deletions llvm/lib/IR/AutoUpgrade.cpp
Expand Up @@ -296,6 +296,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
Name.startswith("avx.blend.p") ||
Name == "avx2.pblendw" ||
Name.startswith("avx2.pblendd.") ||
Name.startswith("avx.vbroadcastf128") ||
Name == "avx2.vbroadcasti128" ||
Name == "xop.vpcmov" ||
(Name.startswith("xop.vpcom") && F->arg_size() == 2))) {
Expand Down Expand Up @@ -886,7 +887,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
Value *Trunc0 = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C));
Rep = Builder.CreateCall(CRC32, {Trunc0, CI->getArgOperand(1)});
Rep = Builder.CreateZExt(Rep, CI->getType(), "");
} else if (IsX86 && Name.startswith("avx.vbroadcast")) {
} else if (IsX86 && Name.startswith("avx.vbroadcast.s")) {
// Replace broadcasts with a series of insertelements.
Type *VecTy = CI->getType();
Type *EltTy = VecTy->getVectorElementType();
Expand Down Expand Up @@ -918,15 +919,21 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
bool DoSext = (StringRef::npos != Name.find("pmovsx"));
Rep = DoSext ? Builder.CreateSExt(SV, DstTy)
: Builder.CreateZExt(SV, DstTy);
} else if (IsX86 && Name == "avx2.vbroadcasti128") {
// Replace vbroadcasts with a vector shuffle.
Type *VT = VectorType::get(Type::getInt64Ty(C), 2);
} else if (IsX86 && (Name.startswith("avx.vbroadcastf128") ||
Name == "avx2.vbroadcasti128")) {
// Replace vbroadcastf128/vbroadcasti128 with a vector load+shuffle.
Type *EltTy = CI->getType()->getVectorElementType();
unsigned NumSrcElts = 128 / EltTy->getPrimitiveSizeInBits();
Type *VT = VectorType::get(EltTy, NumSrcElts);
Value *Op = Builder.CreatePointerCast(CI->getArgOperand(0),
PointerType::getUnqual(VT));
Value *Load = Builder.CreateLoad(VT, Op);
uint32_t Idxs[4] = { 0, 1, 0, 1 };
Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
Idxs);
if (NumSrcElts == 2)
Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
{ 0, 1, 0, 1 });
else
Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
{ 0, 1, 2, 3, 0, 1, 2, 3 });
} else if (IsX86 && (Name.startswith("avx2.pbroadcast") ||
Name.startswith("avx2.vbroadcast") ||
Name.startswith("avx512.pbroadcast") ||
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -12805,6 +12805,10 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
// (insert_subvector (insert_subvector undef, (load addr), 0),
// (load addr + 16), Elts/2)
// --> load32 addr
// or a 16-byte broadcast:
// (insert_subvector (insert_subvector undef, (load addr), 0),
// (load addr), Elts/2)
// --> X86SubVBroadcast(load16 addr)
if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
Expand All @@ -12823,6 +12827,10 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
return Ld;
}

// If lower/upper loads are the same then lower to a VBROADCASTF128.
if (SubVec2 == peekThroughBitcasts(SubVec))
return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
}
}
}
Expand Down
18 changes: 18 additions & 0 deletions llvm/lib/Target/X86/X86InstrAVX512.td
Expand Up @@ -986,6 +986,10 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
AVX5128IBase, EVEX;
}

//===----------------------------------------------------------------------===//
// AVX-512 BROADCAST SUBVECTORS
//

defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
v16i32_info, v4i32x_info>,
EVEX_V512, EVEX_CD8<32, CD8VT4>;
Expand All @@ -1006,7 +1010,13 @@ defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
v8f32x_info, v4f32x_info>,
EVEX_V256, EVEX_CD8<32, CD8VT4>;

def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
(VBROADCASTI32X4Z256rm addr:$src)>;
def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
(VBROADCASTI32X4Z256rm addr:$src)>;
}

let Predicates = [HasVLX, HasDQI] in {
defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2",
v4i64x_info, v2i64x_info>, VEX_W,
Expand All @@ -1015,6 +1025,14 @@ defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2",
v4f64x_info, v2f64x_info>, VEX_W,
EVEX_V256, EVEX_CD8<64, CD8VT2>;
}

let Predicates = [HasVLX, NoDQI] in {
def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
(VBROADCASTF32X4Z256rm addr:$src)>;
def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
(VBROADCASTI32X4Z256rm addr:$src)>;
}

let Predicates = [HasDQI] in {
defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2",
v8i64_info, v2i64x_info>, VEX_W,
Expand Down
37 changes: 32 additions & 5 deletions llvm/lib/Target/X86/X86InstrSSE.td
Expand Up @@ -7759,23 +7759,50 @@ let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
v4f64, v2f64, WriteFShuffle256>, VEX_L;

//===----------------------------------------------------------------------===//
// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
// halves of a 256-bit vector.
//
let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
(ins i128mem:$src),
"vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
Sched<[WriteLoad]>, VEX, VEX_L;

let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX] in
def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
(ins f128mem:$src),
"vbroadcastf128\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
(int_x86_avx_vbroadcastf128_pd_256 addr:$src))]>,
"vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
Sched<[WriteFShuffleLd]>, VEX, VEX_L;

let Predicates = [HasAVX] in
def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
(VBROADCASTI128 addr:$src)>;
def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
(VBROADCASTI128 addr:$src)>;
def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
(VBROADCASTI128 addr:$src)>;
def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
(VBROADCASTI128 addr:$src)>;
}

let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
(VBROADCASTF128 addr:$src)>;
def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
(VBROADCASTF128 addr:$src)>;
}

let Predicates = [HasAVX1Only] in {
def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
(VBROADCASTF128 addr:$src)>;
def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
(VBROADCASTF128 addr:$src)>;
def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
(VBROADCASTF128 addr:$src)>;
def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
(VBROADCASTF128 addr:$src)>;
}

//===----------------------------------------------------------------------===//
// VINSERTF128 - Insert packed floating-point values
Expand Down
26 changes: 25 additions & 1 deletion llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
Expand Up @@ -95,6 +95,30 @@ define <2 x double> @test_x86_avx_extractf128_pd_256_2(<4 x double> %a0) {
}


define <4 x double> @test_x86_avx_vbroadcastf128_pd_256(i8* %a0) {
; CHECK-LABEL: test_x86_avx_vbroadcastf128_pd_256:
; CHECK: ## BB#0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; CHECK-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8* %a0) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8*) nounwind readonly


define <8 x float> @test_x86_avx_vbroadcastf128_ps_256(i8* %a0) {
; CHECK-LABEL: test_x86_avx_vbroadcastf128_ps_256:
; CHECK: ## BB#0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; CHECK-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8* %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8*) nounwind readonly


define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: test_x86_avx_blend_pd_256:
; CHECK: ## BB#0:
Expand Down Expand Up @@ -364,7 +388,7 @@ define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_x86_sse2_storeu_dq:
; CHECK: ## BB#0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: vpaddb LCPI32_0, %xmm0, %xmm0
; CHECK-NEXT: vpaddb LCPI34_0, %xmm0, %xmm0
; CHECK-NEXT: vmovdqu %xmm0, (%eax)
; CHECK-NEXT: retl
%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
Expand Down
40 changes: 2 additions & 38 deletions llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
Expand Up @@ -3970,42 +3970,6 @@ define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) {
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone


define <4 x double> @test_x86_avx_vbroadcastf128_pd_256(i8* %a0) {
; AVX-LABEL: test_x86_avx_vbroadcastf128_pd_256:
; AVX: ## BB#0:
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX-NEXT: retl
;
; AVX512VL-LABEL: test_x86_avx_vbroadcastf128_pd_256:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8* %a0) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8*) nounwind readonly


define <8 x float> @test_x86_avx_vbroadcastf128_ps_256(i8* %a0) {
; AVX-LABEL: test_x86_avx_vbroadcastf128_ps_256:
; AVX: ## BB#0:
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX-NEXT: retl
;
; AVX512VL-LABEL: test_x86_avx_vbroadcastf128_ps_256:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8* %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8*) nounwind readonly


define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
; AVX-LABEL: test_x86_avx_vperm2f128_pd_256:
; AVX: ## BB#0:
Expand Down Expand Up @@ -4585,15 +4549,15 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
; AVX-LABEL: movnt_dq:
; AVX: ## BB#0:
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-NEXT: vpaddq LCPI256_0, %xmm0, %xmm0
; AVX-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0
; AVX-NEXT: vmovntdq %ymm0, (%eax)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retl
;
; AVX512VL-LABEL: movnt_dq:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512VL-NEXT: vpaddq LCPI256_0, %xmm0, %xmm0
; AVX512VL-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovntdq %ymm0, (%eax)
; AVX512VL-NEXT: retl
%a2 = add <2 x i64> %a1, <i64 1, i64 1>
Expand Down
36 changes: 12 additions & 24 deletions llvm/test/CodeGen/X86/avx-vbroadcastf128.ll
Expand Up @@ -6,14 +6,12 @@ define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
; X32-LABEL: test_broadcast_2f64_4f64:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps (%eax), %xmm0
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_2f64_4f64:
; X64: ## BB#0:
; X64-NEXT: vmovaps (%rdi), %xmm0
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: retq
%1 = load <2 x double>, <2 x double> *%p
%2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
Expand All @@ -24,14 +22,12 @@ define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
; X32-LABEL: test_broadcast_2i64_4i64:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps (%eax), %xmm0
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_2i64_4i64:
; X64: ## BB#0:
; X64-NEXT: vmovaps (%rdi), %xmm0
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: retq
%1 = load <2 x i64>, <2 x i64> *%p
%2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
Expand All @@ -42,14 +38,12 @@ define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
; X32-LABEL: test_broadcast_4f32_8f32:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps (%eax), %xmm0
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_4f32_8f32:
; X64: ## BB#0:
; X64-NEXT: vmovaps (%rdi), %xmm0
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: retq
%1 = load <4 x float>, <4 x float> *%p
%2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
Expand All @@ -60,14 +54,12 @@ define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
; X32-LABEL: test_broadcast_4i32_8i32:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps (%eax), %xmm0
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_4i32_8i32:
; X64: ## BB#0:
; X64-NEXT: vmovaps (%rdi), %xmm0
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: retq
%1 = load <4 x i32>, <4 x i32> *%p
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
Expand All @@ -78,14 +70,12 @@ define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
; X32-LABEL: test_broadcast_8i16_16i16:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps (%eax), %xmm0
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_8i16_16i16:
; X64: ## BB#0:
; X64-NEXT: vmovaps (%rdi), %xmm0
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: retq
%1 = load <8 x i16>, <8 x i16> *%p
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
Expand All @@ -96,14 +86,12 @@ define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
; X32-LABEL: test_broadcast_16i8_32i8:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps (%eax), %xmm0
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_16i8_32i8:
; X64: ## BB#0:
; X64-NEXT: vmovaps (%rdi), %xmm0
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: retq
%1 = load <16 x i8>, <16 x i8> *%p
%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
Expand Up @@ -505,14 +505,12 @@ define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) {
; X32-LABEL: test_mm256_broadcastsi128_si256_mem:
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps (%eax), %xmm0
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
; X64: # BB#0:
; X64-NEXT: vmovaps (%rdi), %xmm0
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: retq
%a0 = load <2 x i64>, <2 x i64>* %p0
%res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
Expand Down

0 comments on commit c8e20b1

Please sign in to comment.