Skip to content

Commit

Permalink
[X86] Fix several issues related to X86's psadbw instruction.
Browse files Browse the repository at this point in the history
This patch fixes the following issues:

1. Fix the return type of X86psadbw: it should not be the same type of inputs.
   For vNi8 inputs the output should be vMi64, where M = N/8.
2. Fix the return type of int_x86_avx512_psad_bw_512 accordingly.
3. Fix the definiton of PSADBW, VPSADBW, and VPSADBWY accordingly.
4. Adjust the return type when building a DAG node of X86ISD::PSADBW type.
5. Update related tests.


Differential revision: http://reviews.llvm.org/D14897

llvm-svn: 254010
  • Loading branch information
Cong Hou committed Nov 24, 2015
1 parent b098f0c commit db6220f
Show file tree
Hide file tree
Showing 11 changed files with 80 additions and 78 deletions.
2 changes: 1 addition & 1 deletion llvm/include/llvm/IR/IntrinsicsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -5448,7 +5448,7 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty],
[IntrNoMem]>;
def int_x86_avx512_psad_bw_512 : GCCBuiltin<"__builtin_ia32_psadbw512">,
Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty],
Intrinsic<[llvm_v8i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty],
[IntrNoMem]>;
}
// FP logical ops
Expand Down
8 changes: 5 additions & 3 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19316,7 +19316,8 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
// chunks, thus directly computes the pop count for v2i64 and v4i64.
if (EltVT == MVT::i64) {
SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
V = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, V, Zeros);
MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
return DAG.getBitcast(VT, V);
}

Expand All @@ -19332,9 +19333,10 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,

// Do the horizontal sums into two v2i64s.
Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
DAG.getBitcast(ByteVecVT, Low), Zeros);
High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
DAG.getBitcast(ByteVecVT, High), Zeros);

// Merge them together.
Expand Down
30 changes: 16 additions & 14 deletions llvm/lib/Target/X86/X86InstrAVX512.td
Original file line number Diff line number Diff line change
Expand Up @@ -7369,32 +7369,34 @@ defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq",


multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
string OpcodeStr, X86VectorVTInfo _src>{
string OpcodeStr, X86VectorVTInfo _dst,
X86VectorVTInfo _src>{
def rr : AVX512BI<opc, MRMSrcReg,
(outs _src.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2),
(outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _src.RC:$dst,(_src.VT
(OpNode _src.RC:$src1, _src.RC:$src2)))]>;
[(set _dst.RC:$dst,(_dst.VT
(OpNode (_src.VT _src.RC:$src1),
(_src.VT _src.RC:$src2))))]>;
let mayLoad = 1 in
def rm : AVX512BI<opc, MRMSrcMem,
(outs _src.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2),
(outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _src.RC:$dst,(_src.VT
(OpNode _src.RC:$src1,
(_src.VT (bitconvert
[(set _dst.RC:$dst,(_dst.VT
(OpNode (_src.VT _src.RC:$src1),
(_src.VT (bitconvert
(_src.LdFrag addr:$src2))))))]>;
}

multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode,
string OpcodeStr, Predicate prd> {
let Predicates = [prd] in
defm Z512 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v64i8_info>,
EVEX_V512;
defm Z512 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v8i64_info,
v64i8_info>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v32i8x_info>,
EVEX_V256;
defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v16i8x_info>,
EVEX_V128;
defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v4i64x_info,
v32i8x_info>, EVEX_V256;
defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v2i64x_info,
v16i8x_info>, EVEX_V128;
}
}

Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ def X86pshufb : SDNode<"X86ISD::PSHUFB",
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>]>>;
def X86psadbw : SDNode<"X86ISD::PSADBW",
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>]>>;
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisSameAs<1,2>]>>;
def X86dbpsadbw : SDNode<"X86ISD::DBPSADBW",
SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisSameAs<1,2>, SDTCisInt<3>]>>;
Expand Down
24 changes: 10 additions & 14 deletions llvm/lib/Target/X86/X86InstrSSE.td
Original file line number Diff line number Diff line change
Expand Up @@ -4066,22 +4066,18 @@ defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
int_x86_avx2_psad_bw, SSE_PMADD, 1>;

let Predicates = [HasAVX2] in
def : Pat<(v32i8 (X86psadbw (v32i8 VR256:$src1),
(v32i8 VR256:$src2))),
(VPSADBWYrr VR256:$src2, VR256:$src1)>;

let Predicates = [HasAVX] in
def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1),
(v16i8 VR128:$src2))),
(VPSADBWrr VR128:$src2, VR128:$src1)>;

def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1),
(v16i8 VR128:$src2))),
(PSADBWrr VR128:$src2, VR128:$src1)>;
defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
VEX_4V;
let Predicates = [HasAVX2] in
defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>,
VEX_4V, VEX_L;
let Constraints = "$src1 = $dst" in
defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1>;

let Predicates = [HasAVX] in
defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/X86/X86IntrinsicsInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
X86_INTRINSIC_DATA(avx2_psign_b, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
X86_INTRINSIC_DATA(avx2_psign_d, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
Expand Down Expand Up @@ -1710,6 +1711,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
X86_INTRINSIC_DATA(sse2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
X86_INTRINSIC_DATA(sse2_pshuf_d, INTR_TYPE_2OP, X86ISD::PSHUFD, 0),
X86_INTRINSIC_DATA(sse2_pshufh_w, INTR_TYPE_2OP, X86ISD::PSHUFHW, 0),
X86_INTRINSIC_DATA(sse2_pshufl_w, INTR_TYPE_2OP, X86ISD::PSHUFLW, 0),
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1255,15 +1255,15 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_dq_512(<8 x i64> %x0) {
%res2 = add <8 x i64> %res, %res1
ret <8 x i64> %res2
}
declare <64 x i8> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>)
declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>)

; CHECK-LABEL: @test_int_x86_avx512_mask_psadb_w_512
; CHECK-NOT: call
; CHECK: vpsadbw %zmm1
; CHECK: vpsadbw %zmm2
define <64 x i8>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){
%res = call <64 x i8> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1)
%res1 = call <64 x i8> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2)
%res2 = add <64 x i8> %res, %res1
ret <64 x i8> %res2
define <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){
%res = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1)
%res1 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2)
%res2 = add <8 x i64> %res, %res1
ret <8 x i64> %res2
}
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/vector-popcnt-128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in)
ret <2 x i64> %out
Expand Down Expand Up @@ -207,9 +207,9 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: vpsadbw %xmm2, %xmm1, %xmm2
; AVX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
%out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in)
Expand Down
18 changes: 9 additions & 9 deletions llvm/test/CodeGen/X86/vector-popcnt-256.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm5
; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
Expand All @@ -37,7 +37,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %in)
ret <4 x i64> %out
Expand All @@ -57,9 +57,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; AVX1-NEXT: vpsadbw %xmm5, %xmm3, %xmm5
; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm5
; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
Expand All @@ -68,9 +68,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; AVX1-NEXT: vpsadbw %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
Expand All @@ -87,9 +87,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm2
; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
%out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %in)
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/X86/vector-tzcnt-128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -275,9 +275,9 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX1-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: retq
;
Expand All @@ -297,9 +297,9 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-NEXT: vpsadbw %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: retq
%out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0)
Expand Down Expand Up @@ -430,9 +430,9 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX1-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: retq
;
Expand All @@ -452,9 +452,9 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-NEXT: vpsadbw %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: retq
%out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1)
Expand Down

0 comments on commit db6220f

Please sign in to comment.