Skip to content

Commit

Permalink
[x86] narrow a shuffle that doesn't use or set any high elements
Browse files Browse the repository at this point in the history
This isn't the final fix for our reduction/horizontal codegen, but it takes care 
of a lot of the problems. After we narrow the shuffle, existing combines for 
insert/extract and binops kick in, and we end up with cheaper 128-bit ops.

The avg and mul reduction tests show an existing shuffle lowering hole for 
AVX2/AVX512. I think in its most minimal form this is:
https://bugs.llvm.org/show_bug.cgi?id=40434
...but we might need multiple fixes to get it right.

Differential Revision: https://reviews.llvm.org/D57156

llvm-svn: 352209
  • Loading branch information
rotateright committed Jan 25, 2019
1 parent b120127 commit 21aa6dd
Show file tree
Hide file tree
Showing 20 changed files with 1,406 additions and 1,344 deletions.
49 changes: 47 additions & 2 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32328,14 +32328,59 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) {
return SDValue();
}

/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
/// low half of each source vector and does not set any high half elements in
/// the destination vector, narrow the shuffle to half its original size.
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
if (!Shuf->getValueType(0).isSimple())
return SDValue();
MVT VT = Shuf->getSimpleValueType(0);
if (!VT.is256BitVector() && !VT.is512BitVector())
return SDValue();

// See if we can ignore all of the high elements of the shuffle.
ArrayRef<int> Mask = Shuf->getMask();
if (!isUndefUpperHalf(Mask))
return SDValue();

// Check if the shuffle mask accesses only the low half of each input vector
// (half-index output is 0 or 2).
int HalfIdx1, HalfIdx2;
SmallVector<int, 8> HalfMask(Mask.size() / 2);
if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
(HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
return SDValue();

// Create 4 instructions to replace the unnecessarily wide shuffle.
// The trick is knowing that all of the insert/extract are actually free
// subregister (zmm->ymm or ymm->xmm) ops. That leaves us with a shuffle
// of narrow inputs into a narrow output, and that is always cheaper than
// the wide shuffle that we started with.
unsigned NumElts = Mask.size();
SDValue Op0 = Shuf->getOperand(0);
SDValue Op1 = Shuf->getOperand(1);
SDLoc DL(Shuf);
SDValue Index0 = DAG.getIntPtrConstant(0, DL);
MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts / 2);
SDValue Extr0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op0, Index0);
SDValue Extr1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op1, Index0);
SDValue NewShuf = DAG.getVectorShuffle(HalfVT, DL, Extr0, Extr1, HalfMask);
SDValue UndefV = DAG.getUNDEF(VT);
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, UndefV, NewShuf, Index0);
}

static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
if (SDValue V = narrowShuffle(Shuf, DAG))
return V;

// If we have legalized the vector types, look for blends of FADD and FSUB
// nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
SDLoc dl(N);
EVT VT = N->getValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// If we have legalized the vector types, look for blends of FADD and FSUB
// nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
if (TLI.isTypeLegal(VT)) {
if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
return AddSub;
Expand Down
205 changes: 103 additions & 102 deletions llvm/test/CodeGen/X86/avg.ll

Large diffs are not rendered by default.

40 changes: 10 additions & 30 deletions llvm/test/CodeGen/X86/avx512-hadd-hsub.ll
Original file line number Diff line number Diff line change
Expand Up @@ -109,16 +109,12 @@ define float @fhsub_16(<16 x float> %x225) {
define <16 x i32> @hadd_16_3(<16 x i32> %x225, <16 x i32> %x227) {
; KNL-LABEL: hadd_16_3:
; KNL: # %bb.0:
; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; KNL-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; KNL-NEXT: vphaddd %ymm1, %ymm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: hadd_16_3:
; SKX: # %bb.0:
; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; SKX-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; SKX-NEXT: vphaddd %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
%x226 = shufflevector <16 x i32> %x225, <16 x i32> %x227, <16 x i32> <i32 0, i32 2, i32 16, i32 18
, i32 4, i32 6, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand All @@ -132,16 +128,12 @@ define <16 x i32> @hadd_16_3(<16 x i32> %x225, <16 x i32> %x227) {
define <16 x float> @fhadd_16_3(<16 x float> %x225, <16 x float> %x227) {
; KNL-LABEL: fhadd_16_3:
; KNL: # %bb.0:
; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; KNL-NEXT: vaddps %ymm0, %ymm2, %ymm0
; KNL-NEXT: vhaddps %ymm1, %ymm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: fhadd_16_3:
; SKX: # %bb.0:
; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; SKX-NEXT: vaddps %ymm0, %ymm2, %ymm0
; SKX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
%x226 = shufflevector <16 x float> %x225, <16 x float> %x227, <16 x i32> <i32 0, i32 2, i32 16, i32 18
, i32 4, i32 6, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand All @@ -154,16 +146,12 @@ define <16 x float> @fhadd_16_3(<16 x float> %x225, <16 x float> %x227) {
define <8 x double> @fhadd_16_4(<8 x double> %x225, <8 x double> %x227) {
; KNL-LABEL: fhadd_16_4:
; KNL: # %bb.0:
; KNL-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; KNL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; KNL-NEXT: vaddpd %ymm0, %ymm2, %ymm0
; KNL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: fhadd_16_4:
; SKX: # %bb.0:
; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; SKX-NEXT: vaddpd %ymm0, %ymm2, %ymm0
; SKX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
%x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
%x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 undef ,i32 undef, i32 undef, i32 undef>
Expand All @@ -174,16 +162,12 @@ define <8 x double> @fhadd_16_4(<8 x double> %x225, <8 x double> %x227) {
define <4 x double> @fadd_noundef_low(<8 x double> %x225, <8 x double> %x227) {
; KNL-LABEL: fadd_noundef_low:
; KNL: # %bb.0:
; KNL-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; KNL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; KNL-NEXT: vaddpd %ymm0, %ymm2, %ymm0
; KNL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: fadd_noundef_low:
; SKX: # %bb.0:
; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; SKX-NEXT: vaddpd %ymm0, %ymm2, %ymm0
; SKX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
%x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
%x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5 ,i32 13, i32 7, i32 15>
Expand Down Expand Up @@ -221,16 +205,12 @@ define <4 x double> @fadd_noundef_high(<8 x double> %x225, <8 x double> %x227) {
define <8 x i32> @hadd_16_3_sv(<16 x i32> %x225, <16 x i32> %x227) {
; KNL-LABEL: hadd_16_3_sv:
; KNL: # %bb.0:
; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; KNL-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; KNL-NEXT: vphaddd %ymm1, %ymm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: hadd_16_3_sv:
; SKX: # %bb.0:
; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; SKX-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; SKX-NEXT: vphaddd %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
%x226 = shufflevector <16 x i32> %x225, <16 x i32> %x227, <16 x i32> <i32 0, i32 2, i32 16, i32 18
, i32 4, i32 6, i32 20, i32 22, i32 8, i32 10, i32 24, i32 26, i32 12, i32 14, i32 28, i32 30>
Expand Down
40 changes: 20 additions & 20 deletions llvm/test/CodeGen/X86/madd.ll
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,9 @@ define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture read
; AVX256-NEXT: jne .LBB1_1
; AVX256-NEXT: # %bb.2: # %middle.block
; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vmovd %xmm0, %eax
Expand Down Expand Up @@ -277,9 +277,9 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read
; AVX2-NEXT: # %bb.2: # %middle.block
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
Expand Down Expand Up @@ -465,9 +465,9 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
Expand Down Expand Up @@ -726,9 +726,9 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly
; AVX256-NEXT: jne .LBB5_1
; AVX256-NEXT: # %bb.2: # %middle.block
; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vmovd %xmm0, %eax
Expand Down Expand Up @@ -860,9 +860,9 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
; AVX2-NEXT: # %bb.2: # %middle.block
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
Expand Down Expand Up @@ -1067,9 +1067,9 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
Expand Down Expand Up @@ -1335,9 +1335,9 @@ define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture read
; AVX256-NEXT: jne .LBB9_1
; AVX256-NEXT: # %bb.2: # %middle.block
; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vmovd %xmm0, %eax
Expand Down Expand Up @@ -1490,9 +1490,9 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read
; AVX2-NEXT: # %bb.2: # %middle.block
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
Expand Down Expand Up @@ -1743,9 +1743,9 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea
; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
Expand Down Expand Up @@ -2695,9 +2695,9 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>*
; AVX256-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1
; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX256-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vmovd %xmm0, %eax
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/min-legal-vector-width.ll
Original file line number Diff line number Diff line change
Expand Up @@ -187,9 +187,9 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly
; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
Expand Down Expand Up @@ -317,9 +317,9 @@ define i32 @sad_16i8_256() "min-legal-vector-width"="256" {
; CHECK-NEXT: # %bb.2: # %middle.block
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/X86/sad.ll
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,9 @@ define i32 @sad_16i8() nounwind {
; AVX2-NEXT: # %bb.2: # %middle.block
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
Expand Down Expand Up @@ -339,9 +339,9 @@ define i32 @sad_32i8() nounwind {
; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
Expand Down Expand Up @@ -933,9 +933,9 @@ define i32 @sad_avx64i8() nounwind {
; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
Expand Down Expand Up @@ -1355,7 +1355,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
; AVX2-NEXT: vmovdqu (%rdi), %ymm0
; AVX2-NEXT: vpsadbw (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
Expand All @@ -1367,7 +1367,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
; AVX512-NEXT: vmovdqu (%rdi), %ymm0
; AVX512-NEXT: vpsadbw (%rdx), %ymm0, %ymm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
Expand Down Expand Up @@ -1445,9 +1445,9 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
Expand Down Expand Up @@ -1547,9 +1547,9 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %
; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
Expand Down
Loading

0 comments on commit 21aa6dd

Please sign in to comment.