Skip to content

Commit

Permalink
[X86][SSE] Add cpu feature for aggressive combining to variable shuffles
Browse files Browse the repository at this point in the history
As mentioned in D38318 and D40865, modern Intel processors prefer to combine multiple shuffles to a variable shuffle mask (PSHUFB/VPERMPS etc.) instead of having multiple stage 'fixed' shuffles which put more pressure on Port 5 (at the expense of extra shuffle mask loads).

This patch provides a FeatureFastVariableShuffle target flag for Haswell+ CPUs that prefers combining 2 or more fixed shuffles to a single variable shuffle (default is 3 shuffles).

The long term aim is to drive more of this from schedule data (probably via the MC) but we're not close to being ready for that yet.

Differential Revision: https://reviews.llvm.org/D41323

llvm-svn: 321074
  • Loading branch information
RKSimon committed Dec 19, 2017
1 parent b536a2a commit fd5df63
Show file tree
Hide file tree
Showing 10 changed files with 1,362 additions and 498 deletions.
9 changes: 8 additions & 1 deletion llvm/lib/Target/X86/X86.td
Expand Up @@ -263,6 +263,12 @@ def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
def FeatureSoftFloat
: SubtargetFeature<"soft-float", "UseSoftFloat", "true",
"Use software floating point features.">;
// On recent X86 (port bound) processors, its preferable to combine to a single shuffle
// using a variable mask over multiple fixed shuffles.
def FeatureFastVariableShuffle
: SubtargetFeature<"fast-variable-shuffle",
"HasFastVariableShuffle",
"true", "Shuffles with variable masks are fast">;
// On some X86 processors, there is no performance hazard to writing only the
// lower parts of a YMM or ZMM register without clearing the upper part.
def FeatureFastPartialYMMorZMMWrite
Expand Down Expand Up @@ -620,7 +626,8 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
FeatureERMSB,
FeatureFMA,
FeatureLZCNT,
FeatureMOVBE
FeatureMOVBE,
FeatureFastVariableShuffle
]>;

class HaswellProc<string Name> : ProcModel<Name, HaswellModel,
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -28592,8 +28592,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return SDValue();

// Depth threshold above which we can efficiently use variable mask shuffles.
// TODO This should probably be target specific.
bool AllowVariableMask = (Depth >= 3) || HasVariableMask;
int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask;

bool MaskContainsZeros =
any_of(Mask, [](int M) { return M == SM_SentinelZero; });
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/X86/X86Subtarget.h
Expand Up @@ -228,6 +228,10 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// the stack pointer. This is an optimization for Intel Atom processors.
bool UseLeaForSP;

/// True if its preferable to combine to a single shuffle using a variable
/// mask over multiple fixed shuffles.
bool HasFastVariableShuffle;

/// True if there is no performance penalty to writing only the lower parts
/// of a YMM or ZMM register without clearing the upper part.
bool HasFastPartialYMMorZMMWrite;
Expand Down Expand Up @@ -527,6 +531,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
bool useLeaForSP() const { return UseLeaForSP; }
bool hasFastVariableShuffle() const {
return HasFastVariableShuffle;
}
bool hasFastPartialYMMorZMMWrite() const {
return HasFastPartialYMMorZMMWrite;
}
Expand Down
59 changes: 47 additions & 12 deletions llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
Expand Up @@ -3,8 +3,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL

define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
Expand Down Expand Up @@ -115,11 +116,28 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
; AVX: # %bb.0:
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; AVX-NEXT: retq
; AVX1-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
; AVX1: # %bb.0:
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
ret <16 x i8> %shuffle
}
Expand All @@ -131,11 +149,28 @@ define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(
; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
; AVX: # %bb.0:
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX-NEXT: retq
; AVX1-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
; AVX1: # %bb.0:
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
ret <16 x i8> %shuffle
}
Expand Down

0 comments on commit fd5df63

Please sign in to comment.