Skip to content

Commit

Permalink
[CostModel][X86] Add support for broadcast shuffle costs
Browse files Browse the repository at this point in the history
Currently only for broadcasts with input and output of the same width.

Differential Revision: https://reviews.llvm.org/D27811

llvm-svn: 291122
  • Loading branch information
RKSimon committed Jan 5, 2017
1 parent 406acdb commit bca02f9
Show file tree
Hide file tree
Showing 2 changed files with 186 additions and 11 deletions.
57 changes: 48 additions & 9 deletions llvm/lib/Target/X86/X86TargetTransformInfo.cpp
Expand Up @@ -605,7 +605,14 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
// 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);

if (Kind == TTI::SK_Reverse || Kind == TTI::SK_Alternate) {
if (Kind == TTI::SK_Reverse || Kind == TTI::SK_Alternate ||
Kind == TTI::SK_Broadcast) {
// For Broadcasts we are splatting the first element from the first input
// register, so only need to reference that input and all the output
// registers are the same.
if (Kind == TTI::SK_Broadcast)
LT.first = 1;

static const CostTblEntry AVX512VBMIShuffleTbl[] = {
{ TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb
{ TTI::SK_Reverse, MVT::v32i8, 1 } // vpermb
Expand All @@ -617,10 +624,13 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return LT.first * Entry->Cost;

static const CostTblEntry AVX512BWShuffleTbl[] = {
{ TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
{ TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
{ TTI::SK_Reverse, MVT::v64i8, 6 } // vextracti64x4 + 2*vperm2i128
// + 2*pshufb + vinserti64x4
{ TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw
{ TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb

{ TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
{ TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
{ TTI::SK_Reverse, MVT::v64i8, 6 } // vextracti64x4 + 2*vperm2i128
// + 2*pshufb + vinserti64x4
};

if (ST->hasBWI())
Expand All @@ -629,10 +639,15 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return LT.first * Entry->Cost;

static const CostTblEntry AVX512ShuffleTbl[] = {
{ TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd
{ TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
{ TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq
{ TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd
{ TTI::SK_Broadcast, MVT::v8f64, 1 }, // vbroadcastpd
{ TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps
{ TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq
{ TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd

{ TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd
{ TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
{ TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq
{ TTI::SK_Reverse, MVT::v16i32, 1 } // vpermd
};

if (ST->hasAVX512())
Expand All @@ -641,6 +656,13 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return LT.first * Entry->Cost;

static const CostTblEntry AVX2ShuffleTbl[] = {
{ TTI::SK_Broadcast, MVT::v4f64, 1 }, // vbroadcastpd
{ TTI::SK_Broadcast, MVT::v8f32, 1 }, // vbroadcastps
{ TTI::SK_Broadcast, MVT::v4i64, 1 }, // vpbroadcastq
{ TTI::SK_Broadcast, MVT::v8i32, 1 }, // vpbroadcastd
{ TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw
{ TTI::SK_Broadcast, MVT::v32i8, 1 }, // vpbroadcastb

{ TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd
{ TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps
{ TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq
Expand All @@ -657,6 +679,13 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return LT.first * Entry->Cost;

static const CostTblEntry AVX1ShuffleTbl[] = {
{ TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
{ TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
{ TTI::SK_Broadcast, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
{ TTI::SK_Broadcast, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
{ TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128
{ TTI::SK_Broadcast, MVT::v32i8, 2 }, // vpshufb + vinsertf128

{ TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
{ TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
{ TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
Expand Down Expand Up @@ -692,6 +721,9 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return LT.first * Entry->Cost;

static const CostTblEntry SSSE3ShuffleTbl[] = {
{ TTI::SK_Broadcast, MVT::v8i16, 1 }, // pshufb
{ TTI::SK_Broadcast, MVT::v16i8, 1 }, // pshufb

{ TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb
{ TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb

Expand All @@ -704,6 +736,12 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return LT.first * Entry->Cost;

static const CostTblEntry SSE2ShuffleTbl[] = {
{ TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd
{ TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd
{ TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd
{ TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd
{ TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd

{ TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd
{ TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd
{ TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd
Expand All @@ -723,6 +761,7 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return LT.first * Entry->Cost;

static const CostTblEntry SSE1ShuffleTbl[] = {
{ TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
{ TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
{ TTI::SK_Alternate, MVT::v4f32, 2 } // 2*shufps
};
Expand Down
140 changes: 138 additions & 2 deletions llvm/test/Analysis/CostModel/X86/shuffle-broadcast.ll
Expand Up @@ -18,14 +18,150 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
%V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> zeroinitializer

; SSE: cost of 1 {{.*}} %V256 = shufflevector
; AVX: cost of 1 {{.*}} %V256 = shufflevector
; AVX1: cost of 2 {{.*}} %V256 = shufflevector
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
%V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer

; SSE: cost of 1 {{.*}} %V512 = shufflevector
; AVX: cost of 1 {{.*}} %V512 = shufflevector
; AVX1: cost of 2 {{.*}} %V512 = shufflevector
; AVX2: cost of 1 {{.*}} %V512 = shufflevector
; AVX512: cost of 1 {{.*}} %V512 = shufflevector
%V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer

ret void
}

; CHECK-LABEL: 'test_vXi64'
define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
; SSE: cost of 1 {{.*}} %V128 = shufflevector
; AVX: cost of 1 {{.*}} %V128 = shufflevector
; AVX512: cost of 1 {{.*}} %V128 = shufflevector
%V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> zeroinitializer

; SSE: cost of 1 {{.*}} %V256 = shufflevector
; AVX1: cost of 2 {{.*}} %V256 = shufflevector
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
%V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> zeroinitializer

; SSE: cost of 1 {{.*}} %V512 = shufflevector
; AVX1: cost of 2 {{.*}} %V512 = shufflevector
; AVX2: cost of 1 {{.*}} %V512 = shufflevector
; AVX512: cost of 1 {{.*}} %V512 = shufflevector
%V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> zeroinitializer

ret void
}

; CHECK-LABEL: 'test_vXf32'
define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
; SSE: cost of 1 {{.*}} %V64 = shufflevector
; AVX: cost of 1 {{.*}} %V64 = shufflevector
; AVX512: cost of 1 {{.*}} %V64 = shufflevector
%V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> zeroinitializer

; SSE: cost of 1 {{.*}} %V128 = shufflevector
; AVX: cost of 1 {{.*}} %V128 = shufflevector
; AVX512: cost of 1 {{.*}} %V128 = shufflevector
%V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> zeroinitializer

; SSE: cost of 1 {{.*}} %V256 = shufflevector
; AVX1: cost of 2 {{.*}} %V256 = shufflevector
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
%V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> zeroinitializer

; SSE: cost of 1 {{.*}} %V512 = shufflevector
; AVX1: cost of 2 {{.*}} %V512 = shufflevector
; AVX2: cost of 1 {{.*}} %V512 = shufflevector
; AVX512: cost of 1 {{.*}} %V512 = shufflevector
%V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> zeroinitializer

ret void
}

; CHECK-LABEL: 'test_vXi32'
define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512) {
; SSE: cost of 1 {{.*}} %V64 = shufflevector
; AVX: cost of 1 {{.*}} %V64 = shufflevector
; AVX512: cost of 1 {{.*}} %V64 = shufflevector
%V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> zeroinitializer

; SSE: cost of 1 {{.*}} %V128 = shufflevector
; AVX: cost of 1 {{.*}} %V128 = shufflevector
; AVX512: cost of 1 {{.*}} %V128 = shufflevector
%V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> zeroinitializer

; SSE: cost of 1 {{.*}} %V256 = shufflevector
; AVX1: cost of 2 {{.*}} %V256 = shufflevector
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
%V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> zeroinitializer

; SSE: cost of 1 {{.*}} %V512 = shufflevector
; AVX1: cost of 2 {{.*}} %V512 = shufflevector
; AVX2: cost of 1 {{.*}} %V512 = shufflevector
; AVX512: cost of 1 {{.*}} %V512 = shufflevector
%V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> zeroinitializer

ret void
}

; CHECK-LABEL: 'test_vXi16'
define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) {
; SSE2: cost of 2 {{.*}} %V128 = shufflevector
; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
; SSE42: cost of 1 {{.*}} %V128 = shufflevector
; AVX: cost of 1 {{.*}} %V128 = shufflevector
; AVX512: cost of 1 {{.*}} %V128 = shufflevector
%V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer

; SSE2: cost of 2 {{.*}} %V256 = shufflevector
; SSSE3: cost of 1 {{.*}} %V256 = shufflevector
; SSE42: cost of 1 {{.*}} %V256 = shufflevector
; AVX1: cost of 3 {{.*}} %V256 = shufflevector
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
%V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer

; SSE2: cost of 2 {{.*}} %V512 = shufflevector
; SSSE3: cost of 1 {{.*}} %V512 = shufflevector
; SSE42: cost of 1 {{.*}} %V512 = shufflevector
; AVX1: cost of 3 {{.*}} %V512 = shufflevector
; AVX2: cost of 1 {{.*}} %V512 = shufflevector
; AVX512F: cost of 1 {{.*}} %V512 = shufflevector
; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
%V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer

ret void
}

; CHECK-LABEL: 'test_vXi8'
define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
; SSE2: cost of 3 {{.*}} %V128 = shufflevector
; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
; SSE42: cost of 1 {{.*}} %V128 = shufflevector
; AVX: cost of 1 {{.*}} %V128 = shufflevector
; AVX512: cost of 1 {{.*}} %V128 = shufflevector
%V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer

; SSE2: cost of 3 {{.*}} %V256 = shufflevector
; SSSE3: cost of 1 {{.*}} %V256 = shufflevector
; SSE42: cost of 1 {{.*}} %V256 = shufflevector
; AVX1: cost of 2 {{.*}} %V256 = shufflevector
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
%V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer

; SSE2: cost of 3 {{.*}} %V512 = shufflevector
; SSSE3: cost of 1 {{.*}} %V512 = shufflevector
; SSE42: cost of 1 {{.*}} %V512 = shufflevector
; AVX1: cost of 2 {{.*}} %V512 = shufflevector
; AVX2: cost of 1 {{.*}} %V512 = shufflevector
; AVX512F: cost of 1 {{.*}} %V512 = shufflevector
; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
%V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer

ret void
}

0 comments on commit bca02f9

Please sign in to comment.