Skip to content

Commit

Permalink
[X86][CostModel] Improve costs for vXi1 sign_extend/zero_extend with …
Browse files Browse the repository at this point in the history
…avx512.

With avx512 vXi1 is legal and uses k-registers with many custom cases
for extending.
  • Loading branch information
topperc committed Apr 26, 2020
1 parent 0844337 commit 19cb26f
Show file tree
Hide file tree
Showing 4 changed files with 286 additions and 190 deletions.
106 changes: 101 additions & 5 deletions llvm/lib/Target/X86/X86TargetTransformInfo.cpp
Expand Up @@ -1369,10 +1369,28 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },

// Mask sign extend has an instruction.
{ ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },

// Mask zero extend is a load + broadcast.
// Mask zero extend is a sext + shift.
{ ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },

Expand Down Expand Up @@ -1410,9 +1428,44 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,

{ ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 9 }, // FIXME

// v16i1 -> v16i32 - load + broadcast
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
// Sign extend is zmm vpternlogd+vptruncdb.
// Zero extend is zmm broadcast load+vptruncdw.
{ ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 },
{ ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 },
{ ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 },
{ ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 },

// Sign extend is zmm vpternlogd+vptruncdw.
// Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
{ ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
{ ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },

{ ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
{ ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
{ ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
{ ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
{ ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq

{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq

{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
Expand Down Expand Up @@ -1457,12 +1510,22 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,

static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
// Mask sign extend has an instruction.
{ ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },

// Mask zero extend is a load + broadcast.
// Mask zero extend is a sext + shift.
{ ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
Expand Down Expand Up @@ -1492,6 +1555,39 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
};

static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
// sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
// zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
{ ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 },
{ ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 },
{ ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 },
{ ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 },
{ ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 },
{ ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 },
{ ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 },
{ ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 },

// sign extend is vpcmpeq+maskedmove+vpmovdw
// zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
{ ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
{ ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 },
{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 },
{ ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
{ ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },

{ ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
{ ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
{ ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
{ ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
{ ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq

{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 },
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/Analysis/CostModel/X86/cast.ll
Expand Up @@ -42,11 +42,11 @@ define i32 @add(i32 %arg) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512-LABEL: 'add'
; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %A = zext <4 x i1> undef to <4 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %B = sext <4 x i1> undef to <4 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %A = zext <4 x i1> undef to <4 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = sext <4 x i1> undef to <4 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %C = trunc <4 x i32> undef to <4 x i1>
; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %D = zext <8 x i1> undef to <8 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %E = sext <8 x i1> undef to <8 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %D = zext <8 x i1> undef to <8 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %E = sext <8 x i1> undef to <8 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %F = trunc <8 x i32> undef to <8 x i1>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %G = zext i1 undef to i32
; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %H = trunc i32 undef to i1
Expand Down Expand Up @@ -191,8 +191,8 @@ define i32 @zext_sext(<8 x i1> %in) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512-LABEL: 'zext_sext'
; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %S = sext <8 x i1> %in to <8 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %S = sext <8 x i1> %in to <8 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A1 = zext <16 x i8> undef to <16 x i16>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A2 = sext <16 x i8> undef to <16 x i16>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A = sext <8 x i16> undef to <8 x i32>
Expand Down Expand Up @@ -276,8 +276,8 @@ define i32 @masks8(<8 x i1> %in) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512-LABEL: 'masks8'
; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %S = sext <8 x i1> %in to <8 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %S = sext <8 x i1> %in to <8 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%Z = zext <8 x i1> %in to <8 x i32>
Expand All @@ -302,8 +302,8 @@ define i32 @masks4(<4 x i1> %in) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512-LABEL: 'masks4'
; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <4 x i1> %in to <4 x i64>
; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %S = sext <4 x i1> %in to <4 x i64>
; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %Z = zext <4 x i1> %in to <4 x i64>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %S = sext <4 x i1> %in to <4 x i64>
; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%Z = zext <4 x i1> %in to <4 x i64>
Expand Down

0 comments on commit 19cb26f

Please sign in to comment.