Skip to content

Commit

Permalink
[TTI] BasicTTI::getInterleavedMemoryOpCost(): use getScalarizationOve…
Browse files Browse the repository at this point in the history
…rhead()

getScalarizationOverhead() results in a somewhat better cost estimation than counting the insertion/extraction costs directly. Notably, this is still overestimating the costs.

Original Patch by: @lebedev.ri (Roman Lebedev)

Differential Revision: https://reviews.llvm.org/D110713
  • Loading branch information
RKSimon committed Sep 29, 2021
1 parent 7674bd4 commit 17f1fc1
Show file tree
Hide file tree
Showing 14 changed files with 132 additions and 116 deletions.
55 changes: 21 additions & 34 deletions llvm/include/llvm/CodeGen/BasicTTIImpl.h
Expand Up @@ -1237,6 +1237,14 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
// Then plus the cost of interleave operation.
assert(Indices.size() <= Factor &&
"Interleaved memory op has too many members");

APInt DemandedLoadStoreElts = APInt::getNullValue(NumElts);
for (unsigned Index : Indices) {
assert(Index < Factor && "Invalid index for interleaved memory op");
for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
DemandedLoadStoreElts.setBit(Index + Elm * Factor);
}

if (Opcode == Instruction::Load) {
// The interleave cost is similar to extract sub vectors' elements
// from the wide vector, and insert them into sub vectors.
Expand All @@ -1246,21 +1254,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
// %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
// The cost is estimated as extract elements at 0, 2, 4, 6 from the
// <8 x i32> vector and insert them into a <4 x i32> vector.
for (unsigned Index : Indices) {
assert(Index < Factor && "Invalid index for interleaved memory op");

// Extract elements from loaded vector for each sub vector.
for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT,
Index + Elm * Factor);
}

InstructionCost InsSubCost = 0;
for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
InsSubCost +=
thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, Elm);

InstructionCost InsSubCost =
getScalarizationOverhead(SubVT, /*Insert*/ true, /*Extract*/ false);
Cost += Indices.size() * InsSubCost;
Cost +=
thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
/*Insert*/ false, /*Extract*/ true);
} else {
// The interleave cost is extract elements from sub vectors, and
// insert them into the wide vector.
Expand All @@ -1275,20 +1274,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
// The cost is estimated as extract all elements (of actual members,
// excluding gaps) from both <4 x i32> vectors and insert into the <12 x
// i32> vector.
InstructionCost ExtSubCost = 0;
for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
ExtSubCost += thisT()->getVectorInstrCost(Instruction::ExtractElement,
SubVT, Elm);
InstructionCost ExtSubCost =
getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true);
Cost += ExtSubCost * Indices.size();

for (unsigned Index : Indices) {
assert(Index < Factor && "Invalid index for interleaved memory op");

// Insert elements from loaded vector for each sub vector.
for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VT,
Index + Elm * Factor);
}
Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
/*Insert*/ true,
/*Extract*/ false);
}

if (!UseMaskForCond)
Expand All @@ -1308,13 +1299,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
// The cost is estimated as extract all mask elements from the <8xi1> mask
// vector and insert them factor times into the <24xi1> shuffled mask
// vector.
for (unsigned i = 0; i < NumSubElts; i++)
Cost +=
thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);

for (unsigned i = 0; i < NumElts; i++)
Cost +=
thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i);
Cost += getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true);
Cost +=
getScalarizationOverhead(MaskVT, /*Insert*/ true, /*Extract*/ false);

// The Gaps mask is invariant and created outside the loop, therefore the
// cost of creating it is not accounted for here. However if we have both
Expand Down
Expand Up @@ -22,8 +22,8 @@ target triple = "x86_64-unknown-linux-gnu"
; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 41 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 114 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 228 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 86 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 172 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
Expand Down
Expand Up @@ -22,15 +22,15 @@ target triple = "x86_64-unknown-linux-gnu"
; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 31 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 58 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 171 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 342 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 129 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 258 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 31 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 58 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 171 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 342 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 129 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 258 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
Expand Down
Expand Up @@ -22,8 +22,8 @@ target triple = "x86_64-unknown-linux-gnu"
; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 41 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 82 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 228 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 456 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 172 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 344 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
Expand Down
Expand Up @@ -21,13 +21,13 @@ target triple = "x86_64-unknown-linux-gnu"
; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 50 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 99 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 285 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 215 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 50 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 99 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 285 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 215 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
Expand Down
Expand Up @@ -21,7 +21,7 @@ target triple = "x86_64-unknown-linux-gnu"
; AVX1: LV: Found an estimated cost of 31 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 58 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 123 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 342 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX1: LV: Found an estimated cost of 258 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
; AVX2: LV: Found an estimated cost of 16 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll
Expand Up @@ -13,24 +13,24 @@ target triple = "x86_64-unknown-linux-gnu"
; CHECK: LV: Checking a loop in "test"
;
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, i8* %in0, align 1
; SSE2: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i8, i8* %in0, align 1
; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i8, i8* %in0, align 1
; SSE2: LV: Found an estimated cost of 160 for VF 8 For instruction: %v0 = load i8, i8* %in0, align 1
; SSE2: LV: Found an estimated cost of 478 for VF 16 For instruction: %v0 = load i8, i8* %in0, align 1
; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i8, i8* %in0, align 1
; SSE2: LV: Found an estimated cost of 30 for VF 4 For instruction: %v0 = load i8, i8* %in0, align 1
; SSE2: LV: Found an estimated cost of 62 for VF 8 For instruction: %v0 = load i8, i8* %in0, align 1
; SSE2: LV: Found an estimated cost of 126 for VF 16 For instruction: %v0 = load i8, i8* %in0, align 1
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, i8* %in0, align 1
; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, i8* %in0, align 1
; AVX1: LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i8, i8* %in0, align 1
; AVX1: LV: Found an estimated cost of 33 for VF 8 For instruction: %v0 = load i8, i8* %in0, align 1
; AVX1: LV: Found an estimated cost of 81 for VF 16 For instruction: %v0 = load i8, i8* %in0, align 1
; AVX1: LV: Found an estimated cost of 226 for VF 32 For instruction: %v0 = load i8, i8* %in0, align 1
; AVX1: LV: Found an estimated cost of 166 for VF 32 For instruction: %v0 = load i8, i8* %in0, align 1
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, i8* %in0, align 1
; AVX2: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, i8* %in0, align 1
; AVX2: LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i8, i8* %in0, align 1
; AVX2: LV: Found an estimated cost of 33 for VF 8 For instruction: %v0 = load i8, i8* %in0, align 1
; AVX2: LV: Found an estimated cost of 81 for VF 16 For instruction: %v0 = load i8, i8* %in0, align 1
; AVX2: LV: Found an estimated cost of 226 for VF 32 For instruction: %v0 = load i8, i8* %in0, align 1
; AVX2: LV: Found an estimated cost of 166 for VF 32 For instruction: %v0 = load i8, i8* %in0, align 1
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, i8* %in0, align 1
; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i8, i8* %in0, align 1
Expand Down
Expand Up @@ -21,9 +21,9 @@ target triple = "x86_64-unknown-linux-gnu"
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, i16* %out1, align 2
; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction: store i16 %v1, i16* %out1, align 2
; AVX1: LV: Found an estimated cost of 17 for VF 4 For instruction: store i16 %v1, i16* %out1, align 2
; AVX1: LV: Found an estimated cost of 49 for VF 8 For instruction: store i16 %v1, i16* %out1, align 2
; AVX1: LV: Found an estimated cost of 114 for VF 16 For instruction: store i16 %v1, i16* %out1, align 2
; AVX1: LV: Found an estimated cost of 228 for VF 32 For instruction: store i16 %v1, i16* %out1, align 2
; AVX1: LV: Found an estimated cost of 35 for VF 8 For instruction: store i16 %v1, i16* %out1, align 2
; AVX1: LV: Found an estimated cost of 86 for VF 16 For instruction: store i16 %v1, i16* %out1, align 2
; AVX1: LV: Found an estimated cost of 172 for VF 32 For instruction: store i16 %v1, i16* %out1, align 2
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, i16* %out1, align 2
; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %v1, i16* %out1, align 2
Expand Down
Expand Up @@ -20,17 +20,17 @@ target triple = "x86_64-unknown-linux-gnu"
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, i16* %out2, align 2
; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction: store i16 %v2, i16* %out2, align 2
; AVX1: LV: Found an estimated cost of 35 for VF 4 For instruction: store i16 %v2, i16* %out2, align 2
; AVX1: LV: Found an estimated cost of 66 for VF 8 For instruction: store i16 %v2, i16* %out2, align 2
; AVX1: LV: Found an estimated cost of 171 for VF 16 For instruction: store i16 %v2, i16* %out2, align 2
; AVX1: LV: Found an estimated cost of 342 for VF 32 For instruction: store i16 %v2, i16* %out2, align 2
; AVX1: LV: Found an estimated cost of 30 for VF 4 For instruction: store i16 %v2, i16* %out2, align 2
; AVX1: LV: Found an estimated cost of 53 for VF 8 For instruction: store i16 %v2, i16* %out2, align 2
; AVX1: LV: Found an estimated cost of 129 for VF 16 For instruction: store i16 %v2, i16* %out2, align 2
; AVX1: LV: Found an estimated cost of 258 for VF 32 For instruction: store i16 %v2, i16* %out2, align 2
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, i16* %out2, align 2
; AVX2: LV: Found an estimated cost of 15 for VF 2 For instruction: store i16 %v2, i16* %out2, align 2
; AVX2: LV: Found an estimated cost of 35 for VF 4 For instruction: store i16 %v2, i16* %out2, align 2
; AVX2: LV: Found an estimated cost of 66 for VF 8 For instruction: store i16 %v2, i16* %out2, align 2
; AVX2: LV: Found an estimated cost of 171 for VF 16 For instruction: store i16 %v2, i16* %out2, align 2
; AVX2: LV: Found an estimated cost of 342 for VF 32 For instruction: store i16 %v2, i16* %out2, align 2
; AVX2: LV: Found an estimated cost of 30 for VF 4 For instruction: store i16 %v2, i16* %out2, align 2
; AVX2: LV: Found an estimated cost of 53 for VF 8 For instruction: store i16 %v2, i16* %out2, align 2
; AVX2: LV: Found an estimated cost of 129 for VF 16 For instruction: store i16 %v2, i16* %out2, align 2
; AVX2: LV: Found an estimated cost of 258 for VF 32 For instruction: store i16 %v2, i16* %out2, align 2
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, i16* %out2, align 2
; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %v2, i16* %out2, align 2
Expand Down
Expand Up @@ -20,10 +20,10 @@ target triple = "x86_64-unknown-linux-gnu"
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, i16* %out3, align 2
; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction: store i16 %v3, i16* %out3, align 2
; AVX1: LV: Found an estimated cost of 49 for VF 4 For instruction: store i16 %v3, i16* %out3, align 2
; AVX1: LV: Found an estimated cost of 98 for VF 8 For instruction: store i16 %v3, i16* %out3, align 2
; AVX1: LV: Found an estimated cost of 228 for VF 16 For instruction: store i16 %v3, i16* %out3, align 2
; AVX1: LV: Found an estimated cost of 456 for VF 32 For instruction: store i16 %v3, i16* %out3, align 2
; AVX1: LV: Found an estimated cost of 35 for VF 4 For instruction: store i16 %v3, i16* %out3, align 2
; AVX1: LV: Found an estimated cost of 70 for VF 8 For instruction: store i16 %v3, i16* %out3, align 2
; AVX1: LV: Found an estimated cost of 172 for VF 16 For instruction: store i16 %v3, i16* %out3, align 2
; AVX1: LV: Found an estimated cost of 344 for VF 32 For instruction: store i16 %v3, i16* %out3, align 2
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, i16* %out3, align 2
; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: store i16 %v3, i16* %out3, align 2
Expand Down

0 comments on commit 17f1fc1

Please sign in to comment.