[SLP][TTI][AMDGPU] Add TTI hook preferSLPInstCountCheck for per-target opt-out#199696
Conversation
Created using spr 1.3.7
|
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesAdd Fixes #199662. Full diff: https://github.com/llvm/llvm-project/pull/199696.diff 7 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 333c5e4868395..7f3d12ef46c58 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1939,6 +1939,12 @@ class TargetTransformInfo {
/// vectorization, false - otherwise.
LLVM_ABI bool preferAlternateOpcodeVectorization() const;
+ /// \returns True if the SLP vectorizer should apply the instruction-count
+ /// check that rejects 2-element vector trees when the vector instruction
+ /// count exceeds the scalar instruction count, false if the target opts out
+ /// of this heuristic.
+ LLVM_ABI bool preferSLPInstCountCheck() const;
+
/// \returns True if the target prefers reductions of \p Kind to be performed
/// in the loop.
LLVM_ABI bool preferInLoopReduction(RecurKind Kind, Type *Ty) const;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 774e535775534..a4a8926c99fe3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1155,6 +1155,8 @@ class TargetTransformInfoImplBase {
}
virtual bool preferAlternateOpcodeVectorization() const { return true; }
+ virtual bool preferSLPInstCountCheck() const { return true; }
+
virtual bool preferPredicatedReductionSelect() const { return false; }
virtual bool preferEpilogueVectorization(ElementCount Iters) const {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 51221a6369e91..856950ccae595 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1473,6 +1473,10 @@ bool TargetTransformInfo::preferAlternateOpcodeVectorization() const {
return TTIImpl->preferAlternateOpcodeVectorization();
}
+bool TargetTransformInfo::preferSLPInstCountCheck() const {
+ return TTIImpl->preferSLPInstCountCheck();
+}
+
bool TargetTransformInfo::preferPredicatedReductionSelect() const {
return TTIImpl->preferPredicatedReductionSelect();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 7a2d720e5c497..70871dde80ba5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -357,6 +357,15 @@ unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
: 1;
}
+bool GCNTTIImpl::preferSLPInstCountCheck() const {
+ // The integer inst-count heuristic causes regressions on gfx94x and gfx950
+ // because 2-element vector trees that pass the scalar/vector instruction
+ // count comparison still widen scalar moves (e.g. v_mov_b32 to v_mov_b64)
+ // after codegen, increasing register pressure and throughput cost without
+ // reducing the total instruction count.
+ return !ST->hasGFX940Insts() && !ST->hasGFX950Insts();
+}
+
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
unsigned ChainSizeInBytes,
VectorType *VecTy) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index e49881dee57db..4a239f4d6983d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -133,6 +133,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override;
unsigned getMinVectorRegisterBitWidth() const override;
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override;
+ bool preferSLPInstCountCheck() const override;
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
unsigned ChainSizeInBytes,
VectorType *VecTy) const override;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index caee72eb0547a..b5ccccb2d0208 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -19420,7 +19420,8 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
// shuffles, inserts, and extracts.
// FIXME: remove this as soon as correct fractional model is landed for all
// targets.
- if (SLPInstCountCheck && VectorizableTree.front()->getVectorFactor() == 2 &&
+ if (SLPInstCountCheck && TTI->preferSLPInstCountCheck() &&
+ VectorizableTree.front()->getVectorFactor() == 2 &&
SLPCostThreshold == 0 &&
(!SLPReVec ||
!isa<VectorType>(
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/inst-count-heuristic.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/inst-count-heuristic.ll
index 067e2c77c0624..ff77b7392a997 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/inst-count-heuristic.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/inst-count-heuristic.ll
@@ -12,21 +12,26 @@ define amdgpu_kernel void @phi5_rotate(
; GFX950-LABEL: define amdgpu_kernel void @phi5_rotate(
; GFX950-SAME: ptr addrspace(1) captures(none) [[OUT:%.*]], i32 [[N:%.*]], i32 [[S0:%.*]], i32 [[S1:%.*]], i32 [[S2:%.*]], i32 [[S3:%.*]], i32 [[S4:%.*]]) #[[ATTR0:[0-9]+]] {
; GFX950-NEXT: [[ENTRY:.*]]:
+; GFX950-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[S0]], i32 0
+; GFX950-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[S1]], i32 1
+; GFX950-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[S4]], i32 0
+; GFX950-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[S0]], i32 1
+; GFX950-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[S3]], i32 0
+; GFX950-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[S4]], i32 1
+; GFX950-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[S2]], i32 0
+; GFX950-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[S3]], i32 1
; GFX950-NEXT: br label %[[LOOP:.*]]
; GFX950: [[LOOP]]:
-; GFX950-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
-; GFX950-NEXT: [[X0:%.*]] = phi i32 [ [[S0]], %[[ENTRY]] ], [ [[X4:%.*]], %[[LOOP]] ]
-; GFX950-NEXT: [[X1:%.*]] = phi i32 [ [[S1]], %[[ENTRY]] ], [ [[X0]], %[[LOOP]] ]
-; GFX950-NEXT: [[X2:%.*]] = phi i32 [ [[S2]], %[[ENTRY]] ], [ [[X1]], %[[LOOP]] ]
-; GFX950-NEXT: [[X3:%.*]] = phi i32 [ [[S3]], %[[ENTRY]] ], [ [[X2]], %[[LOOP]] ]
-; GFX950-NEXT: [[X4]] = phi i32 [ [[S4]], %[[ENTRY]] ], [ [[X3]], %[[LOOP]] ]
-; GFX950-NEXT: [[GEP0:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[I]]
-; GFX950-NEXT: store i32 [[X0]], ptr addrspace(1) [[GEP0]], align 4
-; GFX950-NEXT: [[I1:%.*]] = or disjoint i32 [[I]], 1
+; GFX950-NEXT: [[I1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; GFX950-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ [[TMP1]], %[[ENTRY]] ], [ [[TMP9:%.*]], %[[LOOP]] ]
+; GFX950-NEXT: [[TMP9]] = phi <2 x i32> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
+; GFX950-NEXT: [[TMP10]] = phi <2 x i32> [ [[TMP5]], %[[ENTRY]] ], [ [[TMP11:%.*]], %[[LOOP]] ]
+; GFX950-NEXT: [[TMP11]] = phi <2 x i32> [ [[TMP7]], %[[ENTRY]] ], [ [[TMP12:%.*]], %[[LOOP]] ]
; GFX950-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[I1]]
-; GFX950-NEXT: store i32 [[X1]], ptr addrspace(1) [[GEP1]], align 4
-; GFX950-NEXT: [[I_NEXT]] = add nuw i32 [[I]], 2
+; GFX950-NEXT: store <2 x i32> [[TMP8]], ptr addrspace(1) [[GEP1]], align 4
+; GFX950-NEXT: [[I_NEXT]] = add nuw i32 [[I1]], 2
; GFX950-NEXT: [[CMP:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
+; GFX950-NEXT: [[TMP12]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP11]], <2 x i32> <i32 1, i32 2>
; GFX950-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
; GFX950: [[EXIT]]:
; GFX950-NEXT: ret void
@@ -34,21 +39,26 @@ define amdgpu_kernel void @phi5_rotate(
; GFX942-LABEL: define amdgpu_kernel void @phi5_rotate(
; GFX942-SAME: ptr addrspace(1) captures(none) [[OUT:%.*]], i32 [[N:%.*]], i32 [[S0:%.*]], i32 [[S1:%.*]], i32 [[S2:%.*]], i32 [[S3:%.*]], i32 [[S4:%.*]]) #[[ATTR0:[0-9]+]] {
; GFX942-NEXT: [[ENTRY:.*]]:
+; GFX942-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[S0]], i32 0
+; GFX942-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[S1]], i32 1
+; GFX942-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[S4]], i32 0
+; GFX942-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[S0]], i32 1
+; GFX942-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[S3]], i32 0
+; GFX942-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[S4]], i32 1
+; GFX942-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[S2]], i32 0
+; GFX942-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[S3]], i32 1
; GFX942-NEXT: br label %[[LOOP:.*]]
; GFX942: [[LOOP]]:
-; GFX942-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
-; GFX942-NEXT: [[X0:%.*]] = phi i32 [ [[S0]], %[[ENTRY]] ], [ [[X4:%.*]], %[[LOOP]] ]
-; GFX942-NEXT: [[X1:%.*]] = phi i32 [ [[S1]], %[[ENTRY]] ], [ [[X0]], %[[LOOP]] ]
-; GFX942-NEXT: [[X2:%.*]] = phi i32 [ [[S2]], %[[ENTRY]] ], [ [[X1]], %[[LOOP]] ]
-; GFX942-NEXT: [[X3:%.*]] = phi i32 [ [[S3]], %[[ENTRY]] ], [ [[X2]], %[[LOOP]] ]
-; GFX942-NEXT: [[X4]] = phi i32 [ [[S4]], %[[ENTRY]] ], [ [[X3]], %[[LOOP]] ]
-; GFX942-NEXT: [[GEP0:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[I]]
-; GFX942-NEXT: store i32 [[X0]], ptr addrspace(1) [[GEP0]], align 4
-; GFX942-NEXT: [[I1:%.*]] = or disjoint i32 [[I]], 1
+; GFX942-NEXT: [[I1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; GFX942-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ [[TMP1]], %[[ENTRY]] ], [ [[TMP9:%.*]], %[[LOOP]] ]
+; GFX942-NEXT: [[TMP9]] = phi <2 x i32> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
+; GFX942-NEXT: [[TMP10]] = phi <2 x i32> [ [[TMP5]], %[[ENTRY]] ], [ [[TMP11:%.*]], %[[LOOP]] ]
+; GFX942-NEXT: [[TMP11]] = phi <2 x i32> [ [[TMP7]], %[[ENTRY]] ], [ [[TMP12:%.*]], %[[LOOP]] ]
; GFX942-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[I1]]
-; GFX942-NEXT: store i32 [[X1]], ptr addrspace(1) [[GEP1]], align 4
-; GFX942-NEXT: [[I_NEXT]] = add nuw i32 [[I]], 2
+; GFX942-NEXT: store <2 x i32> [[TMP8]], ptr addrspace(1) [[GEP1]], align 4
+; GFX942-NEXT: [[I_NEXT]] = add nuw i32 [[I1]], 2
; GFX942-NEXT: [[CMP:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
+; GFX942-NEXT: [[TMP12]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP11]], <2 x i32> <i32 1, i32 2>
; GFX942-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
; GFX942: [[EXIT]]:
; GFX942-NEXT: ret void
|
|
@llvm/pr-subscribers-backend-amdgpu Author: Alexey Bataev (alexey-bataev) ChangesAdd Fixes #199662. Full diff: https://github.com/llvm/llvm-project/pull/199696.diff 7 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 333c5e4868395..7f3d12ef46c58 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1939,6 +1939,12 @@ class TargetTransformInfo {
/// vectorization, false - otherwise.
LLVM_ABI bool preferAlternateOpcodeVectorization() const;
+ /// \returns True if the SLP vectorizer should apply the instruction-count
+ /// check that rejects 2-element vector trees when the vector instruction
+ /// count exceeds the scalar instruction count, false if the target opts out
+ /// of this heuristic.
+ LLVM_ABI bool preferSLPInstCountCheck() const;
+
/// \returns True if the target prefers reductions of \p Kind to be performed
/// in the loop.
LLVM_ABI bool preferInLoopReduction(RecurKind Kind, Type *Ty) const;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 774e535775534..a4a8926c99fe3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1155,6 +1155,8 @@ class TargetTransformInfoImplBase {
}
virtual bool preferAlternateOpcodeVectorization() const { return true; }
+ virtual bool preferSLPInstCountCheck() const { return true; }
+
virtual bool preferPredicatedReductionSelect() const { return false; }
virtual bool preferEpilogueVectorization(ElementCount Iters) const {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 51221a6369e91..856950ccae595 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1473,6 +1473,10 @@ bool TargetTransformInfo::preferAlternateOpcodeVectorization() const {
return TTIImpl->preferAlternateOpcodeVectorization();
}
+bool TargetTransformInfo::preferSLPInstCountCheck() const {
+ return TTIImpl->preferSLPInstCountCheck();
+}
+
bool TargetTransformInfo::preferPredicatedReductionSelect() const {
return TTIImpl->preferPredicatedReductionSelect();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 7a2d720e5c497..70871dde80ba5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -357,6 +357,15 @@ unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
: 1;
}
+bool GCNTTIImpl::preferSLPInstCountCheck() const {
+ // The integer inst-count heuristic causes regressions on gfx94x and gfx950
+ // because 2-element vector trees that pass the scalar/vector instruction
+ // count comparison still widen scalar moves (e.g. v_mov_b32 to v_mov_b64)
+ // after codegen, increasing register pressure and throughput cost without
+ // reducing the total instruction count.
+ return !ST->hasGFX940Insts() && !ST->hasGFX950Insts();
+}
+
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
unsigned ChainSizeInBytes,
VectorType *VecTy) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index e49881dee57db..4a239f4d6983d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -133,6 +133,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override;
unsigned getMinVectorRegisterBitWidth() const override;
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override;
+ bool preferSLPInstCountCheck() const override;
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
unsigned ChainSizeInBytes,
VectorType *VecTy) const override;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index caee72eb0547a..b5ccccb2d0208 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -19420,7 +19420,8 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
// shuffles, inserts, and extracts.
// FIXME: remove this as soon as correct fractional model is landed for all
// targets.
- if (SLPInstCountCheck && VectorizableTree.front()->getVectorFactor() == 2 &&
+ if (SLPInstCountCheck && TTI->preferSLPInstCountCheck() &&
+ VectorizableTree.front()->getVectorFactor() == 2 &&
SLPCostThreshold == 0 &&
(!SLPReVec ||
!isa<VectorType>(
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/inst-count-heuristic.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/inst-count-heuristic.ll
index 067e2c77c0624..ff77b7392a997 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/inst-count-heuristic.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/inst-count-heuristic.ll
@@ -12,21 +12,26 @@ define amdgpu_kernel void @phi5_rotate(
; GFX950-LABEL: define amdgpu_kernel void @phi5_rotate(
; GFX950-SAME: ptr addrspace(1) captures(none) [[OUT:%.*]], i32 [[N:%.*]], i32 [[S0:%.*]], i32 [[S1:%.*]], i32 [[S2:%.*]], i32 [[S3:%.*]], i32 [[S4:%.*]]) #[[ATTR0:[0-9]+]] {
; GFX950-NEXT: [[ENTRY:.*]]:
+; GFX950-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[S0]], i32 0
+; GFX950-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[S1]], i32 1
+; GFX950-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[S4]], i32 0
+; GFX950-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[S0]], i32 1
+; GFX950-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[S3]], i32 0
+; GFX950-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[S4]], i32 1
+; GFX950-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[S2]], i32 0
+; GFX950-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[S3]], i32 1
; GFX950-NEXT: br label %[[LOOP:.*]]
; GFX950: [[LOOP]]:
-; GFX950-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
-; GFX950-NEXT: [[X0:%.*]] = phi i32 [ [[S0]], %[[ENTRY]] ], [ [[X4:%.*]], %[[LOOP]] ]
-; GFX950-NEXT: [[X1:%.*]] = phi i32 [ [[S1]], %[[ENTRY]] ], [ [[X0]], %[[LOOP]] ]
-; GFX950-NEXT: [[X2:%.*]] = phi i32 [ [[S2]], %[[ENTRY]] ], [ [[X1]], %[[LOOP]] ]
-; GFX950-NEXT: [[X3:%.*]] = phi i32 [ [[S3]], %[[ENTRY]] ], [ [[X2]], %[[LOOP]] ]
-; GFX950-NEXT: [[X4]] = phi i32 [ [[S4]], %[[ENTRY]] ], [ [[X3]], %[[LOOP]] ]
-; GFX950-NEXT: [[GEP0:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[I]]
-; GFX950-NEXT: store i32 [[X0]], ptr addrspace(1) [[GEP0]], align 4
-; GFX950-NEXT: [[I1:%.*]] = or disjoint i32 [[I]], 1
+; GFX950-NEXT: [[I1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; GFX950-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ [[TMP1]], %[[ENTRY]] ], [ [[TMP9:%.*]], %[[LOOP]] ]
+; GFX950-NEXT: [[TMP9]] = phi <2 x i32> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
+; GFX950-NEXT: [[TMP10]] = phi <2 x i32> [ [[TMP5]], %[[ENTRY]] ], [ [[TMP11:%.*]], %[[LOOP]] ]
+; GFX950-NEXT: [[TMP11]] = phi <2 x i32> [ [[TMP7]], %[[ENTRY]] ], [ [[TMP12:%.*]], %[[LOOP]] ]
; GFX950-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[I1]]
-; GFX950-NEXT: store i32 [[X1]], ptr addrspace(1) [[GEP1]], align 4
-; GFX950-NEXT: [[I_NEXT]] = add nuw i32 [[I]], 2
+; GFX950-NEXT: store <2 x i32> [[TMP8]], ptr addrspace(1) [[GEP1]], align 4
+; GFX950-NEXT: [[I_NEXT]] = add nuw i32 [[I1]], 2
; GFX950-NEXT: [[CMP:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
+; GFX950-NEXT: [[TMP12]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP11]], <2 x i32> <i32 1, i32 2>
; GFX950-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
; GFX950: [[EXIT]]:
; GFX950-NEXT: ret void
@@ -34,21 +39,26 @@ define amdgpu_kernel void @phi5_rotate(
; GFX942-LABEL: define amdgpu_kernel void @phi5_rotate(
; GFX942-SAME: ptr addrspace(1) captures(none) [[OUT:%.*]], i32 [[N:%.*]], i32 [[S0:%.*]], i32 [[S1:%.*]], i32 [[S2:%.*]], i32 [[S3:%.*]], i32 [[S4:%.*]]) #[[ATTR0:[0-9]+]] {
; GFX942-NEXT: [[ENTRY:.*]]:
+; GFX942-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[S0]], i32 0
+; GFX942-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[S1]], i32 1
+; GFX942-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[S4]], i32 0
+; GFX942-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[S0]], i32 1
+; GFX942-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[S3]], i32 0
+; GFX942-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[S4]], i32 1
+; GFX942-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[S2]], i32 0
+; GFX942-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[S3]], i32 1
; GFX942-NEXT: br label %[[LOOP:.*]]
; GFX942: [[LOOP]]:
-; GFX942-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
-; GFX942-NEXT: [[X0:%.*]] = phi i32 [ [[S0]], %[[ENTRY]] ], [ [[X4:%.*]], %[[LOOP]] ]
-; GFX942-NEXT: [[X1:%.*]] = phi i32 [ [[S1]], %[[ENTRY]] ], [ [[X0]], %[[LOOP]] ]
-; GFX942-NEXT: [[X2:%.*]] = phi i32 [ [[S2]], %[[ENTRY]] ], [ [[X1]], %[[LOOP]] ]
-; GFX942-NEXT: [[X3:%.*]] = phi i32 [ [[S3]], %[[ENTRY]] ], [ [[X2]], %[[LOOP]] ]
-; GFX942-NEXT: [[X4]] = phi i32 [ [[S4]], %[[ENTRY]] ], [ [[X3]], %[[LOOP]] ]
-; GFX942-NEXT: [[GEP0:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[I]]
-; GFX942-NEXT: store i32 [[X0]], ptr addrspace(1) [[GEP0]], align 4
-; GFX942-NEXT: [[I1:%.*]] = or disjoint i32 [[I]], 1
+; GFX942-NEXT: [[I1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; GFX942-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ [[TMP1]], %[[ENTRY]] ], [ [[TMP9:%.*]], %[[LOOP]] ]
+; GFX942-NEXT: [[TMP9]] = phi <2 x i32> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
+; GFX942-NEXT: [[TMP10]] = phi <2 x i32> [ [[TMP5]], %[[ENTRY]] ], [ [[TMP11:%.*]], %[[LOOP]] ]
+; GFX942-NEXT: [[TMP11]] = phi <2 x i32> [ [[TMP7]], %[[ENTRY]] ], [ [[TMP12:%.*]], %[[LOOP]] ]
; GFX942-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[I1]]
-; GFX942-NEXT: store i32 [[X1]], ptr addrspace(1) [[GEP1]], align 4
-; GFX942-NEXT: [[I_NEXT]] = add nuw i32 [[I]], 2
+; GFX942-NEXT: store <2 x i32> [[TMP8]], ptr addrspace(1) [[GEP1]], align 4
+; GFX942-NEXT: [[I_NEXT]] = add nuw i32 [[I1]], 2
; GFX942-NEXT: [[CMP:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
+; GFX942-NEXT: [[TMP12]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP11]], <2 x i32> <i32 1, i32 2>
; GFX942-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
; GFX942: [[EXIT]]:
; GFX942-NEXT: ret void
|
|
@llvm/pr-subscribers-vectorizers Author: Alexey Bataev (alexey-bataev) ChangesAdd Fixes #199662. Full diff: https://github.com/llvm/llvm-project/pull/199696.diff 7 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 333c5e4868395..7f3d12ef46c58 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1939,6 +1939,12 @@ class TargetTransformInfo {
/// vectorization, false - otherwise.
LLVM_ABI bool preferAlternateOpcodeVectorization() const;
+ /// \returns True if the SLP vectorizer should apply the instruction-count
+ /// check that rejects 2-element vector trees when the vector instruction
+ /// count exceeds the scalar instruction count, false if the target opts out
+ /// of this heuristic.
+ LLVM_ABI bool preferSLPInstCountCheck() const;
+
/// \returns True if the target prefers reductions of \p Kind to be performed
/// in the loop.
LLVM_ABI bool preferInLoopReduction(RecurKind Kind, Type *Ty) const;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 774e535775534..a4a8926c99fe3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1155,6 +1155,8 @@ class TargetTransformInfoImplBase {
}
virtual bool preferAlternateOpcodeVectorization() const { return true; }
+ virtual bool preferSLPInstCountCheck() const { return true; }
+
virtual bool preferPredicatedReductionSelect() const { return false; }
virtual bool preferEpilogueVectorization(ElementCount Iters) const {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 51221a6369e91..856950ccae595 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1473,6 +1473,10 @@ bool TargetTransformInfo::preferAlternateOpcodeVectorization() const {
return TTIImpl->preferAlternateOpcodeVectorization();
}
+bool TargetTransformInfo::preferSLPInstCountCheck() const {
+ return TTIImpl->preferSLPInstCountCheck();
+}
+
bool TargetTransformInfo::preferPredicatedReductionSelect() const {
return TTIImpl->preferPredicatedReductionSelect();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 7a2d720e5c497..70871dde80ba5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -357,6 +357,15 @@ unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
: 1;
}
+bool GCNTTIImpl::preferSLPInstCountCheck() const {
+ // The integer inst-count heuristic causes regressions on gfx94x and gfx950
+ // because 2-element vector trees that pass the scalar/vector instruction
+ // count comparison still widen scalar moves (e.g. v_mov_b32 to v_mov_b64)
+ // after codegen, increasing register pressure and throughput cost without
+ // reducing the total instruction count.
+ return !ST->hasGFX940Insts() && !ST->hasGFX950Insts();
+}
+
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
unsigned ChainSizeInBytes,
VectorType *VecTy) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index e49881dee57db..4a239f4d6983d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -133,6 +133,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override;
unsigned getMinVectorRegisterBitWidth() const override;
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override;
+ bool preferSLPInstCountCheck() const override;
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
unsigned ChainSizeInBytes,
VectorType *VecTy) const override;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index caee72eb0547a..b5ccccb2d0208 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -19420,7 +19420,8 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
// shuffles, inserts, and extracts.
// FIXME: remove this as soon as correct fractional model is landed for all
// targets.
- if (SLPInstCountCheck && VectorizableTree.front()->getVectorFactor() == 2 &&
+ if (SLPInstCountCheck && TTI->preferSLPInstCountCheck() &&
+ VectorizableTree.front()->getVectorFactor() == 2 &&
SLPCostThreshold == 0 &&
(!SLPReVec ||
!isa<VectorType>(
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/inst-count-heuristic.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/inst-count-heuristic.ll
index 067e2c77c0624..ff77b7392a997 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/inst-count-heuristic.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/inst-count-heuristic.ll
@@ -12,21 +12,26 @@ define amdgpu_kernel void @phi5_rotate(
; GFX950-LABEL: define amdgpu_kernel void @phi5_rotate(
; GFX950-SAME: ptr addrspace(1) captures(none) [[OUT:%.*]], i32 [[N:%.*]], i32 [[S0:%.*]], i32 [[S1:%.*]], i32 [[S2:%.*]], i32 [[S3:%.*]], i32 [[S4:%.*]]) #[[ATTR0:[0-9]+]] {
; GFX950-NEXT: [[ENTRY:.*]]:
+; GFX950-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[S0]], i32 0
+; GFX950-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[S1]], i32 1
+; GFX950-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[S4]], i32 0
+; GFX950-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[S0]], i32 1
+; GFX950-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[S3]], i32 0
+; GFX950-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[S4]], i32 1
+; GFX950-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[S2]], i32 0
+; GFX950-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[S3]], i32 1
; GFX950-NEXT: br label %[[LOOP:.*]]
; GFX950: [[LOOP]]:
-; GFX950-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
-; GFX950-NEXT: [[X0:%.*]] = phi i32 [ [[S0]], %[[ENTRY]] ], [ [[X4:%.*]], %[[LOOP]] ]
-; GFX950-NEXT: [[X1:%.*]] = phi i32 [ [[S1]], %[[ENTRY]] ], [ [[X0]], %[[LOOP]] ]
-; GFX950-NEXT: [[X2:%.*]] = phi i32 [ [[S2]], %[[ENTRY]] ], [ [[X1]], %[[LOOP]] ]
-; GFX950-NEXT: [[X3:%.*]] = phi i32 [ [[S3]], %[[ENTRY]] ], [ [[X2]], %[[LOOP]] ]
-; GFX950-NEXT: [[X4]] = phi i32 [ [[S4]], %[[ENTRY]] ], [ [[X3]], %[[LOOP]] ]
-; GFX950-NEXT: [[GEP0:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[I]]
-; GFX950-NEXT: store i32 [[X0]], ptr addrspace(1) [[GEP0]], align 4
-; GFX950-NEXT: [[I1:%.*]] = or disjoint i32 [[I]], 1
+; GFX950-NEXT: [[I1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; GFX950-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ [[TMP1]], %[[ENTRY]] ], [ [[TMP9:%.*]], %[[LOOP]] ]
+; GFX950-NEXT: [[TMP9]] = phi <2 x i32> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
+; GFX950-NEXT: [[TMP10]] = phi <2 x i32> [ [[TMP5]], %[[ENTRY]] ], [ [[TMP11:%.*]], %[[LOOP]] ]
+; GFX950-NEXT: [[TMP11]] = phi <2 x i32> [ [[TMP7]], %[[ENTRY]] ], [ [[TMP12:%.*]], %[[LOOP]] ]
; GFX950-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[I1]]
-; GFX950-NEXT: store i32 [[X1]], ptr addrspace(1) [[GEP1]], align 4
-; GFX950-NEXT: [[I_NEXT]] = add nuw i32 [[I]], 2
+; GFX950-NEXT: store <2 x i32> [[TMP8]], ptr addrspace(1) [[GEP1]], align 4
+; GFX950-NEXT: [[I_NEXT]] = add nuw i32 [[I1]], 2
; GFX950-NEXT: [[CMP:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
+; GFX950-NEXT: [[TMP12]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP11]], <2 x i32> <i32 1, i32 2>
; GFX950-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
; GFX950: [[EXIT]]:
; GFX950-NEXT: ret void
@@ -34,21 +39,26 @@ define amdgpu_kernel void @phi5_rotate(
; GFX942-LABEL: define amdgpu_kernel void @phi5_rotate(
; GFX942-SAME: ptr addrspace(1) captures(none) [[OUT:%.*]], i32 [[N:%.*]], i32 [[S0:%.*]], i32 [[S1:%.*]], i32 [[S2:%.*]], i32 [[S3:%.*]], i32 [[S4:%.*]]) #[[ATTR0:[0-9]+]] {
; GFX942-NEXT: [[ENTRY:.*]]:
+; GFX942-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[S0]], i32 0
+; GFX942-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[S1]], i32 1
+; GFX942-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[S4]], i32 0
+; GFX942-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[S0]], i32 1
+; GFX942-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[S3]], i32 0
+; GFX942-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[S4]], i32 1
+; GFX942-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[S2]], i32 0
+; GFX942-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[S3]], i32 1
; GFX942-NEXT: br label %[[LOOP:.*]]
; GFX942: [[LOOP]]:
-; GFX942-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
-; GFX942-NEXT: [[X0:%.*]] = phi i32 [ [[S0]], %[[ENTRY]] ], [ [[X4:%.*]], %[[LOOP]] ]
-; GFX942-NEXT: [[X1:%.*]] = phi i32 [ [[S1]], %[[ENTRY]] ], [ [[X0]], %[[LOOP]] ]
-; GFX942-NEXT: [[X2:%.*]] = phi i32 [ [[S2]], %[[ENTRY]] ], [ [[X1]], %[[LOOP]] ]
-; GFX942-NEXT: [[X3:%.*]] = phi i32 [ [[S3]], %[[ENTRY]] ], [ [[X2]], %[[LOOP]] ]
-; GFX942-NEXT: [[X4]] = phi i32 [ [[S4]], %[[ENTRY]] ], [ [[X3]], %[[LOOP]] ]
-; GFX942-NEXT: [[GEP0:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[I]]
-; GFX942-NEXT: store i32 [[X0]], ptr addrspace(1) [[GEP0]], align 4
-; GFX942-NEXT: [[I1:%.*]] = or disjoint i32 [[I]], 1
+; GFX942-NEXT: [[I1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; GFX942-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ [[TMP1]], %[[ENTRY]] ], [ [[TMP9:%.*]], %[[LOOP]] ]
+; GFX942-NEXT: [[TMP9]] = phi <2 x i32> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
+; GFX942-NEXT: [[TMP10]] = phi <2 x i32> [ [[TMP5]], %[[ENTRY]] ], [ [[TMP11:%.*]], %[[LOOP]] ]
+; GFX942-NEXT: [[TMP11]] = phi <2 x i32> [ [[TMP7]], %[[ENTRY]] ], [ [[TMP12:%.*]], %[[LOOP]] ]
; GFX942-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[I1]]
-; GFX942-NEXT: store i32 [[X1]], ptr addrspace(1) [[GEP1]], align 4
-; GFX942-NEXT: [[I_NEXT]] = add nuw i32 [[I]], 2
+; GFX942-NEXT: store <2 x i32> [[TMP8]], ptr addrspace(1) [[GEP1]], align 4
+; GFX942-NEXT: [[I_NEXT]] = add nuw i32 [[I1]], 2
; GFX942-NEXT: [[CMP:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
+; GFX942-NEXT: [[TMP12]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP11]], <2 x i32> <i32 1, i32 2>
; GFX942-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
; GFX942: [[EXIT]]:
; GFX942-NEXT: ret void
|
| } | ||
|
|
||
| bool GCNTTIImpl::preferSLPInstCountCheck() const { | ||
| // The integer inst-count heuristic causes regressions on gfx94x and gfx950 |
There was a problem hiding this comment.
This depends on what the element type is, at least? And based on the comment it would be based onVMovB64Inst?
There was a problem hiding this comment.
Don't know, if you need to tune it more - provide some more details or do it later
There was a problem hiding this comment.
@arsenm I asked @alexey-bataev to implement the hooks. I'll work on fine-tuning when needed.
Add
preferSLPInstCountCheck()to TTI so targets can opt out of the2-element inst-count heuristic in
getTreeCost(). Default returnstrue(existing behavior unchanged).
getTreeCost()ANDs the existingSLPInstCountCheckflag with the hook result.GCNTTIImplreturnsfalsefor gfx940/gfx941/gfx942 and gfx950.Fixes #199662.