-
Notifications
You must be signed in to change notification settings - Fork 15.3k
AMDGPU: Improve getShuffleCost accuracy for 8- and 16-bit shuffles #168818
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-analysis Author: Nicolai Hähnle (nhaehnle) ChangesThese shuffles can always be implemented using v_perm_b32, and so this The test changes in Transforms/SLPVectorizer/reduction.ll are Stack:
Patch is 233.01 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168818.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 03d16fdd54c42..e2cf78a6c0158 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1241,46 +1241,108 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
(ScalarSize == 16 || ScalarSize == 8)) {
// Larger vector widths may require additional instructions, but are
// typically cheaper than scalarized versions.
- unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
- unsigned RequestedElts =
- count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
- unsigned EltsPerReg = 32 / ScalarSize;
- if (RequestedElts == 0)
+ //
+ // We assume that shuffling at a register granularity can be done for free.
+ // This is not true for vectors fed into memory instructions, but it is
+ // effectively true for all other shuffling. The emphasis of the logic here
+ // is to assist generic transform in cleaning up / canonicalizing those
+ // shuffles.
+ unsigned NumDstElts = cast<FixedVectorType>(DstTy)->getNumElements();
+ unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();
+
+ // With op_sel VOP3P instructions freely can access the low half or high
+ // half of a register, so any swizzle of two elements is free.
+ if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
+ (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
+ Kind == TTI::SK_PermuteSingleSrc))
return 0;
+
+ unsigned EltsPerReg = 32 / ScalarSize;
switch (Kind) {
case TTI::SK_Broadcast:
+ // A single v_perm_b32 can be re-used for all destination registers.
+ return 1;
case TTI::SK_Reverse:
- case TTI::SK_PermuteSingleSrc: {
- // With op_sel VOP3P instructions freely can access the low half or high
- // half of a register, so any swizzle of two elements is free.
- if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
- return 0;
- unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
- // SK_Broadcast just reuses the same mask
- unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
- return NumPerms + NumPermMasks;
- }
+ // One instruction per register.
+ return divideCeil(NumDstElts, EltsPerReg);
case TTI::SK_ExtractSubvector:
+ if (Index % EltsPerReg == 0)
+ return 0; // Shuffling at register granularity
+ return divideCeil(NumDstElts, EltsPerReg);
case TTI::SK_InsertSubvector: {
- // Even aligned accesses are free
- if (!(Index % 2))
- return 0;
- // Insert/extract subvectors only require shifts / extract code to get the
- // relevant bits
- return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
+ unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements();
+ unsigned EndIndex = Index + NumInsertElts;
+ unsigned BeginSubIdx = Index % EltsPerReg;
+ unsigned EndSubIdx = EndIndex % EltsPerReg;
+ unsigned Cost = 0;
+
+ if (BeginSubIdx != 0) {
+ // Need to shift the inserted vector into place. The cost is the number
+ // of destination registers overlapped by the inserted vector.
+ Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg);
+ }
+
+ // If the last register overlap is partial, there may be three source
+ // registers feeding into it; that takes an extra instruction.
+ if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
+ Cost += 1;
+
+ return Cost;
}
- case TTI::SK_PermuteTwoSrc:
- case TTI::SK_Splice:
- case TTI::SK_Select: {
- unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
- // SK_Select just reuses the same mask
- unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
- return NumPerms + NumPermMasks;
+ case TTI::SK_Splice: {
+ assert(NumDstElts == NumSrcElts);
+ // Determine the sub-region of the result vector that requires
+ // sub-register shuffles / mixing.
+ unsigned EltsFromLHS = NumSrcElts - Index;
+ bool LHSIsAligned = (Index % EltsPerReg) == 0;
+ bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
+ if (LHSIsAligned && RHSIsAligned)
+ return 0;
+ if (LHSIsAligned && !RHSIsAligned)
+ return divideCeil(NumDstElts - EltsFromLHS, EltsPerReg);
+ if (!LHSIsAligned && RHSIsAligned)
+ return divideCeil(EltsFromLHS, EltsPerReg);
+ return divideCeil(NumDstElts, EltsPerReg);
}
-
default:
break;
}
+
+ if (!Mask.empty()) {
+ // Generically estimate the cost by assuming that each destination
+ // register is derived from sources via v_perm_b32 instructions if it
+ // can't be copied as-is.
+ //
+ // For each destination register, derive the cost of obtaining it based
+ // on the number of source registers that feed into it.
+ unsigned Cost = 0;
+ for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
+ SmallVector<int, 4> Regs;
+ bool Aligned = true;
+ for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
+ int SrcIdx = Mask[DstIdx + I];
+ if (SrcIdx == -1)
+ continue;
+ int Reg;
+ if (SrcIdx < (int)NumSrcElts) {
+ Reg = SrcIdx / EltsPerReg;
+ if (SrcIdx % EltsPerReg != I)
+ Aligned = false;
+ } else {
+ Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
+ if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
+ Aligned = false;
+ }
+ if (!llvm::is_contained(Regs, Reg))
+ Regs.push_back(Reg);
+ }
+ if (Regs.size() >= 2)
+ Cost += Regs.size() - 1;
+ else if (!Aligned)
+ Cost += 1;
+ }
+ return Cost;
+ }
}
return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
index b66e19e8bc563..78d43e8949269 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
@@ -13,153 +13,153 @@ define amdgpu_kernel void @shufflevector_i16(<2 x i16> %vec1, <2 x i16> %vec2) {
; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 1>
; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf10 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 0>
; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf11 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 1>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf02 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf02 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 0>
; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf22 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 2>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf03 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 3>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 0>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf03 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 3>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf30 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 0>
; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf33 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 3>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 2>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf21 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 1>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf13 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 3>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 1>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf12 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 2>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf21 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 1>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf13 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 3>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf31 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 1>
; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 3>
; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf32 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 2>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf000 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> zeroinitializer
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf111 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf002 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf020 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf022 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf200 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf202 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf220 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf222 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf112 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf121 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf122 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf211 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf212 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf221 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf000 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> zeroinitializer
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf001 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf100 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf101 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf110 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf111 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf002 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf020 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf022 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf200 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf202 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf220 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf222 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf112 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf121 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf122 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf211 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf212 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf221 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf00_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> zeroinitializer
; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 1>
; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf10_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 0>
; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf11_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 1>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf02_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 2>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf02_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 2>
; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf20_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 0>
; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf22_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 2>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf03_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 3>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf30_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 0>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf03_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 3>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf30_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 0>
; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf33_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 3>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf12_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 2>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf21_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 1>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf13_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 3>
-; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf31_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 1>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf12_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 2>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf21_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 1>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf13_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 3>
+; GFX9-10-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf31_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 1>
; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruct...
[truncated]
|
🐧 Linux x64 Test Results
|
| unsigned NumDstElts = cast<FixedVectorType>(DstTy)->getNumElements(); | ||
| unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you keep this in terms of ElementCount to avoid crashing on scalable vectors
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's not really clear to me what most of these are meant to mean for scalable vectors, so instead of doing it in terms of ElementCount I'm adding type checks and returning InstructionCost::getInvalid() if it's not fixed vectors, which matches what AArch64 does at least.
| if (Regs.size() >= 2) | ||
| Cost += Regs.size() - 1; | ||
| else if (!Aligned) | ||
| Cost += 1; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is this accounting for?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the case where this destination register needs bytes from only a single source register, but those bytes aren't at the same position. So, one instruction is needed to produce the destination register.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense. Could do with a comment.
0be4495 to
aa6362b
Compare
jayfoad
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
These shuffles can always be implemented using v_perm_b32, and so this rewrites the analysis from the perspective of "how many v_perm_b32s does it take to assemble each register of the result?" The test changes in Transforms/SLPVectorizer/reduction.ll are reasonable: VI (gfx8) has native f16 math, but not packed math. commit-id:8b76e888
aa6362b to
d5944fb
Compare
These shuffles can always be implemented using v_perm_b32, and so this
rewrites the analysis from the perspective of "how many v_perm_b32s does
it take to assemble each register of the result?"
The test changes in Transforms/SLPVectorizer/reduction.ll are
reasonable: VI (gfx8) has native f16 math, but not packed math.
Stack: