-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] Use GFNI for vXi8 shifts/rotates #89115
Conversation
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-llvm-analysis Author: Simon Pilgrim (RKSimon) ChangesAs detailed here: https://github.com/InstLatx64/InstLatX64_Demo/blob/master/GFNI_Demo.h We can use the gf2p8affine instruction to lower byte shifts/rotates as well as the existing bitreverse case. There's a few other GFNI patterns we can probably handle - e.g. TZCNT/LZCNT were detailed on PR47394 Based off the original patch here: https://reviews.llvm.org/D137026 CC @shamithoke Patch is 160.87 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/89115.diff 23 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 27107f554fccf1..63982ca4d35402 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28964,6 +28964,33 @@ SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
}
+// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
+uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
+ switch (Opcode) {
+ case ISD::BITREVERSE:
+ return 0x8040201008040201ULL;
+ case ISD::SHL:
+ assert((0 < Amt && Amt < 8) && "Shift amount out of range");
+ return ((0x0102040810204080ULL >> (Amt)) &
+ (0x0101010101010101ULL * (0xFF >> (Amt))));
+ case ISD::SRL:
+ assert((0 < Amt && Amt < 8) && "Shift amount out of range");
+ return ((0x0102040810204080ULL << (Amt)) &
+ (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
+ case ISD::SRA:
+ assert((0 < Amt && Amt < 8) && "Shift amount out of range");
+ return (getGFNICtrlImm(ISD::SRL, Amt) |
+ (0x8080808080808080ULL >> (64 - (8 * Amt))));
+ case ISD::ROTL:
+ assert((0 < Amt && Amt < 8) && "Rotate amount out of range");
+ return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
+ case ISD::ROTR:
+ assert((0 < Amt && Amt < 8) && "Rotate amount out of range");
+ return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
+ }
+ llvm_unreachable("Unsupported GFNI opcode");
+}
+
// Return true if the required (according to Opcode) shift-imm form is natively
// supported by the Subtarget
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
@@ -29151,6 +29178,14 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
if (VT == MVT::v16i8 && Subtarget.hasXOP())
return SDValue();
+ if (Subtarget.hasGFNI()) {
+ uint64_t ShiftMask = getGFNICtrlImm(Op.getOpcode(), ShiftAmt);
+ MVT MaskVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
+ SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(ShiftMask, dl, MaskVT));
+ return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
+ DAG.getTargetConstant(0, dl, MVT::i8));
+ }
+
if (Op.getOpcode() == ISD::SHL) {
// Make a large shift.
SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
@@ -30022,6 +30057,18 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
}
+ // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
+ if (IsCstSplat && Subtarget.hasGFNI() &&
+ (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasAVX()) ||
+ (VT == MVT::v64i8 && Subtarget.useBWIRegs()))) {
+ uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
+ uint64_t RotMask = getGFNICtrlImm(Opcode, RotAmt);
+ MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+ SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(RotMask, DL, MaskVT));
+ return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
+ DAG.getTargetConstant(0, DL, MVT::i8));
+ }
+
// Split 256-bit integers on XOP/pre-AVX2 targets.
if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
return splitVectorIntBinary(Op, DAG, DL);
@@ -31345,7 +31392,8 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
// If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
if (Subtarget.hasGFNI()) {
MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
- SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
+ SDValue Matrix =
+ DAG.getConstant(getGFNICtrlImm(ISD::BITREVERSE), DL, MatrixVT);
Matrix = DAG.getBitcast(VT, Matrix);
return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
DAG.getTargetConstant(0, DL, MVT::i8));
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index d111c4d4ecc1ae..fb6d81ea61de27 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -345,6 +345,24 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
Op1Info.getNoProps(), Op2Info.getNoProps());
}
+ static const CostKindTblEntry GFNIUniformConstCostTable[] = {
+ { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
+ { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
+ { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
+ { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
+ { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
+ { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
+ { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
+ { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
+ { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
+ };
+
+ if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
+ if (const auto *Entry =
+ CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
+ if (auto KindCost = Entry->Cost[CostKind])
+ return LT.first * *KindCost;
+
static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
{ ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
{ ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
@@ -3860,6 +3878,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
{ ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
{ ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
+ { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
+ { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
+ { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
};
static const CostKindTblEntry GLMCostTbl[] = {
{ ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
diff --git a/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll b/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll
index a7585a4d9f39e1..71927002b599fd 100644
--- a/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll
@@ -1597,9 +1597,9 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
;
; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
-; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
@@ -2871,9 +2871,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
;
; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
diff --git a/llvm/test/Analysis/CostModel/X86/fshl-latency.ll b/llvm/test/Analysis/CostModel/X86/fshl-latency.ll
index 7105f713fdc349..c40394ba9a7283 100644
--- a/llvm/test/Analysis/CostModel/X86/fshl-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshl-latency.ll
@@ -1549,9 +1549,9 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
;
; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
-; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
@@ -2823,9 +2823,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
;
; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
diff --git a/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll
index 5d7361e2931769..7b0daf50485505 100644
--- a/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll
@@ -1597,9 +1597,9 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
;
; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
-; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff 6f02120ac4463e5e0cda25e2aafc485a4fe634ea 931d21618cf90ca9dc818fe1382e5c9538af665a -- llvm/lib/Target/X86/X86ISelLowering.cpp llvm/lib/Target/X86/X86TargetTransformInfo.cpp View the diff from clang-format here.diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index ed9d59e4ad..b71ceb30c3 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -346,15 +346,15 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
}
static const CostKindTblEntry GFNIUniformConstCostTable[] = {
- { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
- { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
- { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
- { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
- { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
- { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
- { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
- { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
- { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
+ {ISD::SHL, MVT::v16i8, {1, 6, 1, 2}}, // gf2p8affineqb
+ {ISD::SRL, MVT::v16i8, {1, 6, 1, 2}}, // gf2p8affineqb
+ {ISD::SRA, MVT::v16i8, {1, 6, 1, 2}}, // gf2p8affineqb
+ {ISD::SHL, MVT::v32i8, {1, 6, 1, 2}}, // gf2p8affineqb
+ {ISD::SRL, MVT::v32i8, {1, 6, 1, 2}}, // gf2p8affineqb
+ {ISD::SRA, MVT::v32i8, {1, 6, 1, 2}}, // gf2p8affineqb
+ {ISD::SHL, MVT::v64i8, {1, 6, 1, 2}}, // gf2p8affineqb
+ {ISD::SRL, MVT::v64i8, {1, 6, 1, 2}}, // gf2p8affineqb
+ {ISD::SRA, MVT::v64i8, {1, 6, 1, 2}}, // gf2p8affineqb
};
if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
@@ -3870,25 +3870,25 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
};
static const CostKindTblEntry GFNICostTbl[] = {
- { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
- { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
- { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
- { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
- { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
- { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
- { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
- { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
- { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
- { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
- { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
- { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
- { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
- { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
- { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
- { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
- { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
- { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
- { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
+ {ISD::BITREVERSE, MVT::i8, {3, 3, 3, 4}}, // gf2p8affineqb
+ {ISD::BITREVERSE, MVT::i16, {3, 3, 4, 6}}, // gf2p8affineqb
+ {ISD::BITREVERSE, MVT::i32, {3, 3, 4, 5}}, // gf2p8affineqb
+ {ISD::BITREVERSE, MVT::i64, {3, 3, 4, 6}}, // gf2p8affineqb
+ {ISD::BITREVERSE, MVT::v16i8, {1, 6, 1, 2}}, // gf2p8affineqb
+ {ISD::BITREVERSE, MVT::v32i8, {1, 6, 1, 2}}, // gf2p8affineqb
+ {ISD::BITREVERSE, MVT::v64i8, {1, 6, 1, 2}}, // gf2p8affineqb
+ {ISD::BITREVERSE, MVT::v8i16, {1, 8, 2, 4}}, // gf2p8affineqb
+ {ISD::BITREVERSE, MVT::v16i16, {1, 9, 2, 4}}, // gf2p8affineqb
+ {ISD::BITREVERSE, MVT::v32i16, {1, 9, 2, 4}}, // gf2p8affineqb
+ {ISD::BITREVERSE, MVT::v4i32, {1, 8, 2, 4}}, // gf2p8affineqb
+ {ISD::BITREVERSE, MVT::v8i32, {1, 9, 2, 4}}, // gf2p8affineqb
+ {ISD::BITREVERSE, MVT::v16i32, {1, 9, 2, 4}}, // gf2p8affineqb
+ {ISD::BITREVERSE, MVT::v2i64, {1, 8, 2, 4}}, // gf2p8affineqb
+ {ISD::BITREVERSE, MVT::v4i64, {1, 9, 2, 4}}, // gf2p8affineqb
+ {ISD::BITREVERSE, MVT::v8i64, {1, 9, 2, 4}}, // gf2p8affineqb
+ {X86ISD::VROTLI, MVT::v16i8, {1, 6, 1, 2}}, // gf2p8affineqb
+ {X86ISD::VROTLI, MVT::v32i8, {1, 6, 1, 2}}, // gf2p8affineqb
+ {X86ISD::VROTLI, MVT::v64i8, {1, 6, 1, 2}}, // gf2p8affineqb
};
static const CostKindTblEntry GLMCostTbl[] = {
{ ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
|
Once #89115 has landed, we can handle per-element shifts as well using (V)GF2P8MULB
Once #89115 has landed, we can handle per-element rotates as well using (V)GF2P8MULB
25e2381
to
158d9c3
Compare
…el shifts This patch moves the promotion to vXi16 shifts and the upper/lower bit masking into LowerFunnelShift for targets that have a bit-select instruction (XOP's VPCMOV and AVX512's VPTERNLOG). This prevents the regressions in #89115 due to the masking of ((X << V) | (Y >> (8-V))) vXi8 shifts.
…el shifts This patch moves the promotion to vXi16 shifts and the upper/lower bit masking into LowerFunnelShift for targets that have a bit-select instruction (XOP's VPCMOV and AVX512's VPTERNLOG). This prevents the regressions in llvm#89115 due to the masking of ((X << V) | (Y >> (8-V))) vXi8 shifts.
…overage Once #89115 has landed, we can handle per-element rotates as well using (V)GF2P8MULB
This looks net positive, but looks like there are some regressions related to demanded bits where we can actually just use a single 16-bit shift. Maybe add support analyzing these patterns in the x86 knownbits functions? Or maybe make this a tablgen pattern that matches the complete patterns? |
Which test changes are you worried about? |
As detailed here: https://github.com/InstLatx64/InstLatX64_Demo/blob/master/GFNI_Demo.h These are a bit more complicated than gf2p8affine look ups, requiring us to convert a SHL shift value / amount into a GF so we can perform a multiplication. SRL/SRA need to be converted to SHL via bitreverse/variable-sign-extension. Followup to llvm#89115
The three outstanding comments. |
LGTM. Probably wait on 1 more review or 48hr. |
As detailed here: https://github.com/InstLatx64/InstLatX64_Demo/blob/master/GFNI_Demo.h We can use the gf2p8affine instruction to lower byte shifts/rotates as well as the existing bitreverse case. There's a few other GFNI patterns we can probably handle - e.g. TZCNT/LZCNT were detailed on PR47394 Based off the original patch here: https://reviews.llvm.org/D137026
As detailed here: https://github.com/InstLatx64/InstLatX64_Demo/blob/master/GFNI_Demo.h These are a bit more complicated than gf2p8affine look ups, requiring us to convert a SHL shift value / amount into a GF so we can perform a multiplication. SRL/SRA need to be converted to SHL via bitreverse/variable-sign-extension. Followup to llvm#89115
I'll rebase and commit tomorrow unless I hear anything more. |
As detailed here: https://github.com/InstLatx64/InstLatX64_Demo/blob/master/GFNI_Demo.h These are a bit more complicated than gf2p8affine look ups, requiring us to convert a SHL shift value / amount into a GF so we can perform a multiplication. SRL/SRA need to be converted to SHL via bitreverse/variable-sign-extension. Followup to llvm#89115
As detailed here: https://github.com/InstLatx64/InstLatX64_Demo/blob/master/GFNI_Demo.h
We can use the gf2p8affine instruction to lower byte shifts/rotates as well as the existing bitreverse case.
There's a few other GFNI patterns we can probably handle - e.g. TZCNT/LZCNT were detailed on PR47394
Based off the original patch here: https://reviews.llvm.org/D137026
CC @shamithoke