diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td index a1ff20bb36121a..422bd11dca52e6 100644 --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -203,6 +203,22 @@ def FeatureLogicalFusion : SubtargetFeature<"fuse-logical", "HasLogicalFusion", "true", "Target supports Logical Operations fusion", [FeatureFusion]>; +def FeatureSha3Fusion : + SubtargetFeature<"fuse-sha3", "HasSha3Fusion", "true", + "Target supports SHA3 assist fusion", + [FeatureFusion]>; +def FeatureCompareFusion: + SubtargetFeature<"fuse-cmp", "HasCompareFusion", "true", + "Target supports Comparison Operations fusion", + [FeatureFusion]>; +def FeatureWideImmFusion: + SubtargetFeature<"fuse-wideimm", "HasWideImmFusion", "true", + "Target supports Wide-Immediate fusion", + [FeatureFusion]>; +def FeatureZeroMoveFusion: + SubtargetFeature<"fuse-zeromove", "HasZeroMoveFusion", "true", + "Target supports move to SPR with branch fusion", + [FeatureFusion]>; def FeatureUnalignedFloats : SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess", "true", "CPU does not trap on unaligned FP access">; @@ -393,7 +409,7 @@ def ProcessorFeatures { // still exist with the exception of those we know are Power9 specific. list FusionFeatures = [ FeatureStoreFusion, FeatureAddLogicalFusion, FeatureLogicalAddFusion, - FeatureLogicalFusion, FeatureArithAddFusion + FeatureLogicalFusion, FeatureArithAddFusion, FeatureSha3Fusion, ]; list P10AdditionalFeatures = !listconcat(FusionFeatures, [ diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp b/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp index bdff5109c1e134..9d5206f8fd43c0 100644 --- a/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp +++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp @@ -149,6 +149,79 @@ static bool checkOpConstraints(FusionFeature::FusionKind Kd, case FusionFeature::FK_SldiAdd: return (matchingImmOps(FirstMI, 2, 3) && matchingImmOps(FirstMI, 3, 60)) || (matchingImmOps(FirstMI, 2, 6) && matchingImmOps(FirstMI, 3, 57)); + + // rldicl rx, ra, 1, 0 - xor + case FusionFeature::FK_RotateLeftXor: + return matchingImmOps(FirstMI, 2, 1) && matchingImmOps(FirstMI, 3, 0); + + // rldicr rx, ra, 1, 63 - xor + case FusionFeature::FK_RotateRightXor: + return matchingImmOps(FirstMI, 2, 1) && matchingImmOps(FirstMI, 3, 63); + + // We actually use CMPW* and CMPD*, 'l' doesn't exist as an operand in instr. + + // { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpi 0,1,rx,{ 0,1,-1 } + // { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpli 0,L,rx,{ 0,1 } + case FusionFeature::FK_LoadCmp1: + // { ld,ldx } - cmpi 0,1,rx,{ 0,1,-1 } + // { ld,ldx } - cmpli 0,1,rx,{ 0,1 } + case FusionFeature::FK_LoadCmp2: { + const MachineOperand &BT = SecondMI.getOperand(0); + if (!BT.isReg() || + (!Register::isVirtualRegister(BT.getReg()) && BT.getReg() != PPC::CR0)) + return false; + if (SecondMI.getOpcode() == PPC::CMPDI && + matchingImmOps(SecondMI, 2, -1, 16)) + return true; + return matchingImmOps(SecondMI, 2, 0) || matchingImmOps(SecondMI, 2, 1); + } + + // { lha,lhax,lwa,lwax } - cmpi 0,L,rx,{ 0,1,-1 } + case FusionFeature::FK_LoadCmp3: { + const MachineOperand &BT = SecondMI.getOperand(0); + if (!BT.isReg() || + (!Register::isVirtualRegister(BT.getReg()) && BT.getReg() != PPC::CR0)) + return false; + return matchingImmOps(SecondMI, 2, 0) || matchingImmOps(SecondMI, 2, 1) || + matchingImmOps(SecondMI, 2, -1, 16); + } + + // mtctr - { bcctr,bcctrl } + case FusionFeature::FK_ZeroMoveCTR: + // ( mtctr rx ) is alias of ( mtspr 9, rx ) + return (FirstMI.getOpcode() != PPC::MTSPR && + FirstMI.getOpcode() != PPC::MTSPR8) || + matchingImmOps(FirstMI, 0, 9); + + // mtlr - { bclr,bclrl } + case FusionFeature::FK_ZeroMoveLR: + // ( mtlr rx ) is alias of ( mtspr 8, rx ) + return (FirstMI.getOpcode() != PPC::MTSPR && + FirstMI.getOpcode() != PPC::MTSPR8) || + matchingImmOps(FirstMI, 0, 8); + + // addis rx,ra,si - addi rt,rx,SI, SI >= 0 + case FusionFeature::FK_AddisAddi: { + const MachineOperand &RA = FirstMI.getOperand(1); + const MachineOperand &SI = SecondMI.getOperand(2); + if (!SI.isImm() || !RA.isReg()) + return false; + if (RA.getReg() == PPC::ZERO || RA.getReg() == PPC::ZERO8) + return false; + return SignExtend64(SI.getImm(), 16) >= 0; + } + + // addi rx,ra,si - addis rt,rx,SI, ra > 0, SI >= 2 + case FusionFeature::FK_AddiAddis: { + const MachineOperand &RA = FirstMI.getOperand(1); + const MachineOperand &SI = FirstMI.getOperand(2); + if (!SI.isImm() || !RA.isReg()) + return false; + if (RA.getReg() == PPC::ZERO || RA.getReg() == PPC::ZERO8) + return false; + int64_t ExtendedSI = SignExtend64(SI.getImm(), 16); + return ExtendedSI >= 2; + } } llvm_unreachable("All the cases should have been handled"); diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.def b/llvm/lib/Target/PowerPC/PPCMacroFusion.def index 469a24800423a7..e4954b722fd0f5 100644 --- a/llvm/lib/Target/PowerPC/PPCMacroFusion.def +++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.def @@ -78,5 +78,80 @@ FUSION_FEATURE(VecLogical, hasLogicalFusion, -1, FUSION_FEATURE(SldiAdd, hasArithAddFusion, -1, FUSION_OP_SET(RLDICR, RLDICR_32), FUSION_OP_SET(ADD4, ADD8, SUBF, SUBF8)) +// rldicl rx, ra, 1, 0 - xor +FUSION_FEATURE(RotateLeftXor, hasSha3Fusion, 1, + FUSION_OP_SET(RLDICL, RLDICL_32, RLDICL_32_64), + FUSION_OP_SET(XOR, XOR8)) + +// rldicr rx, ra, 1, 63 - xor +FUSION_FEATURE(RotateRightXor, hasSha3Fusion, 1, + FUSION_OP_SET(RLDICR, RLDICR_32), FUSION_OP_SET(XOR, XOR8)) + +// There're two special cases in 'load-compare' series, so we have to split +// them into several pattern groups to fit into current framework. This can +// be clearer once we switched to a more expressive approach. + +// { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpi 0,1,rx,{ 0,1,-1 } +// { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpli 0,L,rx,{ 0,1 } +FUSION_FEATURE(LoadCmp1, hasCompareFusion, 1, + FUSION_OP_SET(LBZ, LBZ8, LBZX, LBZX8, LBZXTLS, LBZXTLS_, + LBZXTLS_32, LHZ, LHZ8, LHZX, LHZX8, LHZXTLS, + LHZXTLS_, LHZXTLS_32, LWZ, LWZ8, LWZX, LWZX8, + LWZXTLS, LWZXTLS_, LWZXTLS_32), + FUSION_OP_SET(CMPDI, CMPLDI, CMPLWI)) + +// { ld,ldx } - cmpi 0,1,rx,{ 0,1,-1 } +// { ld,ldx } - cmpli 0,1,rx,{ 0,1 } +FUSION_FEATURE(LoadCmp2, hasCompareFusion, 1, + FUSION_OP_SET(LD, LDX, LDXTLS, LDXTLS_), + FUSION_OP_SET(CMPDI, CMPLDI)) + +// { lha,lhax,lwa,lwax } - cmpi 0,L,rx,{ 0,1,-1 } +FUSION_FEATURE(LoadCmp3, hasCompareFusion, 1, + FUSION_OP_SET(LHA, LHA8, LHAX, LHAX8, LWA, LWA_32, LWAX, + LWAX_32), + FUSION_OP_SET(CMPLDI, CMPLWI)) + +// ori - oris +FUSION_FEATURE(OriOris, hasWideImmFusion, 1, FUSION_OP_SET(ORI, ORI8), + FUSION_OP_SET(ORIS, ORIS8)) + +// lis - ori +FUSION_FEATURE(LisOri, hasWideImmFusion, 1, FUSION_OP_SET(LIS, LIS8), + FUSION_OP_SET(ORI, ORI8)) + +// oris - ori +FUSION_FEATURE(OrisOri, hasWideImmFusion, 1, FUSION_OP_SET(ORIS, ORIS8), + FUSION_OP_SET(ORI, ORI8)) + +// xori - xoris +FUSION_FEATURE(XoriXoris, hasWideImmFusion, 1, FUSION_OP_SET(XORI, XORI8), + FUSION_OP_SET(XORIS, XORIS8)) + +// xoris - xori +FUSION_FEATURE(XorisXori, hasWideImmFusion, 1, FUSION_OP_SET(XORIS, XORIS8), + FUSION_OP_SET(XORI, XORI8)) + +// addis rx,ra,si - addi rt,rx,SI, SI >= 0 +FUSION_FEATURE(AddisAddi, hasWideImmFusion, 1, + FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8), + FUSION_OP_SET(ADDI, ADDI8, ADDItocL)) + +// addi rx,ra,si - addis rt,rx,SI, ra > 0, SI >= 2 +FUSION_FEATURE(AddiAddis, hasWideImmFusion, 1, + FUSION_OP_SET(ADDI, ADDI8, ADDItocL), + FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8)) + +// mtctr - { bcctr,bcctrl } +FUSION_FEATURE(ZeroMoveCTR, hasZeroMoveFusion, -1, + FUSION_OP_SET(MTCTR, MTCTRloop, MTSPR8, MTSPR), + FUSION_OP_SET(BCCTR, BCCTRn, BCCTR8, BCCTR8n, BCCTRL, BCCTRLn, + BCCTRL8, BCCTRL8n, gBCCTR, gBCCTRL)) + +// mtlr - { bclr,bclrl } +FUSION_FEATURE(ZeroMoveLR, hasZeroMoveFusion, -1, + FUSION_OP_SET(MTLR8, MTLR, MTSPR8, MTSPR), + FUSION_OP_SET(BCLR, BCLRn, gBCLR, BCLRL, BCLRLn, gBCLRL)) + #undef FUSION_FEATURE #undef FUSION_OP_SET diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index dfc29dbb10f19b..1258a1281597a4 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -131,6 +131,10 @@ void PPCSubtarget::initializeEnvironment() { HasAddLogicalFusion = false; HasLogicalAddFusion = false; HasLogicalFusion = false; + HasSha3Fusion = false; + HasCompareFusion = false; + HasWideImmFusion = false; + HasZeroMoveFusion = false; IsISA2_06 = false; IsISA2_07 = false; IsISA3_0 = false; diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h index 783ea121ccb839..d52833cb1465de 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -151,6 +151,10 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool HasAddLogicalFusion; bool HasLogicalAddFusion; bool HasLogicalFusion; + bool HasSha3Fusion; + bool HasCompareFusion; + bool HasWideImmFusion; + bool HasZeroMoveFusion; bool IsISA2_06; bool IsISA2_07; bool IsISA3_0; @@ -340,6 +344,10 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool hasAddLogicalFusion() const { return HasAddLogicalFusion; } bool hasLogicalAddFusion() const { return HasLogicalAddFusion; } bool hasLogicalFusion() const { return HasLogicalFusion; } + bool hasCompareFusion() const { return HasCompareFusion; } + bool hasWideImmFusion() const { return HasWideImmFusion; } + bool hasSha3Fusion() const { return HasSha3Fusion; } + bool hasZeroMoveFusion() const { return HasZeroMoveFusion; } bool needsSwapsForVSXMemOps() const { return hasVSX() && isLittleEndian() && !hasP9Vector(); } diff --git a/llvm/test/CodeGen/PowerPC/macro-fusion.mir b/llvm/test/CodeGen/PowerPC/macro-fusion.mir index 16391a2ab8fa2c..91c435d290ffc1 100644 --- a/llvm/test/CodeGen/PowerPC/macro-fusion.mir +++ b/llvm/test/CodeGen/PowerPC/macro-fusion.mir @@ -1,6 +1,7 @@ # REQUIRES: asserts # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr10 -x=mir < %s \ # RUN: -debug-only=machine-scheduler -start-before=postmisched 2>&1 \ +# RUN: -mattr=+fuse-zeromove,+fuse-cmp,+fuse-wideimm \ # RUN: | FileCheck %s # CHECK: add_mulld:%bb.0 @@ -93,3 +94,55 @@ body: | renamable $x3 = ADD8 killed renamable $x4, $x5 BLR8 implicit $lr8, implicit $rm, implicit $x3 ... + +# CHECK: rldicl_xor:%bb.0 +# CHECK: Macro fuse: SU(0) - SU(1) / RLDICL - XOR8 +--- +name: rldicl_xor +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x3, $x4, $x5 + renamable $x4 = RLDICL $x3, 1, 0 + renamable $x3 = XOR8 killed renamable $x4, $x5 + BLR8 implicit $lr8, implicit $rm, implicit $x3 +... + +# CHECK: rldicr_xor:%bb.0 +# CHECK: Macro fuse: SU(0) - SU(1) / RLDICR - XOR8 +--- +name: rldicr_xor +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x3, $x4, $x5 + renamable $x4 = RLDICR $x3, 1, 63 + renamable $x3 = XOR8 killed renamable $x4, $x5 + BLR8 implicit $lr8, implicit $rm, implicit $x3 +... + +# CHECK: ori_oris:%bb.0 +# CHECK: Macro fuse: SU(0) - SU(1) / ORI8 - ORIS8 +--- +name: ori_oris +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x3, $x4 + renamable $x4 = ORI8 $x3, 63 + renamable $x3 = ORIS8 killed renamable $x4, 20 + BLR8 implicit $lr8, implicit $rm, implicit $x3 +... + +# CHECK: load_cmp:%bb.0 +# CHECK: Macro fuse: SU(0) - SU(1) / LD - CMPDI +--- +name: load_cmp +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x3, $x4, $x5 + renamable $x3 = LD 0, killed renamable $x3 + renamable $cr0 = CMPDI killed renamable $x3, 0 + renamable $x3 = ISEL8 killed renamable $x5, killed renamable $x4, renamable $cr0lt, implicit killed $cr0 + BLR8 implicit $lr8, implicit $rm, implicit $x3