-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Macro-fusion support for veyron-v1 CPU. #70012
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Mikhail Gudim (mgudim) ChangesSupport was added for the following fusions: Full diff: https://github.com/llvm/llvm-project/pull/70012.diff 4 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 979bc0ea8c7d065..13565ed361dbc2d 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -937,6 +937,16 @@ def TuneLUIADDIFusion
: SubtargetFeature<"lui-addi-fusion", "HasLUIADDIFusion",
"true", "Enable LUI+ADDI macrofusion">;
+def TuneAUIPCADDIFusion
+ : SubtargetFeature<"auipc-addi-fusion", "HasAUIPCADDIFusion",
+ "true", "Enable AUIPC+ADDI macrofusion">;
+def TuneSLLISRLIFusion
+ : SubtargetFeature<"slli-srli-fusion", "HasSLLISRLIFusion",
+ "true", "Enable SLLI+SRLI macrofusion">;
+def TuneLDADDFusion
+ : SubtargetFeature<"ld-add-fusion", "HasLDADDFusion",
+ "true", "Enable fusion of load with the last instruction of the address calculation">;
+
def TuneNoDefaultUnroll
: SubtargetFeature<"no-default-unroll", "EnableDefaultUnroll", "false",
"Disable default unroll preference.">;
@@ -954,6 +964,14 @@ def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7",
[TuneNoDefaultUnroll,
TuneShortForwardBranchOpt]>;
+def TuneVeyronFusions : SubtargetFeature<"ventana-veyron", "RISCVProcFamily", "VentanaVeyron",
+ "Ventana Veyron-Series processors",
+ [TuneLUIADDIFusion,
+ TuneAUIPCADDIFusion,
+ TuneSLLISRLIFusion,
+ TuneLDADDFusion]>;
+
+
// Assume that lock-free native-width atomics are available, even if the target
// and operating system combination would not usually provide them. The user
// is responsible for providing any necessary __sync implementations. Code
diff --git a/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp b/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp
index 02a8d5c18fe1a0e..c33b3503aed0f97 100644
--- a/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp
@@ -18,6 +18,90 @@
using namespace llvm;
+static bool checkRegisters(Register FirstDest, const MachineInstr &SecondMI) {
+ if (SecondMI.getOperand(1).getReg() != FirstDest)
+ return false;
+
+ // If the input is virtual make sure this is the only user.
+ if (FirstDest.isVirtual()) {
+ auto &MRI = SecondMI.getMF()->getRegInfo();
+ return MRI.hasOneNonDBGUse(FirstDest);
+ }
+
+ return SecondMI.getOperand(0).getReg() == FirstDest;
+}
+
+// Fuse Load
+static bool isLDADD(const MachineInstr *FirstMI, const MachineInstr &SecondMI) {
+ if (SecondMI.getOpcode() != RISCV::LD)
+ return false;
+
+ if (!SecondMI.getOperand(2).isImm())
+ return false;
+
+ if (SecondMI.getOperand(2).getImm() != 0)
+ return false;
+
+ // Given SecondMI, when FirstMI is unspecified, we must return
+ // if SecondMI may be part of a fused pair at all.
+ if (!FirstMI)
+ return true;
+
+ if (FirstMI->getOpcode() != RISCV::ADD)
+ return true;
+
+ return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
+}
+
+// Fuse SLLI by 32 feeding into SRLI by 32 or less or
+// SLLI by exactly 48 feeding into SRLI by exactly 48.
+static bool isSLLISRLI(const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ if (SecondMI.getOpcode() != RISCV::SRLI)
+ return false;
+
+ if (!SecondMI.getOperand(2).isImm())
+ return false;
+
+ unsigned SRLIImm = SecondMI.getOperand(2).getImm();
+ bool IsShiftBy48 = SRLIImm == 48;
+ if (SRLIImm > 32 && !IsShiftBy48)
+ return false;
+
+ // Given SecondMI, when FirstMI is unspecified, we must return
+ // if SecondMI may be part of a fused pair at all.
+ if (!FirstMI)
+ return true;
+
+ if (FirstMI->getOpcode() != RISCV::SLLI)
+ return false;
+
+ unsigned SLLIImm = FirstMI->getOperand(2).getImm();
+ if (IsShiftBy48 ? (SLLIImm != 48) : (SLLIImm > 32))
+ return false;
+
+ return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
+}
+
+// Fuse AUIPC followed by ADDI
+static bool isAUIPCADDI(const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ if (SecondMI.getOpcode() != RISCV::ADDI)
+ return false;
+ // Assume the 1st instr to be a wildcard if it is unspecified.
+ if (!FirstMI)
+ return true;
+
+ if (FirstMI->getOpcode() != RISCV::AUIPC)
+ return false;
+
+ // The first operand of ADDI might be a frame index.
+ if (!SecondMI.getOperand(1).isReg())
+ return false;
+
+ return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
+}
+
// Fuse LUI followed by ADDI or ADDIW.
// rd = imm[31:0] which decomposes to
// lui rd, imm[31:12]
@@ -27,7 +111,6 @@ static bool isLUIADDI(const MachineInstr *FirstMI,
if (SecondMI.getOpcode() != RISCV::ADDI &&
SecondMI.getOpcode() != RISCV::ADDIW)
return false;
-
// Assume the 1st instr to be a wildcard if it is unspecified.
if (!FirstMI)
return true;
@@ -35,21 +118,11 @@ static bool isLUIADDI(const MachineInstr *FirstMI,
if (FirstMI->getOpcode() != RISCV::LUI)
return false;
- Register FirstDest = FirstMI->getOperand(0).getReg();
-
- // Destination of LUI should be the ADDI(W) source register.
- if (SecondMI.getOperand(1).getReg() != FirstDest)
+ // The first operand of ADDI might be a frame index.
+ if (!SecondMI.getOperand(1).isReg())
return false;
- // If the input is virtual make sure this is the only user.
- if (FirstDest.isVirtual()) {
- auto &MRI = SecondMI.getMF()->getRegInfo();
- return MRI.hasOneNonDBGUse(FirstDest);
- }
-
- // If the FirstMI destination is non-virtual, it should match the SecondMI
- // destination.
- return SecondMI.getOperand(0).getReg() == FirstDest;
+ return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
}
static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
@@ -61,6 +134,15 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
if (ST.hasLUIADDIFusion() && isLUIADDI(FirstMI, SecondMI))
return true;
+ if (ST.hasAUIPCADDIFusion() && isAUIPCADDI(FirstMI, SecondMI))
+ return true;
+
+ if (ST.hasSLLISRLIFusion() && isSLLISRLI(FirstMI, SecondMI))
+ return true;
+
+ if (ST.hasLDADDFusion() && isLDADD(FirstMI, SecondMI))
+ return true;
+
return false;
}
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index e4008d145ffa572..3a242e20edb5aa4 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -242,4 +242,5 @@ def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1",
FeatureStdExtZicbom,
FeatureStdExtZicbop,
FeatureStdExtZicboz,
- FeatureVendorXVentanaCondOps]>;
+ FeatureVendorXVentanaCondOps,
+ TuneVeyronFusions]>;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 6b915e61c136086..ed23b15bbca9bb5 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -182,7 +182,10 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
return UserReservedRegister[i];
}
- bool hasMacroFusion() const { return hasLUIADDIFusion(); }
+ bool hasMacroFusion() const {
+ return hasLUIADDIFusion() || hasAUIPCADDIFusion() || hasSLLISRLIFusion() ||
+ hasLDADDFusion();
+ }
// Vector codegen related methods.
bool hasVInstructions() const { return HasStdExtZve32x; }
|
As far as I know, there may be up to 30+ instruction pairs that can be fused for some macroarchitecture (for example, , Xiangshan - Decode Unit, Chinese). If we define subtarget features for all of them, that would be a disaster I think. So, I'm working on a TableGen backend that generates predicators for macro fusion (with some refactors of Anyway, this PR looks great to me, and I'm just sharing some information here. (For current implementation, I don't know if we can refer to PPC's, in which there is a script-generated header.) |
602da9a
to
514d689
Compare
Thanks for taking a look. I agree that we shouldn't have too many features defined. I'm happy to rework this in the future if we adopt a new approach. |
return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI); | ||
} | ||
|
||
// Fuse SLLI by 32 feeding into SRLI by 32 or less or |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we split this into two separate sentences? The 32 or less or
is kind of hard to parse.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've replaced this with pseudocode.
|
||
// Destination of LUI should be the ADDI(W) source register. | ||
if (SecondMI.getOperand(1).getReg() != FirstDest) | ||
// The first operand of ADDI might be a frame index. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This check was removed previously because it wasn't needed. Was this a bad merge or is it needed now?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this code:
$rd0 = lui $imm0
$rd1 = addi $rs0, $imm1
In order for fusion to happen it must be that $rd0 == rd1
and $rd0 == rs0
. Same for other fusions. Both checks are done in checkRegisters
now. I think both checks were needed before two, according to this comment:
// Fuse LUI followed by ADDI or ADDIW.
// rd = imm[31:0] which decomposes to
// lui rd, imm[31:12]
// addi(w) rd, rd, imm[11:0]
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was specifically refering to the FrameIndex comment. It was removed by #68701
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we do need the check SecondMI.getOperand(1).isReg()
because later we do SecondMI.getOperand(1).getReg() != FirstDest
- this may crash, right?
I've put that check as a first line of checkRegisters
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As was explained in #68701, if the ADDI operand isn't a register then there can be no data dependency between the LUI and the ADDi so it wouldn't be considered for macrofusion.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oh, sorry I didn't realize that by the time we call shouldScheduleAdjacent
we know that there is a data dependency. Ok, I looked at MacroFusion.cpp : 177
- we only consider pairs of vertices connected by an edge of a schedule dag. But now I am starting to wonder: can we guarantee that an edge always implies data dependency? What if some other mutation creates an artificial edge without data dependency for example?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But now I am starting to wonder: can we guarantee that an edge always implies data dependency? What if some other mutation creates an artificial edge without data dependency for example?
Yes, I have to say that it's possible to have other order dependencies like Artificial
. For example, BaseMemOpClusterMutation::clusterNeighboringMemOps
. Though we may not add artificial edges (not for fusion) between ALU instructions. I haven't run into any problem after remove the isReg()
yet, but I think we may add it back since there are some potential risks of assertion theoretically.
c862441
to
d98d166
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there an optimization guide or other public reference which describes these fusions?
@@ -242,4 +242,5 @@ def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1", | |||
FeatureStdExtZicbom, | |||
FeatureStdExtZicbop, | |||
FeatureStdExtZicboz, | |||
FeatureVendorXVentanaCondOps]>; | |||
FeatureVendorXVentanaCondOps, | |||
TuneVeyronFusions]>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like you're adding this to the feature list, not the tune list. That's probably not what you meant.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done in #70414
@@ -954,6 +964,14 @@ def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7", | |||
[TuneNoDefaultUnroll, | |||
TuneShortForwardBranchOpt]>; | |||
|
|||
def TuneVeyronFusions : SubtargetFeature<"ventana-veyron", "RISCVProcFamily", "VentanaVeyron", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please separate this out in a separate PR which only adds the proc family and the existing LUI/ADDI fusion. Then this change can extend the list with the new fusions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done: #70414
return false; | ||
|
||
unsigned SRLIImm = SecondMI.getOperand(2).getImm(); | ||
bool IsShiftBy48 = SRLIImm == 48; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems a very oddly restricted fusion. Are you sure that e.g. a shift pair by 37 that clears the top bits isn't fused?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The two shifts by 48 case is the i16 zext pattern for rv64 without Zbb. So it makes some sense.
def TuneAUIPCADDIFusion | ||
: SubtargetFeature<"auipc-addi-fusion", "HasAUIPCADDIFusion", | ||
"true", "Enable AUIPC+ADDI macrofusion">; | ||
def TuneSLLISRLIFusion |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is pretty generic name given the specific cases that are recognized for fusion.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how about ShiftedZExtFusion
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok
c4dac98
to
909e139
Compare
Could we add tests for this please? I know we have macro-fusion-lui-addi.ll already though I'm not sure .ll tests are the best for this. PowerPC's macro-fusion.mir test seemed like a nice way to handle it, to my eye at least. |
909e139
to
39198f7
Compare
Yes, good idea. Done. |
b32afa5
to
de5d9dc
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
de5d9dc
to
40c3a0c
Compare
Support was added for the following fusions: auipc-addi, slli-srli, ld-add Some parts of the code became repetative, so small refactoring of existing lui-addi fusion was done.
40c3a0c
to
4ad00f3
Compare
Added pseudocode to explain fusions. Replace the "spacer" `ADDI` in the test with `XORI`- it was confusing in case of LDADD fusion.
4ad00f3
to
c6dff10
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd like to see this landed so that there are more use cases for TableGen way. :-)
@@ -0,0 +1,159 @@ | |||
# REQUIRES: asserts |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need end-to-end llc tests like llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
// add rd, rs1, rs2 | ||
// ld rd, 0(rd) | ||
static bool isLDADD(const MachineInstr *FirstMI, const MachineInstr &SecondMI) { | ||
if (SecondMI.getOpcode() != RISCV::LD) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@mgudim Is this fusion really restricted to just 64-bit load?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes.
Support was added for the following fusions:
auipc-addi, slli-srli, ld-add
Some parts of the code became repetative, so small refactoring of existing lui-addi fusion was done.