-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[RISCV] Implement RVV scheduling model for andes 45 series processor. #167821
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
This patch implements RVV scheduling model for andes45 series processors. Some of scheduling categories are left with a TODO mark. We will revise its latency and throughput in the future.
|
@llvm/pr-subscribers-backend-risc-v Author: Jim Lin (tclin914) ChangesThis patch implements RVV scheduling model for andes45 series processors. Some of scheduling categories are left with a TODO mark. We will revise its latency and throughput in the future. Patch is 4.56 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167821.diff 18 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVSchedAndes45.td b/llvm/lib/Target/RISCV/RISCVSchedAndes45.td
index 8cf15fa26e22d..207a240e5c896 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedAndes45.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedAndes45.td
@@ -8,7 +8,238 @@
//===----------------------------------------------------------------------===//
-// FIXME: Implement sheduling model for V and other extensions.
+// Refer to `Table 5: Supported VLEN and DLEN` for legal VLEN and DLEN.
+defvar Andes45VLEN = 512;
+defvar Andes45DLEN = 512;
+defvar Andes45VLEN_DLEN_RATIO = !div(Andes45VLEN, Andes45DLEN);
+
+assert !or(!eq(Andes45VLEN_DLEN_RATIO, 1), !eq(Andes45VLEN_DLEN_RATIO, 2)),
+ "Andes45VLEN / Andes45DLEN should be 1 or 2";
+
+// Refer to `Table 6: Supported DLEN and BIU_DATA_WIDTH` for legal BIU_DATA_WIDTH.
+defvar Andes45BIU_DATA_WIDTH = 512;
+defvar Andes45DLEN_BIU_DATA_WIDTH_RATIO = !div(Andes45DLEN, Andes45BIU_DATA_WIDTH);
+
+assert !or(!eq(Andes45DLEN_BIU_DATA_WIDTH_RATIO, 1), !eq(Andes45DLEN_BIU_DATA_WIDTH_RATIO, 2)),
+ "Andes45DLEN / Andes45DLEN_BIU_DATA_WIDTH_RATIO should be 1 or 2";
+
+// HVM region: VLSU_MEM_DW equals DLEN
+// Cachable/Non-cachable region: VLSU_MEM_DW equals BIU_DATA_WIDTH
+defvar Andes45VLSU_MEM_DW = Andes45BIU_DATA_WIDTH;
+defvar Andes45VLEN_VLSU_MEM_DW_RATIO = !div(Andes45VLEN, Andes45VLSU_MEM_DW);
+
+// There are various latency depending on its memory type and status.
+defvar VLSU_MEM_LATENCY = 13;
+
+// The worst case LMUL is the largest LMUL.
+class Andes45IsWorstCaseMX<string mx, list<string> MxList> {
+ defvar LLMUL = LargestLMUL<MxList>.r;
+ bit c = !eq(mx, LLMUL);
+}
+
+// The worst case is the largest LMUL with the smallest SEW.
+class Andes45IsWorstCaseMXSEW<string mx, int sew, list<string> MxList,
+ bit isF = 0> {
+ defvar LLMUL = LargestLMUL<MxList>.r;
+ defvar SSEW = SmallestSEW<mx, isF>.r;
+ bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+}
+
+// When fractional LMUL is used, the LMUL used in calculation is 1.
+class Andes45GetLMULValue<string mx> {
+ int c = !cond(
+ !eq(mx, "M1") : 1,
+ !eq(mx, "M2") : 2,
+ !eq(mx, "M4") : 4,
+ !eq(mx, "M8") : 8,
+ !eq(mx, "MF2") : 1,
+ !eq(mx, "MF4") : 1,
+ !eq(mx, "MF8") : 1
+ );
+}
+
+// (VLEN/DLEN)*LMUL
+// When fractional LMUL is used, the LMUL used in calculation is 1.
+class Andes45GetCyclesDefault<string mx> {
+ int c = !cond(
+ !eq(mx, "M1") : !mul(Andes45VLEN_DLEN_RATIO, 1),
+ !eq(mx, "M2") : !mul(Andes45VLEN_DLEN_RATIO, 2),
+ !eq(mx, "M4") : !mul(Andes45VLEN_DLEN_RATIO, 4),
+ !eq(mx, "M8") : !mul(Andes45VLEN_DLEN_RATIO, 8),
+ !eq(mx, "MF2") : !mul(Andes45VLEN_DLEN_RATIO, 1),
+ !eq(mx, "MF4") : !mul(Andes45VLEN_DLEN_RATIO, 1),
+ !eq(mx, "MF8") : !mul(Andes45VLEN_DLEN_RATIO, 1)
+ );
+}
+
+// (VLEN/DLEN)*LMUL*2, if LMUL >= 1,
+// (VLEN != DLEN) ? : 4 : 1, if LMUL < 1.
+class Andes45GetCyclesWidening<string mx> {
+ int c = !cond(
+ !eq(mx, "M1") : !mul(Andes45VLEN_DLEN_RATIO, 2),
+ !eq(mx, "M2") : !mul(Andes45VLEN_DLEN_RATIO, 4),
+ !eq(mx, "M4") : !mul(Andes45VLEN_DLEN_RATIO, 8),
+ // FIXME: .v* and .w* are different if LMUL < 1.
+ !eq(mx, "MF2") : !if(!ne(Andes45VLEN, Andes45DLEN), 4, 1),
+ !eq(mx, "MF4") : !if(!ne(Andes45VLEN, Andes45DLEN), 4, 1),
+ !eq(mx, "MF8") : !if(!ne(Andes45VLEN, Andes45DLEN), 4, 1),
+ );
+}
+
+// (VLEN/DLEN)*LMUL*2, if LMUL >= 1,
+// (VLEN != DLEN) ? : 4 : 1, if LMUL < 1.
+class Andes45GetCyclesNarrowing<string mx> {
+ int c = !cond(
+ !eq(mx, "M1") : !mul(Andes45VLEN_DLEN_RATIO, 2),
+ !eq(mx, "M2") : !mul(Andes45VLEN_DLEN_RATIO, 4),
+ !eq(mx, "M4") : !mul(Andes45VLEN_DLEN_RATIO, 8),
+ !eq(mx, "MF2") : !if(!ne(Andes45VLEN, Andes45DLEN), 4, 1),
+ !eq(mx, "MF4") : !if(!ne(Andes45VLEN, Andes45DLEN), 4, 1),
+ !eq(mx, "MF8") : !if(!ne(Andes45VLEN, Andes45DLEN), 4, 1)
+ );
+}
+
+// 3, if LMUL >= 1,
+// (VLEN != DLEN) ? 3 : 2, if LMUL <1.
+class Andes45GetLatencyNarrowing<string mx> {
+ int c = !cond(
+ !eq(mx, "M1") : 3,
+ !eq(mx, "M2") : 3,
+ !eq(mx, "M4") : 3,
+ !eq(mx, "MF2") : !if(!ne(Andes45VLEN, Andes45DLEN), 3, 2),
+ !eq(mx, "MF4") : !if(!ne(Andes45VLEN, Andes45DLEN), 3, 2),
+ !eq(mx, "MF8") : !if(!ne(Andes45VLEN, Andes45DLEN), 3, 2)
+ );
+}
+
+// (VLEN/VLSU_MEM_DW)*EMUL
+class Andes45GetCyclesLoadStore<string mx> {
+ int c = !cond(
+ !eq(mx, "M1") : !mul(Andes45VLEN_VLSU_MEM_DW_RATIO, 1),
+ !eq(mx, "M2") : !mul(Andes45VLEN_VLSU_MEM_DW_RATIO, 2),
+ !eq(mx, "M4") : !mul(Andes45VLEN_VLSU_MEM_DW_RATIO, 4),
+ !eq(mx, "M8") : !mul(Andes45VLEN_VLSU_MEM_DW_RATIO, 8),
+ !eq(mx, "MF2") : !mul(Andes45VLEN_VLSU_MEM_DW_RATIO, 1),
+ !eq(mx, "MF4") : !mul(Andes45VLEN_VLSU_MEM_DW_RATIO, 1),
+ !eq(mx, "MF8") : !mul(Andes45VLEN_VLSU_MEM_DW_RATIO, 1)
+ );
+}
+
+class Andes45GetCyclesOnePerElement<string mx, int sew> {
+ int VL = !div(Andes45VLEN, sew);
+ int c = !cond(
+ !eq(mx, "M1") : VL,
+ !eq(mx, "M2") : !mul(VL, 2),
+ !eq(mx, "M4") : !mul(VL, 4),
+ !eq(mx, "M8") : !mul(VL, 8),
+ !eq(mx, "MF2") : !div(VL, 2),
+ !eq(mx, "MF4") : !div(VL, 4),
+ !eq(mx, "MF8") : !div(VL, 8)
+ );
+}
+
+class Andes45GetLatecyDiv<int sew> {
+ int c = !cond(
+ !eq(sew, 8) : 12,
+ !eq(sew, 16) : 20,
+ !eq(sew, 32) : 36,
+ !eq(sew, 64) : 68
+ );
+}
+
+// (VLEN/DLEN)*LMUL*SEW+(VLEN/DLEN)*LMUL*2+1
+// = (VLEN/DLEN)*LMUL*(SEW+2)+1
+class Andes45GetCyclesDiv<string mx, int sew> {
+ int a = !mul(Andes45VLEN_DLEN_RATIO, !add(sew, 2));
+ int b = !cond(
+ !eq(mx, "M1") : !mul(a, 1),
+ !eq(mx, "M2") : !mul(a, 2),
+ !eq(mx, "M4") : !mul(a, 4),
+ !eq(mx, "M8") : !mul(a, 8),
+ !eq(mx, "MF2") : !mul(a, 1),
+ !eq(mx, "MF4") : !mul(a, 1),
+ !eq(mx, "MF8") : !mul(a, 1)
+ );
+
+ int c = !add(b, 1);
+}
+
+class Andes45GetFDivFactor<int sew> {
+ int c = !cond(
+ !eq(sew, 16) : 22,
+ !eq(sew, 32) : 36,
+ !eq(sew, 64) : 64
+ );
+}
+
+class Andes45GetFSqrtFactor<int sew> {
+ int c = !cond(
+ !eq(sew, 16) : 20,
+ !eq(sew, 32) : 34,
+ !eq(sew, 64) : 62
+ );
+}
+
+// (VLEN/DLEN)*LMUL+LOG2(DLEN/64)*2+LOG2(64/SEW)
+class Andes45GetReductionCycles<string mx, int sew> {
+ int d = Andes45GetCyclesDefault<mx>.c;
+ int c = !add(d,
+ !add(!mul(!logtwo(!div(Andes45DLEN, 64)), 2),
+ !logtwo(!div(64, sew))));
+}
+
+// (VLEN/DLEN)*LMUL*2+LOG2(DLEN/64)*2+LOG2(64/2/SEW)
+class Andes45GetReductionCyclesWidening<string mx, int sew> {
+ int w = !mul(Andes45GetCyclesDefault<mx>.c, 2);
+ int c = !add(w,
+ !add(!mul(!logtwo(!div(Andes45DLEN, 64)), 2),
+ !logtwo(!div(64, sew))));
+}
+
+// (VLEN/DLEN)*LMUL+LOG2(DLEN/SEW)
+class Andes45GetFReductionCycles<string mx, int sew> {
+ int d = Andes45GetCyclesDefault<mx>.c;
+ int c = !add(d, !logtwo(!div(Andes45DLEN, sew)));
+}
+
+// (VLEN/DLEN)*LMUL*2+LOG2(DLEN/SEW)-1
+class Andes45GetFWReductionCycles<string mx, int sew> {
+ int a = !mul(Andes45GetCyclesDefault<mx>.c, 2);
+ int b = !add(a, !logtwo(!div(Andes45DLEN, sew)));
+ int c = !sub(b, 1);
+}
+
+// (VLEN*LMUL)/SEW
+class Andes45GetOrderedFReductionCycles<string mx, int sew> {
+ int b = !cond(
+ !eq(mx, "M1") : !mul(Andes45VLEN, 1),
+ !eq(mx, "M2") : !mul(Andes45VLEN, 2),
+ !eq(mx, "M4") : !mul(Andes45VLEN, 4),
+ !eq(mx, "M8") : !mul(Andes45VLEN, 8),
+ !eq(mx, "MF2") : !mul(Andes45VLEN, 1),
+ !eq(mx, "MF4") : !mul(Andes45VLEN, 1),
+ !eq(mx, "MF8") : !mul(Andes45VLEN, 1)
+ );
+
+ int c = !div(b, sew);
+}
+
+// (VLEN*LMUL)/SEW
+class Andes45GetOrderedFWReductionCycles<string mx, int sew> {
+ int b = !cond(
+ !eq(mx, "M1") : !mul(Andes45VLEN, 1),
+ !eq(mx, "M2") : !mul(Andes45VLEN, 2),
+ !eq(mx, "M4") : !mul(Andes45VLEN, 4),
+ !eq(mx, "M8") : !mul(Andes45VLEN, 8),
+ !eq(mx, "MF2") : !mul(Andes45VLEN, 1),
+ !eq(mx, "MF4") : !mul(Andes45VLEN, 1),
+ !eq(mx, "MF8") : !mul(Andes45VLEN, 1)
+ );
+
+ int c = !div(b, sew);
+}
+
+
def Andes45Model : SchedMachineModel {
let MicroOpBufferSize = 0; // Andes45 is in-order processor
let IssueWidth = 2; // 2 micro-ops dispatched per cycle
@@ -32,6 +263,15 @@ let SchedModel = Andes45Model in {
// - Floating Point Divide / SQRT Unit (FDIV)
// - Floating Point Move Unit (FMV)
// - Floating Point Misc Unit (FMISC)
+//
+// Andes 45 series VPU
+// - Vector Arithmetic and Logical Unit (VALU)
+// - Vector Multiply Accumulate Unit (VMAC)
+// - Vector Divide Unit (VDIV)
+// - Vector Permutation Unit (VPERMUT)
+// - Vector Mask Unit (VMASK)
+// - Vector Floating-Point Miscellaneous Unit (VFMIS)
+// - Vector Floating-Point Divide Unit (VFDIV)
//===----------------------------------------------------------------------===//
let BufferSize = 0 in {
@@ -44,6 +284,24 @@ def Andes45FMAC : ProcResource<1>;
def Andes45FDIV : ProcResource<1>;
def Andes45FMV : ProcResource<1>;
def Andes45FMISC : ProcResource<1>;
+
+def Andes45VALU : ProcResource<1>;
+def Andes45VMAC : ProcResource<1>;
+def Andes45VFMIS : ProcResource<1>;
+def Andes45VPERMUT : ProcResource<1>;
+def Andes45VDIV : ProcResource<1>;
+def Andes45VFDIV : ProcResource<1>;
+def Andes45VMASK : ProcResource<1>;
+def Andes45VLSU : ProcResource<1>;
+
+def Andes45VPU : ProcResGroup<[Andes45VALU,
+ Andes45VMAC,
+ Andes45VFMIS,
+ Andes45VPERMUT,
+ Andes45VDIV,
+ Andes45VFDIV,
+ Andes45VMASK,
+ Andes45VLSU]>;
}
// Integer arithmetic and logic
@@ -333,10 +591,843 @@ def : ReadAdvance<ReadSingleBit, 0>;
def : ReadAdvance<ReadSingleBitImm, 0>;
def : ReadAdvance<ReadCSR, 0>;
+// RVV Scheduling
+
+// 6. Configuration-Setting Instructions
+def : WriteRes<WriteVSETVLI, [Andes45CSR]>;
+def : WriteRes<WriteVSETIVLI, [Andes45CSR]>;
+def : WriteRes<WriteVSETVL, [Andes45CSR]>;
+
+// 7. Vector Loads and Stores
+
+// Unit-stride loads and stores.
+
+// The latency for loads is (4+VLSU_MEM_LATENCY)
+// The throughput for loads and stores is (VLEN/VLSU_MEM_DW)*EMUL.
+foreach mx = SchedMxList in {
+ defvar Cycles = Andes45GetCyclesLoadStore<mx>.c;
+ defvar IsWorstCase = Andes45IsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = !add(4, VLSU_MEM_LATENCY), ReleaseAtCycles = [Cycles] in {
+ defm "" : LMULWriteResMX<"WriteVLDE", [Andes45VLSU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDFF", [Andes45VLSU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDM", [Andes45VLSU], mx, IsWorstCase>;
+ }
+ let Latency = 1, ReleaseAtCycles = [Cycles] in {
+ defm "" : LMULWriteResMX<"WriteVSTE", [Andes45VLSU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTM", [Andes45VLSU], mx, IsWorstCase>;
+ }
+}
+
+// Strided loads and stores.
+
+// Strided loads and stores operate at one element per cycles.
+// We uses the SEW to compute the number of elements for throughput.
+// The latency for loads is (4+VLSU_MEM_LATENCY+(DLEN/EEW)).
+// The throughput for loads and stores is (VLEN/SEW)
+foreach mx = SchedMxList in {
+ defvar Cycles = Andes45GetCyclesOnePerElement<mx, 8>.c;
+ defvar IsWorstCase = Andes45IsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = !add(4, !add(VLSU_MEM_LATENCY, !div(Andes45DLEN, 8))),
+ ReleaseAtCycles = [Cycles] in
+ defm "" : LMULWriteResMX<"WriteVLDS8", [Andes45VLSU], mx, IsWorstCase>;
+ let Latency = 1, ReleaseAtCycles = [Cycles] in
+ defm "" : LMULWriteResMX<"WriteVSTS8", [Andes45VLSU], mx, IsWorstCase>;
+}
+foreach mx = SchedMxList in {
+ defvar Cycles = Andes45GetCyclesOnePerElement<mx, 16>.c;
+ defvar IsWorstCase = Andes45IsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = !add(4, !add(VLSU_MEM_LATENCY, !div(Andes45DLEN, 16))),
+ ReleaseAtCycles = [Cycles] in
+ defm "" : LMULWriteResMX<"WriteVLDS16", [Andes45VLSU], mx, IsWorstCase>;
+ let Latency = 1, ReleaseAtCycles = [Cycles] in
+ defm "" : LMULWriteResMX<"WriteVSTS16", [Andes45VLSU], mx, IsWorstCase>;
+}
+foreach mx = SchedMxList in {
+ defvar Cycles = Andes45GetCyclesOnePerElement<mx, 32>.c;
+ defvar IsWorstCase = Andes45IsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = !add(4, !add(VLSU_MEM_LATENCY, !div(Andes45DLEN, 32))),
+ ReleaseAtCycles = [Cycles] in
+ defm "" : LMULWriteResMX<"WriteVLDS32", [Andes45VLSU], mx, IsWorstCase>;
+ let Latency = 1, ReleaseAtCycles = [Cycles] in
+ defm "" : LMULWriteResMX<"WriteVSTS32", [Andes45VLSU], mx, IsWorstCase>;
+}
+foreach mx = SchedMxList in {
+ defvar Cycles = Andes45GetCyclesOnePerElement<mx, 64>.c;
+ defvar IsWorstCase = Andes45IsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = !add(4, !add(VLSU_MEM_LATENCY, !div(Andes45DLEN, 64))),
+ ReleaseAtCycles = [Cycles] in
+ defm "" : LMULWriteResMX<"WriteVLDS64", [Andes45VLSU], mx, IsWorstCase>;
+ let Latency = 1, ReleaseAtCycles = [Cycles] in
+ defm "" : LMULWriteResMX<"WriteVSTS64", [Andes45VLSU], mx, IsWorstCase>;
+}
+
+// Indexed loads and stores
+
+// Indexed loads and stores operate at one element per cycles.
+// We uses the SEW to compute the number of elements for throughput.
+// The latency for loads is (5+VLSU_MEM_LATENCY+(DLEN/EEW)).
+// The throughput for loads and stores is (VL+EMUL-1).
+foreach mx = SchedMxList in {
+ defvar Cycles = Andes45GetCyclesOnePerElement<mx, 8>.c;
+ defvar IsWorstCase = Andes45IsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = !add(5, !add(VLSU_MEM_LATENCY, !div(Andes45DLEN, 8))),
+ ReleaseAtCycles = [!add(Cycles, !sub(Andes45GetLMULValue<mx>.c, 1))] in {
+ defm "" : LMULWriteResMX<"WriteVLDUX8", [Andes45VLSU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDOX8", [Andes45VLSU], mx, IsWorstCase>;
+ }
+ let Latency = 1,
+ ReleaseAtCycles = [!add(Cycles, !sub(Andes45GetLMULValue<mx>.c, 1))] in {
+ defm "" : LMULWriteResMX<"WriteVSTUX8", [Andes45VLSU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTOX8", [Andes45VLSU], mx, IsWorstCase>;
+ }
+}
+foreach mx = SchedMxList in {
+ defvar Cycles = Andes45GetCyclesOnePerElement<mx, 16>.c;
+ defvar IsWorstCase = Andes45IsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = !add(5, !add(VLSU_MEM_LATENCY, !div(Andes45DLEN, 16))),
+ ReleaseAtCycles = [!add(Cycles, !sub(Andes45GetLMULValue<mx>.c, 1))] in {
+ defm "" : LMULWriteResMX<"WriteVLDUX16", [Andes45VLSU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDOX16", [Andes45VLSU], mx, IsWorstCase>;
+ }
+ let Latency = 1,
+ ReleaseAtCycles = [!add(Cycles, !sub(Andes45GetLMULValue<mx>.c, 1))] in {
+ defm "" : LMULWriteResMX<"WriteVSTUX16", [Andes45VLSU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTOX16", [Andes45VLSU], mx, IsWorstCase>;
+ }
+}
+foreach mx = SchedMxList in {
+ defvar Cycles = Andes45GetCyclesOnePerElement<mx, 32>.c;
+ defvar IsWorstCase = Andes45IsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = !add(5, !add(VLSU_MEM_LATENCY, !div(Andes45DLEN, 32))),
+ ReleaseAtCycles = [!add(Cycles, !sub(Andes45GetLMULValue<mx>.c, 1))] in {
+ defm "" : LMULWriteResMX<"WriteVLDUX32", [Andes45VLSU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDOX32", [Andes45VLSU], mx, IsWorstCase>;
+ }
+ let Latency = 1,
+ ReleaseAtCycles = [!add(Cycles, !sub(Andes45GetLMULValue<mx>.c, 1))] in {
+ defm "" : LMULWriteResMX<"WriteVSTUX32", [Andes45VLSU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTOX32", [Andes45VLSU], mx, IsWorstCase>;
+ }
+}
+foreach mx = SchedMxList in {
+ defvar Cycles = Andes45GetCyclesOnePerElement<mx, 64>.c;
+ defvar IsWorstCase = Andes45IsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = !add(5, !add(VLSU_MEM_LATENCY, !div(Andes45DLEN, 64))),
+ ReleaseAtCycles = [!add(Cycles, !sub(Andes45GetLMULValue<mx>.c, 1))] in {
+ defm "" : LMULWriteResMX<"WriteVLDUX64", [Andes45VLSU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDOX64", [Andes45VLSU], mx, IsWorstCase>;
+ }
+ let Latency = 1,
+ ReleaseAtCycles = [!add(Cycles, !sub(Andes45GetLMULValue<mx>.c, 1))] in {
+ defm "" : LMULWriteResMX<"WriteVSTUX64", [Andes45VLSU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTOX64", [Andes45VLSU], mx, IsWorstCase>;
+ }
+}
+
+// TODO: Please confirm again the throughput and latency for load/store
+// whole register
+// VLD*R is LMUL aware
+let Latency = 6, ReleaseAtCycles = [2] in
+ def : WriteRes<WriteVLD1R, [Andes45VLSU]>;
+let Latency = 6, ReleaseAtCycles = [4] in
+ def : WriteRes<WriteVLD2R, [Andes45VLSU]>;
+let Latency = 6, ReleaseAtCycles = [8] in
+ def : WriteRes<WriteVLD4R, [Andes45VLSU]>;
+let Latency = 6, ReleaseAtCycles = [16] in
+ def : WriteRes<WriteVLD8R, [Andes45VLSU]>;
+// VST*R is LMUL aware
+let Latency = 1, ReleaseAtCycles = [2] in
+ def : WriteRes<WriteVST1R, [Andes45VLSU]>;
+let Latency = 1, ReleaseAtCycles = [4] in
+ def : WriteRes<WriteVST2R, [Andes45VLSU]>;
+let Latency = 1, ReleaseAtCycles = [8] in
+ def : WriteRes<WriteVST4R, [Andes45VLSU]>;
+let Latency = 1, ReleaseAtCycles = [16] in
+ def : WriteRes<WriteVST8R, [Andes45VLSU]>;
+
+// Unit-Stride Segmented Loads and Stores
+
+// The latency for loads is (4+VLSU_MEM_LATENCY+EMUL* NFIELDS+2)
+// The throughput for loads and stores is (VLEN/VLSU_MEM_DW)*EMUL*NFIELDS.
+foreach mx = SchedMxList in {
+ foreach eew = [8, 16, 32, 64] in {
+ defvar Cycles = Andes45GetCyclesLoadStore<mx>.c;
+ defvar IsWorstCase = Andes45IsWorstCaseMX<mx, SchedMxList>.c;
+ foreach nf=2-8 in {
+ defvar Size = !mul(Andes45GetLMULValue<mx>.c, nf);
+ let Latency = !add(4, !add(VLSU_MEM_LATENCY, !add(Size, 2))),
+ ReleaseAtCycles = [!mul(Cycles, nf)] in {
+ defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" # eew,
+ [Andes45VLSU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" # eew,
+ [Andes45VLSU], mx, IsWorstCase>;
+ }
+ // TODO
+ let Latency = 1, ReleaseAtCycles = [Cycles] in
+ defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" # eew,
+ [Andes45VLSU], mx, IsWorstCase>;
+ }
+ }
+}
+
+// Strided Segmented Loads and Stores
+
+// The latency for loads is (5+VLSU_MEM_LATENCY+(DLEN/EEW))
+// The throughput for loads and stores is VL.
+foreach mx = SchedMxList in {
+ foreach nf=2-8 in {
+ foreach eew = [8, 16, 32, 64] in {
+ defvar Cycles = Andes45GetCyclesOnePerElement<mx, eew>.c;
+ defvar IsWorstCase = Andes45IsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = !add(5, !add(VLSU_MEM_LATENCY, !div(Andes45DLEN, eew))),
+ ReleaseAtCycles = [Cycles] in
+ defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" # eew,
+ [Andes45VLSU], mx, IsWorstCase>;
+ let Latency = 1, ReleaseAtCycles = [Cycles] in
+ defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" # eew,
+ [Andes45VLSU], mx, IsWorstCase>;
+ }
+ }
+}
+
+// Indexed Segmented Loads and Stores
+
+// The latency for loads is (6+VLSU_MEM_LATENCY+(DLEN/EEW))
+// The throughput for loads and stores is (VL+EMUL-1).
+foreach mx = SchedMxList in {
+ foreach nf=2-8 in {
+ foreach eew = [8, 16, 32, 64] in {
+ defvar Cycles = Andes45GetCyclesOnePerElement<mx, eew>.c;
+ defvar IsWorstCase = Andes45IsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = !add(6, !add(VLSU_MEM_LATENCY, !div(Andes45DLEN, eew))),
+ ReleaseAtCycles = [!add(Cycles, !sub(Andes45GetLMULValue<mx>.c, 1))] in {
+ defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" # eew,
+ [Andes45VLSU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" # eew,
+ [Andes45VLSU], mx, IsWorstCase>;
+ }
+ let Latency = 1,
+ ReleaseAtCycles = [!add(Cycles, !sub(Andes45GetLMULValue<mx>.c, 1))] in {
+ defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" # eew,
+ [Andes45VLSU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" # eew,
+ [Andes45VLSU], mx, IsWorstCase>;
+ }
+ }
+ }
+}
+
+// 11. Vector Integer Arithmetic Instructions
+foreach mx = SchedMxList in {
+ defvar Cycles = Andes45GetCyclesDefault<mx>.c;
+ defvar IsWorstCase = Andes45IsWorstCaseMX<mx, SchedMxList>...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I assume you might share this code among many scheduling models in your codebases where Andes45VLEN and Andes45DLEN might be factored out with different values. But in this case they're always the same, could we just substitute this expression with 1?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We would like to keep Andes45VLEN and Andes45DLEN factored out, because their values may change in the future. The VLEN and DLEN of Andes45 are actually configurable. We chose the most common configuration, VLEN=512 and DLEN=512, as our default. If Andes45VLEN and Andes45DLEN are not factored out, it will be difficult to locate all the places that need to be updated.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just want to make sure it's intended that rthroughput will be (much) larger than latency in some cases.
Though latency is usually larger than rthroughput, having the opposite way is also supported in both MachineScheduler and MCA -- just want to double check here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I've confirmed our spec there are really some cases that latency is greater than rthroughput.
🐧 Linux x64 Test Results
|
This patch implements RVV scheduling model for andes45 series processors.
Some of scheduling categories are left with a TODO mark. We will revise its latency and throughput in the future.