Skip to content

Commit 8952225

Browse files
[RISC-V] Update SpacemiT-X60 Vector Integer latencies (#149207)
This PR adds hardware-measured latencies for all instructions defined in Section 11 of the RVV specification: "Vector Integer Arithmetic Instructions" to the SpacemiT-X60 scheduling model. The code in this PR was extracted from PR #144564, so it's smaller to review. I made a few adjustments here and there, and the code is almost identical; the only change was to add ReleaseAtCycles to all instructions modified in this patch, except for the vmul, vdiv, and vrem ones.
1 parent becde6d commit 8952225

File tree

9 files changed

+5816
-5708
lines changed

9 files changed

+5816
-5708
lines changed

llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td

Lines changed: 154 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,67 @@ class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0
2424
bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
2525
}
2626

27+
defvar SMX60VLEN = 256;
28+
defvar SMX60DLEN = !div(SMX60VLEN, 2);
29+
30+
class Get1248Latency<string mx> {
31+
int c = !cond(
32+
!eq(mx, "M2") : 2,
33+
!eq(mx, "M4") : 4,
34+
!eq(mx, "M8") : 8,
35+
true: 1
36+
);
37+
}
38+
39+
// Used for: logical opsz, shifts, sign ext, merge/move, FP sign/recip/convert, mask ops, slides
40+
class Get4816Latency<string mx> {
41+
int c = !cond(
42+
!eq(mx, "M4") : 8,
43+
!eq(mx, "M8") : 16,
44+
true: 4
45+
);
46+
}
47+
48+
// Used for: arithmetic (add/sub/min/max), saturating/averaging, FP add/sub/min/max
49+
class Get458Latency<string mx> {
50+
int c = !cond(
51+
!eq(mx, "M4") : 5,
52+
!eq(mx, "M8") : 8,
53+
true: 4
54+
);
55+
}
56+
57+
// Widening scaling pattern (4,4,4,4,5,8,8): plateaus at higher LMULs
58+
// Used for: widening operations
59+
class Get4588Latency<string mx> {
60+
int c = !cond(
61+
!eq(mx, "M2") : 5,
62+
!eq(mx, "M4") : 8,
63+
!eq(mx, "M8") : 8, // M8 not supported for most widening, fallback
64+
true: 4
65+
);
66+
}
67+
68+
// Used for: mask-producing comparisons, carry ops with mask, FP comparisons
69+
class Get461018Latency<string mx> {
70+
int c = !cond(
71+
!eq(mx, "M2") : 6,
72+
!eq(mx, "M4") : 10,
73+
!eq(mx, "M8") : 18,
74+
true: 4
75+
);
76+
}
77+
78+
// Used for: e64 multiply pattern, complex ops
79+
class Get781632Latency<string mx> {
80+
int c = !cond(
81+
!eq(mx, "M2") : 8,
82+
!eq(mx, "M4") : 16,
83+
!eq(mx, "M8") : 32,
84+
true: 7
85+
);
86+
}
87+
2788
def SpacemitX60Model : SchedMachineModel {
2889
let IssueWidth = 2; // dual-issue
2990
let MicroOpBufferSize = 0; // in-order
@@ -322,71 +383,118 @@ foreach LMul = [1, 2, 4, 8] in {
322383
foreach mx = SchedMxList in {
323384
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
324385

325-
defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
326-
defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
327-
defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
328-
defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
329-
defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
330-
defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
331-
defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
332-
defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
333-
defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
334-
defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
335-
defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
336-
defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
337-
defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
338-
defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
339-
defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
340-
defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
341-
defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
342-
defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
343-
defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
344-
defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
345-
defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
346-
347-
defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
348-
defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
349-
defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
350-
351-
defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
352-
defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
353-
defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
354-
defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
386+
let Latency = Get458Latency<mx>.c, ReleaseAtCycles = [4] in {
387+
defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
388+
defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
389+
}
390+
391+
let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in {
392+
// Pattern of vadd, vsub, vrsub: 4/4/5/8
393+
// Pattern of vand, vor, vxor: 4/4/8/16
394+
// They are grouped together, so we used the worst case 4/4/8/16
395+
// TODO: use InstRW to override individual instructions' scheduling data
396+
defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
397+
defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
398+
defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
399+
400+
defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
401+
defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
402+
defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
403+
defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
404+
defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
405+
defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
406+
defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
407+
defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
408+
defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
409+
defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
410+
411+
defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
412+
defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
413+
defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
414+
}
415+
416+
let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [4] in {
417+
defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
418+
defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
419+
defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
420+
defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
421+
defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
422+
defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
423+
}
424+
425+
// Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8,
426+
// e64 = 7,8,16,32. We use the worst-case until we can split the SEW.
427+
// TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites
428+
let Latency = Get781632Latency<mx>.c, ReleaseAtCycles = [7] in {
429+
defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
430+
defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
431+
defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
432+
defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
433+
}
355434
}
356435

357436
// Widening
437+
// Pattern of vwmul, vwmacc, etc: e8/e16 = 4/4/5/8, e32 = 5,5,5,8
438+
// We use the worst-case for all.
358439
foreach mx = SchedMxListW in {
359440
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
360441

361-
defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
362-
defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
363-
defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
364-
defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
365-
defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
366-
defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
367-
defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
442+
let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [4] in {
443+
defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
444+
defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
445+
defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
446+
defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
447+
defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
448+
defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
449+
defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
450+
}
368451
}
369452

370-
// Vector Integer Division and Remainder
453+
// Division and remainder operations
454+
// Pattern of vdivu: 11/11/11/20/40/80/160
455+
// Pattern of vdiv: 12/12/12/22/44/88/176
456+
// Pattern of vremu: 12/12/12/22/44/88/176
457+
// Pattern of vrem: 13/13/13/24/48/96/192
458+
// We use for all: 12/12/12/24/48/96/192
459+
// TODO: Create separate WriteVIRem to more closely match the latencies
371460
foreach mx = SchedMxList in {
372461
foreach sew = SchedSEWSet<mx>.val in {
373462
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
374463

375-
defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
376-
defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
464+
// Slightly reduced for fractional LMULs
465+
defvar Multiplier = !cond(
466+
!eq(mx, "MF8") : 12,
467+
!eq(mx, "MF4") : 12,
468+
!eq(mx, "MF2") : 12,
469+
true: 24
470+
);
471+
472+
let Latency = !mul(Get1248Latency<mx>.c, Multiplier), ReleaseAtCycles = [12] in {
473+
defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
474+
defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
475+
}
377476
}
378477
}
379478

380479
// Narrowing Shift and Clips
381480
foreach mx = SchedMxListW in {
382481
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
383482

384-
defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
385-
defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
386-
defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
387-
defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
388-
defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
389-
defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
483+
// Slightly increased for integer LMULs
484+
defvar Multiplier = !cond(
485+
!eq(mx, "M2") : 2,
486+
!eq(mx, "M4") : 2,
487+
true: 1
488+
);
489+
490+
let Latency = !mul(Get4816Latency<mx>.c, Multiplier), ReleaseAtCycles = [4] in {
491+
defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
492+
defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
493+
defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
494+
defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
495+
defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
496+
defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
497+
}
390498
}
391499

392500
// 12. Vector Fixed-Point Arithmetic Instructions

0 commit comments

Comments
 (0)