@@ -24,6 +24,67 @@ class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0
24
24
bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
25
25
}
26
26
27
+ defvar SMX60VLEN = 256;
28
+ defvar SMX60DLEN = !div(SMX60VLEN, 2);
29
+
30
+ class Get1248Latency<string mx> {
31
+ int c = !cond(
32
+ !eq(mx, "M2") : 2,
33
+ !eq(mx, "M4") : 4,
34
+ !eq(mx, "M8") : 8,
35
+ true: 1
36
+ );
37
+ }
38
+
39
+ // Used for: logical opsz, shifts, sign ext, merge/move, FP sign/recip/convert, mask ops, slides
40
+ class Get4816Latency<string mx> {
41
+ int c = !cond(
42
+ !eq(mx, "M4") : 8,
43
+ !eq(mx, "M8") : 16,
44
+ true: 4
45
+ );
46
+ }
47
+
48
+ // Used for: arithmetic (add/sub/min/max), saturating/averaging, FP add/sub/min/max
49
+ class Get458Latency<string mx> {
50
+ int c = !cond(
51
+ !eq(mx, "M4") : 5,
52
+ !eq(mx, "M8") : 8,
53
+ true: 4
54
+ );
55
+ }
56
+
57
+ // Widening scaling pattern (4,4,4,4,5,8,8): plateaus at higher LMULs
58
+ // Used for: widening operations
59
+ class Get4588Latency<string mx> {
60
+ int c = !cond(
61
+ !eq(mx, "M2") : 5,
62
+ !eq(mx, "M4") : 8,
63
+ !eq(mx, "M8") : 8, // M8 not supported for most widening, fallback
64
+ true: 4
65
+ );
66
+ }
67
+
68
+ // Used for: mask-producing comparisons, carry ops with mask, FP comparisons
69
+ class Get461018Latency<string mx> {
70
+ int c = !cond(
71
+ !eq(mx, "M2") : 6,
72
+ !eq(mx, "M4") : 10,
73
+ !eq(mx, "M8") : 18,
74
+ true: 4
75
+ );
76
+ }
77
+
78
+ // Used for: e64 multiply pattern, complex ops
79
+ class Get781632Latency<string mx> {
80
+ int c = !cond(
81
+ !eq(mx, "M2") : 8,
82
+ !eq(mx, "M4") : 16,
83
+ !eq(mx, "M8") : 32,
84
+ true: 7
85
+ );
86
+ }
87
+
27
88
def SpacemitX60Model : SchedMachineModel {
28
89
let IssueWidth = 2; // dual-issue
29
90
let MicroOpBufferSize = 0; // in-order
@@ -322,71 +383,118 @@ foreach LMul = [1, 2, 4, 8] in {
322
383
foreach mx = SchedMxList in {
323
384
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
324
385
325
- defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
326
- defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
327
- defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
328
- defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
329
- defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
330
- defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
331
- defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
332
- defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
333
- defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
334
- defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
335
- defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
336
- defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
337
- defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
338
- defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
339
- defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
340
- defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
341
- defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
342
- defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
343
- defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
344
- defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
345
- defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
346
-
347
- defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
348
- defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
349
- defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
350
-
351
- defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
352
- defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
353
- defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
354
- defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
386
+ let Latency = Get458Latency<mx>.c, ReleaseAtCycles = [4] in {
387
+ defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
388
+ defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
389
+ }
390
+
391
+ let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in {
392
+ // Pattern of vadd, vsub, vrsub: 4/4/5/8
393
+ // Pattern of vand, vor, vxor: 4/4/8/16
394
+ // They are grouped together, so we used the worst case 4/4/8/16
395
+ // TODO: use InstRW to override individual instructions' scheduling data
396
+ defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
397
+ defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
398
+ defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
399
+
400
+ defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
401
+ defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
402
+ defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
403
+ defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
404
+ defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
405
+ defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
406
+ defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
407
+ defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
408
+ defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
409
+ defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
410
+
411
+ defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
412
+ defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
413
+ defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
414
+ }
415
+
416
+ let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [4] in {
417
+ defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
418
+ defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
419
+ defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
420
+ defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
421
+ defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
422
+ defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
423
+ }
424
+
425
+ // Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8,
426
+ // e64 = 7,8,16,32. We use the worst-case until we can split the SEW.
427
+ // TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites
428
+ let Latency = Get781632Latency<mx>.c, ReleaseAtCycles = [7] in {
429
+ defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
430
+ defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
431
+ defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
432
+ defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
433
+ }
355
434
}
356
435
357
436
// Widening
437
+ // Pattern of vwmul, vwmacc, etc: e8/e16 = 4/4/5/8, e32 = 5,5,5,8
438
+ // We use the worst-case for all.
358
439
foreach mx = SchedMxListW in {
359
440
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
360
441
361
- defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
362
- defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
363
- defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
364
- defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
365
- defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
366
- defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
367
- defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
442
+ let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [4] in {
443
+ defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
444
+ defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
445
+ defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
446
+ defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
447
+ defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
448
+ defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
449
+ defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
450
+ }
368
451
}
369
452
370
- // Vector Integer Division and Remainder
453
+ // Division and remainder operations
454
+ // Pattern of vdivu: 11/11/11/20/40/80/160
455
+ // Pattern of vdiv: 12/12/12/22/44/88/176
456
+ // Pattern of vremu: 12/12/12/22/44/88/176
457
+ // Pattern of vrem: 13/13/13/24/48/96/192
458
+ // We use for all: 12/12/12/24/48/96/192
459
+ // TODO: Create separate WriteVIRem to more closely match the latencies
371
460
foreach mx = SchedMxList in {
372
461
foreach sew = SchedSEWSet<mx>.val in {
373
462
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
374
463
375
- defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
376
- defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
464
+ // Slightly reduced for fractional LMULs
465
+ defvar Multiplier = !cond(
466
+ !eq(mx, "MF8") : 12,
467
+ !eq(mx, "MF4") : 12,
468
+ !eq(mx, "MF2") : 12,
469
+ true: 24
470
+ );
471
+
472
+ let Latency = !mul(Get1248Latency<mx>.c, Multiplier), ReleaseAtCycles = [12] in {
473
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
474
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
475
+ }
377
476
}
378
477
}
379
478
380
479
// Narrowing Shift and Clips
381
480
foreach mx = SchedMxListW in {
382
481
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
383
482
384
- defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
385
- defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
386
- defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
387
- defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
388
- defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
389
- defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
483
+ // Slightly increased for integer LMULs
484
+ defvar Multiplier = !cond(
485
+ !eq(mx, "M2") : 2,
486
+ !eq(mx, "M4") : 2,
487
+ true: 1
488
+ );
489
+
490
+ let Latency = !mul(Get4816Latency<mx>.c, Multiplier), ReleaseAtCycles = [4] in {
491
+ defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
492
+ defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
493
+ defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
494
+ defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
495
+ defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
496
+ defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
497
+ }
390
498
}
391
499
392
500
// 12. Vector Fixed-Point Arithmetic Instructions
0 commit comments