-
Notifications
You must be signed in to change notification settings - Fork 10.8k
/
ModuloSchedule.cpp
2823 lines (2579 loc) · 107 KB
/
ModuloSchedule.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//===- ModuloSchedule.cpp - Software pipeline schedule expansion ----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/ModuloSchedule.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/InitializePasses.h"
#include "llvm/MC/MCContext.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#define DEBUG_TYPE "pipeliner"
using namespace llvm;
void ModuloSchedule::print(raw_ostream &OS) {
for (MachineInstr *MI : ScheduledInstrs)
OS << "[stage " << getStage(MI) << " @" << getCycle(MI) << "c] " << *MI;
}
//===----------------------------------------------------------------------===//
// ModuloScheduleExpander implementation
//===----------------------------------------------------------------------===//
/// Return the register values for the operands of a Phi instruction.
/// This function assume the instruction is a Phi.
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
unsigned &InitVal, unsigned &LoopVal) {
assert(Phi.isPHI() && "Expecting a Phi.");
InitVal = 0;
LoopVal = 0;
for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
if (Phi.getOperand(i + 1).getMBB() != Loop)
InitVal = Phi.getOperand(i).getReg();
else
LoopVal = Phi.getOperand(i).getReg();
assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
}
/// Return the Phi register value that comes from the incoming block.
static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
if (Phi.getOperand(i + 1).getMBB() != LoopBB)
return Phi.getOperand(i).getReg();
return 0;
}
/// Return the Phi register value that comes the loop block.
static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
if (Phi.getOperand(i + 1).getMBB() == LoopBB)
return Phi.getOperand(i).getReg();
return 0;
}
void ModuloScheduleExpander::expand() {
BB = Schedule.getLoop()->getTopBlock();
Preheader = *BB->pred_begin();
if (Preheader == BB)
Preheader = *std::next(BB->pred_begin());
// Iterate over the definitions in each instruction, and compute the
// stage difference for each use. Keep the maximum value.
for (MachineInstr *MI : Schedule.getInstructions()) {
int DefStage = Schedule.getStage(MI);
for (const MachineOperand &Op : MI->all_defs()) {
Register Reg = Op.getReg();
unsigned MaxDiff = 0;
bool PhiIsSwapped = false;
for (MachineOperand &UseOp : MRI.use_operands(Reg)) {
MachineInstr *UseMI = UseOp.getParent();
int UseStage = Schedule.getStage(UseMI);
unsigned Diff = 0;
if (UseStage != -1 && UseStage >= DefStage)
Diff = UseStage - DefStage;
if (MI->isPHI()) {
if (isLoopCarried(*MI))
++Diff;
else
PhiIsSwapped = true;
}
MaxDiff = std::max(Diff, MaxDiff);
}
RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
}
}
generatePipelinedLoop();
}
void ModuloScheduleExpander::generatePipelinedLoop() {
LoopInfo = TII->analyzeLoopForPipelining(BB);
assert(LoopInfo && "Must be able to analyze loop!");
// Create a new basic block for the kernel and add it to the CFG.
MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
unsigned MaxStageCount = Schedule.getNumStages() - 1;
// Remember the registers that are used in different stages. The index is
// the iteration, or stage, that the instruction is scheduled in. This is
// a map between register names in the original block and the names created
// in each stage of the pipelined loop.
ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
// The renaming destination by Phis for the registers across stages.
// This map is updated during Phis generation to point to the most recent
// renaming destination.
ValueMapTy *VRMapPhi = new ValueMapTy[(MaxStageCount + 1) * 2];
InstrMapTy InstrMap;
SmallVector<MachineBasicBlock *, 4> PrologBBs;
// Generate the prolog instructions that set up the pipeline.
generateProlog(MaxStageCount, KernelBB, VRMap, PrologBBs);
MF.insert(BB->getIterator(), KernelBB);
// Rearrange the instructions to generate the new, pipelined loop,
// and update register names as needed.
for (MachineInstr *CI : Schedule.getInstructions()) {
if (CI->isPHI())
continue;
unsigned StageNum = Schedule.getStage(CI);
MachineInstr *NewMI = cloneInstr(CI, MaxStageCount, StageNum);
updateInstruction(NewMI, false, MaxStageCount, StageNum, VRMap);
KernelBB->push_back(NewMI);
InstrMap[NewMI] = CI;
}
// Copy any terminator instructions to the new kernel, and update
// names as needed.
for (MachineInstr &MI : BB->terminators()) {
MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
updateInstruction(NewMI, false, MaxStageCount, 0, VRMap);
KernelBB->push_back(NewMI);
InstrMap[NewMI] = &MI;
}
NewKernel = KernelBB;
KernelBB->transferSuccessors(BB);
KernelBB->replaceSuccessor(BB, KernelBB);
generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap,
InstrMap, MaxStageCount, MaxStageCount, false);
generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap, VRMapPhi,
InstrMap, MaxStageCount, MaxStageCount, false);
LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
SmallVector<MachineBasicBlock *, 4> EpilogBBs;
// Generate the epilog instructions to complete the pipeline.
generateEpilog(MaxStageCount, KernelBB, BB, VRMap, VRMapPhi, EpilogBBs,
PrologBBs);
// We need this step because the register allocation doesn't handle some
// situations well, so we insert copies to help out.
splitLifetimes(KernelBB, EpilogBBs);
// Remove dead instructions due to loop induction variables.
removeDeadInstructions(KernelBB, EpilogBBs);
// Add branches between prolog and epilog blocks.
addBranches(*Preheader, PrologBBs, KernelBB, EpilogBBs, VRMap);
delete[] VRMap;
delete[] VRMapPhi;
}
void ModuloScheduleExpander::cleanup() {
// Remove the original loop since it's no longer referenced.
for (auto &I : *BB)
LIS.RemoveMachineInstrFromMaps(I);
BB->clear();
BB->eraseFromParent();
}
/// Generate the pipeline prolog code.
void ModuloScheduleExpander::generateProlog(unsigned LastStage,
MachineBasicBlock *KernelBB,
ValueMapTy *VRMap,
MBBVectorTy &PrologBBs) {
MachineBasicBlock *PredBB = Preheader;
InstrMapTy InstrMap;
// Generate a basic block for each stage, not including the last stage,
// which will be generated in the kernel. Each basic block may contain
// instructions from multiple stages/iterations.
for (unsigned i = 0; i < LastStage; ++i) {
// Create and insert the prolog basic block prior to the original loop
// basic block. The original loop is removed later.
MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
PrologBBs.push_back(NewBB);
MF.insert(BB->getIterator(), NewBB);
NewBB->transferSuccessors(PredBB);
PredBB->addSuccessor(NewBB);
PredBB = NewBB;
// Generate instructions for each appropriate stage. Process instructions
// in original program order.
for (int StageNum = i; StageNum >= 0; --StageNum) {
for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
BBE = BB->getFirstTerminator();
BBI != BBE; ++BBI) {
if (Schedule.getStage(&*BBI) == StageNum) {
if (BBI->isPHI())
continue;
MachineInstr *NewMI =
cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum);
updateInstruction(NewMI, false, i, (unsigned)StageNum, VRMap);
NewBB->push_back(NewMI);
InstrMap[NewMI] = &*BBI;
}
}
}
rewritePhiValues(NewBB, i, VRMap, InstrMap);
LLVM_DEBUG({
dbgs() << "prolog:\n";
NewBB->dump();
});
}
PredBB->replaceSuccessor(BB, KernelBB);
// Check if we need to remove the branch from the preheader to the original
// loop, and replace it with a branch to the new loop.
unsigned numBranches = TII->removeBranch(*Preheader);
if (numBranches) {
SmallVector<MachineOperand, 0> Cond;
TII->insertBranch(*Preheader, PrologBBs[0], nullptr, Cond, DebugLoc());
}
}
/// Generate the pipeline epilog code. The epilog code finishes the iterations
/// that were started in either the prolog or the kernel. We create a basic
/// block for each stage that needs to complete.
void ModuloScheduleExpander::generateEpilog(
unsigned LastStage, MachineBasicBlock *KernelBB, MachineBasicBlock *OrigBB,
ValueMapTy *VRMap, ValueMapTy *VRMapPhi, MBBVectorTy &EpilogBBs,
MBBVectorTy &PrologBBs) {
// We need to change the branch from the kernel to the first epilog block, so
// this call to analyze branch uses the kernel rather than the original BB.
MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
SmallVector<MachineOperand, 4> Cond;
bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
assert(!checkBranch && "generateEpilog must be able to analyze the branch");
if (checkBranch)
return;
MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
if (*LoopExitI == KernelBB)
++LoopExitI;
assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
MachineBasicBlock *LoopExitBB = *LoopExitI;
MachineBasicBlock *PredBB = KernelBB;
MachineBasicBlock *EpilogStart = LoopExitBB;
InstrMapTy InstrMap;
// Generate a basic block for each stage, not including the last stage,
// which was generated for the kernel. Each basic block may contain
// instructions from multiple stages/iterations.
int EpilogStage = LastStage + 1;
for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
EpilogBBs.push_back(NewBB);
MF.insert(BB->getIterator(), NewBB);
PredBB->replaceSuccessor(LoopExitBB, NewBB);
NewBB->addSuccessor(LoopExitBB);
if (EpilogStart == LoopExitBB)
EpilogStart = NewBB;
// Add instructions to the epilog depending on the current block.
// Process instructions in original program order.
for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
for (auto &BBI : *BB) {
if (BBI.isPHI())
continue;
MachineInstr *In = &BBI;
if ((unsigned)Schedule.getStage(In) == StageNum) {
// Instructions with memoperands in the epilog are updated with
// conservative values.
MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
updateInstruction(NewMI, i == 1, EpilogStage, 0, VRMap);
NewBB->push_back(NewMI);
InstrMap[NewMI] = In;
}
}
}
generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap,
InstrMap, LastStage, EpilogStage, i == 1);
generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap, VRMapPhi,
InstrMap, LastStage, EpilogStage, i == 1);
PredBB = NewBB;
LLVM_DEBUG({
dbgs() << "epilog:\n";
NewBB->dump();
});
}
// Fix any Phi nodes in the loop exit block.
LoopExitBB->replacePhiUsesWith(BB, PredBB);
// Create a branch to the new epilog from the kernel.
// Remove the original branch and add a new branch to the epilog.
TII->removeBranch(*KernelBB);
assert((OrigBB == TBB || OrigBB == FBB) &&
"Unable to determine looping branch direction");
if (OrigBB != TBB)
TII->insertBranch(*KernelBB, EpilogStart, KernelBB, Cond, DebugLoc());
else
TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
// Add a branch to the loop exit.
if (EpilogBBs.size() > 0) {
MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
SmallVector<MachineOperand, 4> Cond1;
TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
}
}
/// Replace all uses of FromReg that appear outside the specified
/// basic block with ToReg.
static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
MachineBasicBlock *MBB,
MachineRegisterInfo &MRI,
LiveIntervals &LIS) {
for (MachineOperand &O :
llvm::make_early_inc_range(MRI.use_operands(FromReg)))
if (O.getParent()->getParent() != MBB)
O.setReg(ToReg);
if (!LIS.hasInterval(ToReg))
LIS.createEmptyInterval(ToReg);
}
/// Return true if the register has a use that occurs outside the
/// specified loop.
static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
MachineRegisterInfo &MRI) {
for (const MachineOperand &MO : MRI.use_operands(Reg))
if (MO.getParent()->getParent() != BB)
return true;
return false;
}
/// Generate Phis for the specific block in the generated pipelined code.
/// This function looks at the Phis from the original code to guide the
/// creation of new Phis.
void ModuloScheduleExpander::generateExistingPhis(
MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
MachineBasicBlock *KernelBB, ValueMapTy *VRMap, InstrMapTy &InstrMap,
unsigned LastStageNum, unsigned CurStageNum, bool IsLast) {
// Compute the stage number for the initial value of the Phi, which
// comes from the prolog. The prolog to use depends on to which kernel/
// epilog that we're adding the Phi.
unsigned PrologStage = 0;
unsigned PrevStage = 0;
bool InKernel = (LastStageNum == CurStageNum);
if (InKernel) {
PrologStage = LastStageNum - 1;
PrevStage = CurStageNum;
} else {
PrologStage = LastStageNum - (CurStageNum - LastStageNum);
PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
}
for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
BBE = BB->getFirstNonPHI();
BBI != BBE; ++BBI) {
Register Def = BBI->getOperand(0).getReg();
unsigned InitVal = 0;
unsigned LoopVal = 0;
getPhiRegs(*BBI, BB, InitVal, LoopVal);
unsigned PhiOp1 = 0;
// The Phi value from the loop body typically is defined in the loop, but
// not always. So, we need to check if the value is defined in the loop.
unsigned PhiOp2 = LoopVal;
if (VRMap[LastStageNum].count(LoopVal))
PhiOp2 = VRMap[LastStageNum][LoopVal];
int StageScheduled = Schedule.getStage(&*BBI);
int LoopValStage = Schedule.getStage(MRI.getVRegDef(LoopVal));
unsigned NumStages = getStagesForReg(Def, CurStageNum);
if (NumStages == 0) {
// We don't need to generate a Phi anymore, but we need to rename any uses
// of the Phi value.
unsigned NewReg = VRMap[PrevStage][LoopVal];
rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, 0, &*BBI, Def,
InitVal, NewReg);
if (VRMap[CurStageNum].count(LoopVal))
VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
}
// Adjust the number of Phis needed depending on the number of prologs left,
// and the distance from where the Phi is first scheduled. The number of
// Phis cannot exceed the number of prolog stages. Each stage can
// potentially define two values.
unsigned MaxPhis = PrologStage + 2;
if (!InKernel && (int)PrologStage <= LoopValStage)
MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
unsigned NumPhis = std::min(NumStages, MaxPhis);
unsigned NewReg = 0;
unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
// In the epilog, we may need to look back one stage to get the correct
// Phi name, because the epilog and prolog blocks execute the same stage.
// The correct name is from the previous block only when the Phi has
// been completely scheduled prior to the epilog, and Phi value is not
// needed in multiple stages.
int StageDiff = 0;
if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
NumPhis == 1)
StageDiff = 1;
// Adjust the computations below when the phi and the loop definition
// are scheduled in different stages.
if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
StageDiff = StageScheduled - LoopValStage;
for (unsigned np = 0; np < NumPhis; ++np) {
// If the Phi hasn't been scheduled, then use the initial Phi operand
// value. Otherwise, use the scheduled version of the instruction. This
// is a little complicated when a Phi references another Phi.
if (np > PrologStage || StageScheduled >= (int)LastStageNum)
PhiOp1 = InitVal;
// Check if the Phi has already been scheduled in a prolog stage.
else if (PrologStage >= AccessStage + StageDiff + np &&
VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
// Check if the Phi has already been scheduled, but the loop instruction
// is either another Phi, or doesn't occur in the loop.
else if (PrologStage >= AccessStage + StageDiff + np) {
// If the Phi references another Phi, we need to examine the other
// Phi to get the correct value.
PhiOp1 = LoopVal;
MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
int Indirects = 1;
while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
int PhiStage = Schedule.getStage(InstOp1);
if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
PhiOp1 = getInitPhiReg(*InstOp1, BB);
else
PhiOp1 = getLoopPhiReg(*InstOp1, BB);
InstOp1 = MRI.getVRegDef(PhiOp1);
int PhiOpStage = Schedule.getStage(InstOp1);
int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
break;
}
++Indirects;
}
} else
PhiOp1 = InitVal;
// If this references a generated Phi in the kernel, get the Phi operand
// from the incoming block.
if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
// In the epilog, a map lookup is needed to get the value from the kernel,
// or previous epilog block. How is does this depends on if the
// instruction is scheduled in the previous block.
if (!InKernel) {
int StageDiffAdj = 0;
if (LoopValStage != -1 && StageScheduled > LoopValStage)
StageDiffAdj = StageScheduled - LoopValStage;
// Use the loop value defined in the kernel, unless the kernel
// contains the last definition of the Phi.
if (np == 0 && PrevStage == LastStageNum &&
(StageScheduled != 0 || LoopValStage != 0) &&
VRMap[PrevStage - StageDiffAdj].count(LoopVal))
PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
// Use the value defined by the Phi. We add one because we switch
// from looking at the loop value to the Phi definition.
else if (np > 0 && PrevStage == LastStageNum &&
VRMap[PrevStage - np + 1].count(Def))
PhiOp2 = VRMap[PrevStage - np + 1][Def];
// Use the loop value defined in the kernel.
else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
// Use the value defined by the Phi, unless we're generating the first
// epilog and the Phi refers to a Phi in a different stage.
else if (VRMap[PrevStage - np].count(Def) &&
(!LoopDefIsPhi || (PrevStage != LastStageNum) ||
(LoopValStage == StageScheduled)))
PhiOp2 = VRMap[PrevStage - np][Def];
}
// Check if we can reuse an existing Phi. This occurs when a Phi
// references another Phi, and the other Phi is scheduled in an
// earlier stage. We can try to reuse an existing Phi up until the last
// stage of the current Phi.
if (LoopDefIsPhi) {
if (static_cast<int>(PrologStage - np) >= StageScheduled) {
int LVNumStages = getStagesForPhi(LoopVal);
int StageDiff = (StageScheduled - LoopValStage);
LVNumStages -= StageDiff;
// Make sure the loop value Phi has been processed already.
if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
NewReg = PhiOp2;
unsigned ReuseStage = CurStageNum;
if (isLoopCarried(*PhiInst))
ReuseStage -= LVNumStages;
// Check if the Phi to reuse has been generated yet. If not, then
// there is nothing to reuse.
if (VRMap[ReuseStage - np].count(LoopVal)) {
NewReg = VRMap[ReuseStage - np][LoopVal];
rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI,
Def, NewReg);
// Update the map with the new Phi name.
VRMap[CurStageNum - np][Def] = NewReg;
PhiOp2 = NewReg;
if (VRMap[LastStageNum - np - 1].count(LoopVal))
PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
if (IsLast && np == NumPhis - 1)
replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
continue;
}
}
}
if (InKernel && StageDiff > 0 &&
VRMap[CurStageNum - StageDiff - np].count(LoopVal))
PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
}
const TargetRegisterClass *RC = MRI.getRegClass(Def);
NewReg = MRI.createVirtualRegister(RC);
MachineInstrBuilder NewPhi =
BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
TII->get(TargetOpcode::PHI), NewReg);
NewPhi.addReg(PhiOp1).addMBB(BB1);
NewPhi.addReg(PhiOp2).addMBB(BB2);
if (np == 0)
InstrMap[NewPhi] = &*BBI;
// We define the Phis after creating the new pipelined code, so
// we need to rename the Phi values in scheduled instructions.
unsigned PrevReg = 0;
if (InKernel && VRMap[PrevStage - np].count(LoopVal))
PrevReg = VRMap[PrevStage - np][LoopVal];
rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
NewReg, PrevReg);
// If the Phi has been scheduled, use the new name for rewriting.
if (VRMap[CurStageNum - np].count(Def)) {
unsigned R = VRMap[CurStageNum - np][Def];
rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, R,
NewReg);
}
// Check if we need to rename any uses that occurs after the loop. The
// register to replace depends on whether the Phi is scheduled in the
// epilog.
if (IsLast && np == NumPhis - 1)
replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
// In the kernel, a dependent Phi uses the value from this Phi.
if (InKernel)
PhiOp2 = NewReg;
// Update the map with the new Phi name.
VRMap[CurStageNum - np][Def] = NewReg;
}
while (NumPhis++ < NumStages) {
rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, NumPhis, &*BBI, Def,
NewReg, 0);
}
// Check if we need to rename a Phi that has been eliminated due to
// scheduling.
if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
}
}
/// Generate Phis for the specified block in the generated pipelined code.
/// These are new Phis needed because the definition is scheduled after the
/// use in the pipelined sequence.
void ModuloScheduleExpander::generatePhis(
MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
MachineBasicBlock *KernelBB, ValueMapTy *VRMap, ValueMapTy *VRMapPhi,
InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
bool IsLast) {
// Compute the stage number that contains the initial Phi value, and
// the Phi from the previous stage.
unsigned PrologStage = 0;
unsigned PrevStage = 0;
unsigned StageDiff = CurStageNum - LastStageNum;
bool InKernel = (StageDiff == 0);
if (InKernel) {
PrologStage = LastStageNum - 1;
PrevStage = CurStageNum;
} else {
PrologStage = LastStageNum - StageDiff;
PrevStage = LastStageNum + StageDiff - 1;
}
for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
BBE = BB->instr_end();
BBI != BBE; ++BBI) {
for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
MachineOperand &MO = BBI->getOperand(i);
if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual())
continue;
int StageScheduled = Schedule.getStage(&*BBI);
assert(StageScheduled != -1 && "Expecting scheduled instruction.");
Register Def = MO.getReg();
unsigned NumPhis = getStagesForReg(Def, CurStageNum);
// An instruction scheduled in stage 0 and is used after the loop
// requires a phi in the epilog for the last definition from either
// the kernel or prolog.
if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
hasUseAfterLoop(Def, BB, MRI))
NumPhis = 1;
if (!InKernel && (unsigned)StageScheduled > PrologStage)
continue;
unsigned PhiOp2;
if (InKernel) {
PhiOp2 = VRMap[PrevStage][Def];
if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
}
// The number of Phis can't exceed the number of prolog stages. The
// prolog stage number is zero based.
if (NumPhis > PrologStage + 1 - StageScheduled)
NumPhis = PrologStage + 1 - StageScheduled;
for (unsigned np = 0; np < NumPhis; ++np) {
// Example for
// Org:
// %Org = ... (Scheduled at Stage#0, NumPhi = 2)
//
// Prolog0 (Stage0):
// %Clone0 = ...
// Prolog1 (Stage1):
// %Clone1 = ...
// Kernel (Stage2):
// %Phi0 = Phi %Clone1, Prolog1, %Clone2, Kernel
// %Phi1 = Phi %Clone0, Prolog1, %Phi0, Kernel
// %Clone2 = ...
// Epilog0 (Stage3):
// %Phi2 = Phi %Clone1, Prolog1, %Clone2, Kernel
// %Phi3 = Phi %Clone0, Prolog1, %Phi0, Kernel
// Epilog1 (Stage4):
// %Phi4 = Phi %Clone0, Prolog0, %Phi2, Epilog0
//
// VRMap = {0: %Clone0, 1: %Clone1, 2: %Clone2}
// VRMapPhi (after Kernel) = {0: %Phi1, 1: %Phi0}
// VRMapPhi (after Epilog0) = {0: %Phi3, 1: %Phi2}
unsigned PhiOp1 = VRMap[PrologStage][Def];
if (np <= PrologStage)
PhiOp1 = VRMap[PrologStage - np][Def];
if (!InKernel) {
if (PrevStage == LastStageNum && np == 0)
PhiOp2 = VRMap[LastStageNum][Def];
else
PhiOp2 = VRMapPhi[PrevStage - np][Def];
}
const TargetRegisterClass *RC = MRI.getRegClass(Def);
Register NewReg = MRI.createVirtualRegister(RC);
MachineInstrBuilder NewPhi =
BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
TII->get(TargetOpcode::PHI), NewReg);
NewPhi.addReg(PhiOp1).addMBB(BB1);
NewPhi.addReg(PhiOp2).addMBB(BB2);
if (np == 0)
InstrMap[NewPhi] = &*BBI;
// Rewrite uses and update the map. The actions depend upon whether
// we generating code for the kernel or epilog blocks.
if (InKernel) {
rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp1,
NewReg);
rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp2,
NewReg);
PhiOp2 = NewReg;
VRMapPhi[PrevStage - np - 1][Def] = NewReg;
} else {
VRMapPhi[CurStageNum - np][Def] = NewReg;
if (np == NumPhis - 1)
rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
NewReg);
}
if (IsLast && np == NumPhis - 1)
replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
}
}
}
}
/// Remove instructions that generate values with no uses.
/// Typically, these are induction variable operations that generate values
/// used in the loop itself. A dead instruction has a definition with
/// no uses, or uses that occur in the original loop only.
void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB,
MBBVectorTy &EpilogBBs) {
// For each epilog block, check that the value defined by each instruction
// is used. If not, delete it.
for (MachineBasicBlock *MBB : llvm::reverse(EpilogBBs))
for (MachineBasicBlock::reverse_instr_iterator MI = MBB->instr_rbegin(),
ME = MBB->instr_rend();
MI != ME;) {
// From DeadMachineInstructionElem. Don't delete inline assembly.
if (MI->isInlineAsm()) {
++MI;
continue;
}
bool SawStore = false;
// Check if it's safe to remove the instruction due to side effects.
// We can, and want to, remove Phis here.
if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
++MI;
continue;
}
bool used = true;
for (const MachineOperand &MO : MI->all_defs()) {
Register reg = MO.getReg();
// Assume physical registers are used, unless they are marked dead.
if (reg.isPhysical()) {
used = !MO.isDead();
if (used)
break;
continue;
}
unsigned realUses = 0;
for (const MachineOperand &U : MRI.use_operands(reg)) {
// Check if there are any uses that occur only in the original
// loop. If so, that's not a real use.
if (U.getParent()->getParent() != BB) {
realUses++;
used = true;
break;
}
}
if (realUses > 0)
break;
used = false;
}
if (!used) {
LIS.RemoveMachineInstrFromMaps(*MI);
MI++->eraseFromParent();
continue;
}
++MI;
}
// In the kernel block, check if we can remove a Phi that generates a value
// used in an instruction removed in the epilog block.
for (MachineInstr &MI : llvm::make_early_inc_range(KernelBB->phis())) {
Register reg = MI.getOperand(0).getReg();
if (MRI.use_begin(reg) == MRI.use_end()) {
LIS.RemoveMachineInstrFromMaps(MI);
MI.eraseFromParent();
}
}
}
/// For loop carried definitions, we split the lifetime of a virtual register
/// that has uses past the definition in the next iteration. A copy with a new
/// virtual register is inserted before the definition, which helps with
/// generating a better register assignment.
///
/// v1 = phi(a, v2) v1 = phi(a, v2)
/// v2 = phi(b, v3) v2 = phi(b, v3)
/// v3 = .. v4 = copy v1
/// .. = V1 v3 = ..
/// .. = v4
void ModuloScheduleExpander::splitLifetimes(MachineBasicBlock *KernelBB,
MBBVectorTy &EpilogBBs) {
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
for (auto &PHI : KernelBB->phis()) {
Register Def = PHI.getOperand(0).getReg();
// Check for any Phi definition that used as an operand of another Phi
// in the same block.
for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
E = MRI.use_instr_end();
I != E; ++I) {
if (I->isPHI() && I->getParent() == KernelBB) {
// Get the loop carried definition.
unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
if (!LCDef)
continue;
MachineInstr *MI = MRI.getVRegDef(LCDef);
if (!MI || MI->getParent() != KernelBB || MI->isPHI())
continue;
// Search through the rest of the block looking for uses of the Phi
// definition. If one occurs, then split the lifetime.
unsigned SplitReg = 0;
for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
KernelBB->instr_end()))
if (BBJ.readsRegister(Def)) {
// We split the lifetime when we find the first use.
if (SplitReg == 0) {
SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
BuildMI(*KernelBB, MI, MI->getDebugLoc(),
TII->get(TargetOpcode::COPY), SplitReg)
.addReg(Def);
}
BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
}
if (!SplitReg)
continue;
// Search through each of the epilog blocks for any uses to be renamed.
for (auto &Epilog : EpilogBBs)
for (auto &I : *Epilog)
if (I.readsRegister(Def))
I.substituteRegister(Def, SplitReg, 0, *TRI);
break;
}
}
}
}
/// Remove the incoming block from the Phis in a basic block.
static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
for (MachineInstr &MI : *BB) {
if (!MI.isPHI())
break;
for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
if (MI.getOperand(i + 1).getMBB() == Incoming) {
MI.removeOperand(i + 1);
MI.removeOperand(i);
break;
}
}
}
/// Create branches from each prolog basic block to the appropriate epilog
/// block. These edges are needed if the loop ends before reaching the
/// kernel.
void ModuloScheduleExpander::addBranches(MachineBasicBlock &PreheaderBB,
MBBVectorTy &PrologBBs,
MachineBasicBlock *KernelBB,
MBBVectorTy &EpilogBBs,
ValueMapTy *VRMap) {
assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
MachineBasicBlock *LastPro = KernelBB;
MachineBasicBlock *LastEpi = KernelBB;
// Start from the blocks connected to the kernel and work "out"
// to the first prolog and the last epilog blocks.
SmallVector<MachineInstr *, 4> PrevInsts;
unsigned MaxIter = PrologBBs.size() - 1;
for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
// Add branches to the prolog that go to the corresponding
// epilog, and the fall-thru prolog/kernel block.
MachineBasicBlock *Prolog = PrologBBs[j];
MachineBasicBlock *Epilog = EpilogBBs[i];
SmallVector<MachineOperand, 4> Cond;
std::optional<bool> StaticallyGreater =
LoopInfo->createTripCountGreaterCondition(j + 1, *Prolog, Cond);
unsigned numAdded = 0;
if (!StaticallyGreater) {
Prolog->addSuccessor(Epilog);
numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
} else if (*StaticallyGreater == false) {
Prolog->addSuccessor(Epilog);
Prolog->removeSuccessor(LastPro);
LastEpi->removeSuccessor(Epilog);
numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
removePhis(Epilog, LastEpi);
// Remove the blocks that are no longer referenced.
if (LastPro != LastEpi) {
LastEpi->clear();
LastEpi->eraseFromParent();
}
if (LastPro == KernelBB) {
LoopInfo->disposed();
NewKernel = nullptr;
}
LastPro->clear();
LastPro->eraseFromParent();
} else {
numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
removePhis(Epilog, Prolog);
}
LastPro = Prolog;
LastEpi = Epilog;
for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
E = Prolog->instr_rend();
I != E && numAdded > 0; ++I, --numAdded)
updateInstruction(&*I, false, j, 0, VRMap);
}
if (NewKernel) {
LoopInfo->setPreheader(PrologBBs[MaxIter]);
LoopInfo->adjustTripCount(-(MaxIter + 1));
}
}
/// Return true if we can compute the amount the instruction changes
/// during each iteration. Set Delta to the amount of the change.
bool ModuloScheduleExpander::computeDelta(MachineInstr &MI, unsigned &Delta) {
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
const MachineOperand *BaseOp;
int64_t Offset;
bool OffsetIsScalable;
if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
return false;
// FIXME: This algorithm assumes instructions have fixed-size offsets.
if (OffsetIsScalable)
return false;
if (!BaseOp->isReg())
return false;
Register BaseReg = BaseOp->getReg();
MachineRegisterInfo &MRI = MF.getRegInfo();
// Check if there is a Phi. If so, get the definition in the loop.
MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
if (BaseDef && BaseDef->isPHI()) {
BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
BaseDef = MRI.getVRegDef(BaseReg);
}
if (!BaseDef)
return false;
int D = 0;
if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
return false;
Delta = D;
return true;
}
/// Update the memory operand with a new offset when the pipeliner
/// generates a new copy of the instruction that refers to a
/// different memory location.
void ModuloScheduleExpander::updateMemOperands(MachineInstr &NewMI,
MachineInstr &OldMI,
unsigned Num) {
if (Num == 0)
return;
// If the instruction has memory operands, then adjust the offset
// when the instruction appears in different stages.
if (NewMI.memoperands_empty())
return;
SmallVector<MachineMemOperand *, 2> NewMMOs;
for (MachineMemOperand *MMO : NewMI.memoperands()) {
// TODO: Figure out whether isAtomic is really necessary (see D57601).
if (MMO->isVolatile() || MMO->isAtomic() ||
(MMO->isInvariant() && MMO->isDereferenceable()) ||
(!MMO->getValue())) {
NewMMOs.push_back(MMO);
continue;
}
unsigned Delta;
if (Num != UINT_MAX && computeDelta(OldMI, Delta)) {
int64_t AdjOffset = Delta * Num;
NewMMOs.push_back(
MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()));
} else {
NewMMOs.push_back(
MF.getMachineMemOperand(MMO, 0, MemoryLocation::UnknownSize));
}
}
NewMI.setMemRefs(MF, NewMMOs);
}
/// Clone the instruction for the new pipelined loop and update the
/// memory operands, if needed.
MachineInstr *ModuloScheduleExpander::cloneInstr(MachineInstr *OldMI,
unsigned CurStageNum,
unsigned InstStageNum) {
MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
return NewMI;
}
/// Clone the instruction for the new pipelined loop. If needed, this
/// function updates the instruction using the values saved in the