Skip to content

Commit cbf682c

Browse files
committed
[SystemZ] Improve codegen for memset.
Memset with a constant length was implemented with a single store followed by a series of MVC:s. This patch changes this so that one store of the byte is emitted for each MVC, which avoids data dependencies between the MVCs. An MVI/STC + MVC(len-1) is done for each block. In addition, memset with a variable length is now also handled without a libcall. Since the byte is first stored and then MVC is used from that address, a length of two must now be subtracted instead of one for the loop and EXRL. This requires an extra check for the one-byte case, which is handled in a special block with just a single MVI/STC (like GCC). Review: Ulrich Weigand Differential Revision: https://reviews.llvm.org/D112004
1 parent 327d966 commit cbf682c

11 files changed

+366
-96
lines changed

llvm/lib/Target/SystemZ/SystemZISelLowering.cpp

Lines changed: 108 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -5714,6 +5714,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
57145714
OPCODE(OC);
57155715
OPCODE(XC);
57165716
OPCODE(CLC);
5717+
OPCODE(MEMSET_MVC);
57175718
OPCODE(STPCPY);
57185719
OPCODE(STRCMP);
57195720
OPCODE(SEARCH_STRING);
@@ -7860,8 +7861,10 @@ MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI,
78607861
return MBB;
78617862
}
78627863

7863-
MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
7864-
MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
7864+
MachineBasicBlock *
7865+
SystemZTargetLowering::emitMemMemWrapper(MachineInstr &MI,
7866+
MachineBasicBlock *MBB,
7867+
unsigned Opcode, bool IsMemset) const {
78657868
MachineFunction &MF = *MBB->getParent();
78667869
const SystemZInstrInfo *TII =
78677870
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
@@ -7870,18 +7873,64 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
78707873

78717874
MachineOperand DestBase = earlyUseOperand(MI.getOperand(0));
78727875
uint64_t DestDisp = MI.getOperand(1).getImm();
7873-
MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2));
7874-
uint64_t SrcDisp = MI.getOperand(3).getImm();
7875-
MachineOperand &LengthMO = MI.getOperand(4);
7876+
MachineOperand SrcBase = MachineOperand::CreateReg(0U, false);
7877+
uint64_t SrcDisp;
7878+
7879+
// Fold the displacement Disp if it is out of range.
7880+
auto foldDisplIfNeeded = [&](MachineOperand &Base, uint64_t &Disp) -> void {
7881+
if (!isUInt<12>(Disp)) {
7882+
Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
7883+
unsigned Opcode = TII->getOpcodeForOffset(SystemZ::LA, Disp);
7884+
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opcode), Reg)
7885+
.add(Base).addImm(Disp).addReg(0);
7886+
Base = MachineOperand::CreateReg(Reg, false);
7887+
Disp = 0;
7888+
}
7889+
};
7890+
7891+
if (!IsMemset) {
7892+
SrcBase = earlyUseOperand(MI.getOperand(2));
7893+
SrcDisp = MI.getOperand(3).getImm();
7894+
} else {
7895+
SrcBase = DestBase;
7896+
SrcDisp = DestDisp++;
7897+
foldDisplIfNeeded(DestBase, DestDisp);
7898+
}
7899+
7900+
MachineOperand &LengthMO = MI.getOperand(IsMemset ? 2 : 4);
78767901
bool IsImmForm = LengthMO.isImm();
78777902
bool IsRegForm = !IsImmForm;
78787903

7904+
// Build and insert one Opcode of Length, with special treatment for memset.
7905+
auto insertMemMemOp = [&](MachineBasicBlock *InsMBB,
7906+
MachineBasicBlock::iterator InsPos,
7907+
MachineOperand DBase, uint64_t DDisp,
7908+
MachineOperand SBase, uint64_t SDisp,
7909+
unsigned Length) -> void {
7910+
assert(Length > 0 && Length <= 256 && "Building memory op with bad length.");
7911+
if (IsMemset) {
7912+
MachineOperand ByteMO = earlyUseOperand(MI.getOperand(3));
7913+
if (ByteMO.isImm())
7914+
BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::MVI))
7915+
.add(SBase).addImm(SDisp).add(ByteMO);
7916+
else
7917+
BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::STC))
7918+
.add(ByteMO).add(SBase).addImm(SDisp).addReg(0);
7919+
if (--Length == 0)
7920+
return;
7921+
}
7922+
BuildMI(*MBB, InsPos, DL, TII->get(Opcode))
7923+
.add(DBase).addImm(DDisp).addImm(Length)
7924+
.add(SBase).addImm(SDisp)
7925+
.setMemRefs(MI.memoperands());
7926+
};
7927+
78797928
bool NeedsLoop = false;
78807929
uint64_t ImmLength = 0;
7881-
Register LenMinus1Reg = SystemZ::NoRegister;
7930+
Register LenAdjReg = SystemZ::NoRegister;
78827931
if (IsImmForm) {
78837932
ImmLength = LengthMO.getImm();
7884-
ImmLength++; // Add back the '1' subtracted originally.
7933+
ImmLength += IsMemset ? 2 : 1; // Add back the subtracted adjustment.
78857934
if (ImmLength == 0) {
78867935
MI.eraseFromParent();
78877936
return MBB;
@@ -7905,7 +7954,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
79057954
NeedsLoop = true;
79067955
} else {
79077956
NeedsLoop = true;
7908-
LenMinus1Reg = LengthMO.getReg();
7957+
LenAdjReg = LengthMO.getReg();
79097958
}
79107959

79117960
// When generating more than one CLC, all but the last will need to
@@ -7923,17 +7972,17 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
79237972
ImmLength &= 255;
79247973
} else {
79257974
BuildMI(*MBB, MI, DL, TII->get(SystemZ::SRLG), StartCountReg)
7926-
.addReg(LenMinus1Reg)
7975+
.addReg(LenAdjReg)
79277976
.addReg(0)
79287977
.addImm(8);
79297978
}
79307979

7980+
bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
79317981
auto loadZeroAddress = [&]() -> MachineOperand {
79327982
Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
79337983
BuildMI(*MBB, MI, DL, TII->get(SystemZ::LGHI), Reg).addImm(0);
79347984
return MachineOperand::CreateReg(Reg, false);
79357985
};
7936-
bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
79377986
if (DestBase.isReg() && DestBase.getReg() == SystemZ::NoRegister)
79387987
DestBase = loadZeroAddress();
79397988
if (SrcBase.isReg() && SrcBase.getReg() == SystemZ::NoRegister)
@@ -7968,14 +8017,41 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
79688017
DoneMBB = SystemZ::emitBlockAfter(NextMBB);
79698018

79708019
// MBB:
7971-
// # Jump to AllDoneMBB if LenMinus1Reg is -1, or fall thru to StartMBB.
8020+
// # Jump to AllDoneMBB if LenAdjReg means 0, or fall thru to StartMBB.
79728021
BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
7973-
.addReg(LenMinus1Reg).addImm(-1);
8022+
.addReg(LenAdjReg).addImm(IsMemset ? -2 : -1);
79748023
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
79758024
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
79768025
.addMBB(AllDoneMBB);
79778026
MBB->addSuccessor(AllDoneMBB);
7978-
MBB->addSuccessor(StartMBB);
8027+
if (!IsMemset)
8028+
MBB->addSuccessor(StartMBB);
8029+
else {
8030+
// MemsetOneCheckMBB:
8031+
// # Jump to MemsetOneMBB for a memset of length 1, or
8032+
// # fall thru to StartMBB.
8033+
MachineBasicBlock *MemsetOneCheckMBB = SystemZ::emitBlockAfter(MBB);
8034+
MachineBasicBlock *MemsetOneMBB = SystemZ::emitBlockAfter(&*MF.rbegin());
8035+
MBB->addSuccessor(MemsetOneCheckMBB);
8036+
MBB = MemsetOneCheckMBB;
8037+
BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
8038+
.addReg(LenAdjReg).addImm(-1);
8039+
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
8040+
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
8041+
.addMBB(MemsetOneMBB);
8042+
MBB->addSuccessor(MemsetOneMBB, {10, 100});
8043+
MBB->addSuccessor(StartMBB, {90, 100});
8044+
8045+
// MemsetOneMBB:
8046+
// # Jump back to AllDoneMBB after a single MVI or STC.
8047+
MBB = MemsetOneMBB;
8048+
insertMemMemOp(MBB, MBB->end(),
8049+
MachineOperand::CreateReg(StartDestReg, false), DestDisp,
8050+
MachineOperand::CreateReg(StartSrcReg, false), SrcDisp,
8051+
1);
8052+
BuildMI(MBB, DL, TII->get(SystemZ::J)).addMBB(AllDoneMBB);
8053+
MBB->addSuccessor(AllDoneMBB);
8054+
}
79798055

79808056
// StartMBB:
79818057
// # Jump to DoneMBB if %StartCountReg is zero, or fall through to LoopMBB.
@@ -8032,10 +8108,10 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
80328108
if (Opcode == SystemZ::MVC)
80338109
BuildMI(MBB, DL, TII->get(SystemZ::PFD))
80348110
.addImm(SystemZ::PFD_WRITE)
8035-
.addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
8036-
BuildMI(MBB, DL, TII->get(Opcode))
8037-
.addReg(ThisDestReg).addImm(DestDisp).addImm(256)
8038-
.addReg(ThisSrcReg).addImm(SrcDisp);
8111+
.addReg(ThisDestReg).addImm(DestDisp - IsMemset + 768).addReg(0);
8112+
insertMemMemOp(MBB, MBB->end(),
8113+
MachineOperand::CreateReg(ThisDestReg, false), DestDisp,
8114+
MachineOperand::CreateReg(ThisSrcReg, false), SrcDisp, 256);
80398115
if (EndMBB) {
80408116
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
80418117
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
@@ -8075,7 +8151,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
80758151
// # Make PHIs for RemDestReg/RemSrcReg as the loop may or may not run.
80768152
// # Use EXecute Relative Long for the remainder of the bytes. The target
80778153
// instruction of the EXRL will have a length field of 1 since 0 is an
8078-
// illegal value. The number of bytes processed becomes (%LenMinus1Reg &
8154+
// illegal value. The number of bytes processed becomes (%LenAdjReg &
80798155
// 0xff) + 1.
80808156
// # Fall through to AllDoneMBB.
80818157
Register RemSrcReg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
@@ -8088,10 +8164,14 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
80888164
BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg)
80898165
.addReg(StartSrcReg).addMBB(StartMBB)
80908166
.addReg(NextSrcReg).addMBB(NextMBB);
8167+
if (IsMemset)
8168+
insertMemMemOp(MBB, MBB->end(),
8169+
MachineOperand::CreateReg(RemDestReg, false), DestDisp,
8170+
MachineOperand::CreateReg(RemSrcReg, false), SrcDisp, 1);
80918171
MachineInstrBuilder EXRL_MIB =
80928172
BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo))
80938173
.addImm(Opcode)
8094-
.addReg(LenMinus1Reg)
8174+
.addReg(LenAdjReg)
80958175
.addReg(RemDestReg).addImm(DestDisp)
80968176
.addReg(RemSrcReg).addImm(SrcDisp);
80978177
MBB->addSuccessor(AllDoneMBB);
@@ -8107,32 +8187,10 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
81078187
while (ImmLength > 0) {
81088188
uint64_t ThisLength = std::min(ImmLength, uint64_t(256));
81098189
// The previous iteration might have created out-of-range displacements.
8110-
// Apply them using LAY if so.
8111-
if (!isUInt<12>(DestDisp)) {
8112-
Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
8113-
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
8114-
.add(DestBase)
8115-
.addImm(DestDisp)
8116-
.addReg(0);
8117-
DestBase = MachineOperand::CreateReg(Reg, false);
8118-
DestDisp = 0;
8119-
}
8120-
if (!isUInt<12>(SrcDisp)) {
8121-
Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
8122-
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
8123-
.add(SrcBase)
8124-
.addImm(SrcDisp)
8125-
.addReg(0);
8126-
SrcBase = MachineOperand::CreateReg(Reg, false);
8127-
SrcDisp = 0;
8128-
}
8129-
BuildMI(*MBB, MI, DL, TII->get(Opcode))
8130-
.add(DestBase)
8131-
.addImm(DestDisp)
8132-
.addImm(ThisLength)
8133-
.add(SrcBase)
8134-
.addImm(SrcDisp)
8135-
.setMemRefs(MI.memoperands());
8190+
// Apply them using LA/LAY if so.
8191+
foldDisplIfNeeded(DestBase, DestDisp);
8192+
foldDisplIfNeeded(SrcBase, SrcDisp);
8193+
insertMemMemOp(MBB, MI, DestBase, DestDisp, SrcBase, SrcDisp, ThisLength);
81368194
DestDisp += ThisLength;
81378195
SrcDisp += ThisLength;
81388196
ImmLength -= ThisLength;
@@ -8630,6 +8688,11 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
86308688
case SystemZ::CLCImm:
86318689
case SystemZ::CLCReg:
86328690
return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
8691+
case SystemZ::MemsetImmImm:
8692+
case SystemZ::MemsetImmReg:
8693+
case SystemZ::MemsetRegImm:
8694+
case SystemZ::MemsetRegReg:
8695+
return emitMemMemWrapper(MI, MBB, SystemZ::MVC, true/*IsMemset*/);
86338696
case SystemZ::CLSTLoop:
86348697
return emitStringWrapper(MI, MBB, SystemZ::CLST);
86358698
case SystemZ::MVSTLoop:

llvm/lib/Target/SystemZ/SystemZISelLowering.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,9 @@ enum NodeType : unsigned {
126126
// as for MVC.
127127
CLC,
128128

129+
// Use MVC to set a block of memory after storing the first byte.
130+
MEMSET_MVC,
131+
129132
// Use an MVST-based sequence to implement stpcpy().
130133
STPCPY,
131134

@@ -709,7 +712,8 @@ class SystemZTargetLowering : public TargetLowering {
709712
MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr &MI,
710713
MachineBasicBlock *BB) const;
711714
MachineBasicBlock *emitMemMemWrapper(MachineInstr &MI, MachineBasicBlock *BB,
712-
unsigned Opcode) const;
715+
unsigned Opcode,
716+
bool IsMemset = false) const;
713717
MachineBasicBlock *emitStringWrapper(MachineInstr &MI, MachineBasicBlock *BB,
714718
unsigned Opcode) const;
715719
MachineBasicBlock *emitTransactionBegin(MachineInstr &MI,

llvm/lib/Target/SystemZ/SystemZInstrFormats.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5256,6 +5256,16 @@ class RotateSelectAliasRIEf<RegisterOperand cls1, RegisterOperand cls2>
52565256
let Constraints = "$R1 = $R1src";
52575257
}
52585258

5259+
class MemsetPseudo<DAGOperand lenop, DAGOperand byteop>
5260+
: Pseudo<(outs), (ins bdaddr12only:$dest, lenop:$length, byteop:$B),
5261+
[(z_memset_mvc bdaddr12only:$dest, lenop:$length, byteop:$B)]> {
5262+
let Defs = [CC];
5263+
let mayLoad = 1;
5264+
let mayStore = 1;
5265+
let usesCustomInserter = 1;
5266+
let hasNoSchedulingInfo = 1;
5267+
}
5268+
52595269
//===----------------------------------------------------------------------===//
52605270
// Multiclasses that emit both real and pseudo instructions
52615271
//===----------------------------------------------------------------------===//

llvm/lib/Target/SystemZ/SystemZInstrInfo.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,12 @@ let mayLoad = 1, mayStore = 1, Defs = [CC] in {
510510
def MVCLU : SideEffectTernaryMemMemRSY<"mvclu", 0xEB8E, GR128, GR128>;
511511
}
512512

513+
// Memset[Length][Byte] pseudos.
514+
def MemsetImmImm : MemsetPseudo<imm64, imm32zx8trunc>;
515+
def MemsetImmReg : MemsetPseudo<imm64, GR32>;
516+
def MemsetRegImm : MemsetPseudo<ADDR64, imm32zx8trunc>;
517+
def MemsetRegReg : MemsetPseudo<ADDR64, GR32>;
518+
513519
// Move right.
514520
let Predicates = [FeatureMiscellaneousExtensions3],
515521
mayLoad = 1, mayStore = 1, Uses = [R0L] in

llvm/lib/Target/SystemZ/SystemZOperators.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,10 @@ def SDT_ZMemMemLengthCC : SDTypeProfile<1, 3,
102102
SDTCisPtrTy<1>,
103103
SDTCisPtrTy<2>,
104104
SDTCisVT<3, i64>]>;
105+
def SDT_ZMemsetMVC : SDTypeProfile<0, 3,
106+
[SDTCisPtrTy<0>,
107+
SDTCisVT<1, i64>,
108+
SDTCisVT<2, i32>]>;
105109
def SDT_ZString : SDTypeProfile<1, 3,
106110
[SDTCisPtrTy<0>,
107111
SDTCisPtrTy<1>,
@@ -413,6 +417,8 @@ def z_xc : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
413417
[SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
414418
def z_clc : SDNode<"SystemZISD::CLC", SDT_ZMemMemLengthCC,
415419
[SDNPHasChain, SDNPMayLoad]>;
420+
def z_memset_mvc : SDNode<"SystemZISD::MEMSET_MVC", SDT_ZMemsetMVC,
421+
[SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
416422
def z_strcmp : SDNode<"SystemZISD::STRCMP", SDT_ZStringCC,
417423
[SDNPHasChain, SDNPMayLoad]>;
418424
def z_stpcpy : SDNode<"SystemZISD::STPCPY", SDT_ZString,

0 commit comments

Comments
 (0)