-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[RISCV] Optimize the spill/reload of segment registers #153184
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Pengcheng Wang (wangpc-pp) ChangesThe simplest way is:
But Instead, we use wider whole load/store instructions if the register We found this suboptimal implementation when porting some video codec Patch is 54.98 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153184.diff 8 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 085064eee896a..7b4a1de167695 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -382,7 +382,7 @@ void RISCVInstrInfo::copyPhysRegVector(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, MCRegister DstReg, MCRegister SrcReg, bool KillSrc,
const TargetRegisterClass *RegClass) const {
- const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+ const RISCVRegisterInfo *TRI = STI.getRegisterInfo();
RISCVVType::VLMUL LMul = RISCVRI::getLMul(RegClass->TSFlags);
unsigned NF = RISCVRI::getNF(RegClass->TSFlags);
@@ -444,13 +444,7 @@ void RISCVInstrInfo::copyPhysRegVector(
return {RISCVVType::LMUL_1, RISCV::VRRegClass, RISCV::VMV1R_V,
RISCV::PseudoVMV_V_V_M1, RISCV::PseudoVMV_V_I_M1};
};
- auto FindRegWithEncoding = [TRI](const TargetRegisterClass &RegClass,
- uint16_t Encoding) {
- MCRegister Reg = RISCV::V0 + Encoding;
- if (RISCVRI::getLMul(RegClass.TSFlags) == RISCVVType::LMUL_1)
- return Reg;
- return TRI->getMatchingSuperReg(Reg, RISCV::sub_vrm1_0, &RegClass);
- };
+
while (I != NumRegs) {
// For non-segment copying, we only do this once as the registers are always
// aligned.
@@ -470,9 +464,9 @@ void RISCVInstrInfo::copyPhysRegVector(
// Emit actual copying.
// For reversed copying, the encoding should be decreased.
- MCRegister ActualSrcReg = FindRegWithEncoding(
+ MCRegister ActualSrcReg = TRI->findVRegWithEncoding(
RegClass, ReversedCopy ? (SrcEncoding - NumCopied + 1) : SrcEncoding);
- MCRegister ActualDstReg = FindRegWithEncoding(
+ MCRegister ActualDstReg = TRI->findVRegWithEncoding(
RegClass, ReversedCopy ? (DstEncoding - NumCopied + 1) : DstEncoding);
auto MIB = BuildMI(MBB, MBBI, DL, get(Opc), ActualDstReg);
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 7e58b6f342689..758bf64ff197c 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -389,6 +389,22 @@ void RISCVRegisterInfo::adjustReg(MachineBasicBlock &MBB,
.setMIFlag(Flag);
}
+static std::tuple<RISCVVType::VLMUL, const TargetRegisterClass &, unsigned>
+getSpillReloadInfo(unsigned Idx, unsigned Total, uint16_t RegEncoding,
+ bool IsSpill) {
+ if (Idx + 8 <= Total && RegEncoding % 8 == 0)
+ return {RISCVVType::LMUL_8, RISCV::VRM8RegClass,
+ IsSpill ? RISCV::VS8R_V : RISCV::VL8RE8_V};
+ if (Idx + 4 <= Total && RegEncoding % 4 == 0)
+ return {RISCVVType::LMUL_4, RISCV::VRM4RegClass,
+ IsSpill ? RISCV::VS4R_V : RISCV::VL4RE8_V};
+ if (Idx + 2 <= Total && RegEncoding % 2 == 0)
+ return {RISCVVType::LMUL_2, RISCV::VRM2RegClass,
+ IsSpill ? RISCV::VS2R_V : RISCV::VL2RE8_V};
+ return {RISCVVType::LMUL_1, RISCV::VRRegClass,
+ IsSpill ? RISCV::VS1R_V : RISCV::VL1RE8_V};
+}
+
// Split a VSPILLx_Mx pseudo into multiple whole register stores separated by
// LMUL*VLENB bytes.
void RISCVRegisterInfo::lowerVSPILL(MachineBasicBlock::iterator II) const {
@@ -403,47 +419,11 @@ void RISCVRegisterInfo::lowerVSPILL(MachineBasicBlock::iterator II) const {
auto ZvlssegInfo = RISCV::isRVVSpillForZvlsseg(II->getOpcode());
unsigned NF = ZvlssegInfo->first;
unsigned LMUL = ZvlssegInfo->second;
- assert(NF * LMUL <= 8 && "Invalid NF/LMUL combinations.");
- unsigned Opcode, SubRegIdx;
- switch (LMUL) {
- default:
- llvm_unreachable("LMUL must be 1, 2, or 4.");
- case 1:
- Opcode = RISCV::VS1R_V;
- SubRegIdx = RISCV::sub_vrm1_0;
- break;
- case 2:
- Opcode = RISCV::VS2R_V;
- SubRegIdx = RISCV::sub_vrm2_0;
- break;
- case 4:
- Opcode = RISCV::VS4R_V;
- SubRegIdx = RISCV::sub_vrm4_0;
- break;
- }
- static_assert(RISCV::sub_vrm1_7 == RISCV::sub_vrm1_0 + 7,
- "Unexpected subreg numbering");
- static_assert(RISCV::sub_vrm2_3 == RISCV::sub_vrm2_0 + 3,
- "Unexpected subreg numbering");
- static_assert(RISCV::sub_vrm4_1 == RISCV::sub_vrm4_0 + 1,
- "Unexpected subreg numbering");
-
- Register VL = MRI.createVirtualRegister(&RISCV::GPRRegClass);
- // Optimize for constant VLEN.
- if (auto VLEN = STI.getRealVLen()) {
- const int64_t VLENB = *VLEN / 8;
- int64_t Offset = VLENB * LMUL;
- STI.getInstrInfo()->movImm(MBB, II, DL, VL, Offset);
- } else {
- BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VL);
- uint32_t ShiftAmount = Log2_32(LMUL);
- if (ShiftAmount != 0)
- BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), VL)
- .addReg(VL)
- .addImm(ShiftAmount);
- }
+ unsigned NumRegs = NF * LMUL;
+ assert(NumRegs <= 8 && "Invalid NF/LMUL combinations.");
Register SrcReg = II->getOperand(0).getReg();
+ uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
Register Base = II->getOperand(1).getReg();
bool IsBaseKill = II->getOperand(1).isKill();
Register NewBase = MRI.createVirtualRegister(&RISCV::GPRRegClass);
@@ -451,23 +431,53 @@ void RISCVRegisterInfo::lowerVSPILL(MachineBasicBlock::iterator II) const {
auto *OldMMO = *(II->memoperands_begin());
LocationSize OldLoc = OldMMO->getSize();
assert(OldLoc.isPrecise() && OldLoc.getValue().isKnownMultipleOf(NF));
- TypeSize NewSize = OldLoc.getValue().divideCoefficientBy(NF);
- auto *NewMMO = MF.getMachineMemOperand(OldMMO, OldMMO->getOffset(), NewSize);
- for (unsigned I = 0; I < NF; ++I) {
- // Adding implicit-use of super register to describe we are using part of
- // super register, that prevents machine verifier complaining when part of
- // subreg is undef, see comment in MachineVerifier::checkLiveness for more
- // detail.
- BuildMI(MBB, II, DL, TII->get(Opcode))
- .addReg(TRI->getSubReg(SrcReg, SubRegIdx + I))
- .addReg(Base, getKillRegState(I == NF - 1))
- .addMemOperand(NewMMO)
- .addReg(SrcReg, RegState::Implicit);
- if (I != NF - 1)
+ TypeSize NewSize = OldLoc.getValue().divideCoefficientBy(NumRegs);
+
+ Register VLENB = 0;
+ unsigned PreSavedNum = 0;
+ unsigned I = 0;
+ while (I != NumRegs) {
+ auto [LMulSaved, RegClass, Opcode] =
+ getSpillReloadInfo(I, NumRegs, SrcEncoding, true);
+ auto [NumSaved, _] = RISCVVType::decodeVLMUL(LMulSaved);
+ if (PreSavedNum) {
+ Register Step = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ if (auto VLEN = STI.getRealVLen()) {
+ const int64_t VLENB = *VLEN / 8;
+ int64_t Offset = VLENB * PreSavedNum;
+ STI.getInstrInfo()->movImm(MBB, II, DL, Step, Offset);
+ } else {
+ if (!VLENB) {
+ VLENB = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VLENB);
+ }
+ uint32_t ShiftAmount = Log2_32(PreSavedNum);
+ if (ShiftAmount == 0)
+ Step = VLENB;
+ else
+ BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), Step)
+ .addReg(VLENB)
+ .addImm(ShiftAmount);
+ }
+
BuildMI(MBB, II, DL, TII->get(RISCV::ADD), NewBase)
.addReg(Base, getKillRegState(I != 0 || IsBaseKill))
- .addReg(VL, getKillRegState(I == NF - 2));
- Base = NewBase;
+ .addReg(Step, getKillRegState(true));
+ Base = NewBase;
+ }
+
+ MCRegister ActualSrcReg = findVRegWithEncoding(RegClass, SrcEncoding);
+
+ BuildMI(MBB, II, DL, TII->get(Opcode))
+ .addReg(ActualSrcReg)
+ .addReg(Base, getKillRegState(I + NumSaved == NumRegs))
+ .addMemOperand(MF.getMachineMemOperand(OldMMO, OldMMO->getOffset(),
+ NewSize * NumSaved))
+ .addReg(SrcReg, RegState::Implicit);
+
+ PreSavedNum = NumSaved;
+ SrcEncoding += NumSaved;
+ I += NumSaved;
}
II->eraseFromParent();
}
@@ -486,65 +496,63 @@ void RISCVRegisterInfo::lowerVRELOAD(MachineBasicBlock::iterator II) const {
auto ZvlssegInfo = RISCV::isRVVSpillForZvlsseg(II->getOpcode());
unsigned NF = ZvlssegInfo->first;
unsigned LMUL = ZvlssegInfo->second;
- assert(NF * LMUL <= 8 && "Invalid NF/LMUL combinations.");
- unsigned Opcode, SubRegIdx;
- switch (LMUL) {
- default:
- llvm_unreachable("LMUL must be 1, 2, or 4.");
- case 1:
- Opcode = RISCV::VL1RE8_V;
- SubRegIdx = RISCV::sub_vrm1_0;
- break;
- case 2:
- Opcode = RISCV::VL2RE8_V;
- SubRegIdx = RISCV::sub_vrm2_0;
- break;
- case 4:
- Opcode = RISCV::VL4RE8_V;
- SubRegIdx = RISCV::sub_vrm4_0;
- break;
- }
- static_assert(RISCV::sub_vrm1_7 == RISCV::sub_vrm1_0 + 7,
- "Unexpected subreg numbering");
- static_assert(RISCV::sub_vrm2_3 == RISCV::sub_vrm2_0 + 3,
- "Unexpected subreg numbering");
- static_assert(RISCV::sub_vrm4_1 == RISCV::sub_vrm4_0 + 1,
- "Unexpected subreg numbering");
-
- Register VL = MRI.createVirtualRegister(&RISCV::GPRRegClass);
- // Optimize for constant VLEN.
- if (auto VLEN = STI.getRealVLen()) {
- const int64_t VLENB = *VLEN / 8;
- int64_t Offset = VLENB * LMUL;
- STI.getInstrInfo()->movImm(MBB, II, DL, VL, Offset);
- } else {
- BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VL);
- uint32_t ShiftAmount = Log2_32(LMUL);
- if (ShiftAmount != 0)
- BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), VL)
- .addReg(VL)
- .addImm(ShiftAmount);
- }
+ unsigned NumRegs = NF * LMUL;
+ assert(NumRegs <= 8 && "Invalid NF/LMUL combinations.");
Register DestReg = II->getOperand(0).getReg();
+ uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
Register Base = II->getOperand(1).getReg();
bool IsBaseKill = II->getOperand(1).isKill();
Register NewBase = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+
auto *OldMMO = *(II->memoperands_begin());
LocationSize OldLoc = OldMMO->getSize();
assert(OldLoc.isPrecise() && OldLoc.getValue().isKnownMultipleOf(NF));
- TypeSize NewSize = OldLoc.getValue().divideCoefficientBy(NF);
- auto *NewMMO = MF.getMachineMemOperand(OldMMO, OldMMO->getOffset(), NewSize);
- for (unsigned I = 0; I < NF; ++I) {
- BuildMI(MBB, II, DL, TII->get(Opcode),
- TRI->getSubReg(DestReg, SubRegIdx + I))
- .addReg(Base, getKillRegState(I == NF - 1))
- .addMemOperand(NewMMO);
- if (I != NF - 1)
+ TypeSize NewSize = OldLoc.getValue().divideCoefficientBy(NumRegs);
+
+ Register VLENB = 0;
+ unsigned PreReloadedNum = 0;
+ unsigned I = 0;
+ while (I != NumRegs) {
+ auto [LMulReloaded, RegClass, Opcode] =
+ getSpillReloadInfo(I, NumRegs, DestEncoding, false);
+ auto [NumReloaded, _] = RISCVVType::decodeVLMUL(LMulReloaded);
+ if (PreReloadedNum) {
+ Register Step = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ if (auto VLEN = STI.getRealVLen()) {
+ const int64_t VLENB = *VLEN / 8;
+ int64_t Offset = VLENB * PreReloadedNum;
+ STI.getInstrInfo()->movImm(MBB, II, DL, Step, Offset);
+ } else {
+ if (!VLENB) {
+ VLENB = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VLENB);
+ }
+ uint32_t ShiftAmount = Log2_32(PreReloadedNum);
+ if (ShiftAmount == 0)
+ Step = VLENB;
+ else
+ BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), Step)
+ .addReg(VLENB)
+ .addImm(ShiftAmount);
+ }
+
BuildMI(MBB, II, DL, TII->get(RISCV::ADD), NewBase)
.addReg(Base, getKillRegState(I != 0 || IsBaseKill))
- .addReg(VL, getKillRegState(I == NF - 2));
- Base = NewBase;
+ .addReg(Step, getKillRegState(true));
+ Base = NewBase;
+ }
+
+ MCRegister ActualDestReg = findVRegWithEncoding(RegClass, DestEncoding);
+
+ BuildMI(MBB, II, DL, TII->get(Opcode), ActualDestReg)
+ .addReg(Base, getKillRegState(I + NumReloaded == NumRegs))
+ .addMemOperand(MF.getMachineMemOperand(OldMMO, OldMMO->getOffset(),
+ NewSize * NumReloaded));
+
+ PreReloadedNum = NumReloaded;
+ DestEncoding += NumReloaded;
+ I += NumReloaded;
}
II->eraseFromParent();
}
@@ -635,9 +643,7 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
}
// Handle spill/fill of synthetic register classes for segment operations to
- // ensure correctness in the edge case one gets spilled. There are many
- // possible optimizations here, but given the extreme rarity of such spills,
- // we prefer simplicity of implementation for now.
+ // ensure correctness in the edge case one gets spilled.
switch (MI.getOpcode()) {
case RISCV::PseudoVSPILL2_M1:
case RISCV::PseudoVSPILL2_M2:
@@ -1052,3 +1058,12 @@ bool RISCVRegisterInfo::getRegAllocationHints(
return BaseImplRetVal;
}
+
+Register
+RISCVRegisterInfo::findVRegWithEncoding(const TargetRegisterClass &RegClass,
+ uint16_t Encoding) const {
+ MCRegister Reg = RISCV::V0 + Encoding;
+ if (RISCVRI::getLMul(RegClass.TSFlags) == RISCVVType::LMUL_1)
+ return Reg;
+ return getMatchingSuperReg(Reg, RISCV::sub_vrm1_0, &RegClass);
+}
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index b368399e2ad14..ffb4f84afb9a3 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -144,6 +144,9 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
const MachineFunction &MF, const VirtRegMap *VRM,
const LiveRegMatrix *Matrix) const override;
+ Register findVRegWithEncoding(const TargetRegisterClass &RegClass,
+ uint16_t Encoding) const;
+
static bool isVRRegClass(const TargetRegisterClass *RC) {
return RISCVRI::isVRegClass(RC->TSFlags) &&
RISCVRI::getNF(RC->TSFlags) == 1;
diff --git a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
index 0afdcdccd9246..6a7c73672bf6c 100644
--- a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
+++ b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
@@ -40,15 +40,7 @@ define void @_Z3foov() {
; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_45)
; CHECK-NEXT: vle16.v v12, (a0)
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vs2r.v v10, (a0) # vscale x 16-byte Folded Spill
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vs2r.v v12, (a0) # vscale x 16-byte Folded Spill
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vs2r.v v14, (a0) # vscale x 16-byte Folded Spill
+; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_40)
; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_40)
; CHECK-NEXT: #APP
@@ -59,15 +51,7 @@ define void @_Z3foov() {
; CHECK-NEXT: addi a0, a0, 928
; CHECK-NEXT: vmsbc.vx v0, v8, a0
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl2r.v v14, (a0) # vscale x 16-byte Folded Reload
+; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll b/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll
index 878b180e34c01..f3c88923c15e2 100644
--- a/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll
+++ b/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll
@@ -32,11 +32,7 @@ define void @last_chance_recoloring_failure() {
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 2
-; CHECK-NEXT: vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vs4r.v v20, (a0) # vscale x 32-byte Folded Spill
+; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; CHECK-NEXT: li s0, 36
; CHECK-NEXT: vsetvli zero, s0, e16, m4, ta, ma
; CHECK-NEXT: vfwadd.vv v16, v8, v12, v0.t
@@ -47,11 +43,7 @@ define void @last_chance_recoloring_failure() {
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 2
-; CHECK-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vsetvli zero, s0, e16, m4, ta, ma
@@ -92,11 +84,7 @@ define void @last_chance_recoloring_failure() {
; SUBREGLIVENESS-NEXT: slli a0, a0, 3
; SUBREGLIVENESS-NEXT: add a0, sp, a0
; SUBREGLIVENESS-NEXT: addi a0, a0, 16
-; SUBREGLIVENESS-NEXT: csrr a1, vlenb
-; SUBREGLIVENESS-NEXT: slli a1, a1, 2
-; SUBREGLIVENESS-NEXT: vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
-; SUBREGLIVENESS-NEXT: add a0, a0, a1
-; SUBREGLIVENESS-NEXT: vs4r.v v20, (a0) # vscale x 32-byte Folded Spill
+; SUBREGLIVENESS-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; SUBREGLIVENESS-NEXT: li s0, 36
; SUBREGLIVENESS-NEXT: vsetvli zero, s0, e16, m4, ta, ma
; SUBREGLIVENESS-NEXT: vfwadd.vv v16, v8, v12, v0.t
@@ -107,11 +95,7 @@ define void @last_chance_recoloring_failure() {
; SUBREGLIVENESS-NEXT: slli a0, a0, 3
; SUBREGLIVENESS-NEXT: add a0, sp, a0
; SUBREGLIVENESS-NEXT: addi a0, a0, 16
-; SUBREGLIVENESS-NEXT: csrr a1, vlenb
-; SUBREGLIVENESS-NEXT: slli a1, a1, 2
-; SUBREGLIVENESS-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
-; SUBREGLIVENESS-NEXT: add a0, a0, a1
-; SUBREGLIVENESS-NEXT: vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; SUBREGLIVENESS-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; SUBREGLIVENESS-NEXT: addi a0, sp, 16
; SUBREGLIVENESS-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; SUBREGLIVENESS-NEXT: vsetvli zero, s0, e16, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll b/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll
index 663bb1fc15517..d69a166b04080 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll
@@ -41,14 +41,11 @@ define <vscale x 1 x i32> @spill_zvlsseg_nxv1i32(ptr %base, i32 %vl) nounwind {
; SPILL-O2-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
; SPILL-O2-NEXT: vlseg2e32.v v8, (a0)
; SPILL-O2-NEXT: addi a0, sp, 16
-; SPILL-O2-NEXT: csrr a1, vlenb
-; SPILL-O2-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; SPILL-O2-NEXT: add a0, a0, a1
-; SPILL-O2-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; SPILL-O2-NEXT: vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
; SPILL-O2-NEXT: #APP
; SPILL-O2-NEXT: #NO_APP
-; SPILL-O2-NEXT: addi a0, sp, 16
; SPILL-O2-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
+; SPILL-O2-NEXT: csrr a1, vlenb
; SPILL-O2-NEXT: add a0, a0, a1
; SPILL-O2-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
; SPILL-O2-NEXT: csrr a0, vlenb
@@ -64,15 +61,11 @@ define <vscale x 1 x i32> @spill_zvlsseg_nxv1i32(ptr %base, i32 %vl) nounwind {
; SPILL-O2-VLEN128-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
; SPILL-O2-VLEN128-NEXT: vlseg2e32.v v8, (a0)
; SPILL-O2-VLEN128-NEXT: addi a0, sp, 16
-; SPILL-O2-VLEN128-NEXT: li a1, 16
-; SPILL-O2-VLEN1...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good idea, a few detail bits inline, but looks straight forward.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isn't a new problem with this patch, but the offset and base here seem wrong for everything except the first store.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, do you have any thought on fixing it? Or does it matter?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense to me
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of keeping track of PreReloadedNum, is it possible to increment Base at the end of the loop if there's still more registers? Something like
PreReloadedNum = NumReloaded; | |
DestEncoding += NumReloaded; | |
I += NumReloaded; | |
DestEncoding += NumReloaded; | |
I += NumReloaded; | |
if (I == NumRegs) | |
break; | |
// the code above in if (PreReloadedNum) { ... | |
Base = NewBase; | |
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My initial implementation was like this, I changed it to increase the base pointer at the beginning of loop because we need to know if there will be more registers to handle, so that we can set IsKill
properly.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(I am not so confident about the IsKill
settings, please double check that.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
The simplest way is: 1. Save `vtype` to a scalar register. 2. Insert a `vsetvli`. 3. Use segment load/store. 4. Restore `vtype` via `vsetvl`. But `vsetvl` is usually slow, so this PR is not in this way. Instead, we use wider whole load/store instructions if the register encoding is aligned. We have done the same optimization for COPY in llvm#84455. We found this suboptimal implementation when porting some video codec kernels via RVV intrinsics.
1. Add `NumRemaining`. 2. Rename `NewSize` to `VRegSize`. 3. Add argument comments. 4. Don't create `Step` for `ShiftAmount==0` case.
efe4f85
to
5c1e8bb
Compare
The simplest way is:
vtype
to a scalar register.vsetvli
.vtype
viavsetvl
.But
vsetvl
is usually slow, so this PR is not in this way.Instead, we use wider whole load/store instructions if the register
encoding is aligned. We have done the same optimization for COPY in
#84455.
We found this suboptimal implementation when porting some video codec
kernels via RVV intrinsics.