Skip to content

Commit

Permalink
[VE] fastcc and vreg-to-vreg copy
Browse files Browse the repository at this point in the history
This defines a 'fastcc' for the VE target and implements vreg-to-vreg
copy for parameter passing.  The 'fastcc' extends the standard CC for
SX-Aurora with register passing of vector-typed parameters and return
values.

Reviewed By: kaz7

Differential Revision: https://reviews.llvm.org/D90842
  • Loading branch information
simoll committed Nov 16, 2020
1 parent 4369223 commit a598c08
Show file tree
Hide file tree
Showing 6 changed files with 465 additions and 0 deletions.
60 changes: 60 additions & 0 deletions llvm/lib/Target/VE/VECallingConv.td
Expand Up @@ -97,6 +97,66 @@ def RetCC_VE_C : CallingConv<[
[SX0, SX1, SX3, SX5]>>,
]>;

///// Custom fastcc /////
//
// This passes vector params and return values in registers. Scalar values are
// handled conforming to the standard cc.
def CC_VE_Fast : CallingConv<[
// vector --> generic vector registers
CCIfType<[v2i32, v2i64, v2f32, v2f64,
v4i32, v4i64, v4f32, v4f64,
v8i32, v8i64, v8f32, v8f64,
v16i32, v16i64, v16f32, v16f64,
v32i32, v32i64, v32f32, v32f64,
v64i32, v64i64, v64f32, v64f64,
v128i32, v128i64, v128f32, v128f64,
v256i32, v256f32, v256i64, v256f64],
CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>,
// TODO: make this conditional on packed mode
CCIfType<[v512i32, v512f32],
CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>,

// vector mask --> generic vector mask registers
CCIfType<[v256i1],
CCAssignToReg<[VM1, VM2, VM3, VM4, VM5, VM6, VM7]>>,

// pair of vector mask --> generic vector mask registers
CCIfType<[v512i1],
CCAssignToRegWithShadow<[VMP1, VMP2, VMP3],
[VM1, VM1, VM3]>>,

// Follow the standard C CC for scalars.
CCDelegateTo<CC_VE_C>
]>;

def RetCC_VE_Fast : CallingConv<[
// vector --> generic vector registers
CCIfType<[v2i32, v2i64, v2f32, v2f64,
v4i32, v4i64, v4f32, v4f64,
v8i32, v8i64, v8f32, v8f64,
v16i32, v16i64, v16f32, v16f64,
v32i32, v32i64, v32f32, v32f64,
v64i32, v64i64, v64f32, v64f64,
v128i32, v128i64, v128f32, v128f64,
v256i32, v256f32, v256i64, v256f64],
CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>,
// TODO: make this conditional on packed mode
CCIfType<[v512i32, v512f32],
CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>,

// vector mask --> generic vector mask registers
CCIfType<[v256i1],
CCAssignToReg<[VM1, VM2, VM3, VM4, VM5, VM6, VM7]>>,

// pair of vector mask --> generic vector mask registers
CCIfType<[v512i1],
CCAssignToRegWithShadow<[VMP1, VMP2, VMP3],
[VM1, VM1, VM3]>>,

// Follow the standard C CC for scalars.
CCDelegateTo<RetCC_VE_C>
]>;

// Callee-saved registers
def CSR : CalleeSavedRegs<(add (sequence "SX%u", 18, 33))>;
def CSR_NoRegs : CalleeSavedRegs<(add)>;
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/VE/VEISelLowering.cpp
Expand Up @@ -44,6 +44,8 @@ CCAssignFn *getReturnCC(CallingConv::ID CallConv) {
switch (CallConv) {
default:
return RetCC_VE_C;
case CallingConv::Fast:
return RetCC_VE_Fast;
}
}

Expand All @@ -53,6 +55,8 @@ CCAssignFn *getParamCC(CallingConv::ID CallConv, bool IsVarArg) {
switch (CallConv) {
default:
return CC_VE_C;
case CallingConv::Fast:
return CC_VE_Fast;
}
}

Expand Down
19 changes: 19 additions & 0 deletions llvm/lib/Target/VE/VEInstrInfo.cpp
Expand Up @@ -352,6 +352,25 @@ void VEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
BuildMI(MBB, I, DL, get(VE::ORri), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc))
.addImm(0);
} else if (VE::V64RegClass.contains(DestReg, SrcReg)) {
// Generate following instructions
// %sw16 = LEA32zii 256
// VORmvl %dest, (0)1, %src, %sw16
// TODO: reuse a register if vl is already assigned to a register
// FIXME: it would be better to scavenge a register here instead of
// reserving SX16 all of the time.
const TargetRegisterInfo *TRI = &getRegisterInfo();
Register TmpReg = VE::SX16;
Register SubTmp = TRI->getSubReg(TmpReg, VE::sub_i32);
BuildMI(MBB, I, DL, get(VE::LEAzii), TmpReg)
.addImm(0)
.addImm(0)
.addImm(256);
MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(VE::VORmvl), DestReg)
.addImm(M1(0)) // Represent (0)1.
.addReg(SrcReg, getKillRegState(KillSrc))
.addReg(SubTmp, getKillRegState(true));
MIB.getInstr()->addRegisterKilled(TmpReg, TRI, true);
} else if (VE::F128RegClass.contains(DestReg, SrcReg)) {
// Use two instructions.
const unsigned SubRegIdx[] = {VE::sub_even, VE::sub_odd};
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/VE/VERegisterInfo.cpp
Expand Up @@ -35,6 +35,8 @@ VERegisterInfo::VERegisterInfo() : VEGenRegisterInfo(VE::SX10) {}
const MCPhysReg *
VERegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
switch (MF->getFunction().getCallingConv()) {
case CallingConv::Fast:
// Being explicit (same as standard CC).
default:
return CSR_SaveList;
case CallingConv::PreserveAll:
Expand All @@ -45,6 +47,8 @@ VERegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
const uint32_t *VERegisterInfo::getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const {
switch (CC) {
case CallingConv::Fast:
// Being explicit (same as standard CC).
default:
return CSR_RegMask;
case CallingConv::PreserveAll:
Expand Down
122 changes: 122 additions & 0 deletions llvm/test/CodeGen/VE/Vector/fastcc_callee.ll
@@ -0,0 +1,122 @@
; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s

; Scalar argument passing must not change (same tests as in VE/Scalar/callee.ll below - this time with +vpu)

define fastcc i32 @stack_stack_arg_i32_r9(i1 %0, i8 %1, i16 %2, i32 %3, i64 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9) {
; CHECK-LABEL: stack_stack_arg_i32_r9:
; CHECK: .LBB{{[0-9]+}}_2:
; CHECK-NEXT: ldl.sx %s0, 424(, %s11)
; CHECK-NEXT: or %s11, 0, %s9
ret i32 %9
}

define fastcc i64 @stack_stack_arg_i64_r9(i1 %0, i8 %1, i16 %2, i32 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9) {
; CHECK-LABEL: stack_stack_arg_i64_r9:
; CHECK: .LBB{{[0-9]+}}_2:
; CHECK-NEXT: ld %s0, 424(, %s11)
; CHECK-NEXT: or %s11, 0, %s9
ret i64 %9
}

define fastcc float @stack_stack_arg_f32_r9(float %p0, float %p1, float %p2, float %p3, float %p4, float %p5, float %p6, float %p7, float %s0, float %s1) {
; CHECK-LABEL: stack_stack_arg_f32_r9:
; CHECK: .LBB{{[0-9]+}}_2:
; CHECK-NEXT: ldu %s0, 428(, %s11)
; CHECK-NEXT: or %s11, 0, %s9
ret float %s1
}

define fastcc i32 @stack_stack_arg_i32f32_r8(i32 %p0, float %p1, i32 %p2, float %p3, i32 %p4, float %p5, i32 %p6, float %p7, i32 %s0, float %s1) {
; CHECK-LABEL: stack_stack_arg_i32f32_r8:
; CHECK: .LBB{{[0-9]+}}_2:
; CHECK-NEXT: ldl.sx %s0, 416(, %s11)
; CHECK-NEXT: or %s11, 0, %s9
ret i32 %s0
}

define fastcc float @stack_stack_arg_i32f32_r9(i32 %p0, float %p1, i32 %p2, float %p3, i32 %p4, float %p5, i32 %p6, float %p7, i32 %s0, float %s1) {
; CHECK-LABEL: stack_stack_arg_i32f32_r9:
; CHECK: .LBB{{[0-9]+}}_2:
; CHECK-NEXT: ldu %s0, 428(, %s11)
; CHECK-NEXT: or %s11, 0, %s9
ret float %s1
}

; Vector argument passing (fastcc feature)

; v0-to-v0 passthrough case without vreg copy.
define fastcc <256 x i32> @vreg_arg_v256i32_r0(<256 x i32> %p0) {
; CHECK-LABEL: vreg_arg_v256i32_r0:
; CHECK: .LBB{{[0-9]+}}_2:
; CHECK-NEXT: or %s11, 0, %s9
ret <256 x i32> %p0
}

define fastcc <256 x i32> @vreg_arg_v256i32_r1(<256 x i32> %p0, <256 x i32> %p1) {
; CHECK-LABEL: vreg_arg_v256i32_r1:
; CHECK: .LBB{{[0-9]+}}_2:
; CHECK-NEXT: lea %s16, 256
; CHECK-NEXT: lvl %s16
; CHECK-NEXT: vor %v0, (0)1, %v1
; CHECK-NEXT: or %s11, 0, %s9
ret <256 x i32> %p1
}

define fastcc <256 x i32> @vreg_arg_v256i32_r2(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2) {
; CHECK-LABEL: vreg_arg_v256i32_r2:
; CHECK: .LBB{{[0-9]+}}_2:
; CHECK-NEXT: lea %s16, 256
; CHECK-NEXT: lvl %s16
; CHECK-NEXT: vor %v0, (0)1, %v2
; CHECK-NEXT: or %s11, 0, %s9
ret <256 x i32> %p2
}

define fastcc <256 x i32> @vreg_arg_v256i32_r3(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3) {
; CHECK-LABEL: vreg_arg_v256i32_r3:
; CHECK: .LBB{{[0-9]+}}_2:
; CHECK-NEXT: lea %s16, 256
; CHECK-NEXT: lvl %s16
; CHECK-NEXT: vor %v0, (0)1, %v3
; CHECK-NEXT: or %s11, 0, %s9
ret <256 x i32> %p3
}

define fastcc <256 x i32> @vreg_arg_v256i32_r4(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3, <256 x i32> %p4) {
; CHECK-LABEL: vreg_arg_v256i32_r4:
; CHECK: .LBB{{[0-9]+}}_2:
; CHECK-NEXT: lea %s16, 256
; CHECK-NEXT: lvl %s16
; CHECK-NEXT: vor %v0, (0)1, %v4
; CHECK-NEXT: or %s11, 0, %s9
ret <256 x i32> %p4
}

define fastcc <256 x i32> @vreg_arg_v256i32_r5(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3, <256 x i32> %p4, <256 x i32> %p5) {
; CHECK-LABEL: vreg_arg_v256i32_r5:
; CHECK: .LBB{{[0-9]+}}_2:
; CHECK-NEXT: lea %s16, 256
; CHECK-NEXT: lvl %s16
; CHECK-NEXT: vor %v0, (0)1, %v5
; CHECK-NEXT: or %s11, 0, %s9
ret <256 x i32> %p5
}

define fastcc <256 x i32> @vreg_arg_v256i32_r6(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3, <256 x i32> %p4, <256 x i32> %p5, <256 x i32> %p6) {
; CHECK-LABEL: vreg_arg_v256i32_r6:
; CHECK: .LBB{{[0-9]+}}_2:
; CHECK-NEXT: lea %s16, 256
; CHECK-NEXT: lvl %s16
; CHECK-NEXT: vor %v0, (0)1, %v6
; CHECK-NEXT: or %s11, 0, %s9
ret <256 x i32> %p6
}

; TODO: Uncomment test when vector loads are upstream (vreg stack passing).
; define <256 x i32> @vreg_arg_v256i32_r7(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3, <256 x i32> %p4, <256 x i32> %p5, <256 x i32> %p6, <256 x i32> %p7) {
; ret <256 x i32> %p7
; }

; define <256 x i32> @vreg_arg_v256i32_r8(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3, <256 x i32> %p4, <256 x i32> %p5, <256 x i32> %p6, <256 x i32> %p7, <256 x i32> %p8) {
; ret <256 x i32> %p8
; }

0 comments on commit a598c08

Please sign in to comment.