diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp index bfcafc6442d24..9a804c12939c4 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp @@ -38,6 +38,8 @@ static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2, static const MCPhysReg ZRegList[] = {AArch64::Z0, AArch64::Z1, AArch64::Z2, AArch64::Z3, AArch64::Z4, AArch64::Z5, AArch64::Z6, AArch64::Z7}; +static const MCPhysReg PRegList[] = {AArch64::P0, AArch64::P1, AArch64::P2, + AArch64::P3}; static bool finishStackBlock(SmallVectorImpl &PendingMembers, MVT LocVT, ISD::ArgFlagsTy &ArgFlags, @@ -59,11 +61,17 @@ static bool finishStackBlock(SmallVectorImpl &PendingMembers, // CCAssignFn again we want it to behave as if all remaining registers are // allocated. This will force the code to pass the tuple indirectly in // accordance with the PCS. - bool RegsAllocated[8]; + bool ZRegsAllocated[8]; for (int I = 0; I < 8; I++) { - RegsAllocated[I] = State.isAllocated(ZRegList[I]); + ZRegsAllocated[I] = State.isAllocated(ZRegList[I]); State.AllocateReg(ZRegList[I]); } + // The same applies to P registers. + bool PRegsAllocated[4]; + for (int I = 0; I < 4; I++) { + PRegsAllocated[I] = State.isAllocated(PRegList[I]); + State.AllocateReg(PRegList[I]); + } auto &It = PendingMembers[0]; CCAssignFn *AssignFn = @@ -79,8 +87,11 @@ static bool finishStackBlock(SmallVectorImpl &PendingMembers, // Return the register state back to how it was before, leaving any // unallocated registers available for other smaller types. for (int I = 0; I < 8; I++) - if (!RegsAllocated[I]) + if (!ZRegsAllocated[I]) State.DeallocateReg(ZRegList[I]); + for (int I = 0; I < 4; I++) + if (!PRegsAllocated[I]) + State.DeallocateReg(PRegList[I]); // All pending members have now been allocated PendingMembers.clear(); @@ -140,9 +151,15 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, RegList = DRegList; else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector()) RegList = QRegList; - else if (LocVT.isScalableVector()) - RegList = ZRegList; - else { + else if (LocVT.isScalableVector()) { + // Scalable masks should be pass by Predicate registers. + if (LocVT == MVT::nxv1i1 || LocVT == MVT::nxv2i1 || LocVT == MVT::nxv4i1 || + LocVT == MVT::nxv8i1 || LocVT == MVT::nxv16i1 || + LocVT == MVT::aarch64svcount) + RegList = PRegList; + else + RegList = ZRegList; + } else { // Not an array we want to split up after all. return false; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7344387ffe552..8ba12fea19bc7 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7178,7 +7178,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments( uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue(); unsigned NumParts = 1; if (Ins[i].Flags.isInConsecutiveRegs()) { - assert(!Ins[i].Flags.isInConsecutiveRegsLast()); while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) ++NumParts; } @@ -8175,7 +8174,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, uint64_t PartSize = StoreSize; unsigned NumParts = 1; if (Outs[i].Flags.isInConsecutiveRegs()) { - assert(!Outs[i].Flags.isInConsecutiveRegsLast()); while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) ++NumParts; StoreSize *= NumParts; diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll index 8cb8b1c92fa7e..8ce24ceb33d71 100644 --- a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll @@ -72,6 +72,203 @@ define aarch64_sve_vector_pcs @caller_with_many_svepred_arg(< ret %ret } +; Test that arg2 is passed through x0, i.e., x0 = &%arg2; and return values are loaded from x0: +; P0 = ldr [x0] +define aarch64_sve_vector_pcs @callee_with_svepred_arg_4xv16i1_1xv16i1([4 x ] %arg1, [1 x ] %arg2) { +; CHECK: name: callee_with_svepred_arg_4xv16i1_1xv16i1 +; CHECK: [[BASE:%[0-9]+]]:gpr64common = COPY $x0 +; CHECK: [[PRED0:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 :: (load ()) +; CHECK: $p0 = COPY [[PRED0]] +; CHECK: RET_ReallyLR implicit $p0 + %res = extractvalue [1 x ] %arg2, 0 + ret %res +} + +; Test that arg1 is stored to the stack from p0; and the stack location is passed throuch x0 to setup the call: +; str P0, [stack_loc_for_args] +; x0 = stack_loc_for_args +define aarch64_sve_vector_pcs @caller_with_svepred_arg_1xv16i1_4xv16i1([1 x ] %arg1, [4 x ] %arg2) { +; CHECK: name: caller_with_svepred_arg_1xv16i1_4xv16i1 +; CHECK: stack: +; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 2, alignment: 2, +; CHECK-NEXT: stack-id: scalable-vector, +; CHECK: [[PRED0:%[0-9]+]]:ppr = COPY $p0 +; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp +; CHECK: STR_PXI [[PRED0]], %stack.0, 0 :: (store () into %stack.0) +; CHECK: [[STACK:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0 +; CHECK: $x0 = COPY [[STACK]] +; CHECK: BL @callee_with_svepred_arg_4xv16i1_1xv16i1, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $p1, implicit $p2, implicit $p3, implicit $x0, implicit-def $sp, implicit-def $p0 +; CHECK: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + %res = call @callee_with_svepred_arg_4xv16i1_1xv16i1([4 x ] %arg2, [1 x ] %arg1) + ret %res +} + +; Test that arg2 is passed through x0, i.e., x0 = &%arg2; and return values are loaded from x0: +; P0 = ldr [x0] +; P1 = ldr [x0 + sizeof(Px)] +; P2 = ldr [x0 + 2*sizeof(Px)] +; P3 = ldr [x0 + 3*sizeof(Px)] +define aarch64_sve_vector_pcs [4 x ] @callee_with_svepred_arg_4xv16i1_4xv16i1([4 x ] %arg1, [4 x ] %arg2) { +; CHECK: name: callee_with_svepred_arg_4xv16i1_4xv16i1 +; CHECK: [[BASE:%[0-9]+]]:gpr64common = COPY $x0 +; CHECK: [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg +; CHECK: [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[BASE]], killed [[OFFSET1]] +; CHECK: [[PRED1:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR1]], 0 :: (load ()) +; CHECK: [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg +; CHECK: [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET2]] +; CHECK: [[PRED2:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR2]], 0 :: (load ()) +; CHECK: [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg +; CHECK: [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET3]] +; CHECK: [[PRED3:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR3]], 0 :: (load ()) +; CHECK: [[PRED0:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 :: (load ()) +; CHECK: $p0 = COPY [[PRED0]] +; CHECK: $p1 = COPY [[PRED1]] +; CHECK: $p2 = COPY [[PRED2]] +; CHECK: $p3 = COPY [[PRED3]] +; CHECK: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3 + ret [4 x ] %arg2 +} + +; Test that arg1 is stored to the stack from p0~p3; and the stack location is passed throuch x0 to setup the call: +; str P0, [stack_loc_for_args] +; str P1, [stack_loc_for_args + sizeof(Px)] +; str P2, [stack_loc_for_args + 2*sizeof(Px)] +; str P3, [stack_loc_for_args + 3*sizeof(Px)] +; x0 = stack_loc_for_args +define [4 x ] @caller_with_svepred_arg_4xv16i1_4xv16i1([4 x ] %arg1, [4 x ] %arg2) { +; CHECK: name: caller_with_svepred_arg_4xv16i1_4xv16i1 +; CHECK: stack: +; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2, +; CHECK-NEXT: stack-id: scalable-vector, +; CHECK: [[PRED3:%[0-9]+]]:ppr = COPY $p3 +; CHECK: [[PRED2:%[0-9]+]]:ppr = COPY $p2 +; CHECK: [[PRED1:%[0-9]+]]:ppr = COPY $p1 +; CHECK: [[PRED0:%[0-9]+]]:ppr = COPY $p0 +; CHECK: [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg +; CHECK: [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg +; CHECK: [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg +; CHECK: [[STACK:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0 +; CHECK: [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], [[OFFSET3]] +; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp +; CHECK: STR_PXI [[PRED3]], killed [[ADDR3]], 0 :: (store ()) +; CHECK: [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], [[OFFSET2]] +; CHECK: STR_PXI [[PRED2]], killed [[ADDR2]], 0 :: (store ()) +; CHECK: [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[STACK]], [[OFFSET1]] +; CHECK: STR_PXI [[PRED1]], killed [[ADDR1]], 0 :: (store ()) +; CHECK: STR_PXI [[PRED0]], %stack.0, 0 :: (store () into %stack.0) +; CHECK: $x0 = COPY [[STACK]] +; CHECK: BL @callee_with_svepred_arg_4xv16i1_4xv16i1, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $p1, implicit $p2, implicit $p3, implicit $x0, implicit-def $sp, implicit-def $p0, implicit-def $p1, implicit-def $p2, implicit-def $p3 +; CHECK: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + %res = call [4 x ] @callee_with_svepred_arg_4xv16i1_4xv16i1([4 x ] %arg2, [4 x ] %arg1) + ret [4 x ] %res +} + +; Test that arg2 is passed through x0, i.e., x0 = &%arg2; and return values are loaded from x0: +; P0 = ldr [x0] +; P1 = ldr [x0 + sizeof(Px)] +; P2 = ldr [x0 + 2*sizeof(Px)] +; P3 = ldr [x0 + 3*sizeof(Px)] +define aarch64_sve_vector_pcs [2 x ] @callee_with_svepred_arg_1xv16i1_2xv32i1([1 x ] %arg1, [2 x ] %arg2) { +; CHECK: name: callee_with_svepred_arg_1xv16i1_2xv32i1 +; CHECK: [[BASE:%[0-9]+]]:gpr64common = COPY $x0 +; CHECK: [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg +; CHECK: [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[BASE]], killed [[OFFSET1]] +; CHECK: [[PRED1:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR1]], 0 :: (load ()) +; CHECK: [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg +; CHECK: [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET2]] +; CHECK: [[PRED2:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR2]], 0 :: (load ()) +; CHECK: [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg +; CHECK: [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET3]] +; CHECK: [[PRED3:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR3]], 0 :: (load ()) +; CHECK: [[PRED0:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 :: (load ()) +; CHECK: $p0 = COPY [[PRED0]] +; CHECK: $p1 = COPY [[PRED1]] +; CHECK: $p2 = COPY [[PRED2]] +; CHECK: $p3 = COPY [[PRED3]] +; CHECK: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3 + ret [2 x ] %arg2 +} + +; Test that arg1 is stored to the stack from p0~p3; and the stack location is passed throuch x0 to setup the call: +; str P0, [stack_loc_for_args] +; str P1, [stack_loc_for_args + sizeof(Px)] +; str P2, [stack_loc_for_args + 2*sizeof(Px)] +; str P3, [stack_loc_for_args + 3*sizeof(Px)] +; x0 = stack_loc_for_args +define [2 x ] @caller_with_svepred_arg_2xv32i1_1xv16i1([2 x ] %arg1, [1 x ] %arg2) { +; CHECK: name: caller_with_svepred_arg_2xv32i1_1xv16i1 +; CHECK: stack: +; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2, +; CHECK-NEXT: stack-id: scalable-vector, +; CHECK: [[PRED3:%[0-9]+]]:ppr = COPY $p3 +; CHECK: [[PRED2:%[0-9]+]]:ppr = COPY $p2 +; CHECK: [[PRED1:%[0-9]+]]:ppr = COPY $p1 +; CHECK: [[PRED0:%[0-9]+]]:ppr = COPY $p0 +; CHECK: [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg +; CHECK: [[STACK:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0 +; CHECK: [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], killed [[OFFSET3]] +; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp +; CHECK: STR_PXI [[PRED3]], killed [[ADDR3]], 0 :: (store ()) +; CHECK: [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg +; CHECK: [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], killed [[OFFSET2]] +; CHECK: STR_PXI [[PRED2]], killed [[ADDR2]], 0 :: (store ()) +; CHECK: [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg +; CHECK: [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[STACK]], killed [[OFFSET1]] +; CHECK: STR_PXI [[PRED1]], killed [[ADDR1]], 0 :: (store ()) +; CHECK: STR_PXI [[PRED0]], %stack.0, 0 :: (store () into %stack.0) +; CHECK: $x0 = COPY [[STACK]] +; CHECK: BL @callee_with_svepred_arg_1xv16i1_2xv32i1, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $x0, implicit-def $sp, implicit-def $p0, implicit-def $p1, implicit-def $p2, implicit-def $p3 +; CHECK: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + %res = call [2 x ] @callee_with_svepred_arg_1xv16i1_2xv32i1([1 x ] %arg2, [2 x ] %arg1) + ret [2 x ] %res +} + +; Test that arg1 and arg3 are passed via P0~P3, arg1 is passed indirectly through address on stack in x0 +define aarch64_sve_vector_pcs [4 x ] @callee_with_svepred_arg_2xv16i1_4xv16i1_2xv16i1([2 x ] %arg1, [4 x ] %arg2, [2 x ] %arg3) nounwind { +; CHECK: name: callee_with_svepred_arg_2xv16i1_4xv16i1_2xv16i1 +; CHECK: [[P3:%[0-9]+]]:ppr = COPY $p3 +; CHECK: [[P2:%[0-9]+]]:ppr = COPY $p2 +; CHECK: [[X0:%[0-9]+]]:gpr64common = COPY $x0 +; CHECK: [[P1:%[0-9]+]]:ppr = COPY $p1 +; CHECK: [[P0:%[0-9]+]]:ppr = COPY $p0 +; CHECK: [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg +; CHECK: [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[X0]], killed [[OFFSET3]] +; CHECK: [[P7:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR3]], 0 :: (load ()) +; CHECK: [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg +; CHECK: [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[X0]], killed [[OFFSET2]] +; CHECK: [[P6:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR2]], 0 :: (load ()) +; CHECK: [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg +; CHECK: [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[X0]], killed [[OFFSET1]] +; CHECK: [[P5:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR1]], 0 :: (load ()) +; CHECK: [[P4:%[0-9]+]]:ppr = LDR_PXI [[X0]], 0 :: (load ()) +; CHECK: [[RES0:%[0-9]+]]:ppr = AND_PPzPP [[P0]], [[P0]], killed [[P4]] +; CHECK: [[RES1:%[0-9]+]]:ppr = AND_PPzPP [[P1]], [[P1]], killed [[P5]] +; CHECK: [[RES2:%[0-9]+]]:ppr = AND_PPzPP [[P2]], [[P2]], killed [[P6]] +; CHECK: [[RES3:%[0-9]+]]:ppr = AND_PPzPP [[P3]], [[P3]], killed [[P7]] +; CHECK: $p0 = COPY [[RES0]] +; CHECK: $p1 = COPY [[RES1]] +; CHECK: $p2 = COPY [[RES2]] +; CHECK: $p3 = COPY [[RES3]] +; CHECK: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3 + %p0 = extractvalue [2 x ] %arg1, 0 + %p1 = extractvalue [2 x ] %arg1, 1 + %p2 = extractvalue [2 x ] %arg3, 0 + %p3 = extractvalue [2 x ] %arg3, 1 + %p4 = extractvalue [4 x ] %arg2, 0 + %p5 = extractvalue [4 x ] %arg2, 1 + %p6 = extractvalue [4 x ] %arg2, 2 + %p7 = extractvalue [4 x ] %arg2, 3 + %r0 = and %p0, %p4 + %r1 = and %p1, %p5 + %r2 = and %p2, %p6 + %r3 = and %p3, %p7 + %1 = insertvalue [4 x ] undef, %r0, 0 + %2 = insertvalue [4 x ] %1, %r1, 1 + %3 = insertvalue [4 x ] %2, %r2, 2 + %4 = insertvalue [4 x ] %3, %r3, 3 + ret [4 x ] %4 +} + ; Test that z8 and z9, passed by reference, are loaded from a location that is passed on the stack. ; i.e. x0 = %x0 ; : diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll index 0a45244f12be5..bfb750517cbf9 100644 --- a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll @@ -128,6 +128,52 @@ define @sve_signature_pred( %arg1, %arg2 } +; Test that scalable predicate argument in [1 x ] type are properly assigned to P registers. +; CHECK-LABEL: name: sve_signature_pred_1xv4i1 +; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p1 +; CHECK: $p0 = COPY [[RES]] +; CHECK: RET_ReallyLR implicit $p0 +define [1 x ] @sve_signature_pred_1xv4i1([1 x ] %arg1, [1 x ] %arg2) nounwind { + ret [1 x ] %arg2 +} + +; Test that upto to two scalable predicate arguments in [2 x ] type can be assigned to P registers. +; CHECK-LABEL: name: sve_signature_pred_2xv4i1 +; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p3 +; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p2 +; CHECK: $p0 = COPY [[RES0]] +; CHECK: $p1 = COPY [[RES1]] +; CHECK: RET_ReallyLR implicit $p0, implicit $p1 +define [2 x ] @sve_signature_pred_2xv4i1([2 x ] %arg1, [2 x ] %arg2) nounwind { + ret [2 x ] %arg2 +} + +; Test that a scalable predicate argument in [1 x ] type is assigned to two P registers. +; CHECK-LABLE: name: sve_signature_pred_1xv32i1 +; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p3 +; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p2 +; CHECK: $p0 = COPY [[RES0]] +; CHECK: $p1 = COPY [[RES1]] +; CHECK: RET_ReallyLR implicit $p0, implicit $p1 +define [1 x ] @sve_signature_pred_1xv32i1([1 x ] %arg1, [1 x ] %arg2) nounwind { + ret [1 x ] %arg2 +} + +; Test that a scalable predicate argument in [2 x ] type is assigned to four P registers. +; CHECK-LABLE: name: sve_signature_pred_2xv32i1 +; CHECK: [[RES3:%[0-9]+]]:ppr = COPY $p3 +; CHECK: [[RES2:%[0-9]+]]:ppr = COPY $p2 +; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1 +; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0 +; CHECK: $p0 = COPY [[RES0]] +; CHECK: $p1 = COPY [[RES1]] +; CHECK: $p2 = COPY [[RES2]] +; CHECK: $p3 = COPY [[RES3]] +; CHECK: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3 +define [2 x ] @sve_signature_pred_2xv32i1([2 x ] %arg1) nounwind { + ret [2 x ] %arg1 +} + ; CHECK-LABEL: name: sve_signature_vec_caller ; CHECK-DAG: [[ARG2:%[0-9]+]]:zpr = COPY $z1 ; CHECK-DAG: [[ARG1:%[0-9]+]]:zpr = COPY $z0 @@ -156,6 +202,84 @@ define @sve_signature_pred_caller( %arg1, %res } +; CHECK-LABEL: name: sve_signature_pred_1xv4i1_caller +; CHECK-DAG: [[ARG2:%[0-9]+]]:ppr = COPY $p1 +; CHECK-DAG: [[ARG1:%[0-9]+]]:ppr = COPY $p0 +; CHECK-DAG: $p0 = COPY [[ARG2]] +; CHECK-DAG: $p1 = COPY [[ARG1]] +; CHECK-NEXT: BL @sve_signature_pred_1xv4i1, csr_aarch64_sve_aapcs +; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p0 +; CHECK: $p0 = COPY [[RES]] +; CHECK: RET_ReallyLR implicit $p0 +define [1 x ] @sve_signature_pred_1xv4i1_caller([1 x ] %arg1, [1 x ] %arg2) nounwind { + %res = call [1 x ] @sve_signature_pred_1xv4i1([1 x ] %arg2, [1 x ] %arg1) + ret [1 x ] %res +} + +; CHECK-LABEL: name: sve_signature_pred_2xv4i1_caller +; CHECK-DAG: [[ARG2_2:%[0-9]+]]:ppr = COPY $p3 +; CHECK-DAG: [[ARG2_1:%[0-9]+]]:ppr = COPY $p2 +; CHECK-DAG: [[ARG1_2:%[0-9]+]]:ppr = COPY $p1 +; CHECK-DAG: [[ARG1_1:%[0-9]+]]:ppr = COPY $p0 +; CHECK-DAG: $p0 = COPY [[ARG2_1]] +; CHECK-DAG: $p1 = COPY [[ARG2_2]] +; CHECK-DAG: $p2 = COPY [[ARG1_1]] +; CHECK-DAG: $p3 = COPY [[ARG1_2]] +; CHECK-NEXT: BL @sve_signature_pred_2xv4i1, csr_aarch64_sve_aapcs +; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0 +; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1 +; CHECK: $p0 = COPY [[RES0]] +; CHECK: $p1 = COPY [[RES1]] +; CHECK: RET_ReallyLR implicit $p0, implicit $p1 +define [2 x ] @sve_signature_pred_2xv4i1_caller([2 x ] %arg1, [2 x ] %arg2) nounwind { + %res = call [2 x ] @sve_signature_pred_2xv4i1([2 x ] %arg2, [2 x ] %arg1) + ret [2 x ] %res +} + +; CHECK-LABEL: name: sve_signature_pred_1xv32i1_caller +; CHECK-DAG: [[ARG2_2:%[0-9]+]]:ppr = COPY $p3 +; CHECK-DAG: [[ARG2_1:%[0-9]+]]:ppr = COPY $p2 +; CHECK-DAG: [[ARG1_2:%[0-9]+]]:ppr = COPY $p1 +; CHECK-DAG: [[ARG1_1:%[0-9]+]]:ppr = COPY $p0 +; CHECK-DAG: $p0 = COPY [[ARG2_1]] +; CHECK-DAG: $p1 = COPY [[ARG2_2]] +; CHECK-DAG: $p2 = COPY [[ARG1_1]] +; CHECK-DAG: $p3 = COPY [[ARG1_2]] +; CHECK-NEXT: BL @sve_signature_pred_1xv32i1, csr_aarch64_sve_aapcs +; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0 +; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1 +; CHECK: $p0 = COPY [[RES0]] +; CHECK: $p1 = COPY [[RES1]] +; CHECK: RET_ReallyLR implicit $p0, implicit $p1 +define [1 x ] @sve_signature_pred_1xv32i1_caller([1 x ] %arg1, [1 x ] %arg2) nounwind { + %res = call [1 x ] @sve_signature_pred_1xv32i1([1 x ] %arg2, [1 x ] %arg1) + ret [1 x ] %res +} + +; CHECK-LABEL: name: sve_signature_pred_2xv32i1_caller +; CHECK-DAG: [[ARG3:%[0-9]+]]:ppr = COPY $p3 +; CHECK-DAG: [[ARG2:%[0-9]+]]:ppr = COPY $p2 +; CHECK-DAG: [[ARG1:%[0-9]+]]:ppr = COPY $p1 +; CHECK-DAG: [[ARG0:%[0-9]+]]:ppr = COPY $p0 +; CHECK-DAG: $p0 = COPY [[ARG0]] +; CHECK-DAG: $p1 = COPY [[ARG1]] +; CHECK-DAG: $p2 = COPY [[ARG2]] +; CHECK-DAG: $p3 = COPY [[ARG3]] +; CHECK-NEXT: BL @sve_signature_pred_2xv32i1, csr_aarch64_sve_aapcs +; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0 +; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1 +; CHECK: [[RES2:%[0-9]+]]:ppr = COPY $p2 +; CHECK: [[RES3:%[0-9]+]]:ppr = COPY $p3 +; CHECK: $p0 = COPY [[RES0]] +; CHECK: $p1 = COPY [[RES1]] +; CHECK: $p2 = COPY [[RES2]] +; CHECK: $p3 = COPY [[RES3]] +; CHECK: RET_ReallyLR implicit $p0, implicit $p1 +define [2 x ] @sve_signature_pred_2xv32i1_caller([2 x ] %arg1) { + %res = call [2 x ] @sve_signature_pred_2xv32i1([2 x ] %arg1) + ret [2 x ] %res +} + ; Test that functions returning or taking SVE arguments use the correct ; callee-saved set when using the default C calling convention (as opposed ; to aarch64_sve_vector_pcs)