diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 22a3c2e6d9bfe..d6755a2331c39 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -814,6 +814,10 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly, ImmArg>]>; + class AdvSIMD_ManyVec_PredLoad_Intrinsic + : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_anyptr_ty], + [IntrReadMem, IntrArgMemOnly]>; + class AdvSIMD_1Vec_PredLoad_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, @@ -1346,6 +1350,10 @@ def int_aarch64_sve_tuple_set : AdvSIMD_SVE_Set_Vector_Tuple; def int_aarch64_sve_ld1 : AdvSIMD_1Vec_PredLoad_Intrinsic; +def int_aarch64_sve_ld2 : AdvSIMD_ManyVec_PredLoad_Intrinsic; +def int_aarch64_sve_ld3 : AdvSIMD_ManyVec_PredLoad_Intrinsic; +def int_aarch64_sve_ld4 : AdvSIMD_ManyVec_PredLoad_Intrinsic; + def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic; def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredLoad_Intrinsic; def int_aarch64_sve_ldff1 : AdvSIMD_1Vec_PredLoad_Intrinsic; diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index ed297d338855a..3e4dd878120c0 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -245,6 +245,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { unsigned SubRegIdx); void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); + void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, const unsigned Opc); bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm); /// SVE Reg+Imm addressing mode. @@ -1441,6 +1442,30 @@ AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr, return std::make_tuple(IsRegReg ? Opc_rr : Opc_ri, NewBase, NewOffset); } +void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs, + const unsigned Opc) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue Chain = N->getOperand(0); + + SDValue Ops[] = {N->getOperand(1), // Predicate + N->getOperand(2), // Memory operand + CurDAG->getTargetConstant(0, DL, MVT::i64), Chain}; + + const EVT ResTys[] = {MVT::Untyped, MVT::Other}; + + SDNode *Load = CurDAG->getMachineNode(Opc, DL, ResTys, Ops); + SDValue SuperReg = SDValue(Load, 0); + for (unsigned i = 0; i < NumVecs; ++i) + ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg( + AArch64::zsub0 + i, DL, VT, SuperReg)); + + // Copy chain + unsigned ChainIdx = NumVecs; + ReplaceUses(SDValue(N, ChainIdx), SDValue(Load, 1)); + CurDAG->RemoveDeadNode(N); +} + void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc) { SDLoc dl(N); @@ -4603,6 +4628,54 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } break; } + case AArch64ISD::SVE_LD2: { + if (VT == MVT::nxv16i8) { + SelectPredicatedLoad(Node, 2, AArch64::LD2B_IMM); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) { + SelectPredicatedLoad(Node, 2, AArch64::LD2H_IMM); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectPredicatedLoad(Node, 2, AArch64::LD2W_IMM); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectPredicatedLoad(Node, 2, AArch64::LD2D_IMM); + return; + } + break; + } + case AArch64ISD::SVE_LD3: { + if (VT == MVT::nxv16i8) { + SelectPredicatedLoad(Node, 3, AArch64::LD3B_IMM); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) { + SelectPredicatedLoad(Node, 3, AArch64::LD3H_IMM); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectPredicatedLoad(Node, 3, AArch64::LD3W_IMM); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectPredicatedLoad(Node, 3, AArch64::LD3D_IMM); + return; + } + break; + } + case AArch64ISD::SVE_LD4: { + if (VT == MVT::nxv16i8) { + SelectPredicatedLoad(Node, 4, AArch64::LD4B_IMM); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) { + SelectPredicatedLoad(Node, 4, AArch64::LD4H_IMM); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectPredicatedLoad(Node, 4, AArch64::LD4W_IMM); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectPredicatedLoad(Node, 4, AArch64::LD4D_IMM); + return; + } + break; + } } // Select the default instruction diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 43a85d510d6a7..854c94638877a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1467,6 +1467,9 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::LDFF1S: return "AArch64ISD::LDFF1S"; case AArch64ISD::LD1RQ: return "AArch64ISD::LD1RQ"; case AArch64ISD::LD1RO: return "AArch64ISD::LD1RO"; + case AArch64ISD::SVE_LD2: return "AArch64ISD::SVE_LD2"; + case AArch64ISD::SVE_LD3: return "AArch64ISD::SVE_LD3"; + case AArch64ISD::SVE_LD4: return "AArch64ISD::SVE_LD4"; case AArch64ISD::GLD1: return "AArch64ISD::GLD1"; case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED"; case AArch64ISD::GLD1_SXTW: return "AArch64ISD::GLD1_SXTW"; @@ -9796,6 +9799,56 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, return true; } +// Lower an SVE structured load intrinsic returning a tuple type to target +// specific intrinsic taking the same input but returning a multi-result value +// of the split tuple type. +// +// E.g. Lowering an LD3: +// +// call @llvm.aarch64.sve.ld3.nxv12i32( +// %pred, +// * %addr) +// +// Output DAG: +// +// t0: ch = EntryToken +// t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0 +// t4: i64,ch = CopyFromReg t0, Register:i64 %1 +// t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4 +// t6: nxv12i32 = concat_vectors t5, t5:1, t5:2 +// +// This is called pre-legalization to avoid widening/splitting issues with +// non-power-of-2 tuple types used for LD3, such as nxv12i32. +SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic, + ArrayRef LoadOps, + EVT VT, SelectionDAG &DAG, + const SDLoc &DL) const { + assert(VT.isScalableVector() && "Can only lower scalable vectors"); + + unsigned N, Opcode; + static std::map> IntrinsicMap = { + {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2}}, + {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3}}, + {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4}}}; + + std::tie(N, Opcode) = IntrinsicMap[Intrinsic]; + assert(VT.getVectorElementCount().Min % N == 0 && + "invalid tuple vector type!"); + + EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorElementCount() / N); + assert(isTypeLegal(SplitVT)); + + SmallVector VTs(N, SplitVT); + VTs.push_back(MVT::Other); // Chain + SDVTList NodeTys = DAG.getVTList(VTs); + + SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps); + SmallVector PseudoLoadOps; + for (unsigned I = 0; I < N; ++I) + PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps); +} EVT AArch64TargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { @@ -13728,6 +13781,20 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds); return DAG.getMergeValues({Concat, Chain}, DL); } + case Intrinsic::aarch64_sve_ld2: + case Intrinsic::aarch64_sve_ld3: + case Intrinsic::aarch64_sve_ld4: { + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + SDValue Mask = N->getOperand(2); + SDValue BasePtr = N->getOperand(3); + SDValue LoadOps[] = {Chain, Mask, BasePtr}; + unsigned IntrinsicID = + cast(N->getOperand(1))->getZExtValue(); + SDValue Result = + LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL); + return DAG.getMergeValues({Result, Chain}, DL); + } default: break; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index e42c0b6e05b7a..2a68220b62837 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -255,6 +255,11 @@ enum NodeType : unsigned { LD1RQ, LD1RO, + // Structured loads. + SVE_LD2, + SVE_LD3, + SVE_LD4, + // Unsigned gather loads. GLD1, GLD1_SCALED, @@ -835,6 +840,8 @@ class AArch64TargetLowering : public TargetLowering { SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const; + SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef LoadOps, + EVT VT, SelectionDAG &DAG, const SDLoc &DL) const; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const override; diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll index 74717d393f550..1244782bd56bb 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -asm-verbose=0 < %s | FileCheck %s ; ; LD1RQB @@ -252,6 +252,244 @@ define @ldnt1d_f64( %pred, double* %addr) ret %res } +; +; LD2B +; + +define @ld2b_i8( %pred, * %addr) { +; CHECK-LABEL: ld2b_i8: +; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0nxv16i8( %pred, + * %addr) + ret %res +} + +; +; LD2H +; + +define @ld2h_i16( %pred, * %addr) { +; CHECK-LABEL: ld2h_i16: +; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0nxv8i16( %pred, + * %addr) + ret %res +} + +define @ld2h_f16( %pred, * %addr) { +; CHECK-LABEL: ld2h_f16: +; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0nxv8f16( %pred, + * %addr) + ret %res +} + +; +; LD2W +; + +define @ld2w_i32( %pred, * %addr) { +; CHECK-LABEL: ld2w_i32: +; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0nxv4i32( %pred, + * %addr) + ret %res +} + +define @ld2w_f32( %pred, * %addr) { +; CHECK-LABEL: ld2w_f32: +; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0nxv4f32( %pred, + * %addr) + ret %res +} + +; +; LD2D +; + +define @ld2d_i64( %pred, * %addr) { +; CHECK-LABEL: ld2d_i64: +; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0nxv2i64( %pred, + * %addr) + ret %res +} + +define @ld2d_f64( %pred, * %addr) { +; CHECK-LABEL: ld2d_f64: +; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0nxv2f64( %pred, + * %addr) + ret %res +} + +; +; LD3B +; + +define @ld3b_i8( %pred, * %addr) { +; CHECK-LABEL: ld3b_i8: +; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0nxv16i8( %pred, + * %addr) + ret %res +} + +; +; LD3H +; + +define @ld3h_i16( %pred, * %addr) { +; CHECK-LABEL: ld3h_i16: +; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0nxv8i16( %pred, + * %addr) + ret %res +} + +define @ld3h_f16( %pred, * %addr) { +; CHECK-LABEL: ld3h_f16: +; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0nxv8f16( %pred, + * %addr) + ret %res +} + +; +; LD3W +; + +define @ld3w_i32( %pred, * %addr) { +; CHECK-LABEL: ld3w_i32: +; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0nxv4i32( %pred, + * %addr) + ret %res +} + +define @ld3w_f32( %pred, * %addr) { +; CHECK-LABEL: ld3w_f32: +; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0nxv4f32( %pred, + * %addr) + ret %res +} + +; +; LD3D +; + +define @ld3d_i64( %pred, * %addr) { +; CHECK-LABEL: ld3d_i64: +; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0nxv2i64( %pred, + * %addr) + ret %res +} + +define @ld3d_f64( %pred, * %addr) { +; CHECK-LABEL: ld3d_f64: +; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0nxv2f64( %pred, + * %addr) + ret %res +} + +; +; LD4B +; + +define @ld4b_i8( %pred, * %addr) { +; CHECK-LABEL: ld4b_i8: +; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0nxv16i8( %pred, + * %addr) + ret %res +} + +; +; LD4H +; + +define @ld4h_i16( %pred, * %addr) { +; CHECK-LABEL: ld4h_i16: +; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0nxv8i16( %pred, + * %addr) + ret %res +} + +define @ld4h_f16( %pred, * %addr) { +; CHECK-LABEL: ld4h_f16: +; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0nxv8f16( %pred, + * %addr) + ret %res +} + +; +; LD4W +; + +define @ld4w_i32( %pred, * %addr) { +; CHECK-LABEL: ld4w_i32: +; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0nxv4i32( %pred, + * %addr) + ret %res +} + +define @ld4w_f32( %pred, * %addr) { +; CHECK-LABEL: ld4w_f32: +; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0nxv4f32( %pred, + * %addr) + ret %res +} + +; +; LD4D +; + +define @ld4d_i64( %pred, * %addr) { +; CHECK-LABEL: ld4d_i64: +; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0nxv2i64( %pred, + * %addr) + ret %res +} + +define @ld4d_f64( %pred, * %addr) { +; CHECK-LABEL: ld4d_f64: +; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0nxv2f64( %pred, + * %addr) + ret %res +} + + declare @llvm.aarch64.sve.ld1rq.nxv16i8(, i8*) declare @llvm.aarch64.sve.ld1rq.nxv8i16(, i16*) declare @llvm.aarch64.sve.ld1rq.nxv4i32(, i32*) @@ -267,3 +505,27 @@ declare @llvm.aarch64.sve.ldnt1.nxv2i64(, i6 declare @llvm.aarch64.sve.ldnt1.nxv8f16(, half*) declare @llvm.aarch64.sve.ldnt1.nxv4f32(, float*) declare @llvm.aarch64.sve.ldnt1.nxv2f64(, double*) + +declare @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0nxv16i8(, *) +declare @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0nxv8i16(, *) +declare @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0nxv4i32(, *) +declare @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0nxv2i64(, *) +declare @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0nxv8f16(, *) +declare @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0nxv4f32(, *) +declare @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0nxv2f64(, *) + +declare @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0nxv16i8(, *) +declare @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0nxv8i16(, *) +declare @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0nxv4i32(, *) +declare @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0nxv2i64(, *) +declare @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0nxv8f16(, *) +declare @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0nxv4f32(, *) +declare @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0nxv2f64(, *) + +declare @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0nxv16i8(, *) +declare @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0nxv8i16(, *) +declare @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0nxv4i32(, *) +declare @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0nxv2i64(, *) +declare @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0nxv8f16(, *) +declare @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0nxv4f32(, *) +declare @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0nxv2f64(, *)