diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 570675b590eae..cc76b3a229be0 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -9886,18 +9886,10 @@ Value *CodeGenFunction::EmitSMEZero(const SVETypeFlags &TypeFlags, Value *CodeGenFunction::EmitSMELdrStr(const SVETypeFlags &TypeFlags, SmallVectorImpl &Ops, unsigned IntID) { - if (Ops.size() == 3) { - Function *Cntsb = CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb); - llvm::Value *CntsbCall = Builder.CreateCall(Cntsb, {}, "svlb"); - - llvm::Value *VecNum = Ops[2]; - llvm::Value *MulVL = Builder.CreateMul(CntsbCall, VecNum, "mulvl"); - - Ops[1] = Builder.CreateGEP(Int8Ty, Ops[1], MulVL); - Ops[0] = Builder.CreateAdd( - Ops[0], Builder.CreateIntCast(VecNum, Int32Ty, true), "tileslice"); - Ops.erase(&Ops[2]); - } + if (Ops.size() == 2) + Ops.push_back(Builder.getInt32(0)); + else + Ops[2] = Builder.CreateIntCast(Ops[2], Int32Ty, true); Function *F = CGM.getIntrinsic(IntID, {}); return Builder.CreateCall(F, Ops); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c index e85c47072f2df..9af0778e89c5e 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c @@ -6,86 +6,53 @@ #include -// CHECK-C-LABEL: define dso_local void @test_svldr_vnum_za( -// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// CHECK-C-NEXT: entry: -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], ptr [[PTR]]) -// CHECK-C-NEXT: ret void -// -// CHECK-CXX-LABEL: define dso_local void @_Z18test_svldr_vnum_zajPKv( -// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], ptr [[PTR]]) -// CHECK-CXX-NEXT: ret void +// CHECK-C-LABEL: @test_svldr_vnum_za( +// CHECK-CXX-LABEL: @_Z18test_svldr_vnum_zajPKv( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) +// CHECK-NEXT: ret void // void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) { svldr_vnum_za(slice_base, ptr, 0); } -// CHECK-C-LABEL: define dso_local void @test_svldr_vnum_za_1( -// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-C-NEXT: entry: -// CHECK-C-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], 15 -// CHECK-C-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE]], 15 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TILESLICE]], ptr [[TMP0]]) -// CHECK-C-NEXT: ret void -// -// CHECK-CXX-LABEL: define dso_local void @_Z20test_svldr_vnum_za_1jPKv( -// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], 15 -// CHECK-CXX-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE]], 15 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TILESLICE]], ptr [[TMP0]]) -// CHECK-CXX-NEXT: ret void +// CHECK-C-LABEL: @test_svldr_vnum_za_1( +// CHECK-CXX-LABEL: @_Z20test_svldr_vnum_za_1jPKv( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15) +// CHECK-NEXT: ret void // void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) { svldr_vnum_za(slice_base, ptr, 15); } -// CHECK-C-LABEL: define dso_local void @test_svldr_za( -// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-C-NEXT: entry: -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], ptr [[PTR]]) -// CHECK-C-NEXT: ret void -// -// CHECK-CXX-LABEL: define dso_local void @_Z13test_svldr_zajPKv( -// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], ptr [[PTR]]) -// CHECK-CXX-NEXT: ret void +// CHECK-C-LABEL: @test_svldr_za( +// CHECK-CXX-LABEL: @_Z13test_svldr_zajPKv( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) +// CHECK-NEXT: ret void // void test_svldr_za(uint32_t slice_base, const void *ptr) { svldr_za(slice_base, ptr); } -// CHECK-C-LABEL: define dso_local void @test_svldr_vnum_za_var( -// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-C-NEXT: entry: -// CHECK-C-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM]] -// CHECK-C-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TILESLICE:%.*]] = add i32 [[TMP1]], [[SLICE_BASE]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TILESLICE]], ptr [[TMP0]]) -// CHECK-C-NEXT: ret void -// -// CHECK-CXX-LABEL: define dso_local void @_Z22test_svldr_vnum_za_varjPKvl( -// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TILESLICE:%.*]] = add i32 [[TMP1]], [[SLICE_BASE]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TILESLICE]], ptr [[TMP0]]) -// CHECK-CXX-NEXT: ret void +// CHECK-C-LABEL: @test_svldr_vnum_za_var( +// CHECK-CXX-LABEL: @_Z22test_svldr_vnum_za_varjPKvl( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[VNUM:%.*]] to i32 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]]) +// CHECK-NEXT: ret void // void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum) { svldr_vnum_za(slice_base, ptr, vnum); } -//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -// CHECK: {{.*}} + +// CHECK-C-LABEL: @test_svldr_vnum_za_2( +// CHECK-CXX-LABEL: @_Z20test_svldr_vnum_za_2jPKv( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16) +// CHECK-NEXT: ret void +// +void test_svldr_vnum_za_2(uint32_t slice_base, const void *ptr) { + svldr_vnum_za(slice_base, ptr, 16); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c index e53a3c6c57de3..baadfc18563a0 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c @@ -6,86 +6,53 @@ #include -// CHECK-C-LABEL: define dso_local void @test_svstr_vnum_za( -// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// CHECK-C-NEXT: entry: -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], ptr [[PTR]]) -// CHECK-C-NEXT: ret void -// -// CHECK-CXX-LABEL: define dso_local void @_Z18test_svstr_vnum_zajPv( -// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], ptr [[PTR]]) -// CHECK-CXX-NEXT: ret void +// CHECK-C-LABEL: @test_svstr_vnum_za( +// CHECK-CXX-LABEL: @_Z18test_svstr_vnum_zajPv( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) +// CHECK-NEXT: ret void // void test_svstr_vnum_za(uint32_t slice_base, void *ptr) { svstr_vnum_za(slice_base, ptr, 0); } -// CHECK-C-LABEL: define dso_local void @test_svstr_vnum_za_1( -// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-C-NEXT: entry: -// CHECK-C-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], 15 -// CHECK-C-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE]], 15 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TILESLICE]], ptr [[TMP0]]) -// CHECK-C-NEXT: ret void -// -// CHECK-CXX-LABEL: define dso_local void @_Z20test_svstr_vnum_za_1jPv( -// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], 15 -// CHECK-CXX-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE]], 15 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TILESLICE]], ptr [[TMP0]]) -// CHECK-CXX-NEXT: ret void +// CHECK-C-LABEL: @test_svstr_vnum_za_1( +// CHECK-CXX-LABEL: @_Z20test_svstr_vnum_za_1jPv( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15) +// CHECK-NEXT: ret void // void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) { svstr_vnum_za(slice_base, ptr, 15); } -// CHECK-C-LABEL: define dso_local void @test_svstr_za( -// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-C-NEXT: entry: -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], ptr [[PTR]]) -// CHECK-C-NEXT: ret void -// -// CHECK-CXX-LABEL: define dso_local void @_Z13test_svstr_zajPv( -// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], ptr [[PTR]]) -// CHECK-CXX-NEXT: ret void +// CHECK-C-LABEL: @test_svstr_za( +// CHECK-CXX-LABEL: @_Z13test_svstr_zajPv( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) +// CHECK-NEXT: ret void // void test_svstr_za(uint32_t slice_base, void *ptr) { svstr_za(slice_base, ptr); } -// CHECK-C-LABEL: define dso_local void @test_svstr_vnum_za_var( -// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-C-NEXT: entry: -// CHECK-C-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM]] -// CHECK-C-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TILESLICE:%.*]] = add i32 [[TMP1]], [[SLICE_BASE]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TILESLICE]], ptr [[TMP0]]) -// CHECK-C-NEXT: ret void -// -// CHECK-CXX-LABEL: define dso_local void @_Z22test_svstr_vnum_za_varjPvl( -// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TILESLICE:%.*]] = add i32 [[TMP1]], [[SLICE_BASE]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TILESLICE]], ptr [[TMP0]]) -// CHECK-CXX-NEXT: ret void +// CHECK-C-LABEL: @test_svstr_vnum_za_var( +// CHECK-CXX-LABEL: @_Z22test_svstr_vnum_za_varjPvl( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[VNUM:%.*]] to i32 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]]) +// CHECK-NEXT: ret void // void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) { svstr_vnum_za(slice_base, ptr, vnum); } -//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -// CHECK: {{.*}} + +// CHECK-C-LABEL: @test_svstr_vnum_za_2( +// CHECK-CXX-LABEL: @_Z20test_svstr_vnum_za_2jPv( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16) +// CHECK-NEXT: ret void +// +void test_svstr_vnum_za_2(uint32_t slice_base, void *ptr) { + svstr_vnum_za(slice_base, ptr, 16); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index a42e2c49cb477..1b701a91455c9 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2679,10 +2679,10 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sme_st1q_vert : SME_Load_Store_Intrinsic; // Spill + fill - def int_aarch64_sme_ldr : DefaultAttrsIntrinsic< - [], [llvm_i32_ty, llvm_ptr_ty]>; - def int_aarch64_sme_str : DefaultAttrsIntrinsic< - [], [llvm_i32_ty, llvm_ptr_ty]>; + class SME_LDR_STR_ZA_Intrinsic + : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty, llvm_i32_ty]>; + def int_aarch64_sme_ldr : SME_LDR_STR_ZA_Intrinsic; + def int_aarch64_sme_str : SME_LDR_STR_ZA_Intrinsic; class SME_TileToVector_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], @@ -3454,4 +3454,9 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sve_sel_x2 : SVE2_VG2_Sel_Intrinsic; def int_aarch64_sve_sel_x4 : SVE2_VG4_Sel_Intrinsic; + class SME_LDR_STR_ZT_Intrinsic + : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty]>; + def int_aarch64_sme_ldr_zt : SME_LDR_STR_ZT_Intrinsic; + def int_aarch64_sme_str_zt : SME_LDR_STR_ZT_Intrinsic; + } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index fd4df07f04bfe..47c57edbd16cc 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2406,6 +2406,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::FCMP) MAKE_CASE(AArch64ISD::STRICT_FCMP) MAKE_CASE(AArch64ISD::STRICT_FCMPE) + MAKE_CASE(AArch64ISD::SME_ZA_LDR) + MAKE_CASE(AArch64ISD::SME_ZA_STR) MAKE_CASE(AArch64ISD::DUP) MAKE_CASE(AArch64ISD::DUPLANE8) MAKE_CASE(AArch64ISD::DUPLANE16) @@ -4830,6 +4832,90 @@ SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain, Mask); } +// Lower an SME LDR/STR ZA intrinsic +// Case 1: If the vector number (vecnum) is an immediate in range, it gets +// folded into the instruction +// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11] +// Case 2: If the vecnum is not an immediate, then it is used to modify the base +// and tile slice registers +// ldr(%tileslice, %ptr, %vecnum) +// -> +// %svl = rdsvl +// %ptr2 = %ptr + %svl * %vecnum +// %tileslice2 = %tileslice + %vecnum +// ldr [%tileslice2, 0], [%ptr2, 0] +// Case 3: If the vecnum is an immediate out of range, then the same is done as +// case 2, but the base and slice registers are modified by the greatest +// multiple of 15 lower than the vecnum and the remainder is folded into the +// instruction. This means that successive loads and stores that are offset from +// each other can share the same base and slice register updates. +// ldr(%tileslice, %ptr, 22) +// ldr(%tileslice, %ptr, 23) +// -> +// %svl = rdsvl +// %ptr2 = %ptr + %svl * 15 +// %tileslice2 = %tileslice + 15 +// ldr [%tileslice2, 7], [%ptr2, 7] +// ldr [%tileslice2, 8], [%ptr2, 8] +// Case 4: If the vecnum is an add of an immediate, then the non-immediate +// operand and the immediate can be folded into the instruction, like case 2. +// ldr(%tileslice, %ptr, %vecnum + 7) +// ldr(%tileslice, %ptr, %vecnum + 8) +// -> +// %svl = rdsvl +// %ptr2 = %ptr + %svl * %vecnum +// %tileslice2 = %tileslice + %vecnum +// ldr [%tileslice2, 7], [%ptr2, 7] +// ldr [%tileslice2, 8], [%ptr2, 8] +// Case 5: The vecnum being an add of an immediate out of range is also handled, +// in which case the same remainder logic as case 3 is used. +SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) { + SDLoc DL(N); + + SDValue TileSlice = N->getOperand(2); + SDValue Base = N->getOperand(3); + SDValue VecNum = N->getOperand(4); + int32_t ConstAddend = 0; + SDValue VarAddend = VecNum; + + // If the vnum is an add of an immediate, we can fold it into the instruction + if (VecNum.getOpcode() == ISD::ADD && + isa(VecNum.getOperand(1))) { + ConstAddend = cast(VecNum.getOperand(1))->getSExtValue(); + VarAddend = VecNum.getOperand(0); + } else if (auto ImmNode = dyn_cast(VecNum)) { + ConstAddend = ImmNode->getSExtValue(); + VarAddend = SDValue(); + } + + int32_t ImmAddend = ConstAddend % 16; + if (int32_t C = (ConstAddend - ImmAddend)) { + SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32); + VarAddend = VarAddend + ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal}) + : CVal; + } + + if (VarAddend) { + // Get the vector length that will be multiplied by vnum + auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, + DAG.getConstant(1, DL, MVT::i32)); + + // Multiply SVL and vnum then add it to the base + SDValue Mul = DAG.getNode( + ISD::MUL, DL, MVT::i64, + {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)}); + Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul}); + // Just add vnum to the tileslice + TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend}); + } + + return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR, + DL, MVT::Other, + {/*Chain=*/N.getOperand(0), TileSlice, Base, + DAG.getTargetConstant(ImmAddend, DL, MVT::i32)}); +} + SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = Op.getConstantOperandVal(1); @@ -4853,6 +4939,10 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op, return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain, DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr); } + case Intrinsic::aarch64_sme_str: + case Intrinsic::aarch64_sme_ldr: { + return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr); + } case Intrinsic::aarch64_sme_za_enable: return DAG.getNode( AArch64ISD::SMSTART, DL, MVT::Other, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index f7d004fa3cbcc..2a039488f2a9a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -442,6 +442,10 @@ enum NodeType : unsigned { STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE, STRICT_FCMPE, + // SME ZA loads and stores + SME_ZA_LDR, + SME_ZA_STR, + // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, LD3post, diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 4f40fa538b0c3..6c9b1f11a4dec 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -33,6 +33,12 @@ def tileslicerange0s4 : ComplexPattern", []>; def am_sme_indexed_b4 :ComplexPattern", [], [SDNPWantRoot]>; +def SDTZALoadStore : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>; +def AArch64SMELdr : SDNode<"AArch64ISD::SME_ZA_LDR", SDTZALoadStore, + [SDNPHasChain, SDNPSideEffect, SDNPMayLoad]>; +def AArch64SMEStr : SDNode<"AArch64ISD::SME_ZA_STR", SDTZALoadStore, + [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>; + //===----------------------------------------------------------------------===// // SME Pseudo Classes //===----------------------------------------------------------------------===// @@ -779,23 +785,23 @@ class sme_spill_inst : sme_spill_fill_base<0b1, (outs), (ins MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, - imm0_15:$offset), + imm32_0_15:$offset), opcodestr>; let mayLoad = 1 in class sme_fill_inst : sme_spill_fill_base<0b0, (outs MatrixOp:$ZAt), (ins MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, - imm0_15:$offset), + imm32_0_15:$offset), opcodestr>; multiclass sme_spill { def NAME : sme_spill_inst; def : InstAlias(NAME) MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>; - // base - def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base), - (!cast(NAME) ZA, $idx, 0, $base, 0)>; + + def : Pat<(AArch64SMEStr (i32 MatrixIndexGPR32Op12_15:$slice), (i64 GPR64sp:$base), (i32 sme_elm_idx0_15:$imm)), + (!cast(NAME) ZA, MatrixIndexGPR32Op12_15:$slice, sme_elm_idx0_15:$imm, GPR64sp:$base, imm32_0_15:$imm)>; } multiclass sme_fill { @@ -805,16 +811,15 @@ multiclass sme_fill { MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>; def NAME # _PSEUDO : Pseudo<(outs), - (ins MatrixIndexGPR32Op12_15:$idx, imm0_15:$imm4, + (ins MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm4, GPR64sp:$base), []>, Sched<[]> { // Translated to actual instruction in AArch64ISelLowering.cpp let usesCustomInserter = 1; let mayLoad = 1; } - // base - def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base), - (!cast(NAME # _PSEUDO) $idx, 0, $base)>; + def : Pat<(AArch64SMELdr MatrixIndexGPR32Op12_15:$slice, GPR64sp:$base, sme_elm_idx0_15:$imm), + (!cast(NAME # _PSEUDO) MatrixIndexGPR32Op12_15:$slice, sme_elm_idx0_15:$imm, GPR64sp:$base)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll index c96aca366ed43..da764cf52445b 100644 --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll @@ -252,7 +252,7 @@ define void @ldr(ptr %ptr) { ; CHECK-NEXT: mov w12, wzr ; CHECK-NEXT: ldr za[w12, 0], [x0] ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr) + call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr, i32 0) ret void; } @@ -264,7 +264,7 @@ define void @ldr_with_off_15(ptr %ptr) { ; CHECK-NEXT: ldr za[w12, 0], [x8] ; CHECK-NEXT: ret %base = getelementptr i8, ptr %ptr, i64 15 - call void @llvm.aarch64.sme.ldr(i32 15, ptr %base) + call void @llvm.aarch64.sme.ldr(i32 15, ptr %base, i32 0) ret void; } @@ -278,7 +278,7 @@ define void @ldr_with_off_15mulvl(ptr %ptr) { %vscale = call i64 @llvm.vscale.i64() %mulvl = mul i64 %vscale, 240 %base = getelementptr i8, ptr %ptr, i64 %mulvl - call void @llvm.aarch64.sme.ldr(i32 15, ptr %base) + call void @llvm.aarch64.sme.ldr(i32 15, ptr %base, i32 0) ret void; } @@ -292,23 +292,205 @@ define void @ldr_with_off_16mulvl(ptr %ptr) { %vscale = call i64 @llvm.vscale.i64() %mulvl = mul i64 %vscale, 256 %base = getelementptr i8, ptr %ptr, i64 %mulvl - call void @llvm.aarch64.sme.ldr(i32 16, ptr %base) + call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 0) ret void; } +define void @ldr_with_off_var(ptr %base, i32 %off) { +; CHECK-LABEL: ldr_with_off_var: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: add w12, w1, #16 +; CHECK-NEXT: madd x8, x9, x8, x0 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 %off) + ret void; +} + +define void @ldr_with_off_15imm(ptr %base) { +; CHECK-LABEL: ldr_with_off_15imm: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, #16 // =0x10 +; CHECK-NEXT: ldr za[w12, 15], [x0, #15, mul vl] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 15) + ret void; +} + +define void @ldr_with_off_16imm(ptr %base) { +; CHECK-LABEL: ldr_with_off_16imm: +; CHECK: // %bb.0: +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov w12, #32 // =0x20 +; CHECK-NEXT: add x8, x0, x8, lsl #4 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 16) + ret void; +} + +define void @ldr_with_off_many_imm(i32 %tile_slice, ptr %ptr) { +; CHECK-LABEL: ldr_with_off_many_imm: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: ldr za[w12, 1], [x1, #1, mul vl] +; CHECK-NEXT: ldr za[w12, 2], [x1, #2, mul vl] +; CHECK-NEXT: ldr za[w12, 3], [x1, #3, mul vl] +; CHECK-NEXT: ldr za[w12, 4], [x1, #4, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 1) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 2) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 3) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 4) + ret void +} + +define void @ldr_with_off_many_imm_15_18(i32 %tile_slice, ptr %ptr) { +; CHECK-LABEL: ldr_with_off_many_imm_15_18: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: add x8, x1, x8, lsl #4 +; CHECK-NEXT: ldr za[w12, 15], [x1, #15, mul vl] +; CHECK-NEXT: add w12, w0, #16 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 15) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 16) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 17) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 18) + ret void +} + +define void @ldr_with_off_many_imm_16_19(i32 %tile_slice, ptr %ptr) { +; CHECK-LABEL: ldr_with_off_many_imm_16_19: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: add w12, w0, #16 +; CHECK-NEXT: add x8, x1, x8, lsl #4 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: ldr za[w12, 3], [x8, #3, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 16) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 17) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 18) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 19) + ret void +} + +define void @ldr_with_off_many_imm_31_34(i32 %tile_slice, ptr %ptr) { +; CHECK-LABEL: ldr_with_off_many_imm_31_34: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: add w12, w0, #16 +; CHECK-NEXT: add x9, x1, x8, lsl #4 +; CHECK-NEXT: add x8, x1, x8, lsl #5 +; CHECK-NEXT: ldr za[w12, 15], [x9, #15, mul vl] +; CHECK-NEXT: add w12, w0, #32 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 31) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 32) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 33) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 34) + ret void +} + +define void @ldr_with_off_many_imm_32_35(i32 %tile_slice, ptr %ptr, i64 %vnum) { +; CHECK-LABEL: ldr_with_off_many_imm_32_35: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: add w12, w0, #32 +; CHECK-NEXT: add x8, x1, x8, lsl #5 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: ldr za[w12, 3], [x8, #3, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 32) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 33) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 34) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 35) + ret void +} + +define void @ldr_with_off_many_var(i32 %tile_slice, ptr %ptr, i64 %vnum) { +; CHECK-LABEL: ldr_with_off_many_var: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtw x8, w2 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: add w12, w0, w2 +; CHECK-NEXT: madd x8, x9, x8, x1 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: ldr za[w12, 3], [x8, #3, mul vl] +; CHECK-NEXT: ret +entry: + %0 = trunc i64 %vnum to i32 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %0) + %1 = add i32 %0, 1 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %1) + %2 = add i32 %0, 2 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %2) + %3 = add i32 %0, 3 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %3) + ret void +} + +define void @ldr_with_off_many_var_high(i32 %tile_slice, ptr %ptr, i64 %vnum) { +; CHECK-LABEL: ldr_with_off_many_var_high: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add w8, w2, #32 +; CHECK-NEXT: rdsvl x10, #1 +; CHECK-NEXT: sxtw x9, w8 +; CHECK-NEXT: add w12, w0, w8 +; CHECK-NEXT: madd x9, x10, x9, x1 +; CHECK-NEXT: ldr za[w12, 1], [x9, #1, mul vl] +; CHECK-NEXT: ldr za[w12, 2], [x9, #2, mul vl] +; CHECK-NEXT: ldr za[w12, 3], [x9, #3, mul vl] +; CHECK-NEXT: ldr za[w12, 4], [x9, #4, mul vl] +; CHECK-NEXT: ret +entry: + %0 = trunc i64 %vnum to i32 + %1 = add i32 %0, 33 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %1) + %2 = add i32 %0, 34 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %2) + %3 = add i32 %0, 35 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %3) + %4 = add i32 %0, 36 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %4) + ret void +} + ; Ensure that the tile offset is sunk, given that this is likely to be an 'add' ; that's decomposed into a base + offset in ISel. define void @test_ld1_sink_tile0_offset_operand( %pg, ptr %src, i32 %base, i32 %N) { ; CHECK-LABEL: test_ld1_sink_tile0_offset_operand: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov w12, w1 -; CHECK-NEXT: .LBB14_1: // %for.body +; CHECK-NEXT: .LBB24_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld1w {za0h.s[w12, 0]}, p0/z, [x0] ; CHECK-NEXT: subs w2, w2, #1 ; CHECK-NEXT: ld1w {za0h.s[w12, 1]}, p0/z, [x0] ; CHECK-NEXT: ld1w {za0h.s[w12, 2]}, p0/z, [x0] -; CHECK-NEXT: b.ne .LBB14_1 +; CHECK-NEXT: b.ne .LBB24_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret entry: @@ -341,5 +523,5 @@ declare void @llvm.aarch64.sme.ld1w.vert(, ptr, i32, i32) declare void @llvm.aarch64.sme.ld1d.vert(, ptr, i32, i32) declare void @llvm.aarch64.sme.ld1q.vert(, ptr, i32, i32) -declare void @llvm.aarch64.sme.ldr(i32, ptr) +declare void @llvm.aarch64.sme.ldr(i32, ptr, i32) declare i64 @llvm.vscale.i64() diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll index 2bb9c3d05b9da..53e9b6300951c 100644 --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll @@ -252,7 +252,7 @@ define void @str(ptr %ptr) { ; CHECK-NEXT: mov w12, wzr ; CHECK-NEXT: str za[w12, 0], [x0] ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.str(i32 0, ptr %ptr) + call void @llvm.aarch64.sme.str(i32 0, ptr %ptr, i32 0) ret void; } @@ -264,7 +264,7 @@ define void @str_with_off_15(ptr %ptr) { ; CHECK-NEXT: str za[w12, 0], [x8] ; CHECK-NEXT: ret %base = getelementptr i8, ptr %ptr, i64 15 - call void @llvm.aarch64.sme.str(i32 15, ptr %base) + call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 0) ret void; } @@ -278,7 +278,7 @@ define void @str_with_off_15mulvl(ptr %ptr) { %vscale = call i64 @llvm.vscale.i64() %mulvl = mul i64 %vscale, 240 %base = getelementptr i8, ptr %ptr, i64 %mulvl - call void @llvm.aarch64.sme.str(i32 15, ptr %base) + call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 0) ret void; } @@ -292,23 +292,210 @@ define void @str_with_off_16mulvl(ptr %ptr) { %vscale = call i64 @llvm.vscale.i64() %mulvl = mul i64 %vscale, 256 %base = getelementptr i8, ptr %ptr, i64 %mulvl - call void @llvm.aarch64.sme.str(i32 16, ptr %base) + call void @llvm.aarch64.sme.str(i32 16, ptr %base, i32 0) ret void; } +define void @str_with_off_var(ptr %base, i32 %off) { +; CHECK-LABEL: str_with_off_var: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: add w12, w1, #16 +; CHECK-NEXT: madd x8, x9, x8, x0 +; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.str(i32 16, ptr %base, i32 %off) + ret void; +} + +define void @str_with_off_15imm(ptr %ptr) { +; CHECK-LABEL: str_with_off_15imm: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, #15 // =0xf +; CHECK-NEXT: add x8, x0, #15 +; CHECK-NEXT: str za[w12, 15], [x8, #15, mul vl] +; CHECK-NEXT: ret + %base = getelementptr i8, ptr %ptr, i64 15 + call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 15) + ret void; +} + +define void @str_with_off_16imm(ptr %ptr) { +; CHECK-LABEL: str_with_off_16imm: +; CHECK: // %bb.0: +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov w12, #31 // =0x1f +; CHECK-NEXT: add x8, x0, x8, lsl #4 +; CHECK-NEXT: add x8, x8, #15 +; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: ret + %base = getelementptr i8, ptr %ptr, i64 15 + call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 16) + ret void; +} + +define void @str_with_off_many_imm(i32 %tile_slice, ptr %ptr) { +; CHECK-LABEL: str_with_off_many_imm: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: str za[w12, 1], [x1, #1, mul vl] +; CHECK-NEXT: str za[w12, 2], [x1, #2, mul vl] +; CHECK-NEXT: str za[w12, 3], [x1, #3, mul vl] +; CHECK-NEXT: str za[w12, 4], [x1, #4, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 1) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 2) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 3) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 4) + ret void +} + +define void @str_with_off_many_imm_15_18(i32 %tile_slice, ptr %ptr) { +; CHECK-LABEL: str_with_off_many_imm_15_18: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: add x8, x1, x8, lsl #4 +; CHECK-NEXT: str za[w12, 15], [x1, #15, mul vl] +; CHECK-NEXT: add w12, w0, #16 +; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 15) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 16) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 17) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 18) + ret void +} + +define void @str_with_off_many_imm_16_19(i32 %tile_slice, ptr %ptr) { +; CHECK-LABEL: str_with_off_many_imm_16_19: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: add w12, w0, #16 +; CHECK-NEXT: add x8, x1, x8, lsl #4 +; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: str za[w12, 3], [x8, #3, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 16) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 17) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 18) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 19) + ret void +} + +define void @str_with_off_many_imm_31_34(i32 %tile_slice, ptr %ptr) { +; CHECK-LABEL: str_with_off_many_imm_31_34: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: add w12, w0, #16 +; CHECK-NEXT: add w13, w0, #32 +; CHECK-NEXT: add x9, x1, x8, lsl #4 +; CHECK-NEXT: add x8, x1, x8, lsl #5 +; CHECK-NEXT: str za[w12, 15], [x9, #15, mul vl] +; CHECK-NEXT: str za[w13, 0], [x8] +; CHECK-NEXT: str za[w13, 1], [x8, #1, mul vl] +; CHECK-NEXT: str za[w13, 2], [x8, #2, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 31) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 32) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 33) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 34) + ret void +} + +define void @str_with_off_many_imm_32_35(i32 %tile_slice, ptr %ptr) { +; CHECK-LABEL: str_with_off_many_imm_32_35: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: add w12, w0, #32 +; CHECK-NEXT: add x8, x1, x8, lsl #5 +; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: str za[w12, 3], [x8, #3, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 32) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 33) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 34) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 35) + ret void +} + +define void @str_with_off_many_var(i32 %tile_slice, ptr %ptr, i64 %vnum) { +; CHECK-LABEL: str_with_off_many_var: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtw x8, w2 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: add w12, w0, w2 +; CHECK-NEXT: madd x8, x9, x8, x1 +; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: str za[w12, 3], [x8, #3, mul vl] +; CHECK-NEXT: ret +entry: + %0 = trunc i64 %vnum to i32 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %0) + %1 = add i32 %0, 1 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %1) + %2 = add i32 %0, 2 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %2) + %3 = add i32 %0, 3 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %3) + ret void +} + +define void @str_with_off_many_var_high(i32 %tile_slice, ptr %ptr, i64 %vnum) { +; CHECK-LABEL: str_with_off_many_var_high: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add w8, w2, #32 +; CHECK-NEXT: rdsvl x10, #1 +; CHECK-NEXT: sxtw x9, w8 +; CHECK-NEXT: add w12, w0, w8 +; CHECK-NEXT: madd x9, x10, x9, x1 +; CHECK-NEXT: str za[w12, 1], [x9, #1, mul vl] +; CHECK-NEXT: str za[w12, 2], [x9, #2, mul vl] +; CHECK-NEXT: str za[w12, 3], [x9, #3, mul vl] +; CHECK-NEXT: str za[w12, 4], [x9, #4, mul vl] +; CHECK-NEXT: ret +entry: + %0 = trunc i64 %vnum to i32 + %1 = add i32 %0, 33 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %1) + %2 = add i32 %0, 34 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %2) + %3 = add i32 %0, 35 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %3) + %4 = add i32 %0, 36 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %4) + ret void +} + + ; Ensure that the tile offset is sunk, given that this is likely to be an 'add' ; that's decomposed into a base + offset in ISel. define void @test_sink_tile0_offset_operand( %pg, ptr %src, i32 %base, i32 %N) { ; CHECK-LABEL: test_sink_tile0_offset_operand: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov w12, w1 -; CHECK-NEXT: .LBB14_1: // %for.body +; CHECK-NEXT: .LBB24_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: st1w {za0h.s[w12, 0]}, p0, [x0] ; CHECK-NEXT: subs w2, w2, #1 ; CHECK-NEXT: st1w {za0h.s[w12, 1]}, p0, [x0] ; CHECK-NEXT: st1w {za0h.s[w12, 2]}, p0, [x0] -; CHECK-NEXT: b.ne .LBB14_1 +; CHECK-NEXT: b.ne .LBB24_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret entry: @@ -340,5 +527,5 @@ declare void @llvm.aarch64.sme.st1w.vert(, ptr, i32, i32) declare void @llvm.aarch64.sme.st1d.vert(, ptr, i32, i32) declare void @llvm.aarch64.sme.st1q.vert(, ptr, i32, i32) -declare void @llvm.aarch64.sme.str(i32, ptr) +declare void @llvm.aarch64.sme.str(i32, ptr, i32) declare i64 @llvm.vscale.i64() diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td index bcf2466b13a73..b75918ebf2f6d 100644 --- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td +++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td @@ -133,7 +133,8 @@ def LLVM_aarch64_sme_st1q_vert : ArmSME_IntrStoreOp<"st1q.vert">; def LLVM_aarch64_sme_str : ArmSME_IntrOp<"str">, Arguments<(ins Arg:$index, - Arg:$store_address)>; + Arg:$store_address, + Arg:$offset)>; // Vector to tile slice class LLVM_aarch64_sme_write diff --git a/mlir/test/Target/LLVMIR/arm-sme.mlir b/mlir/test/Target/LLVMIR/arm-sme.mlir index aa0389e888b60..767d89a75eec3 100644 --- a/mlir/test/Target/LLVMIR/arm-sme.mlir +++ b/mlir/test/Target/LLVMIR/arm-sme.mlir @@ -214,7 +214,7 @@ llvm.func @arm_sme_store(%nxv1i1 : vector<[1]xi1>, "arm_sme.intr.st1b.vert"(%nxv16i1, %ptr, %c0, %c0) : (vector<[16]xi1>, !llvm.ptr, i32, i32) -> () // CHECK: call void @llvm.aarch64.sme.str - "arm_sme.intr.str"(%c0, %ptr) : (i32, !llvm.ptr) -> () + "arm_sme.intr.str"(%c0, %ptr, %c0) : (i32, !llvm.ptr, i32) -> () llvm.return }