-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64][SVE] Add intrinsincs to assembly mapping for svpmov #81861
Conversation
Thank you for submitting a Pull Request (PR) to the LLVM Project! This PR will be automatically labeled and the relevant teams will be If you wish to, you can add reviewers by using the "Reviewers" section on this page. If this is not working for you, it is probably because you do not have write If you have received no comments on your PR for a week, you can request a review If you have further questions, they may be answered by the LLVM GitHub User Guide. You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums. |
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-llvm-ir Author: None (Lukacma) ChangesThis patch enables translation of svpmov intrinsic to the correct assembly instruction, instead of function call. Full diff: https://github.com/llvm/llvm-project/pull/81861.diff 2 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 921e5b95ae03e8..d044bf6858376d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1367,6 +1367,17 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
llvm_i32_ty,
llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
+
+ class SVE2_1VectorArg_Pred_Intrinsic
+ : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+
+ class SVE2_1VectorArgIndexed_Pred_Intrinsic
+ : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+ [llvm_anyvector_ty, llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+
// NOTE: There is no relationship between these intrinsics beyond an attempt
// to reuse currently identical class definitions.
@@ -3610,15 +3621,10 @@ def int_aarch64_sve_extq : AdvSIMD_2VectorArgIndexed_Intrinsic;
//
// SVE2.1 - Move predicate to/from vector
//
-def int_aarch64_sve_pmov_to_pred_lane :
- DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
- [llvm_anyvector_ty, llvm_i32_ty],
- [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+def int_aarch64_sve_pmov_to_pred_lane : SVE2_1VectorArgIndexed_Pred_Intrinsic;
+
-def int_aarch64_sve_pmov_to_pred_lane_zero :
- DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
- [llvm_anyvector_ty],
- [IntrNoMem]>;
+def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic;
def int_aarch64_sve_pmov_to_vector_lane_merging :
DefaultAttrsIntrinsic<[llvm_anyvector_ty],
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll
index 7cae1d2c216b61..a592dcd4b8ce99 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll
@@ -4,12 +4,7 @@
define <vscale x 16 x i1> @test_pmov_to_pred_i8(<vscale x 16 x i8> %zn) {
; CHECK-LABEL: test_pmov_to_pred_i8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: pmov p0.b, z0
; CHECK-NEXT: ret
entry:
%res = call <vscale x 16 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8(<vscale x 16 x i8> %zn, i32 0)
@@ -19,27 +14,10 @@ define <vscale x 16 x i1> @test_pmov_to_pred_i8(<vscale x 16 x i8> %zn) {
define <vscale x 8 x i1> @test_pmov_to_pred_i16(<vscale x 8 x i16> %zn) {
; CHECK-LABEL: test_pmov_to_pred_i16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-2
-; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT: .cfi_offset w30, -8
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: mov z8.d, z0.d
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16
-; CHECK-NEXT: mov z0.d, z8.d
-; CHECK-NEXT: mov w0, #1 // =0x1
-; CHECK-NEXT: mov p4.b, p0.b
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16
-; CHECK-NEXT: ptrue p1.h
-; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: eor p0.b, p1/z, p4.b, p0.b
-; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #2
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: pmov p1.h, z0[0]
+; CHECK-NEXT: pmov p2.h, z0[1]
+; CHECK-NEXT: eor p0.b, p0/z, p1.b, p2.b
; CHECK-NEXT: ret
entry:
%res1 = call <vscale x 8 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16(<vscale x 8 x i16> %zn, i32 0)
@@ -52,27 +30,10 @@ define <vscale x 8 x i1> @test_pmov_to_pred_i16(<vscale x 8 x i16> %zn) {
define <vscale x 4 x i1> @test_pmov_to_pred_i32(<vscale x 4 x i32> %zn) {
; CHECK-LABEL: test_pmov_to_pred_i32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-2
-; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT: .cfi_offset w30, -8
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: mov z8.d, z0.d
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32
-; CHECK-NEXT: mov z0.d, z8.d
-; CHECK-NEXT: mov w0, #3 // =0x3
-; CHECK-NEXT: mov p4.b, p0.b
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32
-; CHECK-NEXT: ptrue p1.s
-; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: eor p0.b, p1/z, p4.b, p0.b
-; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #2
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: pmov p1.s, z0[0]
+; CHECK-NEXT: pmov p2.s, z0[3]
+; CHECK-NEXT: eor p0.b, p0/z, p1.b, p2.b
; CHECK-NEXT: ret
entry:
%res1 = call <vscale x 4 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32(<vscale x 4 x i32> %zn, i32 0)
@@ -85,27 +46,10 @@ define <vscale x 4 x i1> @test_pmov_to_pred_i32(<vscale x 4 x i32> %zn) {
define <vscale x 2 x i1> @test_pmov_to_pred_i64(<vscale x 2 x i64> %zn) {
; CHECK-LABEL: test_pmov_to_pred_i64:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-2
-; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT: .cfi_offset w30, -8
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: mov z8.d, z0.d
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64
-; CHECK-NEXT: mov z0.d, z8.d
-; CHECK-NEXT: mov w0, #7 // =0x7
-; CHECK-NEXT: mov p4.b, p0.b
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: eor p0.b, p1/z, p4.b, p0.b
-; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #2
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: pmov p1.d, z0[0]
+; CHECK-NEXT: pmov p2.d, z0[7]
+; CHECK-NEXT: eor p0.b, p0/z, p1.b, p2.b
; CHECK-NEXT: ret
entry:
%res1 = call <vscale x 2 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64(<vscale x 2 x i64> %zn, i32 0)
|
Seems like I cannot add reviewers yet so I am tagging you for review @momchil-velikov @hassnaaHamdi @CarolineConcatto |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you @Lukacma .
@Lukacma Congratulations on having your first Pull Request (PR) merged into the LLVM Project! Your changes will be combined with recent changes from other authors, then tested Please check whether problems have been caused by your change specifically, as How to do this, and the rest of the post-merge process, is covered in detail here. If your change does cause a problem, it may be reverted, or you can revert it yourself. If you don't get any reports, no action is required from you. Your changes are working as expected, well done! |
This patch enables translation of svpmov intrinsic to the correct assembly instruction, instead of function call.