-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NFC][AArch64] Tests for guarding unrolling with scalable vec ins/ext #81132
[NFC][AArch64] Tests for guarding unrolling with scalable vec ins/ext #81132
Conversation
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-llvm-transforms Author: Graham Hunter (huntergr-arm) ChangesFull diff: https://github.com/llvm/llvm-project/pull/81132.diff 1 Files Affected:
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll b/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
new file mode 100644
index 00000000000000..4e9885750008f8
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=loop-unroll,simplifycfg -S -mtriple aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=128 %s | FileCheck %s -check-prefix=UNROLL-128
+; RUN: opt -passes=loop-unroll,simplifycfg -S -mtriple aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=256 %s | FileCheck %s -check-prefix=UNROLL-256
+
+;; This test contains IR similar to what would be generated when SVE ACLE
+;; routines are used with fixed-width vector types -- lots of subvector inserts
+;; and extracts that are effectively just bitcasts since the types are the
+;; same at a given SVE bit size. We want to make sure that they are not a
+;; barrier to unrolling simple loops with a fixed trip count which could be
+;; further optimized.
+
+define void @test_ins_ext_cost(ptr readonly %a, ptr readonly %b, ptr readonly %c, ptr noalias %d) {
+; UNROLL-128-LABEL: define void @test_ins_ext_cost(
+; UNROLL-128-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; UNROLL-128-NEXT: entry:
+; UNROLL-128-NEXT: br label [[FOR_BODY:%.*]]
+; UNROLL-128: for.body:
+; UNROLL-128-NEXT: [[EXIT_COND:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[FOR_BODY]] ]
+; UNROLL-128-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 1, [[FOR_BODY]] ]
+; UNROLL-128-NEXT: [[GEP_A:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 [[IV]]
+; UNROLL-128-NEXT: [[LOAD_A:%.*]] = load <8 x float>, ptr [[GEP_A]], align 16
+; UNROLL-128-NEXT: [[GEP_B:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 [[IV]]
+; UNROLL-128-NEXT: [[LOAD_B:%.*]] = load <8 x float>, ptr [[GEP_B]], align 16
+; UNROLL-128-NEXT: [[GEP_C:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 [[IV]]
+; UNROLL-128-NEXT: [[LOAD_C:%.*]] = load <8 x float>, ptr [[GEP_C]], align 16
+; UNROLL-128-NEXT: [[CAST_SCALABLE_B:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B]], i64 0)
+; UNROLL-128-NEXT: [[CAST_SCALABLE_C:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C]], i64 0)
+; UNROLL-128-NEXT: [[ADD:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B]], [[CAST_SCALABLE_C]]
+; UNROLL-128-NEXT: [[CAST_SCALABLE_A:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A]], i64 0)
+; UNROLL-128-NEXT: [[MUL:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A]], [[ADD]]
+; UNROLL-128-NEXT: [[CAST_FIXED_D:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL]], i64 0)
+; UNROLL-128-NEXT: [[GEP_D:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 [[IV]]
+; UNROLL-128-NEXT: store <8 x float> [[CAST_FIXED_D]], ptr [[GEP_D]], align 16
+; UNROLL-128-NEXT: br i1 [[EXIT_COND]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; UNROLL-128: exit:
+; UNROLL-128-NEXT: ret void
+;
+; UNROLL-256-LABEL: define void @test_ins_ext_cost(
+; UNROLL-256-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; UNROLL-256-NEXT: entry:
+; UNROLL-256-NEXT: br label [[FOR_BODY:%.*]]
+; UNROLL-256: for.body:
+; UNROLL-256-NEXT: [[EXIT_COND:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[FOR_BODY]] ]
+; UNROLL-256-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 1, [[FOR_BODY]] ]
+; UNROLL-256-NEXT: [[GEP_A:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 [[IV]]
+; UNROLL-256-NEXT: [[LOAD_A:%.*]] = load <8 x float>, ptr [[GEP_A]], align 16
+; UNROLL-256-NEXT: [[GEP_B:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 [[IV]]
+; UNROLL-256-NEXT: [[LOAD_B:%.*]] = load <8 x float>, ptr [[GEP_B]], align 16
+; UNROLL-256-NEXT: [[GEP_C:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 [[IV]]
+; UNROLL-256-NEXT: [[LOAD_C:%.*]] = load <8 x float>, ptr [[GEP_C]], align 16
+; UNROLL-256-NEXT: [[CAST_SCALABLE_B:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B]], i64 0)
+; UNROLL-256-NEXT: [[CAST_SCALABLE_C:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C]], i64 0)
+; UNROLL-256-NEXT: [[ADD:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B]], [[CAST_SCALABLE_C]]
+; UNROLL-256-NEXT: [[CAST_SCALABLE_A:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A]], i64 0)
+; UNROLL-256-NEXT: [[MUL:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A]], [[ADD]]
+; UNROLL-256-NEXT: [[CAST_FIXED_D:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL]], i64 0)
+; UNROLL-256-NEXT: [[GEP_D:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 [[IV]]
+; UNROLL-256-NEXT: store <8 x float> [[CAST_FIXED_D]], ptr [[GEP_D]], align 16
+; UNROLL-256-NEXT: br i1 [[EXIT_COND]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; UNROLL-256: exit:
+; UNROLL-256-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %exit.cond = phi i1 [ true, %entry ], [ false, %for.body ]
+ %iv = phi i64 [ 0, %entry ], [ 1, %for.body ]
+ %gep.a = getelementptr inbounds <8 x float>, ptr %a, i64 %iv
+ %load.a = load <8 x float>, ptr %gep.a, align 16
+ %gep.b = getelementptr inbounds <8 x float>, ptr %b, i64 %iv
+ %load.b = load <8 x float>, ptr %gep.b, align 16
+ %gep.c = getelementptr inbounds <8 x float>, ptr %c, i64 %iv
+ %load.c = load <8 x float>, ptr %gep.c, align 16
+ %cast.scalable.b = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> %load.b, i64 0)
+ %cast.scalable.c = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> %load.c, i64 0)
+ %add = fadd <vscale x 4 x float> %cast.scalable.b, %cast.scalable.c
+ %cast.scalable.a = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> %load.a, i64 0)
+ %mul = fmul <vscale x 4 x float> %cast.scalable.a, %add
+ %cast.fixed.d = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> %mul, i64 0)
+ %gep.d = getelementptr inbounds <8 x float>, ptr %d, i64 0, i64 %iv
+ store <8 x float> %cast.fixed.d, ptr %gep.d, align 16
+ br i1 %exit.cond, label %for.body, label %exit
+
+exit:
+ ret void
+}
|
@@ -0,0 +1,87 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 | |||
; RUN: opt -passes=loop-unroll,simplifycfg -S -mtriple aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=128 %s | FileCheck %s -check-prefix=UNROLL-128 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure about this runline. I used it to make sure <8 x float>
was not a legal type, but if 128b is the actual size of the vectors at runtime then we won't really be able to insert that much data. Should this be ignored, or be rejected by the verifier if we know that it may be a invalid operation?
I think I can write another test for a negative case, e.g. one with non-power-of-2 integers that would require promotion rather than being directly legal.
This work might need to be extended in future to cover cases where we would legalize both sides in the same way, but it's a start for now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you test the cost of the intrinsics more directly, like the tests in llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll or llvm/test/Analysis/CostModel/AArch64/splice.ll?
@@ -30,6 +30,41 @@ declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> | |||
declare <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32>, i64) | |||
declare <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32>, <vscale x 4 x i32>, i64) | |||
|
|||
define void @vector_insert_extract_legal_idxzero_128b(<vscale x 4 x float> %v0, <4 x float> %v1, <vscale x 2 x double> %v2) #1 { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be good to have some tests for inserting/extracting predicates here too, i.e.
%insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> %v2, i64 0) | ||
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void | ||
; | ||
%insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> %v0, <4 x float> %v1, i64 0) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is just a suggestion, but you can make the tests simpler by just passing in undef everywhere, i.e.
%insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
and remove the arguments passed in to the function. This is a trick used quite often for cost model tests.
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> %v2, i64 0) | ||
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void | ||
; | ||
%insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> %v0, <16 x i16> %v1, i64 0) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same comment here about tests using legal predicate vectors.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM (with nit addressed)
%insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0) | ||
%extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0) | ||
%insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0) | ||
%extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Could you also add a test-case of extracting a fixed-length predicate from a scalable predicate, e.g. <8 x i1>
from a <vscale x 8 x i1>
?
No description provided.