diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll index 2941b3677af81..8830ce33aecff 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 6 ; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -5,14 +6,41 @@ target triple = "aarch64-unknown-linux-gnu" ; Tests basic vectorization of scalable homogeneous struct literal returns. define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @struct_return_f32_widen -; CHECK-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) -; CHECK: vector.body: -; CHECK: [[WIDE_CALL:%.*]] = call { , } @scalable_vec_masked_foo( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) -; CHECK: [[WIDE_A:%.*]] = extractvalue { , } [[WIDE_CALL]], 0 -; CHECK: [[WIDE_B:%.*]] = extractvalue { , } [[WIDE_CALL]], 1 -; CHECK: call void @llvm.masked.store.nxv4f32.p0( [[WIDE_A]], ptr {{%.*}}, i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK: call void @llvm.masked.store.nxv4f32.p0( [[WIDE_B]], ptr {{%.*}}, i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-LABEL: define void @struct_return_f32_widen( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 1024, [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ugt i64 1024, [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 1024) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP8:%.*]] = call { , } @scalable_vec_masked_foo( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[TMP8]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[TMP8]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP9]], ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP10]], ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH:.*:]] +; entry: br label %for.body @@ -36,14 +64,41 @@ exit: } define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @struct_return_f64_widen -; CHECK-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) -; CHECK: vector.body: -; CHECK: [[WIDE_CALL:%.*]] = call { , } @scalable_vec_masked_bar( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) -; CHECK: [[WIDE_A:%.*]] = extractvalue { , } [[WIDE_CALL]], 0 -; CHECK: [[WIDE_B:%.*]] = extractvalue { , } [[WIDE_CALL]], 1 -; CHECK: call void @llvm.masked.store.nxv2f64.p0( [[WIDE_A]], ptr {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK: call void @llvm.masked.store.nxv2f64.p0( [[WIDE_B]], ptr {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-LABEL: define void @struct_return_f64_widen( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 1024, [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ugt i64 1024, [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1024) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP8:%.*]] = call { , } @scalable_vec_masked_bar( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[TMP8]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[TMP8]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[OUT_A]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP9]], ptr [[TMP11]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[OUT_B]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP10]], ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP6]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH:.*:]] +; entry: br label %for.body @@ -67,15 +122,59 @@ exit: } define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) { -; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks -; CHECK-SAME: (ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]]) -; CHECK: entry: -; CHECK: br label %vector.memcheck -; CHECK: vector.memcheck: -; CHECK: vector.body: -; CHECK: call { , } @scalable_vec_masked_foo( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) -; CHECK: for.body: -; CHECK: call { float, float } @foo(float [[LOAD:%.*]]) +; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks( +; CHECK-SAME: ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[IN3:%.*]] = ptrtoint ptr [[IN]] to i64 +; CHECK-NEXT: [[OUT_A2:%.*]] = ptrtoint ptr [[OUT_A]] to i64 +; CHECK-NEXT: [[OUT_B1:%.*]] = ptrtoint ptr [[OUT_B]] to i64 +; CHECK-NEXT: br label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[OUT_B1]], [[OUT_A2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[OUT_A2]], [[IN3]] +; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[OUT_B1]], [[IN3]] +; CHECK-NEXT: [[DIFF_CHECK5:%.*]] = icmp ult i64 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[CONFLICT_RDX6:%.*]] = or i1 [[CONFLICT_RDX]], [[DIFF_CHECK5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX6]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = sub i64 1024, [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt i64 1024, [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 1024) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP16:%.*]] = call { , } @scalable_vec_masked_foo( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , } [[TMP16]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , } [[TMP16]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP17]], ptr [[TMP19]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP18]], ptr [[TMP20]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]]) +; CHECK-NEXT: [[TMP21:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = xor i1 [[TMP21]], true +; CHECK-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH]]: +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll index b721e9e489804..f2e2e2846614b 100644 --- a/llvm/test/Transforms/LoopVectorize/struct-return.ll +++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 6 ; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize 2>%t | FileCheck %s ; RUN: cat %t | FileCheck --check-prefix=CHECK-REMARKS %s @@ -7,14 +8,30 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" ; CHECK-REMARKS: remark: {{.*}} vectorized loop define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @struct_return_f32_widen -; CHECK-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) -; CHECK: vector.body: -; CHECK: [[WIDE_CALL:%.*]] = call { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float> [[WIDE_LOAD:%.*]]) -; CHECK: [[WIDE_A:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_CALL]], 0 -; CHECK: [[WIDE_B:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_CALL]], 1 -; CHECK: store <2 x float> [[WIDE_A]], ptr {{%.*}}, align 4 -; CHECK: store <2 x float> [[WIDE_B]], ptr {{%.*}}, align 4 +; CHECK-LABEL: define void @struct_return_f32_widen( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]] +; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]] +; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[TMP5]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH:.*:]] +; entry: br label %for.body @@ -39,14 +56,30 @@ exit: ; CHECK-REMARKS: remark: {{.*}} vectorized loop define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @struct_return_f64_widen -; CHECK-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) -; CHECK: vector.body: -; CHECK: [[WIDE_CALL:%.*]] = call { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double> [[WIDE_LOAD:%.*]]) -; CHECK: [[WIDE_A:%.*]] = extractvalue { <2 x double>, <2 x double> } [[WIDE_CALL]], 0 -; CHECK: [[WIDE_B:%.*]] = extractvalue { <2 x double>, <2 x double> } [[WIDE_CALL]], 1 -; CHECK: store <2 x double> [[WIDE_A]], ptr {{%.*}}, align 8 -; CHECK: store <2 x double> [[WIDE_B]], ptr {{%.*}}, align 8 +; CHECK-LABEL: define void @struct_return_f64_widen( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = call { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[OUT_A]], i64 [[INDEX]] +; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[OUT_B]], i64 [[INDEX]] +; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TMP5]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH:.*:]] +; entry: br label %for.body @@ -71,14 +104,43 @@ exit: ; CHECK-REMARKS: remark: {{.*}} vectorized loop define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) { -; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks -; CHECK-SAME: (ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]]) -; CHECK: entry: -; CHECK: br label %vector.memcheck -; CHECK: vector.memcheck: -; CHECK: vector.body: -; CHECK: call { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float> [[WIDE_LOAD:%.*]]) -; CHECK: for.body: +; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks( +; CHECK-SAME: ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[IN3:%.*]] = ptrtoint ptr [[IN]] to i32 +; CHECK-NEXT: [[OUT_A2:%.*]] = ptrtoint ptr [[OUT_A]] to i32 +; CHECK-NEXT: [[OUT_B1:%.*]] = ptrtoint ptr [[OUT_B]] to i32 +; CHECK-NEXT: br label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = sub i32 [[OUT_B1]], [[OUT_A2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i32 [[TMP0]], 8 +; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[OUT_A2]], [[IN3]] +; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i32 [[TMP1]], 8 +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[OUT_B1]], [[IN3]] +; CHECK-NEXT: [[DIFF_CHECK5:%.*]] = icmp ult i32 [[TMP2]], 8 +; CHECK-NEXT: [[CONFLICT_RDX6:%.*]] = or i1 [[CONFLICT_RDX]], [[DIFF_CHECK5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX6]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = call { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP4]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]] +; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]] +; CHECK-NEXT: store <2 x float> [[TMP6]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH]]: +; ; CHECK call { float, float } @foo(float [[LOAD:%.*]]) entry: br label %for.body @@ -105,9 +167,28 @@ exit: ; TODO: Allow mixed-struct type vectorization and mark overflow intrinsics as trivially vectorizable. ; CHECK-REMARKS: remark: {{.*}} loop not vectorized: call instruction cannot be vectorized define void @test_overflow_intrinsic(ptr noalias readonly %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @test_overflow_intrinsic -; CHECK-NOT: vector.body: -; CHECK-NOT: @llvm.sadd.with.overflow.v{{.+}}i32 +; CHECK-LABEL: define void @test_overflow_intrinsic( +; CHECK-SAME: ptr noalias readonly [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]] +; CHECK-NEXT: [[IN_VAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[IN_VAL]], i32 [[IN_VAL]]) +; CHECK-NEXT: [[EXTRACT_RET:%.*]] = extractvalue { i32, i1 } [[CALL]], 0 +; CHECK-NEXT: [[EXTRACT_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[CALL]], 1 +; CHECK-NEXT: [[ZEXT_OVERFLOW:%.*]] = zext i1 [[EXTRACT_OVERFLOW]] to i8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[OUT_A]], i64 [[IV]] +; CHECK-NEXT: store i32 [[EXTRACT_RET]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[OUT_B]], i64 [[IV]] +; CHECK-NEXT: store i8 [[ZEXT_OVERFLOW]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -133,9 +214,27 @@ exit: ; CHECK-REMARKS: remark: {{.*}} vectorized loop define void @struct_return_i32_three_results_widen(ptr noalias %in, ptr noalias writeonly %out_a) { -; CHECK-LABEL: define void @struct_return_i32_three_results_widen -; CHECK: vector.body: -; CHECK: call { <2 x i32>, <2 x i32>, <2 x i32> } @fixed_vec_qux(<2 x i32> [[WIDE_LOAD:%.*]]) +; CHECK-LABEL: define void @struct_return_i32_three_results_widen( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @fixed_vec_qux(<2 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[OUT_A]], i64 [[INDEX]] +; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH:.*:]] +; entry: br label %for.body @@ -159,10 +258,50 @@ exit: ; (mainly it does not crash). ; CHECK-REMARKS: remark: {{.*}} vectorized loop define void @scalarized_predicated_struct_return(ptr %a) { -; CHECK-LABEL: define void @scalarized_predicated_struct_return -; CHECK: vector.body: -; CHECK: pred.store.if: -; CHECK: tail call { i64, i64 } @bar_i64(i64 {{.+}}) +; CHECK-LABEL: define void @scalarized_predicated_struct_return( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 +; CHECK-NEXT: br i1 [[TMP2]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP3]]) #[[ATTR4:[0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { i64, i64 } [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; CHECK-NEXT: store i64 [[TMP7]], ptr [[TMP9]], align 8 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]] +; CHECK: [[PRED_STORE_IF1]]: +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP11]]) #[[ATTR4]] +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { i64, i64 } [[TMP12]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = udiv i64 [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP16]] +; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]] +; CHECK: [[PRED_STORE_CONTINUE2]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH:.*:]] +; entry: br label %for.body @@ -192,8 +331,27 @@ exit: ; Negative test. Widening structs of vectors is not supported. ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized define void @negative_struct_of_vectors(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @negative_struct_of_vectors -; CHECK-NOT: vector.body: +; CHECK-LABEL: define void @negative_struct_of_vectors( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]] +; CHECK-NEXT: [[IN_VAL:%.*]] = load <1 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = tail call { <1 x float>, <1 x float> } @foo(<1 x float> [[IN_VAL]]) #[[ATTR1:[0-9]+]] +; CHECK-NEXT: [[EXTRACT_A:%.*]] = extractvalue { <1 x float>, <1 x float> } [[CALL]], 0 +; CHECK-NEXT: [[EXTRACT_B:%.*]] = extractvalue { <1 x float>, <1 x float> } [[CALL]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[IV]] +; CHECK-NEXT: store <1 x float> [[EXTRACT_A]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[IV]] +; CHECK-NEXT: store <1 x float> [[EXTRACT_B]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -219,9 +377,27 @@ exit: ; Negative test. Widening structs with mixed element types is not supported. ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized define void @negative_mixed_element_type_struct_return(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @negative_mixed_element_type_struct_return -; CHECK-NOT: vector.body: -; CHECK-NOT: call {{.*}} @fixed_vec_baz +; CHECK-LABEL: define void @negative_mixed_element_type_struct_return( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]] +; CHECK-NEXT: [[IN_VAL:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = tail call { float, i32 } @baz(float [[IN_VAL]]) #[[ATTR5:[0-9]+]] +; CHECK-NEXT: [[EXTRACT_A:%.*]] = extractvalue { float, i32 } [[CALL]], 0 +; CHECK-NEXT: [[EXTRACT_B:%.*]] = extractvalue { float, i32 } [[CALL]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[IV]] +; CHECK-NEXT: store float [[EXTRACT_A]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[OUT_B]], i64 [[IV]] +; CHECK-NEXT: store i32 [[EXTRACT_B]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -249,9 +425,27 @@ exit: ; Negative test. Widening non-literal structs is not supported. ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized define void @negative_named_struct_return(ptr noalias readonly %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @negative_named_struct_return -; CHECK-NOT: vector.body: -; CHECK-NOT: call {{.*}} @fixed_vec_bar +; CHECK-LABEL: define void @negative_named_struct_return( +; CHECK-SAME: ptr noalias readonly [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[IV]] +; CHECK-NEXT: [[IN_VAL:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[CALL:%.*]] = tail call [[NAMED_STRUCT:%.*]] @[[BAR_NAMED:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](double [[IN_VAL]]) #[[ATTR6:[0-9]+]] +; CHECK-NEXT: [[EXTRACT_A:%.*]] = extractvalue [[NAMED_STRUCT]] [[CALL]], 0 +; CHECK-NEXT: [[EXTRACT_B:%.*]] = extractvalue [[NAMED_STRUCT]] [[CALL]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[OUT_A]], i64 [[IV]] +; CHECK-NEXT: store double [[EXTRACT_A]], ptr [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[OUT_B]], i64 [[IV]] +; CHECK-NEXT: store double [[EXTRACT_B]], ptr [[ARRAYIDX4]], align 8 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -277,8 +471,28 @@ exit: ; Negative test. Nested homogeneous structs are not supported. ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized define void @negative_nested_struct(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @negative_nested_struct -; CHECK-NOT: vector.body: +; CHECK-LABEL: define void @negative_nested_struct( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]] +; CHECK-NEXT: [[IN_VAL:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = tail call { { float, float } } @foo_nested_struct(float [[IN_VAL]]) #[[ATTR1]] +; CHECK-NEXT: [[EXTRACT_INNER:%.*]] = extractvalue { { float, float } } [[CALL]], 0 +; CHECK-NEXT: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[EXTRACT_INNER]], 0 +; CHECK-NEXT: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[EXTRACT_INNER]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[IV]] +; CHECK-NEXT: store float [[EXTRACT_A]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[IV]] +; CHECK-NEXT: store float [[EXTRACT_B]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -305,8 +519,24 @@ exit: ; Negative test. The second element of the struct cannot be widened. ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized define void @negative_non_widenable_element(ptr noalias %in, ptr noalias writeonly %out_a) { -; CHECK-LABEL: define void @negative_non_widenable_element -; CHECK-NOT: vector.body: +; CHECK-LABEL: define void @negative_non_widenable_element( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]] +; CHECK-NEXT: [[IN_VAL:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = tail call { float, [1 x float] } @foo_one_non_widenable_element(float [[IN_VAL]]) #[[ATTR1]] +; CHECK-NEXT: [[EXTRACT_A:%.*]] = extractvalue { float, [1 x float] } [[CALL]], 0 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[IV]] +; CHECK-NEXT: store float [[EXTRACT_A]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -329,8 +559,28 @@ exit: ; Negative test. Homogeneous structs of arrays are not supported. ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized define void @negative_struct_array_elements(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @negative_struct_array_elements -; CHECK-NOT: vector.body: +; CHECK-LABEL: define void @negative_struct_array_elements( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]] +; CHECK-NEXT: [[IN_VAL:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = tail call { [2 x float] } @foo_arrays(float [[IN_VAL]]) #[[ATTR1]] +; CHECK-NEXT: [[EXTRACT_INNER:%.*]] = extractvalue { [2 x float] } [[CALL]], 0 +; CHECK-NEXT: [[EXTRACT_A:%.*]] = extractvalue [2 x float] [[EXTRACT_INNER]], 0 +; CHECK-NEXT: [[EXTRACT_B:%.*]] = extractvalue [2 x float] [[EXTRACT_INNER]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[IV]] +; CHECK-NEXT: store float [[EXTRACT_A]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[IV]] +; CHECK-NEXT: store float [[EXTRACT_B]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -357,8 +607,26 @@ exit: ; Negative test. Widening struct loads is not supported. ; CHECK-REMARKS: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized define void @negative_struct_load(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @negative_struct_load -; CHECK-NOT: vector.body: +; CHECK-LABEL: define void @negative_struct_load( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds { float, float }, ptr [[IN]], i64 [[IV]] +; CHECK-NEXT: [[CALL:%.*]] = load { float, float }, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0 +; CHECK-NEXT: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[IV]] +; CHECK-NEXT: store float [[EXTRACT_A]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[IV]] +; CHECK-NEXT: store float [[EXTRACT_B]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -383,8 +651,23 @@ exit: ; Negative test. Widening struct stores is not supported. ; CHECK-REMARKS: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized define void @negative_struct_return_store_struct(ptr noalias %in, ptr noalias writeonly %out) { -; CHECK-LABEL: define void @negative_struct_return_store_struct -; CHECK-NOT: vector.body: +; CHECK-LABEL: define void @negative_struct_return_store_struct( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds { float, float }, ptr [[IN]], i64 [[IV]] +; CHECK-NEXT: [[IN_VAL:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = tail call { float, float } @foo(float [[IN_VAL]]) #[[ATTR1]] +; CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr inbounds { float, float }, ptr [[OUT]], i64 [[IV]] +; CHECK-NEXT: store { float, float } [[CALL]], ptr [[OUT_PTR]], align 8 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: br label %for.body