diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 83dee09f94b99..5cc01a9f974b4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7307,7 +7307,9 @@ DenseMap LoopVectorizationPlanner::executePlan( VPlanTransforms::narrowInterleaveGroups( BestVPlan, BestVF, - TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)); + TTI.getRegisterBitWidth(BestVF.isScalable() + ? TargetTransformInfo::RGK_ScalableVector + : TargetTransformInfo::RGK_FixedWidthVector)); VPlanTransforms::removeDeadRecipes(BestVPlan); VPlanTransforms::convertToConcreteRecipes(BestVPlan); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 2423b95930319..7cef98f465715 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -4177,8 +4177,9 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, /// members both equal to \p VF. The interleave group must also access the full /// vector width \p VectorRegWidth. static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, - unsigned VF, VPTypeAnalysis &TypeInfo, - unsigned VectorRegWidth) { + ElementCount VF, + VPTypeAnalysis &TypeInfo, + TypeSize VectorRegWidth) { if (!InterleaveR || InterleaveR->getMask()) return false; @@ -4200,9 +4201,11 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, return false; } - unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF; - auto IG = InterleaveR->getInterleaveGroup(); - return IG->getFactor() == VF && IG->getNumMembers() == VF && + unsigned VFMin = VF.getKnownMinValue(); + TypeSize GroupSize = TypeSize::get( + GroupElementTy->getScalarSizeInBits() * VFMin, VF.isScalable()); + const auto *IG = InterleaveR->getInterleaveGroup(); + return IG->getFactor() == VFMin && IG->getNumMembers() == VFMin && GroupSize == VectorRegWidth; } @@ -4268,14 +4271,13 @@ narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl &NarrowedOps) { } void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, - unsigned VectorRegWidth) { + TypeSize VectorRegWidth) { VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0) return; VPTypeAnalysis TypeInfo(Plan); - unsigned VFMinVal = VF.getKnownMinValue(); SmallVector StoreGroups; for (auto &R : *VectorLoop->getEntryBasicBlock()) { if (isa(&R)) @@ -4310,7 +4312,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, continue; // Bail out on non-consecutive interleave groups. - if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo, + if (!isConsecutiveInterleaveGroup(InterleaveR, VF, TypeInfo, VectorRegWidth)) return; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 34850743e7b62..5e67d8fd2a0eb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -349,7 +349,7 @@ struct VPlanTransforms { /// form of loop-aware SLP, where we use interleave groups to identify /// candidates. static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, - unsigned VectorRegWidth); + TypeSize VectorRegWidth); /// Predicate and linearize the control-flow in the only loop region of /// \p Plan. If \p FoldTail is true, create a mask guarding the loop diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/RISCV/transform-narrow-interleave-to-widen-memory.ll new file mode 100644 index 0000000000000..d4d7d398185a1 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/transform-narrow-interleave-to-widen-memory.ll @@ -0,0 +1,214 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -p loop-vectorize -mtriple riscv64 -mattr=+v -S %s | FileCheck -check-prefix=CHECK %s +; RUN: opt -p loop-vectorize -mtriple riscv64 -mattr=+v -S %s -prefer-predicate-over-epilogue=scalar-epilogue | FileCheck -check-prefix=EPILOGUE %s + +define void @load_store_interleave_group(ptr noalias %data) { +; CHECK-LABEL: define void @load_store_interleave_group( +; CHECK-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 100, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP1:%.*]] = shl nsw i64 [[EVL_BASED_IV]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]] +; CHECK-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP0]], 2 +; CHECK-NEXT: [[WIDE_VP_LOAD:%.*]] = call @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP2]], splat (i1 true), i32 [[INTERLEAVE_EVL]]) +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VP_LOAD]]) +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP0]], 2 +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP3]], [[TMP4]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv4i64.p0( [[INTERLEAVED_VEC]], ptr align 8 [[TMP2]], splat (i1 true), i32 [[INTERLEAVE_EVL1]]) +; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP5]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +; EPILOGUE-LABEL: define void @load_store_interleave_group( +; EPILOGUE-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] { +; EPILOGUE-NEXT: [[ENTRY:.*]]: +; EPILOGUE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; EPILOGUE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1 +; EPILOGUE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]] +; EPILOGUE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; EPILOGUE: [[VECTOR_PH]]: +; EPILOGUE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; EPILOGUE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 +; EPILOGUE-NEXT: [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]] +; EPILOGUE-NEXT: [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]] +; EPILOGUE-NEXT: br label %[[VECTOR_BODY:.*]] +; EPILOGUE: [[VECTOR_BODY]]: +; EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; EPILOGUE-NEXT: [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 1 +; EPILOGUE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]] +; EPILOGUE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP5]], align 8 +; EPILOGUE-NEXT: store [[WIDE_LOAD]], ptr [[TMP5]], align 8 +; EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] +; EPILOGUE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; EPILOGUE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; EPILOGUE: [[MIDDLE_BLOCK]]: +; EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]] +; EPILOGUE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; EPILOGUE: [[SCALAR_PH]]: +; EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; EPILOGUE-NEXT: br label %[[LOOP:.*]] +; EPILOGUE: [[LOOP]]: +; EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; EPILOGUE-NEXT: [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1 +; EPILOGUE-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[MUL_2]] +; EPILOGUE-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8 +; EPILOGUE-NEXT: store i64 [[L_0]], ptr [[DATA_0]], align 8 +; EPILOGUE-NEXT: [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1 +; EPILOGUE-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[ADD_1]] +; EPILOGUE-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8 +; EPILOGUE-NEXT: store i64 [[L_1]], ptr [[DATA_1]], align 8 +; EPILOGUE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; EPILOGUE-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100 +; EPILOGUE-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; EPILOGUE: [[EXIT]]: +; EPILOGUE-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %mul.2 = shl nsw i64 %iv, 1 + %data.0 = getelementptr inbounds i64, ptr %data, i64 %mul.2 + %l.0 = load i64, ptr %data.0, align 8 + store i64 %l.0, ptr %data.0, align 8 + %add.1 = or disjoint i64 %mul.2, 1 + %data.1 = getelementptr inbounds i64, ptr %data, i64 %add.1 + %l.1 = load i64, ptr %data.1, align 8 + store i64 %l.1, ptr %data.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + + +define void @load_store_interleave_group_i32(ptr noalias %data) { +; CHECK-LABEL: define void @load_store_interleave_group_i32( +; CHECK-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 100, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[TMP1:%.*]] = shl nsw i64 [[EVL_BASED_IV]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[TMP1]] +; CHECK-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP0]], 4 +; CHECK-NEXT: [[WIDE_VP_LOAD:%.*]] = call @llvm.vp.load.nxv16i32.p0(ptr align 8 [[TMP2]], splat (i1 true), i32 [[INTERLEAVE_EVL]]) +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv16i32( [[WIDE_VP_LOAD]]) +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; CHECK-NEXT: [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP0]], 4 +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv16i32( [[TMP3]], [[TMP4]], [[TMP7]], [[TMP8]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv16i32.p0( [[INTERLEAVED_VEC]], ptr align 8 [[TMP2]], splat (i1 true), i32 [[INTERLEAVE_EVL1]]) +; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP5]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +; EPILOGUE-LABEL: define void @load_store_interleave_group_i32( +; EPILOGUE-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0]] { +; EPILOGUE-NEXT: [[ENTRY:.*]]: +; EPILOGUE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; EPILOGUE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 +; EPILOGUE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]] +; EPILOGUE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; EPILOGUE: [[VECTOR_PH]]: +; EPILOGUE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; EPILOGUE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 +; EPILOGUE-NEXT: [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]] +; EPILOGUE-NEXT: [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]] +; EPILOGUE-NEXT: br label %[[VECTOR_BODY:.*]] +; EPILOGUE: [[VECTOR_BODY]]: +; EPILOGUE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; EPILOGUE-NEXT: [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 2 +; EPILOGUE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[TMP4]] +; EPILOGUE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP5]], align 8 +; EPILOGUE-NEXT: store [[WIDE_LOAD]], ptr [[TMP5]], align 8 +; EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] +; EPILOGUE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; EPILOGUE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; EPILOGUE: [[MIDDLE_BLOCK]]: +; EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]] +; EPILOGUE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; EPILOGUE: [[SCALAR_PH]]: +; EPILOGUE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; EPILOGUE-NEXT: br label %[[LOOP:.*]] +; EPILOGUE: [[LOOP]]: +; EPILOGUE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; EPILOGUE-NEXT: [[MUL_2:%.*]] = shl nsw i64 [[IV]], 2 +; EPILOGUE-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[MUL_2]] +; EPILOGUE-NEXT: [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8 +; EPILOGUE-NEXT: store i32 [[L_0]], ptr [[DATA_0]], align 8 +; EPILOGUE-NEXT: [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1 +; EPILOGUE-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[ADD_1]] +; EPILOGUE-NEXT: [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8 +; EPILOGUE-NEXT: store i32 [[L_1]], ptr [[DATA_1]], align 8 +; EPILOGUE-NEXT: [[ADD_2:%.*]] = add i64 [[MUL_2]], 2 +; EPILOGUE-NEXT: [[DATA_2:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[ADD_2]] +; EPILOGUE-NEXT: [[L_2:%.*]] = load i32, ptr [[DATA_2]], align 8 +; EPILOGUE-NEXT: store i32 [[L_2]], ptr [[DATA_2]], align 8 +; EPILOGUE-NEXT: [[ADD_3:%.*]] = add i64 [[MUL_2]], 3 +; EPILOGUE-NEXT: [[DATA_3:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[ADD_3]] +; EPILOGUE-NEXT: [[L_3:%.*]] = load i32, ptr [[DATA_3]], align 8 +; EPILOGUE-NEXT: store i32 [[L_3]], ptr [[DATA_3]], align 8 +; EPILOGUE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; EPILOGUE-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100 +; EPILOGUE-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; EPILOGUE: [[EXIT]]: +; EPILOGUE-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %mul.4 = shl nsw i64 %iv, 2 + %data.0 = getelementptr inbounds i32, ptr %data, i64 %mul.4 + %l.0 = load i32, ptr %data.0, align 8 + store i32 %l.0, ptr %data.0, align 8 + %add.1 = or disjoint i64 %mul.4, 1 + %data.1 = getelementptr inbounds i32, ptr %data, i64 %add.1 + %l.1 = load i32, ptr %data.1, align 8 + store i32 %l.1, ptr %data.1, align 8 + %add.2 = add i64 %mul.4, 2 + %data.2 = getelementptr inbounds i32, ptr %data, i64 %add.2 + %l.2 = load i32, ptr %data.2, align 8 + store i32 %l.2, ptr %data.2, align 8 + %add.3 = add i64 %mul.4, 3 + %data.3 = getelementptr inbounds i32, ptr %data, i64 %add.3 + %l.3 = load i32, ptr %data.3, align 8 + store i32 %l.3, ptr %data.3, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +}