diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll index 20edd8b9eb8d0..6693b9b10a654 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll @@ -8,7 +8,7 @@ target triple = "aarch64-unknown-linux-gnu" ; the loop, preventing the gep (and consequently the loop induction ; update variable) from being classified as 'uniform'. -define void @test_no_scalarization(i64* %a, i32 %idx, i32 %n) #0 { +define void @test_no_scalarization(ptr %a, i32 %idx, i32 %n) #0 { ; CHECK-LABEL: @test_no_scalarization( ; CHECK-NEXT: L.entry: ; CHECK-NEXT: [[TMP0:%.*]] = add nsw i32 [[IDX:%.*]], 1 @@ -39,23 +39,21 @@ define void @test_no_scalarization(i64* %a, i32 %idx, i32 %n) #0 { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[A:%.*]], [[VEC_IND]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64* [[TMP13]] to double* -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr double, double* [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP15]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP16]], align 8 -; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP18]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[A:%.*]], [[VEC_IND]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[TMP13]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP16]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT2]] -; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], 2 -; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP21]], 1 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement [[TMP12]], i32 [[TMP22]] +; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP19]], 1 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement [[TMP12]], i32 [[TMP20]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[L_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -64,14 +62,13 @@ define void @test_no_scalarization(i64* %a, i32 %idx, i32 %n) #0 { ; CHECK: L.LoopBody: ; CHECK-NEXT: [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], [[L_LOOPBODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INDVAR_NEXT]] = add nsw i32 [[INDVAR]], 1 -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i64, i64* [[A]], i32 [[INDVAR]] -; CHECK-NEXT: [[TMP25:%.*]] = bitcast i64* [[TMP24]] to double* -; CHECK-NEXT: [[TMP26:%.*]] = load double, double* [[TMP25]], align 8 -; CHECK-NEXT: [[TMP27:%.*]] = icmp slt i32 [[INDVAR_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[TMP27]], label [[L_LOOPBODY]], label [[L_EXIT]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i64, ptr [[A]], i32 [[INDVAR]] +; CHECK-NEXT: [[TMP23:%.*]] = load double, ptr [[TMP22]], align 8 +; CHECK-NEXT: [[TMP24:%.*]] = icmp slt i32 [[INDVAR_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP24]], label [[L_LOOPBODY]], label [[L_EXIT]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: L.exit: -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i64* [ [[TMP24]], [[L_LOOPBODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: store i64 1, i64* [[DOTLCSSA]], align 8 +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP22]], [[L_LOOPBODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: store i64 1, ptr [[DOTLCSSA]], align 8 ; CHECK-NEXT: ret void ; L.entry: @@ -80,14 +77,13 @@ L.entry: L.LoopBody: ; preds = %L.LoopBody, %L.entry %indvar = phi i32 [ %indvar.next, %L.LoopBody ], [ %idx, %L.entry ] %indvar.next = add nsw i32 %indvar, 1 - %0 = getelementptr i64, i64* %a, i32 %indvar - %1 = bitcast i64* %0 to double* - %2 = load double, double* %1, align 8 - %3 = icmp slt i32 %indvar.next, %n - br i1 %3, label %L.LoopBody, label %L.exit + %0 = getelementptr i64, ptr %a, i32 %indvar + %1 = load double, ptr %0, align 8 + %2 = icmp slt i32 %indvar.next, %n + br i1 %2, label %L.LoopBody, label %L.exit L.exit: ; preds = %L.LoopBody - store i64 1, i64* %0, align 8 + store i64 1, ptr %0, align 8 ret void } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll index 8e11a1c988247..52af3bfdd6034 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll @@ -2,7 +2,7 @@ ; RUN: opt < %s -passes=loop-vectorize -prefer-predicate-over-epilogue=scalar-epilogue -mtriple aarch64-unknown-linux-gnu \ ; RUN: -mattr=+sve -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -S | FileCheck %s -define float @cond_fadd(float* noalias nocapture readonly %a, float* noalias nocapture readonly %cond, i64 %N){ +define float @cond_fadd(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %cond, i64 %N){ ; CHECK-LABEL: @cond_fadd( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() @@ -17,43 +17,41 @@ define float @cond_fadd(float* noalias nocapture readonly %a, float* noalias noc ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[COND:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = fcmp une [[WIDE_LOAD]], shufflevector ( insertelement ( poison, float 2.000000e+00, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP11]], i32 4, [[TMP8]], poison) -; CHECK-NEXT: [[TMP12:%.*]] = select fast [[TMP8]], [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP12]]) -; CHECK-NEXT: [[TMP14]] = fadd fast float [[TMP13]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[COND:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = fcmp une [[WIDE_LOAD]], shufflevector ( insertelement ( poison, float 2.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP9]], i32 4, [[TMP7]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = select fast [[TMP7]], [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP10]]) +; CHECK-NEXT: [[TMP12]] = fadd fast float [[TMP11]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], [[FOR_INC:%.*]] ] ; CHECK-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[COND]], i64 [[INDVARS]] -; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP18]], 2.000000e+00 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[COND]], i64 [[INDVARS]] +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP16]], 2.000000e+00 ; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK: if.then: -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS]] -; CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[FADD:%.*]] = fadd fast float [[RDX]], [[TMP19]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS]] +; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[FADD:%.*]] = fadd fast float [[RDX]], [[TMP17]] ; CHECK-NEXT: br label [[FOR_INC]] ; CHECK: for.inc: ; CHECK-NEXT: [[RES]] = phi float [ [[FADD]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ] @@ -61,7 +59,7 @@ define float @cond_fadd(float* noalias nocapture readonly %a, float* noalias noc ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi float [ [[RES]], [[FOR_INC]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi float [ [[RES]], [[FOR_INC]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RES_LCSSA]] ; entry: @@ -70,14 +68,14 @@ entry: for.body: %indvars = phi i64 [ 0, %entry ], [ %indvars.next, %for.inc ] %rdx = phi float [ 1.000000e+00, %entry ], [ %res, %for.inc ] - %arrayidx = getelementptr inbounds float, float* %cond, i64 %indvars - %0 = load float, float* %arrayidx + %arrayidx = getelementptr inbounds float, ptr %cond, i64 %indvars + %0 = load float, ptr %arrayidx %tobool = fcmp une float %0, 2.000000e+00 br i1 %tobool, label %if.then, label %for.inc if.then: - %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars - %1 = load float, float* %arrayidx2 + %arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvars + %1 = load float, ptr %arrayidx2 %fadd = fadd fast float %rdx, %1 br label %for.inc @@ -91,7 +89,7 @@ for.end: ret float %res } -define float @cond_cmp_sel(float* noalias %a, float* noalias %cond, i64 %N) { +define float @cond_cmp_sel(ptr noalias %a, ptr noalias %cond, i64 %N) { ; CHECK-LABEL: @cond_cmp_sel( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() @@ -108,24 +106,22 @@ define float @cond_cmp_sel(float* noalias %a, float* noalias %cond, i64 %N) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[COND:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = fcmp une [[WIDE_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP11]], i32 4, [[TMP8]], poison) -; CHECK-NEXT: [[TMP12:%.*]] = select fast [[TMP8]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float 0x7FF0000000000000, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32( [[TMP12]]) -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP13]], [[VEC_PHI]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP13]], float [[VEC_PHI]] -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[COND:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = fcmp une [[WIDE_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP9]], i32 4, [[TMP7]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = select fast [[TMP7]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float 0x7FF0000000000000, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32( [[TMP10]]) +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP11]], [[VEC_PHI]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP11]], float [[VEC_PHI]] +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -136,15 +132,15 @@ define float @cond_cmp_sel(float* noalias %a, float* noalias %cond, i64 %N) { ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; CHECK-NEXT: [[RDX:%.*]] = phi float [ [[RES:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[COND]], i64 [[IV]] -; CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP17]], 3.000000e+00 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[COND]], i64 [[IV]] +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP15]], 3.000000e+00 ; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK: if.then: -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[FCMP:%.*]] = fcmp fast olt float [[RDX]], [[TMP18]] -; CHECK-NEXT: [[FSEL:%.*]] = select fast i1 [[FCMP]], float [[RDX]], float [[TMP18]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[FCMP:%.*]] = fcmp fast olt float [[RDX]], [[TMP16]] +; CHECK-NEXT: [[FSEL:%.*]] = select fast i1 [[FCMP]], float [[RDX]], float [[TMP16]] ; CHECK-NEXT: br label [[FOR_INC]] ; CHECK: for.inc: ; CHECK-NEXT: [[RES]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[FSEL]], [[IF_THEN]] ] @@ -161,14 +157,14 @@ entry: for.body: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ] %rdx = phi float [ %res, %for.inc ], [ 1.000000e+00, %entry ] - %arrayidx = getelementptr inbounds float, float* %cond, i64 %iv - %0 = load float, float* %arrayidx + %arrayidx = getelementptr inbounds float, ptr %cond, i64 %iv + %0 = load float, ptr %arrayidx %tobool = fcmp une float %0, 3.000000e+00 br i1 %tobool, label %if.then, label %for.inc if.then: - %arrayidx2 = getelementptr inbounds float, float* %a, i64 %iv - %1 = load float, float* %arrayidx2 + %arrayidx2 = getelementptr inbounds float, ptr %a, i64 %iv + %1 = load float, ptr %arrayidx2 %fcmp = fcmp fast olt float %rdx, %1 %fsel = select fast i1 %fcmp, float %rdx, float %1 br label %for.inc diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll index fbc436fc87d1d..bbeda2af2babb 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll @@ -4,7 +4,7 @@ ; ; In-loop integer and reduction ; -define i64 @int_reduction_and(i64* noalias nocapture %a, i64 %N) { +define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) { ; CHECK-LABEL: @int_reduction_and( ; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2 @@ -22,35 +22,33 @@ define i64 @int_reduction_and(i64* noalias nocapture %a, i64 %N) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 1, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ -1, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 1, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ -1, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 ; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64* [[TMP12]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP13]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 2 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64* [[TMP16]] to * -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , * [[TMP17]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64( [[WIDE_LOAD]]) -; CHECK-NEXT: [[TMP19]] = and i64 [[TMP18]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64( [[WIDE_LOAD3]]) -; CHECK-NEXT: [[TMP21]] = and i64 [[TMP20]], [[VEC_PHI2]] -; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]] -; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 2 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 [[TMP14]] +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64( [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP17]] = and i64 [[TMP16]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64( [[WIDE_LOAD3]]) +; CHECK-NEXT: [[TMP19]] = and i64 [[TMP18]], [[VEC_PHI2]] +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = and i64 [[TMP21]], [[TMP19]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = and i64 [[TMP19]], [[TMP17]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: @@ -64,36 +62,35 @@ define i64 @int_reduction_and(i64* noalias nocapture %a, i64 %N) { ; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[N]], [[N_MOD_VF4]] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP30:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, i64* [[TMP26]], i32 0 -; CHECK-NEXT: [[TMP28:%.*]] = bitcast i64* [[TMP27]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x i64>, <2 x i64>* [[TMP28]], align 4 -; CHECK-NEXT: [[TMP29:%.*]] = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> [[WIDE_LOAD9]]) -; CHECK-NEXT: [[TMP30]] = and i64 [[TMP29]], [[VEC_PHI8]] -; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[OFFSET_IDX]], 2 -; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[TMP31]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX7]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> [[WIDE_LOAD9]]) +; CHECK-NEXT: [[TMP27]] = and i64 [[TMP26]], [[VEC_PHI8]] +; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2 +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i64 [ 1, [[ITER_CHECK]] ], [ [[BIN_RDX]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP30]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i64 [ 1, [[ITER_CHECK]] ], [ [[BIN_RDX]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[AND:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX11]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[L2:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]] -; CHECK-NEXT: [[L3:%.*]] = load i64, i64* [[L2]], align 4 +; CHECK-NEXT: [[L2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[L3:%.*]] = load i64, ptr [[L2]], align 4 ; CHECK-NEXT: [[AND]] = and i64 [[RDX]], [[L3]] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[AND_LCSSA:%.*]] = phi i64 [ [[AND]], [[FOR_BODY]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ [[TMP30]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[AND_LCSSA:%.*]] = phi i64 [ [[AND]], [[FOR_BODY]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[AND_LCSSA]] ; entry: @@ -102,8 +99,8 @@ entry: for.body: %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] %rdx = phi i64 [ %and, %for.body ], [ 1, %entry ] - %l2 = getelementptr inbounds i64, i64* %a, i64 %iv - %l3 = load i64, i64* %l2 + %l2 = getelementptr inbounds i64, ptr %a, i64 %iv + %l3 = load i64, ptr %l2 %and = and i64 %rdx, %l3 %iv.next = add i64 %iv, 1 %exitcond = icmp eq i64 %iv.next, %N diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll index 18e5ccae271b1..82119bab1568d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll @@ -4,7 +4,7 @@ ; ; Integer reduction with interleaving & a start value of 5 ; -define i64 @int_reduction_add(i64* %a, i64 %N) { +define i64 @int_reduction_add(ptr %a, i64 %N) { ; CHECK-LABEL: @int_reduction_add( ; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2 @@ -22,34 +22,32 @@ define i64 @int_reduction_add(i64* %a, i64 %N) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i64 5, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i64 5, i32 0), [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 ; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64* [[TMP12]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP13]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 2 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64* [[TMP16]] to * -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , * [[TMP17]], align 4 -; CHECK-NEXT: [[TMP18]] = add [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP19]] = add [[WIDE_LOAD3]], [[VEC_PHI2]] -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]] -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 2 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 [[TMP14]] +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16]] = add [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP17]] = add [[WIDE_LOAD3]], [[VEC_PHI2]] +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add [[TMP19]], [[TMP18]] -; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[BIN_RDX]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: @@ -57,43 +55,42 @@ define i64 @int_reduction_add(i64* %a, i64 %N) { ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 5, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP23]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 5, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP21]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[N]], 2 ; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[N]], [[N_MOD_VF4]] -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi <2 x i64> [ [[TMP24]], [[VEC_EPILOG_PH]] ], [ [[TMP29:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, i64* [[TMP26]], i32 0 -; CHECK-NEXT: [[TMP28:%.*]] = bitcast i64* [[TMP27]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x i64>, <2 x i64>* [[TMP28]], align 4 -; CHECK-NEXT: [[TMP29]] = add <2 x i64> [[WIDE_LOAD9]], [[VEC_PHI8]] -; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[OFFSET_IDX]], 2 -; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi <2 x i64> [ [[TMP22]], [[VEC_EPILOG_PH]] ], [ [[TMP26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX7]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 4 +; CHECK-NEXT: [[TMP26]] = add <2 x i64> [[WIDE_LOAD9]], [[VEC_PHI8]] +; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2 +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[TMP27]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP29]]) +; CHECK-NEXT: [[TMP28:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP26]]) ; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i64 [ 5, [[ITER_CHECK]] ], [ [[TMP23]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP31]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i64 [ 5, [[ITER_CHECK]] ], [ [[TMP21]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP28]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX11]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP32:%.*]] = load i64, i64* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD]] = add i64 [[TMP32]], [[SUM]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = add i64 [[TMP29]], [[SUM]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ [[TMP31]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ [[TMP28]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[ADD_LCSSA]] ; entry: @@ -102,8 +99,8 @@ entry: for.body: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] %sum = phi i64 [ 5, %entry ], [ %add, %for.body ] - %arrayidx = getelementptr inbounds i64, i64* %a, i64 %iv - %0 = load i64, i64* %arrayidx + %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv + %0 = load i64, ptr %arrayidx %add = add i64 %0, %sum %iv.next = add nuw nsw i64 %iv, 1 %exitcond.not = icmp eq i64 %iv.next, %N diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll index 6220034dcd788..1648a2f6bd1bf 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll @@ -4,7 +4,7 @@ ; ; Strict fadd reduction with interleaving ; -define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) { +define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) { ; CHECK-LABEL: @fadd_strict( ; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2 @@ -22,30 +22,28 @@ define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0xFFFFFFFFE0000000, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0xFFFFFFFFE0000000, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 ; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP13]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 4 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to * -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , * [[TMP17]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[WIDE_LOAD]]) -; CHECK-NEXT: [[TMP19]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[TMP18]], [[WIDE_LOAD2]]) -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]] -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 4 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 [[TMP14]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP17]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[TMP16]], [[WIDE_LOAD2]]) +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -54,41 +52,40 @@ define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) { ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0xFFFFFFFFE0000000, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP19]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0xFFFFFFFFE0000000, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_MOD_VF3:%.*]] = urem i64 [[N]], 2 ; CHECK-NEXT: [[N_VEC4:%.*]] = sub i64 [[N]], [[N_MOD_VF3]] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi float [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 0 -; CHECK-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to <2 x float>* -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <2 x float>, <2 x float>* [[TMP26]], align 4 -; CHECK-NEXT: [[TMP27]] = call float @llvm.vector.reduce.fadd.v2f32(float [[VEC_PHI7]], <2 x float> [[WIDE_LOAD8]]) -; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 2 -; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]] -; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi float [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP24:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX6]], 0 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <2 x float>, ptr [[TMP23]], align 4 +; CHECK-NEXT: [[TMP24]] = call float @llvm.vector.reduce.fadd.v2f32(float [[VEC_PHI7]], <2 x float> [[WIDE_LOAD8]]) +; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2 +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]] +; CHECK-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N5:%.*]] = icmp eq i64 [[N]], [[N_VEC4]] ; CHECK-NEXT: br i1 [[CMP_N5]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX10:%.*]] = phi float [ 0xFFFFFFFFE0000000, [[ITER_CHECK]] ], [ [[TMP19]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX10:%.*]] = phi float [ 0xFFFFFFFFE0000000, [[ITER_CHECK]] ], [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX10]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP29:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD]] = fadd float [[TMP29]], [[SUM_07]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = fadd float [[TMP26]], [[SUM_07]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[ADD_LCSSA]] ; entry: @@ -97,8 +94,8 @@ entry: for.body: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] %sum.07 = phi float [ 0xFFFFFFFFE0000000, %entry ], [ %add, %for.body ] - %arrayidx = getelementptr inbounds float, float* %a, i64 %iv - %0 = load float, float* %arrayidx, align 4 + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + %0 = load float, ptr %arrayidx, align 4 %add = fadd float %0, %sum.07 %iv.next = add nuw nsw i64 %iv, 1 %exitcond.not = icmp eq i64 %iv.next, %n diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll index 50f1f7cdc1833..a110cb0e48498 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll @@ -3,7 +3,7 @@ target triple = "aarch64-unknown-linux-gnu" -define void @inv_store_i16(i16* noalias %dst, i16* noalias readonly %src, i64 %N) #0 { +define void @inv_store_i16(ptr noalias %dst, ptr noalias readonly %src, i64 %N) #0 { ; CHECK-LABEL: @inv_store_i16( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() @@ -19,20 +19,19 @@ define void @inv_store_i16(i16* noalias %dst, i16* noalias readonly %src, i64 %N ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[SRC:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP7]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP8]], 4 -; CHECK-NEXT: [[TMP10:%.*]] = sub i32 [[TMP9]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[WIDE_LOAD]], i32 [[TMP10]] -; CHECK-NEXT: store i16 [[TMP11]], i16* [[DST:%.*]], align 2 -; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[SRC:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = sub i32 [[TMP8]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement [[WIDE_LOAD]], i32 [[TMP9]] +; CHECK-NEXT: store i16 [[TMP10]], ptr [[DST:%.*]], align 2 +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_INC24:%.*]], label [[SCALAR_PH]] @@ -41,9 +40,9 @@ define void @inv_store_i16(i16* noalias %dst, i16* noalias readonly %src, i64 %N ; CHECK-NEXT: br label [[FOR_BODY14:%.*]] ; CHECK: for.body14: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY14]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[SRC]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[LD:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -; CHECK-NEXT: store i16 [[LD]], i16* [[DST]], align 2 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[LD:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; CHECK-NEXT: store i16 [[LD]], ptr [[DST]], align 2 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_INC24]], label [[FOR_BODY14]], !llvm.loop [[LOOP3:![0-9]+]] @@ -55,9 +54,9 @@ entry: for.body14: ; preds = %for.body14, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body14 ] - %arrayidx = getelementptr inbounds i16, i16* %src, i64 %indvars.iv - %ld = load i16, i16* %arrayidx - store i16 %ld, i16* %dst, align 2 + %arrayidx = getelementptr inbounds i16, ptr %src, i64 %indvars.iv + %ld = load i16, ptr %arrayidx + store i16 %ld, ptr %dst, align 2 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond.not = icmp eq i64 %indvars.iv.next, %N br i1 %exitcond.not, label %for.inc24, label %for.body14, !llvm.loop !0 @@ -67,7 +66,7 @@ for.inc24: ; preds = %for.body14, %for.bo } -define void @cond_inv_store_i32(i32* noalias %dst, i32* noalias readonly %src, i64 %N) #0 { +define void @cond_inv_store_i32(ptr noalias %dst, ptr noalias readonly %src, i64 %N) #0 { ; CHECK-LABEL: @cond_inv_store_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() @@ -79,23 +78,22 @@ define void @cond_inv_store_i32(i32* noalias %dst, i32* noalias readonly %src, i ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32* [[DST:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[DST:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_LOAD]], [[BROADCAST_SPLAT]], i32 4, [[TMP8]]) -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_LOAD]], [[BROADCAST_SPLAT]], i32 4, [[TMP7]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -104,12 +102,12 @@ define void @cond_inv_store_i32(i32* noalias %dst, i32* noalias readonly %src, i ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_09:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[I_09]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP12]], 0 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[I_09]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP11]], 0 ; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK: if.then: -; CHECK-NEXT: store i32 [[TMP12]], i32* [[DST]], align 4 +; CHECK-NEXT: store i32 [[TMP11]], ptr [[DST]], align 4 ; CHECK-NEXT: br label [[FOR_INC]] ; CHECK: for.inc: ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_09]], 1 @@ -123,13 +121,13 @@ entry: for.body: ; preds = %entry, %for.inc %i.09 = phi i64 [ %inc, %for.inc ], [ 0, %entry ] - %arrayidx = getelementptr inbounds i32, i32* %src, i64 %i.09 - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %src, i64 %i.09 + %0 = load i32, ptr %arrayidx, align 4 %cmp1 = icmp sgt i32 %0, 0 br i1 %cmp1, label %if.then, label %for.inc if.then: ; preds = %for.body - store i32 %0, i32* %dst, align 4 + store i32 %0, ptr %dst, align 4 br label %for.inc for.inc: ; preds = %for.body, %if.then diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll index 4b0e6eea3a1fa..2c186cecf0806 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -30,14 +30,14 @@ target triple = "aarch64-unknown-linux-gnu" ; VPLANS-NEXT: No successors ; VPLANS-NEXT: } -define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 { +define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-LABEL: @simple_memset( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]] -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[UMAX]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() @@ -48,35 +48,46 @@ define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 { ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP12]], i32 4, [[ACTIVE_LANE_MASK2]]) -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 -; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP14]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK4]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT3]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK4]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP14:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[TMP14]], i32 0 +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[WHILE_BODY:%.*]] +; CHECK: while.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]] +; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 +; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: while.end.loopexit: +; CHECK-NEXT: ret void ; entry: br label %while.body while.body: ; preds = %while.body, %entry %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] - %gep = getelementptr i32, i32* %ptr, i64 %index - store i32 %val, i32* %gep + %gep = getelementptr i32, ptr %ptr, i64 %index + store i32 %val, ptr %gep %index.next = add nsw i64 %index, 1 %cmp10 = icmp ult i64 %index.next, %n br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll index b6d6135ca13e3..000c338b93513 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -4,7 +4,7 @@ target triple = "aarch64-unknown-linux-gnu" -define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 { +define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-LABEL: @simple_memset( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) @@ -30,17 +30,16 @@ define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 { ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[TMP14]], i32 0 +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -48,8 +47,8 @@ define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 { ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[INDEX]] -; CHECK-NEXT: store i32 [[VAL]], i32* [[GEP]], align 4 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]] +; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 ; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]] @@ -61,8 +60,8 @@ entry: while.body: ; preds = %while.body, %entry %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] - %gep = getelementptr i32, i32* %ptr, i64 %index - store i32 %val, i32* %gep + %gep = getelementptr i32, ptr %ptr, i64 %index + store i32 %val, ptr %gep %index.next = add nsw i64 %index, 1 %cmp10 = icmp ult i64 %index.next, %n br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0 @@ -72,7 +71,7 @@ while.end.loopexit: ; preds = %while.body } -define void @simple_memset_v4i32(i32 %val, i32* %ptr, i64 %n) #0 { +define void @simple_memset_v4i32(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-LABEL: @simple_memset_v4i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) @@ -89,15 +88,14 @@ define void @simple_memset_v4i32(i32 %val, i32* %ptr, i64 %n) #0 { ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[BROADCAST_SPLAT]], <4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], 4 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -105,8 +103,8 @@ define void @simple_memset_v4i32(i32 %val, i32* %ptr, i64 %n) #0 { ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[INDEX]] -; CHECK-NEXT: store i32 [[VAL]], i32* [[GEP]], align 4 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]] +; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 ; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]] @@ -118,8 +116,8 @@ entry: while.body: ; preds = %while.body, %entry %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] - %gep = getelementptr i32, i32* %ptr, i64 %index - store i32 %val, i32* %gep + %gep = getelementptr i32, ptr %ptr, i64 %index + store i32 %val, ptr %gep %index.next = add nsw i64 %index, 1 %cmp10 = icmp ult i64 %index.next, %n br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !3 @@ -129,7 +127,7 @@ while.end.loopexit: ; preds = %while.body } -define void @simple_memcpy(i32* noalias %dst, i32* noalias %src, i64 %n) #0 { +define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-LABEL: @simple_memcpy( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) @@ -153,21 +151,19 @@ define void @simple_memcpy(i32* noalias %dst, i32* noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[WIDE_MASKED_LOAD]], * [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[WIDE_MASKED_LOAD]], ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP15]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP18:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP19:%.*]] = extractelement [[TMP18]], i32 0 -; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -175,10 +171,10 @@ define void @simple_memcpy(i32* noalias %dst, i32* noalias %src, i64 %n) #0 { ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, i32* [[SRC]], i64 [[INDEX]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[GEP1]], align 4 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, i32* [[DST]], i64 [[INDEX]] -; CHECK-NEXT: store i32 [[VAL]], i32* [[GEP2]], align 4 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP2]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 ; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]] @@ -190,10 +186,10 @@ entry: while.body: ; preds = %while.body, %entry %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] - %gep1 = getelementptr i32, i32* %src, i64 %index - %val = load i32, i32* %gep1 - %gep2 = getelementptr i32, i32* %dst, i64 %index - store i32 %val, i32* %gep2 + %gep1 = getelementptr i32, ptr %src, i64 %index + %val = load i32, ptr %gep1 + %gep2 = getelementptr i32, ptr %dst, i64 %index + store i32 %val, ptr %gep2 %index.next = add nsw i64 %index, 1 %cmp10 = icmp ult i64 %index.next, %n br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0 @@ -203,7 +199,7 @@ while.end.loopexit: ; preds = %while.body } -define void @copy_stride4(i32* noalias %dst, i32* noalias %src, i64 %n) #0 { +define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-LABEL: @copy_stride4( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 4) @@ -240,10 +236,10 @@ define void @copy_stride4(i32* noalias %dst, i32* noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[SRC:%.*]], [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP18]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DST:%.*]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP19]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[SRC:%.*]], [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP18]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[DST:%.*]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP19]], i32 4, [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP21]] @@ -259,10 +255,10 @@ define void @copy_stride4(i32* noalias %dst, i32* noalias %src, i64 %n) #0 { ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, i32* [[SRC]], i64 [[INDEX]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[GEP1]], align 4 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, i32* [[DST]], i64 [[INDEX]] -; CHECK-NEXT: store i32 [[VAL]], i32* [[GEP2]], align 4 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP2]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 4 ; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]] @@ -274,10 +270,10 @@ entry: while.body: ; preds = %while.body, %entry %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] - %gep1 = getelementptr i32, i32* %src, i64 %index - %val = load i32, i32* %gep1 - %gep2 = getelementptr i32, i32* %dst, i64 %index - store i32 %val, i32* %gep2 + %gep1 = getelementptr i32, ptr %src, i64 %index + %val = load i32, ptr %gep1 + %gep2 = getelementptr i32, ptr %dst, i64 %index + store i32 %val, ptr %gep2 %index.next = add nsw i64 %index, 4 %cmp10 = icmp ult i64 %index.next, %n br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0 @@ -287,7 +283,7 @@ while.end.loopexit: ; preds = %while.body } -define void @simple_gather_scatter(i32* noalias %dst, i32* noalias %src, i32* noalias %ind, i64 %n) #0 { +define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noalias %ind, i64 %n) #0 { ; CHECK-LABEL: @simple_gather_scatter( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) @@ -311,21 +307,20 @@ define void @simple_gather_scatter(i32* noalias %dst, i32* noalias %src, i32* no ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[IND:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[SRC:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[DST:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[IND:%.*]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[SRC:%.*]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[DST:%.*]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP15]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -333,12 +328,12 @@ define void @simple_gather_scatter(i32* noalias %dst, i32* noalias %src, i32* no ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, i32* [[IND]], i64 [[INDEX]] -; CHECK-NEXT: [[IND_VAL:%.*]] = load i32, i32* [[GEP1]], align 4 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, i32* [[SRC]], i32 [[IND_VAL]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[GEP2]], align 4 -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i32, i32* [[DST]], i32 [[IND_VAL]] -; CHECK-NEXT: store i32 [[VAL]], i32* [[GEP3]], align 4 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr [[IND]], i64 [[INDEX]] +; CHECK-NEXT: [[IND_VAL:%.*]] = load i32, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, ptr [[SRC]], i32 [[IND_VAL]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[GEP2]], align 4 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i32, ptr [[DST]], i32 [[IND_VAL]] +; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP3]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 ; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]] @@ -350,12 +345,12 @@ entry: while.body: ; preds = %while.body, %entry %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] - %gep1 = getelementptr i32, i32* %ind, i64 %index - %ind_val = load i32, i32* %gep1 - %gep2 = getelementptr i32, i32* %src, i32 %ind_val - %val = load i32, i32* %gep2 - %gep3 = getelementptr i32, i32* %dst, i32 %ind_val - store i32 %val, i32* %gep3 + %gep1 = getelementptr i32, ptr %ind, i64 %index + %ind_val = load i32, ptr %gep1 + %gep2 = getelementptr i32, ptr %src, i32 %ind_val + %val = load i32, ptr %gep2 + %gep3 = getelementptr i32, ptr %dst, i32 %ind_val + store i32 %val, ptr %gep3 %index.next = add nsw i64 %index, 1 %cmp10 = icmp ult i64 %index.next, %n br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0 @@ -367,7 +362,7 @@ while.end.loopexit: ; preds = %while.body ; The original loop had an unconditional uniform load. Let's make sure ; we don't artificially create new predicated blocks for the load. -define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 { +define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #0 { ; CHECK-LABEL: @uniform_load( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] @@ -390,20 +385,19 @@ define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n) ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[SRC:%.*]], align 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP10]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT]], * [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) -; CHECK-NEXT: [[TMP16:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -411,9 +405,9 @@ define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n) ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[VAL]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[VAL]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] @@ -426,9 +420,9 @@ entry: for.body: ; preds = %entry, %for.body %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %val = load i32, i32* %src, align 4 - %arrayidx = getelementptr inbounds i32, i32* %dst, i64 %indvars.iv - store i32 %val, i32* %arrayidx, align 4 + %val = load i32, ptr %src, align 4 + %arrayidx = getelementptr inbounds i32, ptr %dst, i64 %indvars.iv + store i32 %val, ptr %arrayidx, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond.not = icmp eq i64 %indvars.iv.next, %n br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 @@ -442,7 +436,7 @@ for.end: ; preds = %for.body, %entry ; do need to perform conditional loads and so we end up using a gather instead. ; However, we at least ensure the mask is the overlap of the loop predicate ; and the original condition. -define void @cond_uniform_load(i32* noalias %dst, i32* noalias readonly %src, i32* noalias readonly %cond, i64 %n) #0 { +define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr noalias readonly %cond, i64 %n) #0 { ; CHECK-LABEL: @cond_uniform_load( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] @@ -460,35 +454,33 @@ define void @cond_uniform_load(i32* noalias %dst, i32* noalias readonly %src, i3 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32* [[SRC:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[SRC:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP14]], zeroinitializer -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT]], i32 4, [[TMP15]], poison) -; CHECK-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP16]], zeroinitializer, [[WIDE_MASKED_GATHER]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP18:%.*]] = or [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[PREDPHI]], * [[TMP20]], i32 4, [[TMP18]]) -; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP22]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP14:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], i32 4, [[TMP14]], poison) +; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP12]], zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], zeroinitializer, [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP17:%.*]] = or [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[PREDPHI]], ptr [[TMP18]], i32 4, [[TMP17]]) +; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4 +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP20]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[N]]) -; CHECK-NEXT: [[TMP23:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP24:%.*]] = extractelement [[TMP23]], i32 0 -; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP21:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP22:%.*]] = extractelement [[TMP21]], i32 0 +; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -496,17 +488,17 @@ define void @cond_uniform_load(i32* noalias %dst, i32* noalias readonly %src, i3 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP25]], 0 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COND]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP23]], 0 ; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[SRC]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[SRC]], align 4 ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[TMP26]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDEX]] -; CHECK-NEXT: store i32 [[VAL_0]], i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[TMP24]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: store i32 [[VAL_0]], ptr [[ARRAYIDX1]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] @@ -519,19 +511,19 @@ entry: for.body: ; preds = %entry, %if.end %index = phi i64 [ %index.next, %if.end ], [ 0, %entry ] - %arrayidx = getelementptr inbounds i32, i32* %cond, i64 %index - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %cond, i64 %index + %0 = load i32, ptr %arrayidx, align 4 %tobool.not = icmp eq i32 %0, 0 br i1 %tobool.not, label %if.end, label %if.then if.then: ; preds = %for.body - %1 = load i32, i32* %src, align 4 + %1 = load i32, ptr %src, align 4 br label %if.end if.end: ; preds = %if.then, %for.body %val.0 = phi i32 [ %1, %if.then ], [ 0, %for.body ] - %arrayidx1 = getelementptr inbounds i32, i32* %dst, i64 %index - store i32 %val.0, i32* %arrayidx1, align 4 + %arrayidx1 = getelementptr inbounds i32, ptr %dst, i64 %index + store i32 %val.0, ptr %arrayidx1, align 4 %index.next = add nuw i64 %index, 1 %exitcond.not = icmp eq i64 %index.next, %n br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 @@ -543,7 +535,7 @@ for.end: ; preds = %for.inc, %entry ; The original loop had an unconditional uniform store. Let's make sure ; we don't artificially create new predicated blocks for the load. -define void @uniform_store(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 { +define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #0 { ; CHECK-LABEL: @uniform_store( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] @@ -561,25 +553,24 @@ define void @uniform_store(i32* noalias %dst, i32* noalias readonly %src, i64 %n ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32* [[DST:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[DST:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) -; CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[TMP14]], i32 0 +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -587,9 +578,9 @@ define void @uniform_store(i32* noalias %dst, i32* noalias readonly %src, i64 %n ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: store i32 [[VAL]], i32* [[DST]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: store i32 [[VAL]], ptr [[DST]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] @@ -602,9 +593,9 @@ entry: for.body: ; preds = %entry, %for.body %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv - %val = load i32, i32* %arrayidx, align 4 - store i32 %val, i32* %dst, align 4 + %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv + %val = load i32, ptr %arrayidx, align 4 + store i32 %val, ptr %dst, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond.not = icmp eq i64 %indvars.iv.next, %n br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 @@ -614,7 +605,7 @@ for.end: ; preds = %for.body, %entry } -define void @simple_fdiv(float* noalias %dst, float* noalias %src, i64 %n) #0 { +define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-LABEL: @simple_fdiv( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) @@ -638,24 +629,21 @@ define void @simple_fdiv(float* noalias %dst, float* noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[SRC:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[DST:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr float, float* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr float, float* [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP16:%.*]] = fdiv [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]] -; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP14]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32( [[TMP16]], * [[TMP17]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 -; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP19]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[SRC:%.*]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr float, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr float, ptr [[TMP11]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP14:%.*]] = fdiv [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP14]], ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP16]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT3]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 -; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -663,12 +651,12 @@ define void @simple_fdiv(float* noalias %dst, float* noalias %src, i64 %n) #0 { ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr float, float* [[SRC]], i64 [[INDEX]] -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr float, float* [[DST]], i64 [[INDEX]] -; CHECK-NEXT: [[VAL1:%.*]] = load float, float* [[GEP1]], align 4 -; CHECK-NEXT: [[VAL2:%.*]] = load float, float* [[GEP2]], align 4 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr float, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr float, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: [[VAL1:%.*]] = load float, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[VAL2:%.*]] = load float, ptr [[GEP2]], align 4 ; CHECK-NEXT: [[RES:%.*]] = fdiv float [[VAL1]], [[VAL2]] -; CHECK-NEXT: store float [[RES]], float* [[GEP2]], align 4 +; CHECK-NEXT: store float [[RES]], ptr [[GEP2]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 ; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP19:![0-9]+]] @@ -680,12 +668,12 @@ entry: while.body: ; preds = %while.body, %entry %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] - %gep1 = getelementptr float, float* %src, i64 %index - %gep2 = getelementptr float, float* %dst, i64 %index - %val1 = load float, float* %gep1 - %val2 = load float, float* %gep2 + %gep1 = getelementptr float, ptr %src, i64 %index + %gep2 = getelementptr float, ptr %dst, i64 %index + %val1 = load float, ptr %gep1 + %val2 = load float, ptr %gep2 %res = fdiv float %val1, %val2 - store float %res, float* %gep2 + store float %res, ptr %gep2 %index.next = add nsw i64 %index, 1 %cmp10 = icmp ult i64 %index.next, %n br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0 @@ -696,7 +684,7 @@ while.end.loopexit: ; preds = %while.body ; Integer divides can throw exceptions; if we vectorize, we must ensure ; that speculated lanes don't fault. -define void @simple_idiv(i32* noalias %dst, i32* noalias %src, i64 %n) #0 { +define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-LABEL: @simple_idiv( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) @@ -720,25 +708,22 @@ define void @simple_idiv(i32* noalias %dst, i32* noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD2]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP17:%.*]] = udiv [[WIDE_MASKED_LOAD]], [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP14]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP17]], * [[TMP18]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4 -; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP20]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP14:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD2]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP15:%.*]] = udiv [[WIDE_MASKED_LOAD]], [[TMP14]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP15]], ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP17]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT3]], i64 [[UMAX]]) -; CHECK-NEXT: [[TMP21:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP22:%.*]] = extractelement [[TMP21]], i32 0 -; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[TMP18:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP19:%.*]] = extractelement [[TMP18]], i32 0 +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -746,12 +731,12 @@ define void @simple_idiv(i32* noalias %dst, i32* noalias %src, i64 %n) #0 { ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, i32* [[SRC]], i64 [[INDEX]] -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, i32* [[DST]], i64 [[INDEX]] -; CHECK-NEXT: [[VAL1:%.*]] = load i32, i32* [[GEP1]], align 4 -; CHECK-NEXT: [[VAL2:%.*]] = load i32, i32* [[GEP2]], align 4 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[VAL2:%.*]] = load i32, ptr [[GEP2]], align 4 ; CHECK-NEXT: [[RES:%.*]] = udiv i32 [[VAL1]], [[VAL2]] -; CHECK-NEXT: store i32 [[RES]], i32* [[GEP2]], align 4 +; CHECK-NEXT: store i32 [[RES]], ptr [[GEP2]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 ; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP21:![0-9]+]] @@ -763,12 +748,12 @@ entry: while.body: ; preds = %while.body, %entry %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] - %gep1 = getelementptr i32, i32* %src, i64 %index - %gep2 = getelementptr i32, i32* %dst, i64 %index - %val1 = load i32, i32* %gep1 - %val2 = load i32, i32* %gep2 + %gep1 = getelementptr i32, ptr %src, i64 %index + %gep2 = getelementptr i32, ptr %dst, i64 %index + %val1 = load i32, ptr %gep1 + %val2 = load i32, ptr %gep2 %res = udiv i32 %val1, %val2 - store i32 %res, i32* %gep2 + store i32 %res, ptr %gep2 %index.next = add nsw i64 %index, 1 %cmp10 = icmp ult i64 %index.next, %n br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0 diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll index 1485ff609f207..c4ba0839a59e5 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll @@ -3,7 +3,7 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" -define void @test_stride1_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) { +define void @test_stride1_4i32(ptr readonly %data, ptr noalias nocapture %dst, i32 %n) { ; CHECK-LABEL: @test_stride1_4i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -18,18 +18,16 @@ define void @test_stride1_4i32(i32* readonly %data, i32* noalias nocapture %dst, ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]]) ; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i32 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i32 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> , [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP6]], <4 x i32>* [[TMP9]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> , [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -39,11 +37,11 @@ define void @test_stride1_4i32(i32* readonly %data, i32* noalias nocapture %dst, ; CHECK-NEXT: [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 1 ; CHECK-NEXT: [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP11]] -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]] -; CHECK-NEXT: store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP9]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]] +; CHECK-NEXT: store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_023]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] @@ -56,11 +54,11 @@ for.body: ; preds = %for.body.preheader, %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %mul = mul nuw nsw i32 %i.023, 1 %add5 = add nuw nsw i32 %mul, 2 - %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5 - %0 = load i32, i32* %arrayidx6, align 4 + %arrayidx6 = getelementptr inbounds i32, ptr %data, i32 %add5 + %0 = load i32, ptr %arrayidx6, align 4 %add7 = add nsw i32 5, %0 - %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023 - store i32 %add7, i32* %arrayidx9, align 4 + %arrayidx9 = getelementptr inbounds i32, ptr %dst, i32 %i.023 + store i32 %add7, ptr %arrayidx9, align 4 %inc = add nuw nsw i32 %i.023, 1 %exitcond.not = icmp eq i32 %inc, %n br i1 %exitcond.not, label %end, label %for.body @@ -68,7 +66,7 @@ end: ; preds = %end, %entry ret void } -define void @test_stride-1_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) { +define void @test_stride-1_4i32(ptr readonly %data, ptr noalias nocapture %dst, i32 %n) { ; CHECK-LABEL: @test_stride-1_4i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 4 @@ -82,20 +80,18 @@ define void @test_stride-1_4i32(i32* readonly %data, i32* noalias nocapture %dst ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i32 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -3 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -3 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> , [[REVERSE]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> , [[REVERSE]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] @@ -106,11 +102,11 @@ define void @test_stride-1_4i32(i32* readonly %data, i32* noalias nocapture %dst ; CHECK-NEXT: [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], -1 ; CHECK-NEXT: [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP12]] -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]] -; CHECK-NEXT: store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP10]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]] +; CHECK-NEXT: store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_023]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -123,11 +119,11 @@ for.body: ; preds = %for.body.preheader, %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %mul = mul nuw nsw i32 %i.023, -1 %add5 = add nuw nsw i32 %mul, 2 - %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5 - %0 = load i32, i32* %arrayidx6, align 4 + %arrayidx6 = getelementptr inbounds i32, ptr %data, i32 %add5 + %0 = load i32, ptr %arrayidx6, align 4 %add7 = add nsw i32 5, %0 - %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023 - store i32 %add7, i32* %arrayidx9, align 4 + %arrayidx9 = getelementptr inbounds i32, ptr %dst, i32 %i.023 + store i32 %add7, ptr %arrayidx9, align 4 %inc = add nuw nsw i32 %i.023, 1 %exitcond.not = icmp eq i32 %inc, %n br i1 %exitcond.not, label %end, label %for.body @@ -135,7 +131,7 @@ end: ; preds = %end, %entry ret void } -define void @test_stride2_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) { +define void @test_stride2_4i32(ptr readonly %data, ptr noalias nocapture %dst, i32 %n) { ; ; CHECK-LABEL: @test_stride2_4i32( ; CHECK-NEXT: entry: @@ -152,19 +148,17 @@ define void @test_stride2_4i32(i32* readonly %data, i32* noalias nocapture %dst, ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw nsw i32 [[TMP2]], 2 ; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i32 [[TMP3]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i32 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> , [[STRIDED_VEC]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP2]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> , [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -174,11 +168,11 @@ define void @test_stride2_4i32(i32* readonly %data, i32* noalias nocapture %dst, ; CHECK-NEXT: [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 2 ; CHECK-NEXT: [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP13]] -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]] -; CHECK-NEXT: store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP11]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]] +; CHECK-NEXT: store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_023]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] @@ -191,11 +185,11 @@ for.body: ; preds = %for.body.preheader, %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %mul = mul nuw nsw i32 %i.023, 2 %add5 = add nuw nsw i32 %mul, 2 - %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5 - %0 = load i32, i32* %arrayidx6, align 4 + %arrayidx6 = getelementptr inbounds i32, ptr %data, i32 %add5 + %0 = load i32, ptr %arrayidx6, align 4 %add7 = add nsw i32 5, %0 - %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023 - store i32 %add7, i32* %arrayidx9, align 4 + %arrayidx9 = getelementptr inbounds i32, ptr %dst, i32 %i.023 + store i32 %add7, ptr %arrayidx9, align 4 %inc = add nuw nsw i32 %i.023, 1 %exitcond.not = icmp eq i32 %inc, %n br i1 %exitcond.not, label %end, label %for.body @@ -203,7 +197,7 @@ end: ; preds = %end, %entry ret void } -define void @test_stride3_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) { +define void @test_stride3_4i32(ptr readonly %data, ptr noalias nocapture %dst, i32 %n) { ; CHECK-LABEL: @test_stride3_4i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -219,17 +213,16 @@ define void @test_stride3_4i32(i32* readonly %data, i32* noalias nocapture %dst, ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]]) ; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw <4 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP2]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], <4 x i32> [[TMP2]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> , [[WIDE_MASKED_GATHER]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -239,11 +232,11 @@ define void @test_stride3_4i32(i32* readonly %data, i32* noalias nocapture %dst, ; CHECK-NEXT: [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 3 ; CHECK-NEXT: [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]] -; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP9]] -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]] -; CHECK-NEXT: store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP8]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]] +; CHECK-NEXT: store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_023]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] @@ -256,11 +249,11 @@ for.body: ; preds = %for.body.preheader, %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %mul = mul nuw nsw i32 %i.023, 3 %add5 = add nuw nsw i32 %mul, 2 - %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5 - %0 = load i32, i32* %arrayidx6, align 4 + %arrayidx6 = getelementptr inbounds i32, ptr %data, i32 %add5 + %0 = load i32, ptr %arrayidx6, align 4 %add7 = add nsw i32 5, %0 - %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023 - store i32 %add7, i32* %arrayidx9, align 4 + %arrayidx9 = getelementptr inbounds i32, ptr %dst, i32 %i.023 + store i32 %add7, ptr %arrayidx9, align 4 %inc = add nuw nsw i32 %i.023, 1 %exitcond.not = icmp eq i32 %inc, %n br i1 %exitcond.not, label %end, label %for.body @@ -268,7 +261,7 @@ end: ; preds = %end, %entry ret void } -define void @test_stride4_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) { +define void @test_stride4_4i32(ptr readonly %data, ptr noalias nocapture %dst, i32 %n) { ; CHECK-LABEL: @test_stride4_4i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -284,17 +277,16 @@ define void @test_stride4_4i32(i32* readonly %data, i32* noalias nocapture %dst, ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]]) ; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw <4 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP2]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], <4 x i32> [[TMP2]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> , [[WIDE_MASKED_GATHER]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -304,11 +296,11 @@ define void @test_stride4_4i32(i32* readonly %data, i32* noalias nocapture %dst, ; CHECK-NEXT: [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 4 ; CHECK-NEXT: [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]] -; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP9]] -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]] -; CHECK-NEXT: store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP8]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]] +; CHECK-NEXT: store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_023]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] @@ -321,11 +313,11 @@ for.body: ; preds = %for.body.preheader, %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %mul = mul nuw nsw i32 %i.023, 4 %add5 = add nuw nsw i32 %mul, 2 - %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5 - %0 = load i32, i32* %arrayidx6, align 4 + %arrayidx6 = getelementptr inbounds i32, ptr %data, i32 %add5 + %0 = load i32, ptr %arrayidx6, align 4 %add7 = add nsw i32 5, %0 - %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023 - store i32 %add7, i32* %arrayidx9, align 4 + %arrayidx9 = getelementptr inbounds i32, ptr %dst, i32 %i.023 + store i32 %add7, ptr %arrayidx9, align 4 %inc = add nuw nsw i32 %i.023, 1 %exitcond.not = icmp eq i32 %inc, %n br i1 %exitcond.not, label %end, label %for.body @@ -333,7 +325,7 @@ end: ; preds = %end, %entry ret void } -define void @test_stride_loopinvar_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n, i32 %stride) { +define void @test_stride_loopinvar_4i32(ptr readonly %data, ptr noalias nocapture %dst, i32 %n, i32 %stride) { ; CHECK-LABEL: @test_stride_loopinvar_4i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] @@ -351,18 +343,16 @@ define void @test_stride_loopinvar_4i32(i32* readonly %data, i32* noalias nocapt ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]]) ; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i32 [[TMP0]], [[STRIDE]] ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i32 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> , [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP6]], <4 x i32>* [[TMP9]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> , [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -372,11 +362,11 @@ define void @test_stride_loopinvar_4i32(i32* readonly %data, i32* noalias nocapt ; CHECK-NEXT: [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], [[STRIDE]] ; CHECK-NEXT: [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP11]] -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]] -; CHECK-NEXT: store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP9]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]] +; CHECK-NEXT: store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_023]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] @@ -389,11 +379,11 @@ for.body: ; preds = %for.body.preheader, %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %mul = mul nuw nsw i32 %i.023, %stride %add5 = add nuw nsw i32 %mul, 2 - %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5 - %0 = load i32, i32* %arrayidx6, align 4 + %arrayidx6 = getelementptr inbounds i32, ptr %data, i32 %add5 + %0 = load i32, ptr %arrayidx6, align 4 %add7 = add nsw i32 5, %0 - %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023 - store i32 %add7, i32* %arrayidx9, align 4 + %arrayidx9 = getelementptr inbounds i32, ptr %dst, i32 %i.023 + store i32 %add7, ptr %arrayidx9, align 4 %inc = add nuw nsw i32 %i.023, 1 %exitcond.not = icmp eq i32 %inc, %n br i1 %exitcond.not, label %end, label %for.body @@ -401,7 +391,7 @@ end: ; preds = %end, %entry ret void } -define void @test_stride_noninvar_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) { +define void @test_stride_noninvar_4i32(ptr readonly %data, ptr noalias nocapture %dst, i32 %n) { ; CHECK-LABEL: @test_stride_noninvar_4i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 4 @@ -419,18 +409,17 @@ define void @test_stride_noninvar_4i32(i32* readonly %data, i32* noalias nocaptu ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], [[VEC_IND2]] ; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw <4 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP3]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP4]], i32 4, <4 x i1> , <4 x i32> poison) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], <4 x i32> [[TMP3]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP4]], i32 4, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> , [[WIDE_MASKED_GATHER]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] @@ -443,11 +432,11 @@ define void @test_stride_noninvar_4i32(i32* readonly %data, i32* noalias nocaptu ; CHECK-NEXT: [[STRIDE:%.*]] = phi i32 [ [[NEXT_STRIDE:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], [[STRIDE]] ; CHECK-NEXT: [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]] -; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP10]] -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]] -; CHECK-NEXT: store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP9]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]] +; CHECK-NEXT: store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_023]], 1 ; CHECK-NEXT: [[NEXT_STRIDE]] = add nuw nsw i32 [[STRIDE]], 8 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] @@ -462,11 +451,11 @@ for.body: ; preds = %for.body.preheader, %stride = phi i32 [ %next.stride, %for.body ], [ 3, %entry ] %mul = mul nuw nsw i32 %i.023, %stride %add5 = add nuw nsw i32 %mul, 2 - %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5 - %0 = load i32, i32* %arrayidx6, align 4 + %arrayidx6 = getelementptr inbounds i32, ptr %data, i32 %add5 + %0 = load i32, ptr %arrayidx6, align 4 %add7 = add nsw i32 5, %0 - %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023 - store i32 %add7, i32* %arrayidx9, align 4 + %arrayidx9 = getelementptr inbounds i32, ptr %dst, i32 %i.023 + store i32 %add7, ptr %arrayidx9, align 4 %inc = add nuw nsw i32 %i.023, 1 %next.stride = add nuw nsw i32 %stride, 8 %exitcond.not = icmp eq i32 %inc, %n @@ -475,7 +464,7 @@ end: ; preds = %end, %entry ret void } -define void @test_stride_noninvar2_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) { +define void @test_stride_noninvar2_4i32(ptr readonly %data, ptr noalias nocapture %dst, i32 %n) { ; CHECK-LABEL: @test_stride_noninvar2_4i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -484,11 +473,11 @@ define void @test_stride_noninvar2_4i32(i32* readonly %data, i32* noalias nocapt ; CHECK-NEXT: [[STRIDE:%.*]] = phi i32 [ [[NEXT_STRIDE:%.*]], [[FOR_BODY]] ], [ 3, [[ENTRY]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], [[STRIDE]] ; CHECK-NEXT: [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i32 [[ADD5]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[ADD5]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP0]] -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[I_023]] -; CHECK-NEXT: store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[I_023]] +; CHECK-NEXT: store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_023]], 1 ; CHECK-NEXT: [[NEXT_STRIDE]] = mul nuw nsw i32 [[STRIDE]], 8 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] @@ -503,11 +492,11 @@ for.body: ; preds = %for.body.preheader, %stride = phi i32 [ %next.stride, %for.body ], [ 3, %entry ] %mul = mul nuw nsw i32 %i.023, %stride %add5 = add nuw nsw i32 %mul, 2 - %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5 - %0 = load i32, i32* %arrayidx6, align 4 + %arrayidx6 = getelementptr inbounds i32, ptr %data, i32 %add5 + %0 = load i32, ptr %arrayidx6, align 4 %add7 = add nsw i32 5, %0 - %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023 - store i32 %add7, i32* %arrayidx9, align 4 + %arrayidx9 = getelementptr inbounds i32, ptr %dst, i32 %i.023 + store i32 %add7, ptr %arrayidx9, align 4 %inc = add nuw nsw i32 %i.023, 1 %next.stride = mul nuw nsw i32 %stride, 8 %exitcond.not = icmp eq i32 %inc, %n @@ -516,7 +505,7 @@ end: ; preds = %end, %entry ret void } -define void @test_stride_noninvar3_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n, i32 %x) { +define void @test_stride_noninvar3_4i32(ptr readonly %data, ptr noalias nocapture %dst, i32 %n, i32 %x) { ; CHECK-LABEL: @test_stride_noninvar3_4i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 4 @@ -541,18 +530,17 @@ define void @test_stride_noninvar3_4i32(i32* readonly %data, i32* noalias nocapt ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], [[VEC_IND4]] ; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw <4 x i32> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP5]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP6]], i32 4, <4 x i1> , <4 x i32> poison) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], <4 x i32> [[TMP5]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP6]], i32 4, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> , [[WIDE_MASKED_GATHER]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[VEC_IND_NEXT5]] = add <4 x i32> [[VEC_IND4]], [[DOTSPLAT3]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] @@ -565,11 +553,11 @@ define void @test_stride_noninvar3_4i32(i32* readonly %data, i32* noalias nocapt ; CHECK-NEXT: [[STRIDE:%.*]] = phi i32 [ [[NEXT_STRIDE:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], [[STRIDE]] ; CHECK-NEXT: [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP12]] -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]] -; CHECK-NEXT: store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP11]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]] +; CHECK-NEXT: store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_023]], 1 ; CHECK-NEXT: [[NEXT_STRIDE]] = add nuw nsw i32 [[STRIDE]], [[X]] ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] @@ -584,11 +572,11 @@ for.body: ; preds = %for.body.preheader, %stride = phi i32 [ %next.stride, %for.body ], [ 3, %entry ] %mul = mul nuw nsw i32 %i.023, %stride %add5 = add nuw nsw i32 %mul, 2 - %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5 - %0 = load i32, i32* %arrayidx6, align 4 + %arrayidx6 = getelementptr inbounds i32, ptr %data, i32 %add5 + %0 = load i32, ptr %arrayidx6, align 4 %add7 = add nsw i32 5, %0 - %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023 - store i32 %add7, i32* %arrayidx9, align 4 + %arrayidx9 = getelementptr inbounds i32, ptr %dst, i32 %i.023 + store i32 %add7, ptr %arrayidx9, align 4 %inc = add nuw nsw i32 %i.023, 1 %next.stride = add nuw nsw i32 %stride, %x %exitcond.not = icmp eq i32 %inc, %n @@ -598,7 +586,7 @@ end: ; preds = %end, %entry } declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) -declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>) declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) -declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) -declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>) +declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>) +declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>) diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll index 74b47292b31d2..3f1320d90c640 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll @@ -4,7 +4,7 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main-none-none-eabi" -define i32 @mla_i32(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i32 %n) #0 { +define i32 @mla_i32(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, i32 %n) #0 { ; CHECK-LABEL: @mla_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -18,48 +18,46 @@ define i32 @mla_i32(i8* noalias nocapture readonly %A, i8* noalias nocapture rea ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[TMP0]], i32 [[N]]) -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP10]]) -; CHECK-NEXT: [[TMP12]] = add i32 [[TMP11]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) +; CHECK-NEXT: [[TMP10]] = add i32 [[TMP9]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[RES_010:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[I_011]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP14]] to i32 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[I_011]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CONV2:%.*]] = sext i8 [[TMP15]] to i32 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[I_011]] +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP12]] to i32 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[I_011]] +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CONV2:%.*]] = sext i8 [[TMP13]] to i32 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]] ; CHECK-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[RES_010]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_011]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -72,11 +70,11 @@ entry: for.body: ; preds = %entry, %for.body %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %res.010 = phi i32 [ %add, %for.body ], [ 0, %entry ] - %arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.011 - %0 = load i8, i8* %arrayidx, align 1 + %arrayidx = getelementptr inbounds i8, ptr %A, i32 %i.011 + %0 = load i8, ptr %arrayidx, align 1 %conv = sext i8 %0 to i32 - %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.011 - %1 = load i8, i8* %arrayidx1, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %B, i32 %i.011 + %1 = load i8, ptr %arrayidx1, align 1 %conv2 = sext i8 %1 to i32 %mul = mul nsw i32 %conv2, %conv %add = add nsw i32 %mul, %res.010 @@ -89,7 +87,7 @@ for.cond.cleanup: ; preds = %for.body, %entry ret i32 %res.0.lcssa } -define i32 @mla_i8(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i32 %n) #0 { +define i32 @mla_i8(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, i32 %n) #0 { ; CHECK-LABEL: @mla_i8( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -103,48 +101,46 @@ define i32 @mla_i8(i8* noalias nocapture readonly %A, i8* noalias nocapture read ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[TMP0]], i32 [[N]]) -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP10]]) -; CHECK-NEXT: [[TMP12]] = add i32 [[TMP11]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) +; CHECK-NEXT: [[TMP10]] = add i32 [[TMP9]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[RES_010:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[I_011]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP14]] to i32 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[I_011]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CONV2:%.*]] = sext i8 [[TMP15]] to i32 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[I_011]] +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP12]] to i32 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[I_011]] +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CONV2:%.*]] = sext i8 [[TMP13]] to i32 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]] ; CHECK-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[RES_010]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_011]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -157,11 +153,11 @@ entry: for.body: ; preds = %entry, %for.body %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %res.010 = phi i32 [ %add, %for.body ], [ 0, %entry ] - %arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.011 - %0 = load i8, i8* %arrayidx, align 1 + %arrayidx = getelementptr inbounds i8, ptr %A, i32 %i.011 + %0 = load i8, ptr %arrayidx, align 1 %conv = sext i8 %0 to i32 - %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.011 - %1 = load i8, i8* %arrayidx1, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %B, i32 %i.011 + %1 = load i8, ptr %arrayidx1, align 1 %conv2 = sext i8 %1 to i32 %mul = mul nsw i32 %conv2, %conv %add = add nsw i32 %mul, %res.010 @@ -174,7 +170,7 @@ for.cond.cleanup: ; preds = %for.body, %entry ret i32 %res.0.lcssa } -define i32 @add_i32(i32* nocapture readonly %x, i32 %n) #0 { +define i32 @add_i32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -188,36 +184,35 @@ define i32 @add_i32(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]]) -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) -; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) -; CHECK-NEXT: [[TMP6]] = add i32 [[TMP5]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP8]], [[R_07]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP7]], [[R_07]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -230,8 +225,8 @@ entry: for.body: ; preds = %entry, %for.body %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ] - %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 + %0 = load i32, ptr %arrayidx, align 4 %add = add nsw i32 %0, %r.07 %inc = add nuw nsw i32 %i.08, 1 %exitcond = icmp eq i32 %inc, %n @@ -242,7 +237,7 @@ for.cond.cleanup: ; preds = %for.body, %entry ret i32 %r.0.lcssa } -define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) #0 { +define i32 @mul_i32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @mul_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -256,36 +251,35 @@ define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]]) -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) -; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD]] = mul nsw i32 [[TMP8]], [[R_07]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = mul nsw i32 [[TMP7]], [[R_07]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -298,8 +292,8 @@ entry: for.body: ; preds = %entry, %for.body %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %r.07 = phi i32 [ %add, %for.body ], [ 1, %entry ] - %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 + %0 = load i32, ptr %arrayidx, align 4 %add = mul nsw i32 %0, %r.07 %inc = add nuw nsw i32 %i.08, 1 %exitcond = icmp eq i32 %inc, %n @@ -310,7 +304,7 @@ for.cond.cleanup: ; preds = %for.body, %entry ret i32 %r.0.lcssa } -define i32 @and_i32(i32* nocapture readonly %x, i32 %n) #0 { +define i32 @and_i32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @and_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -324,36 +318,35 @@ define i32 @and_i32(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]]) -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) -; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = and <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD]] = and i32 [[TMP8]], [[R_07]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = and i32 [[TMP7]], [[R_07]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -366,8 +359,8 @@ entry: for.body: ; preds = %entry, %for.body %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %r.07 = phi i32 [ %add, %for.body ], [ 4294967295, %entry ] - %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 + %0 = load i32, ptr %arrayidx, align 4 %add = and i32 %0, %r.07 %inc = add nuw nsw i32 %i.08, 1 %exitcond = icmp eq i32 %inc, %n @@ -378,7 +371,7 @@ for.cond.cleanup: ; preds = %for.body, %entry ret i32 %r.0.lcssa } -define i32 @or_i32(i32* nocapture readonly %x, i32 %n) #0 { +define i32 @or_i32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @or_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -392,36 +385,35 @@ define i32 @or_i32(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]]) -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) -; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD]] = or i32 [[TMP8]], [[R_07]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = or i32 [[TMP7]], [[R_07]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -434,8 +426,8 @@ entry: for.body: ; preds = %entry, %for.body %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ] - %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 + %0 = load i32, ptr %arrayidx, align 4 %add = or i32 %0, %r.07 %inc = add nuw nsw i32 %i.08, 1 %exitcond = icmp eq i32 %inc, %n @@ -446,7 +438,7 @@ for.cond.cleanup: ; preds = %for.body, %entry ret i32 %r.0.lcssa } -define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) #0 { +define i32 @xor_i32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @xor_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -460,36 +452,35 @@ define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]]) -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) -; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD]] = xor i32 [[TMP8]], [[R_07]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = xor i32 [[TMP7]], [[R_07]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -502,8 +493,8 @@ entry: for.body: ; preds = %entry, %for.body %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ] - %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 + %0 = load i32, ptr %arrayidx, align 4 %add = xor i32 %0, %r.07 %inc = add nuw nsw i32 %i.08, 1 %exitcond = icmp eq i32 %inc, %n @@ -514,7 +505,7 @@ for.cond.cleanup: ; preds = %for.body, %entry ret i32 %r.0.lcssa } -define float @fadd_f32(float* nocapture readonly %x, i32 %n) #0 { +define float @fadd_f32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @fadd_f32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -528,36 +519,35 @@ define float @fadd_f32(float* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]]) -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison) -; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP5]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison) +; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP4]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP3]], <4 x float> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[R_07:%.*]] = phi float [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X]], i32 [[I_08]] -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD]] = fadd fast float [[TMP8]], [[R_07]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = fadd fast float [[TMP7]], [[R_07]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -570,8 +560,8 @@ entry: for.body: ; preds = %entry, %for.body %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %r.07 = phi float [ %add, %for.body ], [ 0.0, %entry ] - %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08 - %0 = load float, float* %arrayidx, align 4 + %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.08 + %0 = load float, ptr %arrayidx, align 4 %add = fadd fast float %0, %r.07 %inc = add nuw nsw i32 %i.08, 1 %exitcond = icmp eq i32 %inc, %n @@ -582,7 +572,7 @@ for.cond.cleanup: ; preds = %for.body, %entry ret float %r.0.lcssa } -define float @fmul_f32(float* nocapture readonly %x, i32 %n) #0 { +define float @fmul_f32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @fmul_f32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -596,36 +586,35 @@ define float @fmul_f32(float* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]]) -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison) -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP5]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison) +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP4]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP3]], <4 x float> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP4]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[R_07:%.*]] = phi float [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X]], i32 [[I_08]] -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD]] = fmul fast float [[TMP8]], [[R_07]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = fmul fast float [[TMP7]], [[R_07]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -638,8 +627,8 @@ entry: for.body: ; preds = %entry, %for.body %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %r.07 = phi float [ %add, %for.body ], [ 1.0, %entry ] - %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08 - %0 = load float, float* %arrayidx, align 4 + %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.08 + %0 = load float, ptr %arrayidx, align 4 %add = fmul fast float %0, %r.07 %inc = add nuw nsw i32 %i.08, 1 %exitcond = icmp eq i32 %inc, %n @@ -650,7 +639,7 @@ for.cond.cleanup: ; preds = %for.body, %entry ret float %r.0.lcssa } -define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) #0 { +define i32 @smin_i32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @smin_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -664,37 +653,36 @@ define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2147483647, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2147483647, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[C:%.*]] = icmp slt i32 [[R_07]], [[TMP8]] -; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[C:%.*]] = icmp slt i32 [[R_07]], [[TMP7]] +; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 2147483647, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -707,8 +695,8 @@ entry: for.body: ; preds = %entry, %for.body %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %r.07 = phi i32 [ %add, %for.body ], [ 2147483647, %entry ] - %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 + %0 = load i32, ptr %arrayidx, align 4 %c = icmp slt i32 %r.07, %0 %add = select i1 %c, i32 %r.07, i32 %0 %inc = add nuw nsw i32 %i.08, 1 @@ -720,7 +708,7 @@ for.cond.cleanup: ; preds = %for.body, %entry ret i32 %r.0.lcssa } -define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) #0 { +define i32 @smax_i32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @smax_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -734,37 +722,36 @@ define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -2147483648, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -2147483648, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[C:%.*]] = icmp sgt i32 [[R_07]], [[TMP8]] -; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i32 [[R_07]], [[TMP7]] +; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ -2147483648, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -777,8 +764,8 @@ entry: for.body: ; preds = %entry, %for.body %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %r.07 = phi i32 [ %add, %for.body ], [ -2147483648, %entry ] - %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 + %0 = load i32, ptr %arrayidx, align 4 %c = icmp sgt i32 %r.07, %0 %add = select i1 %c, i32 %r.07, i32 %0 %inc = add nuw nsw i32 %i.08, 1 @@ -790,7 +777,7 @@ for.cond.cleanup: ; preds = %for.body, %entry ret i32 %r.0.lcssa } -define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) #0 { +define i32 @umin_i32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @umin_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -804,37 +791,36 @@ define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[R_07]], [[TMP8]] -; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[R_07]], [[TMP7]] +; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -847,8 +833,8 @@ entry: for.body: ; preds = %entry, %for.body %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %r.07 = phi i32 [ %add, %for.body ], [ 4294967295, %entry ] - %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 + %0 = load i32, ptr %arrayidx, align 4 %c = icmp ult i32 %r.07, %0 %add = select i1 %c, i32 %r.07, i32 %0 %inc = add nuw nsw i32 %i.08, 1 @@ -860,7 +846,7 @@ for.cond.cleanup: ; preds = %for.body, %entry ret i32 %r.0.lcssa } -define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) #0 { +define i32 @umax_i32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @umax_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -874,37 +860,36 @@ define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[R_07]], [[TMP8]] -; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[R_07]], [[TMP7]] +; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -917,8 +902,8 @@ entry: for.body: ; preds = %entry, %for.body %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ] - %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 + %0 = load i32, ptr %arrayidx, align 4 %c = icmp ugt i32 %r.07, %0 %add = select i1 %c, i32 %r.07, i32 %0 %inc = add nuw nsw i32 %i.08, 1 @@ -930,7 +915,7 @@ for.cond.cleanup: ; preds = %for.body, %entry ret i32 %r.0.lcssa } -define float @fmin_f32(float* nocapture readonly %x, i32 %n) #0 { +define float @fmin_f32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @fmin_f32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -940,8 +925,8 @@ define float @fmin_f32(float* nocapture readonly %x, i32 %n) #0 { ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: [[R_07:%.*]] = phi float [ [[ADD:%.*]], [[FOR_BODY]] ], [ 1.000000e+03, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[I_08]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i32 [[I_08]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[C:%.*]] = fcmp ult float [[R_07]], [[TMP0]] ; CHECK-NEXT: [[ADD]] = select i1 [[C]], float [[R_07]], float [[TMP0]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 @@ -961,8 +946,8 @@ entry: for.body: ; preds = %entry, %for.body %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %r.07 = phi float [ %add, %for.body ], [ 1000.0, %entry ] - %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08 - %0 = load float, float* %arrayidx, align 4 + %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.08 + %0 = load float, ptr %arrayidx, align 4 %c = fcmp ult float %r.07, %0 %add = select i1 %c, float %r.07, float %0 %inc = add nuw nsw i32 %i.08, 1 @@ -974,7 +959,7 @@ for.cond.cleanup: ; preds = %for.body, %entry ret float %r.0.lcssa } -define float @fmax_f32(float* nocapture readonly %x, i32 %n) #0 { +define float @fmax_f32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @fmax_f32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -984,8 +969,8 @@ define float @fmax_f32(float* nocapture readonly %x, i32 %n) #0 { ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: [[R_07:%.*]] = phi float [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[I_08]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i32 [[I_08]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[C:%.*]] = fcmp fast ugt float [[R_07]], [[TMP0]] ; CHECK-NEXT: [[ADD]] = select i1 [[C]], float [[R_07]], float [[TMP0]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 @@ -1005,8 +990,8 @@ entry: for.body: ; preds = %entry, %for.body %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %r.07 = phi float [ %add, %for.body ], [ 0.0, %entry ] - %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08 - %0 = load float, float* %arrayidx, align 4 + %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.08 + %0 = load float, ptr %arrayidx, align 4 %c = fcmp fast ugt float %r.07, %0 %add = select i1 %c, float %r.07, float %0 %inc = add nuw nsw i32 %i.08, 1 @@ -1018,7 +1003,7 @@ for.cond.cleanup: ; preds = %for.body, %entry ret float %r.0.lcssa } -define i64 @loopinvariant_mla(i32* nocapture readonly %x, i32 %y, i32 %n) #0 { +define i64 @loopinvariant_mla(ptr nocapture readonly %x, i32 %y, i32 %n) #0 { ; CHECK-LABEL: @loopinvariant_mla( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -1035,8 +1020,8 @@ define i64 @loopinvariant_mla(i32* nocapture readonly %x, i32 %y, i32 %n) #0 { ; CHECK: for.body: ; CHECK-NEXT: [[I_09:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[S_08:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[ADD]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[I_09]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[I_09]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV1]] ; CHECK-NEXT: [[ADD]] = add nsw i64 [[MUL]], [[S_08]] @@ -1059,8 +1044,8 @@ for.cond.cleanup: ; preds = %for.body, %entry for.body: ; preds = %for.body.lr.ph, %for.body %i.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] %s.08 = phi i64 [ 0, %for.body.lr.ph ], [ %add, %for.body ] - %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.09 - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.09 + %0 = load i32, ptr %arrayidx, align 4 %conv = sext i32 %0 to i64 %mul = mul nsw i64 %conv, %conv1 %add = add nsw i64 %mul, %s.08 diff --git a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll index bba064213b872..3ffd424f0cb89 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll @@ -10,29 +10,29 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" @a = external global i32 @v = external global i32 -@mm = external global float** -@vv = external global float** -@ll = external global float* +@mm = external global ptr +@vv = external global ptr +@ll = external global ptr -define i32 @test(float* nocapture readonly %x) { +define i32 @test(ptr nocapture readonly %x) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[T:%.*]] = load i32, i32* @v, align 8 -; CHECK-NEXT: [[T1:%.*]] = load i32, i32* @a, align 4 +; CHECK-NEXT: [[T:%.*]] = load i32, ptr @v, align 8 +; CHECK-NEXT: [[T1:%.*]] = load i32, ptr @a, align 4 ; CHECK-NEXT: br label [[OUTERLOOP:%.*]] ; CHECK: outerloop: ; CHECK-NEXT: [[T2:%.*]] = phi i32 [ [[V17:%.*]], [[OUTEREND:%.*]] ], [ [[T1]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[J_0136:%.*]] = phi i32 [ [[INC144:%.*]], [[OUTEREND]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[SCORE_1135:%.*]] = phi i32 [ [[CALL142:%.*]], [[OUTEREND]] ], [ -939524096, [[ENTRY]] ] -; CHECK-NEXT: [[T3:%.*]] = load float**, float*** @mm, align 4 -; CHECK-NEXT: [[ARRAYIDX109:%.*]] = getelementptr inbounds float*, float** [[T3]], i32 [[T2]] -; CHECK-NEXT: [[T4:%.*]] = load float*, float** [[ARRAYIDX109]], align 4 -; CHECK-NEXT: [[T5:%.*]] = load float**, float*** @vv, align 4 -; CHECK-NEXT: [[ARRAYIDX111:%.*]] = getelementptr inbounds float*, float** [[T5]], i32 [[T2]] -; CHECK-NEXT: [[T6:%.*]] = load float*, float** [[ARRAYIDX111]], align 4 -; CHECK-NEXT: [[T7:%.*]] = load float*, float** @ll, align 4 -; CHECK-NEXT: [[ARRAYIDX113:%.*]] = getelementptr inbounds float, float* [[T7]], i32 [[T2]] -; CHECK-NEXT: [[T8:%.*]] = load float, float* [[ARRAYIDX113]], align 4 +; CHECK-NEXT: [[T3:%.*]] = load ptr, ptr @mm, align 4 +; CHECK-NEXT: [[ARRAYIDX109:%.*]] = getelementptr inbounds ptr, ptr [[T3]], i32 [[T2]] +; CHECK-NEXT: [[T4:%.*]] = load ptr, ptr [[ARRAYIDX109]], align 4 +; CHECK-NEXT: [[T5:%.*]] = load ptr, ptr @vv, align 4 +; CHECK-NEXT: [[ARRAYIDX111:%.*]] = getelementptr inbounds ptr, ptr [[T5]], i32 [[T2]] +; CHECK-NEXT: [[T6:%.*]] = load ptr, ptr [[ARRAYIDX111]], align 4 +; CHECK-NEXT: [[T7:%.*]] = load ptr, ptr @ll, align 4 +; CHECK-NEXT: [[ARRAYIDX113:%.*]] = getelementptr inbounds float, ptr [[T7]], i32 [[T2]] +; CHECK-NEXT: [[T8:%.*]] = load float, ptr [[ARRAYIDX113]], align 4 ; CHECK-NEXT: [[CONV114:%.*]] = fpext float [[T8]] to double ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[T]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -43,100 +43,97 @@ define i32 @test(float* nocapture readonly %x) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <2 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP9:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double> -; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <2 x double> [[TMP9]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <2 x float>* -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x float>, <2 x float>* [[TMP13]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = fpext <2 x float> [[WIDE_LOAD2]] to <2 x double> -; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP14]] -; CHECK-NEXT: [[TMP16]] = fsub fast <2 x double> [[VEC_PHI]], [[TMP15]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[T4]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP7:%.*]] = fpext <2 x float> [[TMP6]] to <2 x double> +; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x double> [[TMP7]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[T6]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x float>, ptr [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = fpext <2 x float> [[WIDE_LOAD2]] to <2 x double> +; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <2 x double> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP13]] = fsub fast <2 x double> [[VEC_PHI]], [[TMP12]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP18:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP16]]) +; CHECK-NEXT: [[TMP15:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP13]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[T]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[OUTEREND]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTERLOOP]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[CONV114]], [[OUTERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[CONV114]], [[OUTERLOOP]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[INNERLOOP:%.*]] ; CHECK: innerloop: ; CHECK-NEXT: [[I_2132:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC129:%.*]], [[INNERLOOP]] ] ; CHECK-NEXT: [[DVAL1_4131:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB127:%.*]], [[INNERLOOP]] ] -; CHECK-NEXT: [[ARRAYIDX119:%.*]] = getelementptr inbounds float, float* [[X]], i32 [[I_2132]] -; CHECK-NEXT: [[T9:%.*]] = load float, float* [[ARRAYIDX119]], align 4 -; CHECK-NEXT: [[ARRAYIDX120:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[I_2132]] -; CHECK-NEXT: [[T10:%.*]] = load float, float* [[ARRAYIDX120]], align 4 +; CHECK-NEXT: [[ARRAYIDX119:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[I_2132]] +; CHECK-NEXT: [[T9:%.*]] = load float, ptr [[ARRAYIDX119]], align 4 +; CHECK-NEXT: [[ARRAYIDX120:%.*]] = getelementptr inbounds float, ptr [[T4]], i32 [[I_2132]] +; CHECK-NEXT: [[T10:%.*]] = load float, ptr [[ARRAYIDX120]], align 4 ; CHECK-NEXT: [[SUB121:%.*]] = fsub fast float [[T9]], [[T10]] ; CHECK-NEXT: [[CONV122:%.*]] = fpext float [[SUB121]] to double ; CHECK-NEXT: [[MUL123:%.*]] = fmul fast double [[CONV122]], [[CONV122]] -; CHECK-NEXT: [[ARRAYIDX124:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[I_2132]] -; CHECK-NEXT: [[T11:%.*]] = load float, float* [[ARRAYIDX124]], align 4 +; CHECK-NEXT: [[ARRAYIDX124:%.*]] = getelementptr inbounds float, ptr [[T6]], i32 [[I_2132]] +; CHECK-NEXT: [[T11:%.*]] = load float, ptr [[ARRAYIDX124]], align 4 ; CHECK-NEXT: [[CONV125:%.*]] = fpext float [[T11]] to double ; CHECK-NEXT: [[MUL126:%.*]] = fmul fast double [[MUL123]], [[CONV125]] ; CHECK-NEXT: [[SUB127]] = fsub fast double [[DVAL1_4131]], [[MUL126]] ; CHECK-NEXT: [[INC129]] = add nuw nsw i32 [[I_2132]], 1 ; CHECK-NEXT: [[EXITCOND143:%.*]] = icmp eq i32 [[INC129]], [[T]] -; CHECK-NEXT: br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: outerend: -; CHECK-NEXT: [[SUB127_LCSSA:%.*]] = phi double [ [[SUB127]], [[INNERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUB127_LCSSA:%.*]] = phi double [ [[SUB127]], [[INNERLOOP]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[CONV138:%.*]] = fptosi double [[SUB127_LCSSA]] to i32 ; CHECK-NEXT: [[CALL142]] = add nuw nsw i32 [[SCORE_1135]], [[CONV138]] ; CHECK-NEXT: [[INC144]] = add nuw nsw i32 [[J_0136]], 1 -; CHECK-NEXT: [[ARRAYIDX102:%.*]] = getelementptr inbounds i32, i32* @a, i32 [[INC144]] -; CHECK-NEXT: [[V17]] = load i32, i32* [[ARRAYIDX102]], align 4 +; CHECK-NEXT: [[ARRAYIDX102:%.*]] = getelementptr inbounds i32, ptr @a, i32 [[INC144]] +; CHECK-NEXT: [[V17]] = load i32, ptr [[ARRAYIDX102]], align 4 ; CHECK-NEXT: [[CMP103:%.*]] = icmp sgt i32 [[V17]], -1 ; CHECK-NEXT: br i1 [[CMP103]], label [[OUTERLOOP]], label [[EXIT:%.*]] ; CHECK: exit: ; CHECK-NEXT: ret i32 [[CALL142]] ; entry: - %t = load i32, i32* @v, align 8 - %t1 = load i32, i32* @a, align 4 + %t = load i32, ptr @v, align 8 + %t1 = load i32, ptr @a, align 4 br label %outerloop outerloop: %t2 = phi i32 [ %v17, %outerend ], [ %t1, %entry ] %j.0136 = phi i32 [ %inc144, %outerend ], [ 0, %entry ] %score.1135 = phi i32 [ %call142, %outerend ], [ -939524096, %entry ] - %t3 = load float**, float*** @mm, align 4 - %arrayidx109 = getelementptr inbounds float*, float** %t3, i32 %t2 - %t4 = load float*, float** %arrayidx109, align 4 - %t5 = load float**, float*** @vv, align 4 - %arrayidx111 = getelementptr inbounds float*, float** %t5, i32 %t2 - %t6 = load float*, float** %arrayidx111, align 4 - %t7 = load float*, float** @ll, align 4 - %arrayidx113 = getelementptr inbounds float, float* %t7, i32 %t2 - %t8 = load float, float* %arrayidx113, align 4 + %t3 = load ptr, ptr @mm, align 4 + %arrayidx109 = getelementptr inbounds ptr, ptr %t3, i32 %t2 + %t4 = load ptr, ptr %arrayidx109, align 4 + %t5 = load ptr, ptr @vv, align 4 + %arrayidx111 = getelementptr inbounds ptr, ptr %t5, i32 %t2 + %t6 = load ptr, ptr %arrayidx111, align 4 + %t7 = load ptr, ptr @ll, align 4 + %arrayidx113 = getelementptr inbounds float, ptr %t7, i32 %t2 + %t8 = load float, ptr %arrayidx113, align 4 %conv114 = fpext float %t8 to double br label %innerloop innerloop: %i.2132 = phi i32 [ 0, %outerloop ], [ %inc129, %innerloop ] %dval1.4131 = phi double [ %conv114, %outerloop ], [ %sub127, %innerloop ] - %arrayidx119 = getelementptr inbounds float, float* %x, i32 %i.2132 - %t9 = load float, float* %arrayidx119, align 4 - %arrayidx120 = getelementptr inbounds float, float* %t4, i32 %i.2132 - %t10 = load float, float* %arrayidx120, align 4 + %arrayidx119 = getelementptr inbounds float, ptr %x, i32 %i.2132 + %t9 = load float, ptr %arrayidx119, align 4 + %arrayidx120 = getelementptr inbounds float, ptr %t4, i32 %i.2132 + %t10 = load float, ptr %arrayidx120, align 4 %sub121 = fsub fast float %t9, %t10 %conv122 = fpext float %sub121 to double %mul123 = fmul fast double %conv122, %conv122 - %arrayidx124 = getelementptr inbounds float, float* %t6, i32 %i.2132 - %t11 = load float, float* %arrayidx124, align 4 + %arrayidx124 = getelementptr inbounds float, ptr %t6, i32 %i.2132 + %t11 = load float, ptr %arrayidx124, align 4 %conv125 = fpext float %t11 to double %mul126 = fmul fast double %mul123, %conv125 %sub127 = fsub fast double %dval1.4131, %mul126 @@ -149,8 +146,8 @@ outerend: %conv138 = fptosi double %sub127.lcssa to i32 %call142 = add nuw nsw i32 %score.1135, %conv138 %inc144 = add nuw nsw i32 %j.0136, 1 - %arrayidx102 = getelementptr inbounds i32, i32* @a, i32 %inc144 - %v17 = load i32, i32* %arrayidx102, align 4 + %arrayidx102 = getelementptr inbounds i32, ptr @a, i32 %inc144 + %v17 = load i32, ptr %arrayidx102, align 4 %cmp103 = icmp sgt i32 %v17, -1 br i1 %cmp103, label %outerloop, label %exit diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll index 77a154e629fb3..6d3ffab0a5980 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll @@ -5,7 +5,7 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" -define void @trunc_not_allowed_different_vec_elemns(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i16* noalias nocapture %D) #0 { +define void @trunc_not_allowed_different_vec_elemns(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture %D) #0 { ; CHECK-LABEL: @trunc_not_allowed_different_vec_elemns( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -14,28 +14,24 @@ define void @trunc_not_allowed_different_vec_elemns(i32* noalias nocapture %A, i ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i16> -; CHECK-NEXT: [[TMP12:%.*]] = shl <4 x i16> [[TMP11]], -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, i16* [[D:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i16* [[TMP14]] to <4 x i16>* -; CHECK-NEXT: store <4 x i16> [[TMP12]], <4 x i16>* [[TMP15]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i16> +; CHECK-NEXT: [[TMP9:%.*]] = shl <4 x i16> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[D:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 +; CHECK-NEXT: store <4 x i16> [[TMP9]], ptr [[TMP11]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428 -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428 +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 431, 428 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -46,17 +42,17 @@ define void @trunc_not_allowed_different_vec_elemns(i32* noalias nocapture %A, i ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[I_021:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD9:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_021]] -; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[I_021]] -; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP17]] -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_021]] -; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[I_021]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[I_021]] +; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_021]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ADD_TR:%.*]] = trunc i32 [[ADD]] to i16 ; CHECK-NEXT: [[CONV7:%.*]] = shl i16 [[ADD_TR]], 1 -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i16, i16* [[D]], i32 [[I_021]] -; CHECK-NEXT: store i16 [[CONV7]], i16* [[ARRAYIDX8]], align 2 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i16, ptr [[D]], i32 [[I_021]] +; CHECK-NEXT: store i16 [[CONV7]], ptr [[ARRAYIDX8]], align 2 ; CHECK-NEXT: [[ADD9]] = add nuw nsw i32 [[I_021]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[ADD9]], 431 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] @@ -69,23 +65,23 @@ for.cond.cleanup: for.body: %i.021 = phi i32 [ 0, %entry ], [ %add9, %for.body ] - %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.021 - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.021 - %1 = load i32, i32* %arrayidx1, align 4 + %arrayidx = getelementptr inbounds i32, ptr %B, i32 %i.021 + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, ptr %C, i32 %i.021 + %1 = load i32, ptr %arrayidx1, align 4 %add = add nsw i32 %1, %0 - %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.021 - store i32 %add, i32* %arrayidx2, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %A, i32 %i.021 + store i32 %add, ptr %arrayidx2, align 4 %add.tr = trunc i32 %add to i16 %conv7 = shl i16 %add.tr, 1 - %arrayidx8 = getelementptr inbounds i16, i16* %D, i32 %i.021 - store i16 %conv7, i16* %arrayidx8, align 2 + %arrayidx8 = getelementptr inbounds i16, ptr %D, i32 %i.021 + store i16 %conv7, ptr %arrayidx8, align 2 %add9 = add nuw nsw i32 %i.021, 1 %exitcond = icmp eq i32 %add9, 431 br i1 %exitcond, label %for.cond.cleanup, label %for.body } -define void @unsupported_i64_type(i64* noalias nocapture %A, i64* noalias nocapture readonly %B, i64* noalias nocapture readonly %C) #0 { +define void @unsupported_i64_type(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 { ; CHECK-LABEL: @unsupported_i64_type( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -93,13 +89,13 @@ define void @unsupported_i64_type(i64* noalias nocapture %A, i64* noalias nocapt ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[I_09:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i32 [[I_09]] -; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[C:%.*]], i32 [[I_09]] -; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i32 [[I_09]] +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[C:%.*]], i32 [[I_09]] +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i32 [[I_09]] -; CHECK-NEXT: store i64 [[ADD]], i64* [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i32 [[I_09]] +; CHECK-NEXT: store i64 [[ADD]], ptr [[ARRAYIDX2]], align 8 ; CHECK-NEXT: [[ADD3]] = add nuw nsw i32 [[I_09]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[ADD3]], 431 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] @@ -112,19 +108,19 @@ for.cond.cleanup: for.body: %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] - %arrayidx = getelementptr inbounds i64, i64* %B, i32 %i.09 - %0 = load i64, i64* %arrayidx, align 8 - %arrayidx1 = getelementptr inbounds i64, i64* %C, i32 %i.09 - %1 = load i64, i64* %arrayidx1, align 8 + %arrayidx = getelementptr inbounds i64, ptr %B, i32 %i.09 + %0 = load i64, ptr %arrayidx, align 8 + %arrayidx1 = getelementptr inbounds i64, ptr %C, i32 %i.09 + %1 = load i64, ptr %arrayidx1, align 8 %add = add nsw i64 %1, %0 - %arrayidx2 = getelementptr inbounds i64, i64* %A, i32 %i.09 - store i64 %add, i64* %arrayidx2, align 8 + %arrayidx2 = getelementptr inbounds i64, ptr %A, i32 %i.09 + store i64 %add, ptr %arrayidx2, align 8 %add3 = add nuw nsw i32 %i.09, 1 %exitcond = icmp eq i32 %add3, 431 br i1 %exitcond, label %for.cond.cleanup, label %for.body } -define void @narrowing_load_not_allowed(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, i16* noalias nocapture readonly %C) #0 { +define void @narrowing_load_not_allowed(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 { ; CHECK-LABEL: @narrowing_load_not_allowed( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -133,23 +129,20 @@ define void @narrowing_load_not_allowed(i8* noalias nocapture %A, i8* noalias no ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[C:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP2]] to <8 x i16>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP6]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = trunc <8 x i16> [[WIDE_LOAD]] to <8 x i8> -; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i8> [[WIDE_LOAD1]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>* -; CHECK-NEXT: store <8 x i8> [[TMP8]], <8 x i8>* [[TMP11]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[C:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = trunc <8 x i16> [[WIDE_LOAD]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i8> [[WIDE_LOAD1]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP8]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 424 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 424 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 431, 424 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -160,14 +153,14 @@ define void @narrowing_load_not_allowed(i8* noalias nocapture %A, i8* noalias no ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[C]], i32 [[I_012]] -; CHECK-NEXT: [[TMP13:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[I_012]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CONV3:%.*]] = trunc i16 [[TMP13]] to i8 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP14]], [[CONV3]] -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[I_012]] -; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[C]], i32 [[I_012]] +; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[I_012]] +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CONV3:%.*]] = trunc i16 [[TMP10]] to i8 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP11]], [[CONV3]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[I_012]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX5]], align 1 ; CHECK-NEXT: [[ADD6]] = add nuw nsw i32 [[I_012]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[ADD6]], 431 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -180,14 +173,14 @@ for.cond.cleanup: ; preds = %for.body for.body: ; preds = %for.body, %entry %i.012 = phi i32 [ 0, %entry ], [ %add6, %for.body ] - %arrayidx = getelementptr inbounds i16, i16* %C, i32 %i.012 - %0 = load i16, i16* %arrayidx, align 2 - %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.012 - %1 = load i8, i8* %arrayidx1, align 1 + %arrayidx = getelementptr inbounds i16, ptr %C, i32 %i.012 + %0 = load i16, ptr %arrayidx, align 2 + %arrayidx1 = getelementptr inbounds i8, ptr %B, i32 %i.012 + %1 = load i8, ptr %arrayidx1, align 1 %conv3 = trunc i16 %0 to i8 %add = add i8 %1, %conv3 - %arrayidx5 = getelementptr inbounds i8, i8* %A, i32 %i.012 - store i8 %add, i8* %arrayidx5, align 1 + %arrayidx5 = getelementptr inbounds i8, ptr %A, i32 %i.012 + store i8 %add, ptr %arrayidx5, align 1 %add6 = add nuw nsw i32 %i.012, 1 %exitcond = icmp eq i32 %add6, 431 br i1 %exitcond, label %for.cond.cleanup, label %for.body @@ -198,7 +191,7 @@ for.body: ; preds = %for.body, %entry ; loop control statements, and thus not affecting element sizes, so ; we could allow this case. ; -define void @trunc_not_allowed(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { +define void @trunc_not_allowed(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 { ; CHECK-LABEL: @trunc_not_allowed( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -207,22 +200,19 @@ define void @trunc_not_allowed(i32* noalias nocapture %A, i32* noalias nocapture ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428 -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 431, 428 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -233,13 +223,13 @@ define void @trunc_not_allowed(i32* noalias nocapture %A, i32* noalias nocapture ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[I_09:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_09]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[I_09]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]] -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_09]] -; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[I_09]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[I_09]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_09]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ADD3]] = add nuw nsw i32 [[I_09]], 1 ; CHECK-NEXT: [[ADD_IV:%.*]] = trunc i32 [[ADD3]] to i16 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i16 [[ADD_IV]], 431 @@ -253,13 +243,13 @@ for.cond.cleanup: for.body: %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] - %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 - %1 = load i32, i32* %arrayidx1, align 4 + %arrayidx = getelementptr inbounds i32, ptr %B, i32 %i.09 + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, ptr %C, i32 %i.09 + %1 = load i32, ptr %arrayidx1, align 4 %add = add nsw i32 %1, %0 - %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 - store i32 %add, i32* %arrayidx2, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %A, i32 %i.09 + store i32 %add, ptr %arrayidx2, align 4 %add3 = add nuw nsw i32 %i.09, 1 %add.iv = trunc i32 %add3 to i16 @@ -271,7 +261,7 @@ for.body: ; Test directions for array indices i and N-1. I.e. check strides 1 and -1, and ; force vectorisation with a loop hint. ; -define void @strides_different_direction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) #0 { +define void @strides_different_direction(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) #0 { ; CHECK-LABEL: @strides_different_direction( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -280,25 +270,22 @@ define void @strides_different_direction(i32* noalias nocapture %A, i32* noalias ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = sub nsw i32 [[N:%.*]], [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 -3 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw i32 [[N:%.*]], [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 -3 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD1]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[REVERSE]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[REVERSE]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428 -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428 +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 431, 428 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -309,14 +296,14 @@ define void @strides_different_direction(i32* noalias nocapture %A, i32* noalias ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[I_09:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_09]] -; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[I_09]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[N]], [[I_09]] -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[SUB]] -; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP14]] -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_09]] -; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[SUB]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_09]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ADD3]] = add nuw nsw i32 [[I_09]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[ADD3]], 431 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] @@ -329,20 +316,20 @@ for.cond.cleanup: for.body: %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] - %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %B, i32 %i.09 + %0 = load i32, ptr %arrayidx, align 4 %sub = sub nsw i32 %N, %i.09 - %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %sub - %1 = load i32, i32* %arrayidx1, align 4 + %arrayidx1 = getelementptr inbounds i32, ptr %C, i32 %sub + %1 = load i32, ptr %arrayidx1, align 4 %add = add nsw i32 %1, %0 - %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 - store i32 %add, i32* %arrayidx2, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %A, i32 %i.09 + store i32 %add, ptr %arrayidx2, align 4 %add3 = add nuw nsw i32 %i.09, 1 %exitcond = icmp eq i32 %add3, 431 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10 } -define void @too_many_loop_blocks(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { +define void @too_many_loop_blocks(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 { ; CHECK-LABEL: @too_many_loop_blocks( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -351,22 +338,19 @@ define void @too_many_loop_blocks(i32* noalias nocapture %A, i32* noalias nocapt ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428 -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 431, 428 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -377,13 +361,13 @@ define void @too_many_loop_blocks(i32* noalias nocapture %A, i32* noalias nocapt ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[I_09:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[LOOPINCR:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_09]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[I_09]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]] -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_09]] -; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[I_09]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[I_09]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_09]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: br label [[LOOPINCR]] ; CHECK: loopincr: ; CHECK-NEXT: [[ADD3]] = add nuw nsw i32 [[I_09]], 1 @@ -398,13 +382,13 @@ for.cond.cleanup: for.body: %i.09 = phi i32 [ 0, %entry ], [ %add3, %loopincr ] - %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 - %1 = load i32, i32* %arrayidx1, align 4 + %arrayidx = getelementptr inbounds i32, ptr %B, i32 %i.09 + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, ptr %C, i32 %i.09 + %1 = load i32, ptr %arrayidx1, align 4 %add = add nsw i32 %1, %0 - %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 - store i32 %add, i32* %arrayidx2, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %A, i32 %i.09 + store i32 %add, ptr %arrayidx2, align 4 br label %loopincr loopincr: @@ -413,7 +397,7 @@ loopincr: br i1 %exitcond, label %for.cond.cleanup, label %for.body } -define void @double(double* noalias nocapture %A, double* noalias nocapture readonly %B, double* noalias nocapture readonly %C) #0 { +define void @double(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 { ; CHECK-LABEL: @double( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -421,13 +405,13 @@ define void @double(double* noalias nocapture %A, double* noalias nocapture read ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[I_09:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i32 [[I_09]] -; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i32 [[I_09]] -; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX1]], align 8 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 [[I_09]] +; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[C:%.*]], i32 [[I_09]] +; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8 ; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i32 [[I_09]] -; CHECK-NEXT: store double [[ADD]], double* [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 [[I_09]] +; CHECK-NEXT: store double [[ADD]], ptr [[ARRAYIDX2]], align 8 ; CHECK-NEXT: [[ADD3]] = add nuw nsw i32 [[I_09]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[ADD3]], 431 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] @@ -440,19 +424,19 @@ for.cond.cleanup: for.body: %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] - %arrayidx = getelementptr inbounds double, double* %B, i32 %i.09 - %0 = load double, double* %arrayidx, align 8 - %arrayidx1 = getelementptr inbounds double, double* %C, i32 %i.09 - %1 = load double, double* %arrayidx1, align 8 + %arrayidx = getelementptr inbounds double, ptr %B, i32 %i.09 + %0 = load double, ptr %arrayidx, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %C, i32 %i.09 + %1 = load double, ptr %arrayidx1, align 8 %add = fadd fast double %1, %0 - %arrayidx2 = getelementptr inbounds double, double* %A, i32 %i.09 - store double %add, double* %arrayidx2, align 8 + %arrayidx2 = getelementptr inbounds double, ptr %A, i32 %i.09 + store double %add, ptr %arrayidx2, align 8 %add3 = add nuw nsw i32 %i.09, 1 %exitcond = icmp eq i32 %add3, 431 br i1 %exitcond, label %for.cond.cleanup, label %for.body } -define void @fptrunc_not_allowed(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C, half* noalias nocapture %D) #0 { +define void @fptrunc_not_allowed(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture %D) #0 { ; CHECK-LABEL: @fptrunc_not_allowed( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -461,28 +445,24 @@ define void @fptrunc_not_allowed(float* noalias nocapture %A, float* noalias noc ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = fptrunc <4 x float> [[TMP7]] to <4 x half> -; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <4 x half> [[TMP11]], -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds half, half* [[D:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds half, half* [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast half* [[TMP14]] to <4 x half>* -; CHECK-NEXT: store <4 x half> [[TMP12]], <4 x half>* [[TMP15]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = fptrunc <4 x float> [[TMP5]] to <4 x half> +; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <4 x half> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds half, ptr [[D:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds half, ptr [[TMP10]], i32 0 +; CHECK-NEXT: store <4 x half> [[TMP9]], ptr [[TMP11]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428 -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428 +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 431, 428 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -493,17 +473,17 @@ define void @fptrunc_not_allowed(float* noalias nocapture %A, float* noalias noc ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[I_017:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i32 [[I_017]] -; CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[C]], i32 [[I_017]] -; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP18]], [[TMP17]] -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i32 [[I_017]] -; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i32 [[I_017]] +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[C]], i32 [[I_017]] +; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP14]], [[TMP13]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 [[I_017]] +; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[CONV:%.*]] = fptrunc float [[ADD]] to half ; CHECK-NEXT: [[FACTOR:%.*]] = fmul fast half [[CONV]], 0xH4000 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, half* [[D]], i32 [[I_017]] -; CHECK-NEXT: store half [[FACTOR]], half* [[ARRAYIDX5]], align 2 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr [[D]], i32 [[I_017]] +; CHECK-NEXT: store half [[FACTOR]], ptr [[ARRAYIDX5]], align 2 ; CHECK-NEXT: [[ADD6]] = add nuw nsw i32 [[I_017]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[ADD6]], 431 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] @@ -516,17 +496,17 @@ for.cond.cleanup: for.body: %i.017 = phi i32 [ 0, %entry ], [ %add6, %for.body ] - %arrayidx = getelementptr inbounds float, float* %B, i32 %i.017 - %0 = load float, float* %arrayidx, align 4 - %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.017 - %1 = load float, float* %arrayidx1, align 4 + %arrayidx = getelementptr inbounds float, ptr %B, i32 %i.017 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %C, i32 %i.017 + %1 = load float, ptr %arrayidx1, align 4 %add = fadd fast float %1, %0 - %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.017 - store float %add, float* %arrayidx2, align 4 + %arrayidx2 = getelementptr inbounds float, ptr %A, i32 %i.017 + store float %add, ptr %arrayidx2, align 4 %conv = fptrunc float %add to half %factor = fmul fast half %conv, 0xH4000 - %arrayidx5 = getelementptr inbounds half, half* %D, i32 %i.017 - store half %factor, half* %arrayidx5, align 2 + %arrayidx5 = getelementptr inbounds half, ptr %D, i32 %i.017 + store half %factor, ptr %arrayidx5, align 2 %add6 = add nuw nsw i32 %i.017, 1 %exitcond = icmp eq i32 %add6, 431 br i1 %exitcond, label %for.cond.cleanup, label %for.body @@ -537,7 +517,7 @@ for.body: ; which aren't supported by the lowoverhead loop pass, causing the tail-predication ; to be reverted which is expensive and what we would like to avoid. ; -define dso_local void @select_not_allowed(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N, i32* noalias nocapture readonly %Cond) { +define dso_local void @select_not_allowed(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N, ptr noalias nocapture readonly %Cond) { ; CHECK-LABEL: @select_not_allowed( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -548,31 +528,29 @@ define dso_local void @select_not_allowed(i32* noalias nocapture %A, i32* noalia ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32*> poison, i32* [[C:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32*> [[BROADCAST_SPLATINSERT]], <4 x i32*> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32*> poison, i32* [[B:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32*> [[BROADCAST_SPLATINSERT1]], <4 x i32*> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[C:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x ptr> poison, ptr [[B:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT1]], <4 x ptr> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i32*> [[BROADCAST_SPLAT]], <4 x i32*> [[BROADCAST_SPLAT2]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, <4 x i32*> [[TMP5]], <4 x i32> [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP6]], i32 4, <4 x i1> , <4 x i32> poison) -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[WIDE_MASKED_GATHER]], <4 x i32>* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x ptr> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, <4 x ptr> [[TMP4]], <4 x i32> [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP5]], i32 4, <4 x i1> , <4 x i32> poison) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store <4 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP7]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -585,14 +563,14 @@ define dso_local void @select_not_allowed(i32* noalias nocapture %A, i32* noalia ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i32 [[I_011]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP11]], 0 -; CHECK-NEXT: [[C_B:%.*]] = select i1 [[TOBOOL_NOT]], i32* [[C]], i32* [[B]] -; CHECK-NEXT: [[COND_IN:%.*]] = getelementptr inbounds i32, i32* [[C_B]], i32 [[I_011]] -; CHECK-NEXT: [[COND:%.*]] = load i32, i32* [[COND_IN]], align 4 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_011]] -; CHECK-NEXT: store i32 [[COND]], i32* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[I_011]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP9]], 0 +; CHECK-NEXT: [[C_B:%.*]] = select i1 [[TOBOOL_NOT]], ptr [[C]], ptr [[B]] +; CHECK-NEXT: [[COND_IN:%.*]] = getelementptr inbounds i32, ptr [[C_B]], i32 [[I_011]] +; CHECK-NEXT: [[COND:%.*]] = load i32, ptr [[COND_IN]], align 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_011]] +; CHECK-NEXT: store i32 [[COND]], ptr [[ARRAYIDX3]], align 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_011]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] @@ -612,20 +590,20 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo for.body: ; preds = %for.body.preheader, %for.body %i.011 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %arrayidx = getelementptr inbounds i32, i32* %Cond, i32 %i.011 - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %Cond, i32 %i.011 + %0 = load i32, ptr %arrayidx, align 4 %tobool.not = icmp eq i32 %0, 0 - %C.B = select i1 %tobool.not, i32* %C, i32* %B - %cond.in = getelementptr inbounds i32, i32* %C.B, i32 %i.011 - %cond = load i32, i32* %cond.in, align 4 - %arrayidx3 = getelementptr inbounds i32, i32* %A, i32 %i.011 - store i32 %cond, i32* %arrayidx3, align 4 + %C.B = select i1 %tobool.not, ptr %C, ptr %B + %cond.in = getelementptr inbounds i32, ptr %C.B, i32 %i.011 + %cond = load i32, ptr %cond.in, align 4 + %arrayidx3 = getelementptr inbounds i32, ptr %A, i32 %i.011 + store i32 %cond, ptr %arrayidx3, align 4 %inc = add nuw nsw i32 %i.011, 1 %exitcond.not = icmp eq i32 %inc, %N br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body } -define i32 @i32_smin_reduction(i32* nocapture readonly %x, i32 %n) #0 { +define i32 @i32_smin_reduction(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @i32_smin_reduction( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -639,37 +617,36 @@ define i32 @i32_smin_reduction(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2147483647, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2147483647, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[C:%.*]] = icmp slt i32 [[R_07]], [[TMP8]] -; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[C:%.*]] = icmp slt i32 [[R_07]], [[TMP7]] +; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 2147483647, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -682,8 +659,8 @@ entry: for.body: ; preds = %entry, %for.body %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %r.07 = phi i32 [ %add, %for.body ], [ 2147483647, %entry ] - %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 + %0 = load i32, ptr %arrayidx, align 4 %c = icmp slt i32 %r.07, %0 %add = select i1 %c, i32 %r.07, i32 %0 %inc = add nuw nsw i32 %i.08, 1 @@ -695,7 +672,7 @@ for.cond.cleanup: ; preds = %for.body, %entry ret i32 %r.0.lcssa } -define i32 @i32_smax_reduction(i32* nocapture readonly %x, i32 %n) #0 { +define i32 @i32_smax_reduction(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @i32_smax_reduction( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -709,37 +686,36 @@ define i32 @i32_smax_reduction(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -2147483648, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -2147483648, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[C:%.*]] = icmp sgt i32 [[R_07]], [[TMP8]] -; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i32 [[R_07]], [[TMP7]] +; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ -2147483648, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -752,8 +728,8 @@ entry: for.body: ; preds = %entry, %for.body %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %r.07 = phi i32 [ %add, %for.body ], [ -2147483648, %entry ] - %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 + %0 = load i32, ptr %arrayidx, align 4 %c = icmp sgt i32 %r.07, %0 %add = select i1 %c, i32 %r.07, i32 %0 %inc = add nuw nsw i32 %i.08, 1 @@ -765,7 +741,7 @@ for.cond.cleanup: ; preds = %for.body, %entry ret i32 %r.0.lcssa } -define i32 @i32_umin_reduction(i32* nocapture readonly %x, i32 %n) #0 { +define i32 @i32_umin_reduction(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @i32_umin_reduction( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -779,37 +755,36 @@ define i32 @i32_umin_reduction(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[R_07]], [[TMP8]] -; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[R_07]], [[TMP7]] +; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -822,8 +797,8 @@ entry: for.body: ; preds = %entry, %for.body %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %r.07 = phi i32 [ %add, %for.body ], [ 4294967295, %entry ] - %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 + %0 = load i32, ptr %arrayidx, align 4 %c = icmp ult i32 %r.07, %0 %add = select i1 %c, i32 %r.07, i32 %0 %inc = add nuw nsw i32 %i.08, 1 @@ -835,7 +810,7 @@ for.cond.cleanup: ; preds = %for.body, %entry ret i32 %r.0.lcssa } -define i32 @i32_umax_reduction(i32* nocapture readonly %x, i32 %n) #0 { +define i32 @i32_umax_reduction(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @i32_umax_reduction( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -849,37 +824,36 @@ define i32 @i32_umax_reduction(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[R_07]], [[TMP8]] -; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[R_07]], [[TMP7]] +; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -892,8 +866,8 @@ entry: for.body: ; preds = %entry, %for.body %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ] - %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 + %0 = load i32, ptr %arrayidx, align 4 %c = icmp ugt i32 %r.07, %0 %add = select i1 %c, i32 %r.07, i32 %0 %inc = add nuw nsw i32 %i.08, 1 diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll index 7bb804eb73e95..b3157b75bc26f 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -passes=loop-vectorize -mtriple=powerpc64le-unknown-unknown \ ; RUN: -force-target-max-vector-interleave=1 -mcpu=pwr9 < %s | FileCheck %s -define dso_local void @test(i32* %Arr, i32 signext %Len) { +define dso_local void @test(ptr %Arr, i32 signext %Len) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 0, [[LEN:%.*]] @@ -17,19 +17,17 @@ define dso_local void @test(i32* %Arr, i32 signext %Len) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[WIDE_LOAD]]) -; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[TMP7]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[LEN]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] @@ -43,12 +41,12 @@ define dso_local void @test(i32* %Arr, i32 signext %Len) { ; CHECK: for.body: ; CHECK-NEXT: [[I_02:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[I_02]] to i64 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]]) +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP9]]) ; CHECK-NEXT: [[IDXPROM1:%.*]] = sext i32 [[I_02]] to i64 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 [[IDXPROM1]] -; CHECK-NEXT: store i32 [[TMP12]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[IDXPROM1]] +; CHECK-NEXT: store i32 [[TMP10]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: br label [[FOR_INC]] ; CHECK: for.inc: ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_02]], 1 @@ -73,12 +71,12 @@ for.cond.cleanup: ; preds = %for.cond.for.cond.c for.body: ; preds = %for.body.lr.ph, %for.inc %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] %idxprom = sext i32 %i.02 to i64 - %arrayidx = getelementptr inbounds i32, i32* %Arr, i64 %idxprom - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %Arr, i64 %idxprom + %0 = load i32, ptr %arrayidx, align 4 %1 = call i32 @llvm.bswap.i32(i32 %0) %idxprom1 = sext i32 %i.02 to i64 - %arrayidx2 = getelementptr inbounds i32, i32* %Arr, i64 %idxprom1 - store i32 %1, i32* %arrayidx2, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %Arr, i64 %idxprom1 + store i32 %1, ptr %arrayidx2, align 4 br label %for.inc for.inc: ; preds = %for.body diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll index 841d7c7dbbdcd..7fc51482acf4a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll @@ -6,7 +6,7 @@ target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128" target triple = "riscv64" -define i32 @add_i16_i32(i16* nocapture readonly %x, i32 %n) { +define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; OUTLOOP-LABEL: @add_i16_i32( ; OUTLOOP-NEXT: entry: ; OUTLOOP-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -24,54 +24,52 @@ define i32 @add_i16_i32(i16* nocapture readonly %x, i32 %n) { ; OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; OUTLOOP: vector.body: ; OUTLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] -; OUTLOOP-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; OUTLOOP-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] ; OUTLOOP-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 ; OUTLOOP-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() ; OUTLOOP-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], 2 ; OUTLOOP-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], 0 ; OUTLOOP-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], 1 ; OUTLOOP-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], [[TMP8]] -; OUTLOOP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[TMP4]] -; OUTLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[TMP9]] -; OUTLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP10]], i32 0 -; OUTLOOP-NEXT: [[TMP13:%.*]] = bitcast i16* [[TMP12]] to * -; OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP13]], align 2 -; OUTLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; OUTLOOP-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 2 -; OUTLOOP-NEXT: [[TMP16:%.*]] = getelementptr inbounds i16, i16* [[TMP10]], i32 [[TMP15]] -; OUTLOOP-NEXT: [[TMP17:%.*]] = bitcast i16* [[TMP16]] to * -; OUTLOOP-NEXT: [[WIDE_LOAD2:%.*]] = load , * [[TMP17]], align 2 -; OUTLOOP-NEXT: [[TMP18:%.*]] = sext [[WIDE_LOAD]] to -; OUTLOOP-NEXT: [[TMP19:%.*]] = sext [[WIDE_LOAD2]] to -; OUTLOOP-NEXT: [[TMP20]] = add [[VEC_PHI]], [[TMP18]] -; OUTLOOP-NEXT: [[TMP21]] = add [[VEC_PHI1]], [[TMP19]] -; OUTLOOP-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32() -; OUTLOOP-NEXT: [[TMP23:%.*]] = mul i32 [[TMP22]], 4 -; OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP23]] -; OUTLOOP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; OUTLOOP-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; OUTLOOP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP4]] +; OUTLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[TMP9]] +; OUTLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 +; OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 2 +; OUTLOOP-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32() +; OUTLOOP-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 2 +; OUTLOOP-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 [[TMP14]] +; OUTLOOP-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP15]], align 2 +; OUTLOOP-NEXT: [[TMP16:%.*]] = sext [[WIDE_LOAD]] to +; OUTLOOP-NEXT: [[TMP17:%.*]] = sext [[WIDE_LOAD2]] to +; OUTLOOP-NEXT: [[TMP18]] = add [[VEC_PHI]], [[TMP16]] +; OUTLOOP-NEXT: [[TMP19]] = add [[VEC_PHI1]], [[TMP17]] +; OUTLOOP-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() +; OUTLOOP-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], 4 +; OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP21]] +; OUTLOOP-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; OUTLOOP-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; OUTLOOP: middle.block: -; OUTLOOP-NEXT: [[BIN_RDX:%.*]] = add [[TMP21]], [[TMP20]] -; OUTLOOP-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[BIN_RDX]]) +; OUTLOOP-NEXT: [[BIN_RDX:%.*]] = add [[TMP19]], [[TMP18]] +; OUTLOOP-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[BIN_RDX]]) ; OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; OUTLOOP: scalar.ph: ; OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] ; OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] ; OUTLOOP: for.body: ; OUTLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; OUTLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[I_08]] -; OUTLOOP-NEXT: [[TMP26:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -; OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP26]] to i32 +; OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]] +; OUTLOOP-NEXT: [[TMP24:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP24]] to i32 ; OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]] ; OUTLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; OUTLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; OUTLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; OUTLOOP: for.cond.cleanup.loopexit: -; OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] ; OUTLOOP-NEXT: br label [[FOR_COND_CLEANUP]] ; OUTLOOP: for.cond.cleanup: ; OUTLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -94,37 +92,35 @@ define i32 @add_i16_i32(i16* nocapture readonly %x, i32 %n) { ; INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; INLOOP: vector.body: ; INLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] -; INLOOP-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; INLOOP-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] ; INLOOP-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 ; INLOOP-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() ; INLOOP-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], 4 ; INLOOP-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], 0 ; INLOOP-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], 1 ; INLOOP-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], [[TMP8]] -; INLOOP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[TMP4]] -; INLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[TMP9]] -; INLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP10]], i32 0 -; INLOOP-NEXT: [[TMP13:%.*]] = bitcast i16* [[TMP12]] to * -; INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP13]], align 2 -; INLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; INLOOP-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 4 -; INLOOP-NEXT: [[TMP16:%.*]] = getelementptr inbounds i16, i16* [[TMP10]], i32 [[TMP15]] -; INLOOP-NEXT: [[TMP17:%.*]] = bitcast i16* [[TMP16]] to * -; INLOOP-NEXT: [[WIDE_LOAD2:%.*]] = load , * [[TMP17]], align 2 -; INLOOP-NEXT: [[TMP18:%.*]] = sext [[WIDE_LOAD]] to -; INLOOP-NEXT: [[TMP19:%.*]] = sext [[WIDE_LOAD2]] to -; INLOOP-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP18]]) -; INLOOP-NEXT: [[TMP21]] = add i32 [[TMP20]], [[VEC_PHI]] -; INLOOP-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) -; INLOOP-NEXT: [[TMP23]] = add i32 [[TMP22]], [[VEC_PHI1]] -; INLOOP-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32() -; INLOOP-NEXT: [[TMP25:%.*]] = mul i32 [[TMP24]], 8 -; INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP25]] -; INLOOP-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; INLOOP-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; INLOOP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP4]] +; INLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[TMP9]] +; INLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 +; INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 2 +; INLOOP-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32() +; INLOOP-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 4 +; INLOOP-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 [[TMP14]] +; INLOOP-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP15]], align 2 +; INLOOP-NEXT: [[TMP16:%.*]] = sext [[WIDE_LOAD]] to +; INLOOP-NEXT: [[TMP17:%.*]] = sext [[WIDE_LOAD2]] to +; INLOOP-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP16]]) +; INLOOP-NEXT: [[TMP19]] = add i32 [[TMP18]], [[VEC_PHI]] +; INLOOP-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP17]]) +; INLOOP-NEXT: [[TMP21]] = add i32 [[TMP20]], [[VEC_PHI1]] +; INLOOP-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32() +; INLOOP-NEXT: [[TMP23:%.*]] = mul i32 [[TMP22]], 8 +; INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP23]] +; INLOOP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; INLOOP-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; INLOOP: middle.block: -; INLOOP-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP23]], [[TMP21]] +; INLOOP-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP21]], [[TMP19]] ; INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; INLOOP: scalar.ph: @@ -134,9 +130,9 @@ define i32 @add_i16_i32(i16* nocapture readonly %x, i32 %n) { ; INLOOP: for.body: ; INLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; INLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[I_08]] -; INLOOP-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -; INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP27]] to i32 +; INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]] +; INLOOP-NEXT: [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP25]] to i32 ; INLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]] ; INLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; INLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] @@ -155,8 +151,8 @@ entry: for.body: ; preds = %entry, %for.body %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ] - %arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.08 - %0 = load i16, i16* %arrayidx, align 2 + %arrayidx = getelementptr inbounds i16, ptr %x, i32 %i.08 + %0 = load i16, ptr %arrayidx, align 2 %conv = sext i16 %0 to i32 %add = add nsw i32 %r.07, %conv %inc = add nuw nsw i32 %i.08, 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll index 0c914d670f554..1046f4cda4cdf 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -3,7 +3,7 @@ target triple = "riscv64" -define void @trip5_i8(i8* noalias nocapture noundef %dst, i8* noalias nocapture noundef readonly %src) #0 { +define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 { ; CHECK-LABEL: @trip5_i8( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() @@ -24,21 +24,18 @@ define void @trip5_i8(i8* noalias nocapture noundef %dst, i8* noalias nocapture ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP8]], i64 5) -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8i8.p0nxv8i8(* [[TMP11]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP12:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv8i8.p0nxv8i8(* [[TMP15]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP12]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP14]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0nxv8i8( [[TMP16]], * [[TMP17]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP19]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8i8.p0(ptr [[TMP10]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP11:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv8i8.p0(ptr [[TMP13]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP14:%.*]] = add [[TMP11]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0( [[TMP14]], ptr [[TMP13]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP16]] ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -47,13 +44,13 @@ define void @trip5_i8(i8* noalias nocapture noundef %dst, i8* noalias nocapture ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[I_08]] -; CHECK-NEXT: [[TMP20:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP20]], 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[I_08]] -; CHECK-NEXT: [[TMP21:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP21]] -; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] +; CHECK-NEXT: [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP17]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] +; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP18]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] @@ -65,13 +62,13 @@ entry: for.body: ; preds = %entry, %for.body %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ] - %arrayidx = getelementptr inbounds i8, i8* %src, i64 %i.08 - %0 = load i8, i8* %arrayidx, align 1 + %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08 + %0 = load i8, ptr %arrayidx, align 1 %mul = shl i8 %0, 1 - %arrayidx1 = getelementptr inbounds i8, i8* %dst, i64 %i.08 - %1 = load i8, i8* %arrayidx1, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08 + %1 = load i8, ptr %arrayidx1, align 1 %add = add i8 %mul, %1 - store i8 %add, i8* %arrayidx1, align 1 + store i8 %add, ptr %arrayidx1, align 1 %inc = add nuw nsw i64 %i.08, 1 %exitcond.not = icmp eq i64 %inc, 5 br i1 %exitcond.not, label %for.end, label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll index 24f75e0a93114..4df17b95d849d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll @@ -4,7 +4,7 @@ ; The source code: ; -;void foo4(double *A, double *B, int *trigger) { +;void foo4(ptr A, ptr B, int *trigger) { ; ; for (int i=0; i<10000; i += 16) { ; if (trigger[i] < 100) { @@ -13,30 +13,24 @@ ; } ;} -define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 { +define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 { ; RV32-LABEL: @foo4( ; RV32-NEXT: entry: -; RV32-NEXT: [[A1:%.*]] = bitcast double* [[A:%.*]] to i8* -; RV32-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* -; RV32-NEXT: [[B6:%.*]] = bitcast double* [[B:%.*]] to i8* ; RV32-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 12, i64 [[TMP0]]) ; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP1]] ; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; RV32: vector.memcheck: -; RV32-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 9985 -; RV32-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8* -; RV32-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 9985 -; RV32-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; RV32-NEXT: [[SCEVGEP7:%.*]] = getelementptr double, double* [[B]], i64 19969 -; RV32-NEXT: [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8* -; RV32-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]] -; RV32-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] +; RV32-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880 +; RV32-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940 +; RV32-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752 +; RV32-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]] +; RV32-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]] ; RV32-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; RV32-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]] -; RV32-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] -; RV32-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; RV32-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; RV32-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[UGLYGEP2]] +; RV32-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[UGLYGEP]] +; RV32-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; RV32-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; RV32-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; RV32: vector.ph: ; RV32-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -55,16 +49,16 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca ; RV32: vector.body: ; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; RV32-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; RV32-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], [[VEC_IND]] -; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv1i32.nxv1p0i32( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison), !alias.scope !0 +; RV32-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], [[VEC_IND]] +; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv1i32.nxv1p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison), !alias.scope !0 ; RV32-NEXT: [[TMP9:%.*]] = icmp slt [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 100, i32 0), poison, zeroinitializer) ; RV32-NEXT: [[TMP10:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) -; RV32-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[B]], [[TMP10]] -; RV32-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call @llvm.masked.gather.nxv1f64.nxv1p0f64( [[TMP11]], i32 8, [[TMP9]], poison), !alias.scope !3 +; RV32-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[B]], [[TMP10]] +; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv1f64.nxv1p0( [[TMP11]], i32 8, [[TMP9]], poison), !alias.scope !3 ; RV32-NEXT: [[TMP12:%.*]] = sitofp [[WIDE_MASKED_GATHER]] to -; RV32-NEXT: [[TMP13:%.*]] = fadd [[WIDE_MASKED_GATHER12]], [[TMP12]] -; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[A]], [[VEC_IND]] -; RV32-NEXT: call void @llvm.masked.scatter.nxv1f64.nxv1p0f64( [[TMP13]], [[TMP14]], i32 8, [[TMP9]]), !alias.scope !5, !noalias !7 +; RV32-NEXT: [[TMP13:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP12]] +; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] +; RV32-NEXT: call void @llvm.masked.scatter.nxv1f64.nxv1p0( [[TMP13]], [[TMP14]], i32 8, [[TMP9]]), !alias.scope !5, !noalias !7 ; RV32-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] ; RV32-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] @@ -78,18 +72,18 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca ; RV32-NEXT: br label [[FOR_BODY:%.*]] ; RV32: for.body: ; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; RV32-NEXT: [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; RV32-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; RV32-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100 ; RV32-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; RV32: if.then: ; RV32-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 -; RV32-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP18]] -; RV32-NEXT: [[TMP19:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; RV32-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP18]] +; RV32-NEXT: [[TMP19:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 ; RV32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP17]] to double ; RV32-NEXT: [[ADD:%.*]] = fadd double [[TMP19]], [[CONV]] -; RV32-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]] -; RV32-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8 +; RV32-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] +; RV32-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8 ; RV32-NEXT: br label [[FOR_INC]] ; RV32: for.inc: ; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 @@ -100,27 +94,21 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca ; ; RV64-LABEL: @foo4( ; RV64-NEXT: entry: -; RV64-NEXT: [[A1:%.*]] = bitcast double* [[A:%.*]] to i8* -; RV64-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* -; RV64-NEXT: [[B6:%.*]] = bitcast double* [[B:%.*]] to i8* ; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 12, i64 [[TMP0]]) ; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP1]] ; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; RV64: vector.memcheck: -; RV64-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 9985 -; RV64-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8* -; RV64-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 9985 -; RV64-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; RV64-NEXT: [[SCEVGEP7:%.*]] = getelementptr double, double* [[B]], i64 19969 -; RV64-NEXT: [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8* -; RV64-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]] -; RV64-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] +; RV64-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880 +; RV64-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940 +; RV64-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752 +; RV64-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]] +; RV64-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]] ; RV64-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; RV64-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]] -; RV64-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] -; RV64-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; RV64-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; RV64-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[UGLYGEP2]] +; RV64-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[UGLYGEP]] +; RV64-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; RV64-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; RV64-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; RV64: vector.ph: ; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -139,16 +127,16 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca ; RV64: vector.body: ; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; RV64-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], [[VEC_IND]] -; RV64-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv1i32.nxv1p0i32( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison), !alias.scope !0 +; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], [[VEC_IND]] +; RV64-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv1i32.nxv1p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison), !alias.scope !0 ; RV64-NEXT: [[TMP9:%.*]] = icmp slt [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 100, i32 0), poison, zeroinitializer) ; RV64-NEXT: [[TMP10:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) -; RV64-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[B]], [[TMP10]] -; RV64-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call @llvm.masked.gather.nxv1f64.nxv1p0f64( [[TMP11]], i32 8, [[TMP9]], poison), !alias.scope !3 +; RV64-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[B]], [[TMP10]] +; RV64-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv1f64.nxv1p0( [[TMP11]], i32 8, [[TMP9]], poison), !alias.scope !3 ; RV64-NEXT: [[TMP12:%.*]] = sitofp [[WIDE_MASKED_GATHER]] to -; RV64-NEXT: [[TMP13:%.*]] = fadd [[WIDE_MASKED_GATHER12]], [[TMP12]] -; RV64-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[A]], [[VEC_IND]] -; RV64-NEXT: call void @llvm.masked.scatter.nxv1f64.nxv1p0f64( [[TMP13]], [[TMP14]], i32 8, [[TMP9]]), !alias.scope !5, !noalias !7 +; RV64-NEXT: [[TMP13:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP12]] +; RV64-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] +; RV64-NEXT: call void @llvm.masked.scatter.nxv1f64.nxv1p0( [[TMP13]], [[TMP14]], i32 8, [[TMP9]]), !alias.scope !5, !noalias !7 ; RV64-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] ; RV64-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] @@ -162,18 +150,18 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca ; RV64-NEXT: br label [[FOR_BODY:%.*]] ; RV64: for.body: ; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; RV64-NEXT: [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; RV64-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; RV64-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100 ; RV64-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; RV64: if.then: ; RV64-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 -; RV64-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP18]] -; RV64-NEXT: [[TMP19:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; RV64-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP18]] +; RV64-NEXT: [[TMP19:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 ; RV64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP17]] to double ; RV64-NEXT: [[ADD:%.*]] = fadd double [[TMP19]], [[CONV]] -; RV64-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]] -; RV64-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8 +; RV64-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] +; RV64-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8 ; RV64-NEXT: br label [[FOR_INC]] ; RV64: for.inc: ; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 @@ -187,19 +175,19 @@ entry: for.body: ; preds = %entry, %for.inc %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] - %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 %cmp1 = icmp slt i32 %0, 100 br i1 %cmp1, label %if.then, label %for.inc if.then: ; preds = %for.body %1 = shl nuw nsw i64 %indvars.iv, 1 - %arrayidx3 = getelementptr inbounds double, double* %B, i64 %1 - %2 = load double, double* %arrayidx3, align 8 + %arrayidx3 = getelementptr inbounds double, ptr %B, i64 %1 + %2 = load double, ptr %arrayidx3, align 8 %conv = sitofp i32 %0 to double %add = fadd double %2, %conv - %arrayidx7 = getelementptr inbounds double, double* %A, i64 %indvars.iv - store double %add, double* %arrayidx7, align 8 + %arrayidx7 = getelementptr inbounds double, ptr %A, i64 %indvars.iv + store double %add, ptr %arrayidx7, align 8 br label %for.inc for.inc: ; preds = %for.body, %if.then diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll index 993d429a1760a..4e9ec86df7ca0 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll @@ -5,7 +5,7 @@ ; RUN: opt < %s -passes=loop-vectorize -force-target-max-vector-interleave=1 -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -scalable-vectorization=off -riscv-v-register-bit-width-lmul=2 -S | FileCheck %s --check-prefix=LMUL2 ; Function Attrs: nounwind -define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) { +define ptr @array_add(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr %c, i32 %size) { ; LMUL1-LABEL: @array_add( ; LMUL1-NEXT: entry: ; LMUL1-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[SIZE:%.*]], 0 @@ -23,22 +23,19 @@ define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocaptur ; LMUL1: vector.body: ; LMUL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; LMUL1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; LMUL1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]] -; LMUL1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; LMUL1-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; LMUL1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4 -; LMUL1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]] -; LMUL1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 -; LMUL1-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; LMUL1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; LMUL1-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; LMUL1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP3]] -; LMUL1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0 -; LMUL1-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; LMUL1-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP13]], align 4 +; LMUL1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]] +; LMUL1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; LMUL1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4 +; LMUL1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP3]] +; LMUL1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; LMUL1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4 +; LMUL1-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; LMUL1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP3]] +; LMUL1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; LMUL1-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP10]], align 4 ; LMUL1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; LMUL1-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; LMUL1-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; LMUL1-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; LMUL1-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; LMUL1: middle.block: ; LMUL1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; LMUL1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -47,13 +44,13 @@ define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocaptur ; LMUL1-NEXT: br label [[FOR_BODY:%.*]] ; LMUL1: for.body: ; LMUL1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; LMUL1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; LMUL1-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; LMUL1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; LMUL1-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 -; LMUL1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]] -; LMUL1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]] -; LMUL1-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 +; LMUL1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; LMUL1-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; LMUL1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; LMUL1-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; LMUL1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]] +; LMUL1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; LMUL1-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 ; LMUL1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; LMUL1-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; LMUL1-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]] @@ -61,7 +58,7 @@ define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocaptur ; LMUL1: for.end.loopexit: ; LMUL1-NEXT: br label [[FOR_END]] ; LMUL1: for.end: -; LMUL1-NEXT: ret i32* [[C]] +; LMUL1-NEXT: ret ptr [[C]] ; ; LMUL2-LABEL: @array_add( ; LMUL2-NEXT: entry: @@ -80,22 +77,19 @@ define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocaptur ; LMUL2: vector.body: ; LMUL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; LMUL2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; LMUL2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]] -; LMUL2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; LMUL2-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* -; LMUL2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 4 -; LMUL2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]] -; LMUL2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 -; LMUL2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* -; LMUL2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4 -; LMUL2-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; LMUL2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP3]] -; LMUL2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0 -; LMUL2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; LMUL2-NEXT: store <8 x i32> [[TMP10]], <8 x i32>* [[TMP13]], align 4 +; LMUL2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]] +; LMUL2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; LMUL2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4 +; LMUL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP3]] +; LMUL2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; LMUL2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4 +; LMUL2-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; LMUL2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP3]] +; LMUL2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; LMUL2-NEXT: store <8 x i32> [[TMP8]], ptr [[TMP10]], align 4 ; LMUL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; LMUL2-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; LMUL2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; LMUL2-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; LMUL2-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; LMUL2: middle.block: ; LMUL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; LMUL2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -104,13 +98,13 @@ define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocaptur ; LMUL2-NEXT: br label [[FOR_BODY:%.*]] ; LMUL2: for.body: ; LMUL2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; LMUL2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; LMUL2-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; LMUL2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; LMUL2-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 -; LMUL2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]] -; LMUL2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]] -; LMUL2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 +; LMUL2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; LMUL2-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; LMUL2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; LMUL2-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; LMUL2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]] +; LMUL2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; LMUL2-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 ; LMUL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; LMUL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; LMUL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]] @@ -118,7 +112,7 @@ define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocaptur ; LMUL2: for.end.loopexit: ; LMUL2-NEXT: br label [[FOR_END]] ; LMUL2: for.end: -; LMUL2-NEXT: ret i32* [[C]] +; LMUL2-NEXT: ret ptr [[C]] ; entry: %cmp10 = icmp sgt i32 %size, 0 @@ -129,13 +123,13 @@ for.body.preheader: ; preds = %entry for.body: ; preds = %for.body.preheader, %for.body %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] - %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv - %1 = load i32, i32* %arrayidx2, align 4 + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 %add = add nsw i32 %1, %0 - %arrayidx4 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv - store i32 %add, i32* %arrayidx4, align 4 + %arrayidx4 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv + store i32 %add, ptr %arrayidx4, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 %exitcond = icmp eq i32 %lftr.wideiv, %size @@ -145,5 +139,5 @@ for.end.loopexit: ; preds = %for.body br label %for.end for.end: ; preds = %for.end.loopexit, %entry - ret i32* %c + ret ptr %c } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll index 24b692c121726..357f8b4fe7519 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -mtriple=riscv64 -mattr=+v -passes=loop-vectorize < %s | FileCheck %s -define void @small_trip_count_min_vlen_128(i32* nocapture %a) nounwind vscale_range(4,1024) { +define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_range(4,1024) { ; CHECK-LABEL: @small_trip_count_min_vlen_128( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -10,13 +10,11 @@ define void @small_trip_count_min_vlen_128(i32* nocapture %a) nounwind vscale_ra ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[TMP2]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -27,10 +25,10 @@ define void @small_trip_count_min_vlen_128(i32* nocapture %a) nounwind vscale_ra ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[IV]] -; CHECK-NEXT: [[V:%.*]] = load i32, i32* [[GEP]], align 4 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]] +; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[GEP]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[V]], 1 -; CHECK-NEXT: store i32 [[ADD]], i32* [[GEP]], align 4 +; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 3 ; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] @@ -42,10 +40,10 @@ entry: loop: %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds i32, i32* %a, i32 %iv - %v = load i32, i32* %gep, align 4 + %gep = getelementptr inbounds i32, ptr %a, i32 %iv + %v = load i32, ptr %gep, align 4 %add = add nsw i32 %v, 1 - store i32 %add, i32* %gep, align 4 + store i32 %add, ptr %gep, align 4 %iv.next = add i32 %iv, 1 %cond = icmp eq i32 %iv, 3 br i1 %cond, label %exit, label %loop @@ -56,7 +54,7 @@ exit: ; Note: This test uses a vscale_range starting at 1, which is technically incompatibile ; with +v. If we expose a target hook for minimum vlen, this example will need reworked -define void @small_trip_count_min_vlen_32(i32* nocapture %a) nounwind vscale_range(1,1024) { +define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_range(1,1024) { ; CHECK-LABEL: @small_trip_count_min_vlen_32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() @@ -77,18 +75,16 @@ define void @small_trip_count_min_vlen_32(i32* nocapture %a) nounwind vscale_ran ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 [[TMP8]], i32 4) -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i32.p0nxv2i32(* [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP12:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP10]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv2i32.p0nxv2i32( [[TMP12]], * [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i32.p0(ptr [[TMP10]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP11:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.store.nxv2i32.p0( [[TMP11]], ptr [[TMP10]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -96,10 +92,10 @@ define void @small_trip_count_min_vlen_32(i32* nocapture %a) nounwind vscale_ran ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[IV]] -; CHECK-NEXT: [[V:%.*]] = load i32, i32* [[GEP]], align 4 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]] +; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[GEP]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[V]], 1 -; CHECK-NEXT: store i32 [[ADD]], i32* [[GEP]], align 4 +; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 3 ; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] @@ -111,10 +107,10 @@ entry: loop: %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ] - %gep = getelementptr inbounds i32, i32* %a, i32 %iv - %v = load i32, i32* %gep, align 4 + %gep = getelementptr inbounds i32, ptr %a, i32 %iv + %v = load i32, ptr %gep, align 4 %add = add nsw i32 %v, 1 - store i32 %add, i32* %gep, align 4 + store i32 %add, ptr %gep, align 4 %iv.next = add i32 %iv, 1 %cond = icmp eq i32 %iv, 3 br i1 %cond, label %exit, label %loop diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll index bf3d5d35f1200..648ba0e45a6cf 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll @@ -7,7 +7,7 @@ %rec8 = type { i16 } @a = global [1 x %rec8] zeroinitializer -@b = global [2 x i16*] zeroinitializer +@b = global [2 x ptr] zeroinitializer define void @f1() { @@ -21,12 +21,11 @@ define void @f1() { ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16 ; CHECK-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i16*, i16** [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16** [[TMP3]] to <2 x i16*>* -; CHECK-NEXT: store <2 x i16*> , <2 x i16*>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr ptr, ptr [[TMP2]], i32 0 +; CHECK-NEXT: store <2 x ptr> , ptr [[TMP3]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 2, 2 ; CHECK-NEXT: br i1 [[CMP_N]], label [[BB3:%.*]], label [[SCALAR_PH]] @@ -36,14 +35,13 @@ define void @f1() { ; CHECK: bb2: ; CHECK-NEXT: [[C_1_0:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[_TMP9:%.*]], [[BB2]] ] ; CHECK-NEXT: [[_TMP1:%.*]] = zext i16 0 to i64 -; CHECK-NEXT: [[_TMP2:%.*]] = getelementptr [1 x %rec8], [1 x %rec8]* @a, i16 0, i64 [[_TMP1]] -; CHECK-NEXT: [[_TMP4:%.*]] = bitcast %rec8* [[_TMP2]] to i16* +; CHECK-NEXT: [[_TMP2:%.*]] = getelementptr [1 x %rec8], ptr @a, i16 0, i64 [[_TMP1]] ; CHECK-NEXT: [[_TMP6:%.*]] = sext i16 [[C_1_0]] to i64 -; CHECK-NEXT: [[_TMP7:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[_TMP6]] -; CHECK-NEXT: store i16* [[_TMP4]], i16** [[_TMP7]], align 8 +; CHECK-NEXT: [[_TMP7:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[_TMP6]] +; CHECK-NEXT: store ptr [[_TMP2]], ptr [[_TMP7]], align 8 ; CHECK-NEXT: [[_TMP9]] = add nsw i16 [[C_1_0]], 1 ; CHECK-NEXT: [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2 -; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: bb3: ; CHECK-NEXT: ret void ; @@ -54,11 +52,10 @@ bb1: bb2: %c.1.0 = phi i16 [ 0, %bb1 ], [ %_tmp9, %bb2 ] %_tmp1 = zext i16 0 to i64 - %_tmp2 = getelementptr [1 x %rec8], [1 x %rec8]* @a, i16 0, i64 %_tmp1 - %_tmp4 = bitcast %rec8* %_tmp2 to i16* + %_tmp2 = getelementptr [1 x %rec8], ptr @a, i16 0, i64 %_tmp1 %_tmp6 = sext i16 %c.1.0 to i64 - %_tmp7 = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 %_tmp6 - store i16* %_tmp4, i16** %_tmp7 + %_tmp7 = getelementptr [2 x ptr], ptr @b, i16 0, i64 %_tmp6 + store ptr %_tmp2, ptr %_tmp7 %_tmp9 = add nsw i16 %c.1.0, 1 %_tmp11 = icmp slt i16 %_tmp9, 2 br i1 %_tmp11, label %bb2, label %bb3 diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll index 3bf4d6ff66324..1554ce97e2506 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll @@ -4,7 +4,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" -define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp { +define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp { ; CHECK-LABEL: @conversion_cost1( ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 3 ; CHECK-NEXT: br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]] @@ -21,17 +21,16 @@ define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) noun ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <32 x i8> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <32 x i8> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <32 x i8>* -; CHECK-NEXT: store <32 x i8> [[VEC_IND1]], <32 x i8>* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store <32 x i8> [[VEC_IND]], ptr [[TMP7]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <32 x i8> [[VEC_IND1]], -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <32 x i8> [[VEC_IND]], +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -40,9 +39,9 @@ define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) noun ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[INDVARS_IV]] to i8 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i8 [[TMP10]], i8* [[TMP11]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[INDVARS_IV]] to i8 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i8 [[TMP9]], ptr [[TMP10]], align 1 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] @@ -58,8 +57,8 @@ define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) noun .lr.ph: ; preds = %0, %.lr.ph %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 3, %0 ] %2 = trunc i64 %indvars.iv to i8 - %3 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv - store i8 %2, i8* %3, align 1 + %3 = getelementptr inbounds i8, ptr %A, i64 %indvars.iv + store i8 %2, ptr %3, align 1 %indvars.iv.next = add i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 %exitcond = icmp eq i32 %lftr.wideiv, %n @@ -69,7 +68,7 @@ define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) noun ret i32 undef } -define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp { +define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp { ; CHECK-LABEL: @conversion_cost2( ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 9 ; CHECK-NEXT: br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]] @@ -92,37 +91,33 @@ define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) noun ; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <2 x i64> [[STEP_ADD1]], ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 9, [[INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 2 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 6 -; CHECK-NEXT: [[TMP13:%.*]] = add nsw <2 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP14:%.*]] = add nsw <2 x i64> [[STEP_ADD]], -; CHECK-NEXT: [[TMP15:%.*]] = add nsw <2 x i64> [[STEP_ADD1]], -; CHECK-NEXT: [[TMP16:%.*]] = add nsw <2 x i64> [[STEP_ADD2]], -; CHECK-NEXT: [[TMP17:%.*]] = sitofp <2 x i64> [[TMP13]] to <2 x float> -; CHECK-NEXT: [[TMP18:%.*]] = sitofp <2 x i64> [[TMP14]] to <2 x float> -; CHECK-NEXT: [[TMP19:%.*]] = sitofp <2 x i64> [[TMP15]] to <2 x float> -; CHECK-NEXT: [[TMP20:%.*]] = sitofp <2 x i64> [[TMP16]] to <2 x float> -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 0 -; CHECK-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP17]], <2 x float>* [[TMP26]], align 4 -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 2 -; CHECK-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP27]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP18]], <2 x float>* [[TMP28]], align 4 -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 4 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast float* [[TMP29]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP19]], <2 x float>* [[TMP30]], align 4 -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 6 -; CHECK-NEXT: [[TMP32:%.*]] = bitcast float* [[TMP31]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP20]], <2 x float>* [[TMP32]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 6 +; CHECK-NEXT: [[TMP9:%.*]] = add nsw <2 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <2 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[STEP_ADD1]], +; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[STEP_ADD2]], +; CHECK-NEXT: [[TMP13:%.*]] = sitofp <2 x i64> [[TMP9]] to <2 x float> +; CHECK-NEXT: [[TMP14:%.*]] = sitofp <2 x i64> [[TMP10]] to <2 x float> +; CHECK-NEXT: [[TMP15:%.*]] = sitofp <2 x i64> [[TMP11]] to <2 x float> +; CHECK-NEXT: [[TMP16:%.*]] = sitofp <2 x i64> [[TMP12]] to <2 x float> +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0 +; CHECK-NEXT: store <2 x float> [[TMP13]], ptr [[TMP21]], align 4 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 2 +; CHECK-NEXT: store <2 x float> [[TMP14]], ptr [[TMP22]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 4 +; CHECK-NEXT: store <2 x float> [[TMP15]], ptr [[TMP23]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 6 +; CHECK-NEXT: store <2 x float> [[TMP16]], ptr [[TMP24]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD2]], -; CHECK-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -133,8 +128,8 @@ define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) noun ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[TOFP:%.*]] = sitofp i64 [[ADD]] to float -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store float [[TOFP]], float* [[GEP]], align 4 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[TOFP]], ptr [[GEP]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] @@ -151,8 +146,8 @@ define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) noun %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ] %add = add nsw i64 %indvars.iv, 3 %tofp = sitofp i64 %add to float - %gep = getelementptr inbounds float, float* %B, i64 %indvars.iv - store float %tofp, float* %gep, align 4 + %gep = getelementptr inbounds float, ptr %B, i64 %indvars.iv + store float %tofp, ptr %gep, align 4 %indvars.iv.next = add i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 %exitcond = icmp eq i32 %lftr.wideiv, %n diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll index 57e50b90c3ffd..8e3af54b770e8 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll @@ -11,79 +11,29 @@ target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-w64-windows-gnu" -define void @cff_index_load_offsets(i1 %cond, i8 %x, i8* %p) #0 { +define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p) #0 { ; CHECK-LABEL: @cff_index_load_offsets( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[EXIT:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 undef, i64 4) -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[N_VEC]], 4 -; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* null, i64 [[TMP3]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[X:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT2]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shl nuw <4 x i32> [[TMP4]], -; CHECK-NEXT: [[TMP7:%.*]] = shl nuw <4 x i32> [[TMP5]], -; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[P:%.*]], align 1, !tbaa [[TBAA1:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i8> poison, i8 [[TMP8]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT5]], <4 x i8> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT6]] to <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT6]] to <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw <4 x i32> [[TMP9]], -; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw <4 x i32> [[TMP10]], -; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i32> [[TMP11]], [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i32> [[TMP12]], [[TMP7]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i8> poison, i8 [[TMP15]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT9]], <4 x i8> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT10]] to <4 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT10]] to <4 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = or <4 x i32> [[TMP16]], [[TMP18]] -; CHECK-NEXT: [[TMP21:%.*]] = or <4 x i32> [[TMP17]], [[TMP19]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP21]], i32 3 -; CHECK-NEXT: store i32 [[TMP22]], i32* undef, align 4, !tbaa [[TBAA4:![0-9]+]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[SW_EPILOG:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ null, [[IF_THEN]] ] ; CHECK-NEXT: br label [[FOR_BODY68:%.*]] ; CHECK: for.body68: -; CHECK-NEXT: [[P_359:%.*]] = phi i8* [ [[ADD_PTR86:%.*]], [[FOR_BODY68]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[CONV70:%.*]] = zext i8 [[X]] to i32 +; CHECK-NEXT: [[P_359:%.*]] = phi ptr [ [[ADD_PTR86:%.*]], [[FOR_BODY68]] ], [ null, [[IF_THEN]] ] +; CHECK-NEXT: [[CONV70:%.*]] = zext i8 [[X:%.*]] to i32 ; CHECK-NEXT: [[SHL71:%.*]] = shl nuw i32 [[CONV70]], 24 -; CHECK-NEXT: [[TMP24:%.*]] = load i8, i8* [[P]], align 1, !tbaa [[TBAA1]] -; CHECK-NEXT: [[CONV73:%.*]] = zext i8 [[TMP24]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[P:%.*]], align 1, !tbaa [[TBAA1:![0-9]+]] +; CHECK-NEXT: [[CONV73:%.*]] = zext i8 [[TMP0]] to i32 ; CHECK-NEXT: [[SHL74:%.*]] = shl nuw nsw i32 [[CONV73]], 16 ; CHECK-NEXT: [[OR75:%.*]] = or i32 [[SHL74]], [[SHL71]] -; CHECK-NEXT: [[TMP25:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr undef, align 1, !tbaa [[TBAA1]] ; CHECK-NEXT: [[SHL78:%.*]] = shl nuw nsw i32 undef, 8 ; CHECK-NEXT: [[OR79:%.*]] = or i32 [[OR75]], [[SHL78]] -; CHECK-NEXT: [[CONV81:%.*]] = zext i8 [[TMP25]] to i32 +; CHECK-NEXT: [[CONV81:%.*]] = zext i8 [[TMP1]] to i32 ; CHECK-NEXT: [[OR83:%.*]] = or i32 [[OR79]], [[CONV81]] -; CHECK-NEXT: store i32 [[OR83]], i32* undef, align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[ADD_PTR86]] = getelementptr inbounds i8, i8* [[P_359]], i64 4 -; CHECK-NEXT: [[CMP66:%.*]] = icmp ult i8* [[ADD_PTR86]], undef -; CHECK-NEXT: br i1 [[CMP66]], label [[FOR_BODY68]], label [[SW_EPILOG]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: store i32 [[OR83]], ptr undef, align 4, !tbaa [[TBAA4:![0-9]+]] +; CHECK-NEXT: [[ADD_PTR86]] = getelementptr inbounds i8, ptr [[P_359]], i64 4 +; CHECK-NEXT: [[CMP66:%.*]] = icmp ult ptr [[ADD_PTR86]], undef +; CHECK-NEXT: br i1 [[CMP66]], label [[FOR_BODY68]], label [[SW_EPILOG:%.*]] ; CHECK: sw.epilog: ; CHECK-NEXT: unreachable ; CHECK: Exit: @@ -96,21 +46,21 @@ if.then: ; preds = %entry br label %for.body68 for.body68: ; preds = %for.body68, %if.then - %p.359 = phi i8* [ %add.ptr86, %for.body68 ], [ null, %if.then ] + %p.359 = phi ptr [ %add.ptr86, %for.body68 ], [ null, %if.then ] %conv70 = zext i8 %x to i32 %shl71 = shl nuw i32 %conv70, 24 - %0 = load i8, i8* %p, align 1, !tbaa !1 + %0 = load i8, ptr %p, align 1, !tbaa !1 %conv73 = zext i8 %0 to i32 %shl74 = shl nuw nsw i32 %conv73, 16 %or75 = or i32 %shl74, %shl71 - %1 = load i8, i8* undef, align 1, !tbaa !1 + %1 = load i8, ptr undef, align 1, !tbaa !1 %shl78 = shl nuw nsw i32 undef, 8 %or79 = or i32 %or75, %shl78 %conv81 = zext i8 %1 to i32 %or83 = or i32 %or79, %conv81 - store i32 %or83, i32* undef, align 4, !tbaa !4 - %add.ptr86 = getelementptr inbounds i8, i8* %p.359, i64 4 - %cmp66 = icmp ult i8* %add.ptr86, undef + store i32 %or83, ptr undef, align 4, !tbaa !4 + %add.ptr86 = getelementptr inbounds i8, ptr %p.359, i64 4 + %cmp66 = icmp ult ptr %add.ptr86, undef br i1 %cmp66, label %for.body68, label %sw.epilog sw.epilog: ; preds = %for.body68 diff --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll index 9f876d7bfa8e4..259c52d65485d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll @@ -19,7 +19,7 @@ target triple = "x86_64-unknown-linux-gnu" ; } ; Function Attrs: nounwind uwtable -define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 { +define void @foo(ptr nocapture %a, ptr nocapture %b, i32 %k, i32 %m) #0 { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP27:%.*]] = icmp sgt i32 [[M:%.*]], 0 @@ -31,10 +31,10 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 { ; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 ; CHECK-NEXT: br label [[FOR_BODY3_LR_PH_US:%.*]] ; CHECK: for.end.us: -; CHECK-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0 +; CHECK-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV33:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0 ; CHECK-NEXT: [[ADD10_US:%.*]] = add nsw i32 [[TMP4]], 3 -; CHECK-NEXT: store i32 [[ADD10_US]], i32* [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0 +; CHECK-NEXT: store i32 [[ADD10_US]], ptr [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0 ; CHECK-NEXT: [[INDVARS_IV_NEXT34:%.*]] = add i64 [[INDVARS_IV33]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV35:%.*]] = trunc i64 [[INDVARS_IV_NEXT34]] to i32 ; CHECK-NEXT: [[EXITCOND36:%.*]] = icmp eq i32 [[LFTR_WIDEIV35]], [[M]] @@ -44,10 +44,10 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 { ; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV29]] to i32 ; CHECK-NEXT: [[ADD4_US:%.*]] = add i32 [[ADD_US:%.*]], [[TMP5]] ; CHECK-NEXT: [[IDXPROM_US:%.*]] = sext i32 [[ADD4_US]] to i64 -; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]] -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4, !llvm.mem.parallel_loop_access !0 +; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4, !llvm.mem.parallel_loop_access !0 ; CHECK-NEXT: [[ADD5_US:%.*]] = add nsw i32 [[TMP6]], 1 -; CHECK-NEXT: store i32 [[ADD5_US]], i32* [[ARRAYIDX7_US:%.*]], align 4, !llvm.mem.parallel_loop_access !0 +; CHECK-NEXT: store i32 [[ADD5_US]], ptr [[ARRAYIDX7_US:%.*]], align 4, !llvm.mem.parallel_loop_access !0 ; CHECK-NEXT: [[INDVARS_IV_NEXT30]] = add i64 [[INDVARS_IV29]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV31:%.*]] = trunc i64 [[INDVARS_IV_NEXT30]] to i32 ; CHECK-NEXT: [[EXITCOND32:%.*]] = icmp eq i32 [[LFTR_WIDEIV31]], [[M]] @@ -58,7 +58,7 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 { ; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 ; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[INDVARS_IV33]] to i32 ; CHECK-NEXT: [[ADD_US]] = add i32 [[TMP9]], [[K]] -; CHECK-NEXT: [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]] +; CHECK-NEXT: [[ARRAYIDX7_US]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV33]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: @@ -70,21 +70,20 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[OFFSET_IDX]] to i32 ; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[ADD_US]], [[TMP13]] ; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3 -; CHECK-NEXT: store i32 [[TMP20]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP17]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[TMP18]], i32 3 +; CHECK-NEXT: store i32 [[TMP19]], ptr [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]] @@ -101,10 +100,10 @@ entry: br i1 %cmp27, label %for.body3.lr.ph.us, label %for.end15 for.end.us: ; preds = %for.body3.us - %arrayidx9.us = getelementptr inbounds i32, i32* %b, i64 %indvars.iv33 - %0 = load i32, i32* %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3 + %arrayidx9.us = getelementptr inbounds i32, ptr %b, i64 %indvars.iv33 + %0 = load i32, ptr %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3 %add10.us = add nsw i32 %0, 3 - store i32 %add10.us, i32* %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3 + store i32 %add10.us, ptr %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3 %indvars.iv.next34 = add i64 %indvars.iv33, 1 %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32 %exitcond36 = icmp eq i32 %lftr.wideiv35, %m @@ -115,10 +114,10 @@ for.body3.us: ; preds = %for.body3.us, %for. %1 = trunc i64 %indvars.iv29 to i32 %add4.us = add i32 %add.us, %1 %idxprom.us = sext i32 %add4.us to i64 - %arrayidx.us = getelementptr inbounds i32, i32* %a, i64 %idxprom.us - %2 = load i32, i32* %arrayidx.us, align 4, !llvm.mem.parallel_loop_access !3 + %arrayidx.us = getelementptr inbounds i32, ptr %a, i64 %idxprom.us + %2 = load i32, ptr %arrayidx.us, align 4, !llvm.mem.parallel_loop_access !3 %add5.us = add nsw i32 %2, 1 - store i32 %add5.us, i32* %arrayidx7.us, align 4, !llvm.mem.parallel_loop_access !3 + store i32 %add5.us, ptr %arrayidx7.us, align 4, !llvm.mem.parallel_loop_access !3 %indvars.iv.next30 = add i64 %indvars.iv29, 1 %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32 %exitcond32 = icmp eq i32 %lftr.wideiv31, %m @@ -128,7 +127,7 @@ for.body3.lr.ph.us: ; preds = %for.end.us, %entry %indvars.iv33 = phi i64 [ %indvars.iv.next34, %for.end.us ], [ 0, %entry ] %3 = trunc i64 %indvars.iv33 to i32 %add.us = add i32 %3, %k - %arrayidx7.us = getelementptr inbounds i32, i32* %a, i64 %indvars.iv33 + %arrayidx7.us = getelementptr inbounds i32, ptr %a, i64 %indvars.iv33 br label %for.body3.us for.end15: ; preds = %for.end.us, %entry @@ -139,7 +138,7 @@ for.end15: ; preds = %for.end.us, %entry ; Here we can see the vectorizer does the mem dep checks and decides it is ; unsafe to vectorize. -define void @no-par-mem-metadata(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 { +define void @no-par-mem-metadata(ptr nocapture %a, ptr nocapture %b, i32 %k, i32 %m) #0 { ; CHECK-LABEL: @no-par-mem-metadata( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP27:%.*]] = icmp sgt i32 [[M:%.*]], 0 @@ -147,10 +146,10 @@ define void @no-par-mem-metadata(i32* nocapture %a, i32* nocapture %b, i32 %k, i ; CHECK: for.body3.lr.ph.us.preheader: ; CHECK-NEXT: br label [[FOR_BODY3_LR_PH_US:%.*]] ; CHECK: for.end.us: -; CHECK-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX9_US]], align 4 +; CHECK-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV33:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX9_US]], align 4 ; CHECK-NEXT: [[ADD10_US:%.*]] = add nsw i32 [[TMP0]], 3 -; CHECK-NEXT: store i32 [[ADD10_US]], i32* [[ARRAYIDX9_US]], align 4 +; CHECK-NEXT: store i32 [[ADD10_US]], ptr [[ARRAYIDX9_US]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT34:%.*]] = add i64 [[INDVARS_IV33]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV35:%.*]] = trunc i64 [[INDVARS_IV_NEXT34]] to i32 ; CHECK-NEXT: [[EXITCOND36:%.*]] = icmp eq i32 [[LFTR_WIDEIV35]], [[M]] @@ -160,10 +159,10 @@ define void @no-par-mem-metadata(i32* nocapture %a, i32* nocapture %b, i32 %k, i ; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV29]] to i32 ; CHECK-NEXT: [[ADD4_US:%.*]] = add i32 [[ADD_US:%.*]], [[TMP1]] ; CHECK-NEXT: [[IDXPROM_US:%.*]] = sext i32 [[ADD4_US]] to i64 -; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4 +; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4 ; CHECK-NEXT: [[ADD5_US:%.*]] = add nsw i32 [[TMP2]], 1 -; CHECK-NEXT: store i32 [[ADD5_US]], i32* [[ARRAYIDX7_US:%.*]], align 4 +; CHECK-NEXT: store i32 [[ADD5_US]], ptr [[ARRAYIDX7_US:%.*]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT30]] = add i64 [[INDVARS_IV29]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV31:%.*]] = trunc i64 [[INDVARS_IV_NEXT30]] to i32 ; CHECK-NEXT: [[EXITCOND32:%.*]] = icmp eq i32 [[LFTR_WIDEIV31]], [[M]] @@ -172,7 +171,7 @@ define void @no-par-mem-metadata(i32* nocapture %a, i32* nocapture %b, i32 %k, i ; CHECK-NEXT: [[INDVARS_IV33]] = phi i64 [ [[INDVARS_IV_NEXT34]], [[FOR_END_US]] ], [ 0, [[FOR_BODY3_LR_PH_US_PREHEADER]] ] ; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[INDVARS_IV33]] to i32 ; CHECK-NEXT: [[ADD_US]] = add i32 [[TMP3]], [[K:%.*]] -; CHECK-NEXT: [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]] +; CHECK-NEXT: [[ARRAYIDX7_US]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV33]] ; CHECK-NEXT: br label [[FOR_BODY3_US]] ; CHECK: for.end15.loopexit: ; CHECK-NEXT: br label [[FOR_END15]] @@ -184,10 +183,10 @@ entry: br i1 %cmp27, label %for.body3.lr.ph.us, label %for.end15 for.end.us: ; preds = %for.body3.us - %arrayidx9.us = getelementptr inbounds i32, i32* %b, i64 %indvars.iv33 - %0 = load i32, i32* %arrayidx9.us, align 4 + %arrayidx9.us = getelementptr inbounds i32, ptr %b, i64 %indvars.iv33 + %0 = load i32, ptr %arrayidx9.us, align 4 %add10.us = add nsw i32 %0, 3 - store i32 %add10.us, i32* %arrayidx9.us, align 4 + store i32 %add10.us, ptr %arrayidx9.us, align 4 %indvars.iv.next34 = add i64 %indvars.iv33, 1 %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32 %exitcond36 = icmp eq i32 %lftr.wideiv35, %m @@ -198,10 +197,10 @@ for.body3.us: ; preds = %for.body3.us, %for. %1 = trunc i64 %indvars.iv29 to i32 %add4.us = add i32 %add.us, %1 %idxprom.us = sext i32 %add4.us to i64 - %arrayidx.us = getelementptr inbounds i32, i32* %a, i64 %idxprom.us - %2 = load i32, i32* %arrayidx.us, align 4 + %arrayidx.us = getelementptr inbounds i32, ptr %a, i64 %idxprom.us + %2 = load i32, ptr %arrayidx.us, align 4 %add5.us = add nsw i32 %2, 1 - store i32 %add5.us, i32* %arrayidx7.us, align 4 + store i32 %add5.us, ptr %arrayidx7.us, align 4 %indvars.iv.next30 = add i64 %indvars.iv29, 1 %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32 %exitcond32 = icmp eq i32 %lftr.wideiv31, %m @@ -211,7 +210,7 @@ for.body3.lr.ph.us: ; preds = %for.end.us, %entry %indvars.iv33 = phi i64 [ %indvars.iv.next34, %for.end.us ], [ 0, %entry ] %3 = trunc i64 %indvars.iv33 to i32 %add.us = add i32 %3, %k - %arrayidx7.us = getelementptr inbounds i32, i32* %a, i64 %indvars.iv33 + %arrayidx7.us = getelementptr inbounds i32, ptr %a, i64 %indvars.iv33 br label %for.body3.us for.end15: ; preds = %for.end.us, %entry diff --git a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll index 4efdc43f02b95..1c7f3a70d37c3 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll @@ -9,15 +9,15 @@ ; if (arr[i] != 42) ; tot += arr[i]; -define double @sumIfScalar(double* nocapture readonly %arr) { +define double @sumIfScalar(ptr nocapture readonly %arr) { ; CHECK-LABEL: @sumIfScalar( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ] ; CHECK-NEXT: [[TOT:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ] -; CHECK-NEXT: [[ADDR:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[I]] -; CHECK-NEXT: [[NEXTVAL:%.*]] = load double, double* [[ADDR]], align 8 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr double, ptr [[ARR:%.*]], i32 [[I]] +; CHECK-NEXT: [[NEXTVAL:%.*]] = load double, ptr [[ADDR]], align 8 ; CHECK-NEXT: [[TST:%.*]] = fcmp une double [[NEXTVAL]], 4.200000e+01 ; CHECK-NEXT: br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]] ; CHECK: do.add: @@ -41,8 +41,8 @@ loop: %i = phi i32 [0, %entry], [%i.next, %next.iter] %tot = phi double [0.0, %entry], [%tot.next, %next.iter] - %addr = getelementptr double, double* %arr, i32 %i - %nextval = load double, double* %addr + %addr = getelementptr double, ptr %arr, i32 %i + %nextval = load double, ptr %addr %tst = fcmp une double %nextval, 42.0 br i1 %tst, label %do.add, label %no.add @@ -64,7 +64,7 @@ done: ret double %tot.next } -define double @sumIfVector(double* nocapture readonly %arr) { +define double @sumIfVector(ptr nocapture readonly %arr) { ; SSE-LABEL: @sumIfVector( ; SSE-NEXT: entry: ; SSE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -74,30 +74,29 @@ define double @sumIfVector(double* nocapture readonly %arr) { ; SSE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SSE-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ] ; SSE-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; SSE-NEXT: [[TMP1:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[TMP0]] -; SSE-NEXT: [[TMP2:%.*]] = getelementptr double, double* [[TMP1]], i32 0 -; SSE-NEXT: [[TMP3:%.*]] = bitcast double* [[TMP2]] to <2 x double>* -; SSE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 -; SSE-NEXT: [[TMP4:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD]], -; SSE-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]] -; SSE-NEXT: [[TMP6:%.*]] = xor <2 x i1> [[TMP4]], -; SSE-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[VEC_PHI]] +; SSE-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[ARR:%.*]], i32 [[TMP0]] +; SSE-NEXT: [[TMP2:%.*]] = getelementptr double, ptr [[TMP1]], i32 0 +; SSE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8 +; SSE-NEXT: [[TMP3:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD]], +; SSE-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]] +; SSE-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP3]], +; SSE-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP3]], <2 x double> [[TMP4]], <2 x double> [[VEC_PHI]] ; SSE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; SSE-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 -; SSE-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SSE-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 +; SSE-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SSE: middle.block: -; SSE-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[PREDPHI]]) +; SSE-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[PREDPHI]]) ; SSE-NEXT: [[CMP_N:%.*]] = icmp eq i32 32, 32 ; SSE-NEXT: br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]] ; SSE: scalar.ph: ; SSE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SSE-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; SSE-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; SSE-NEXT: br label [[LOOP:%.*]] ; SSE: loop: ; SSE-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ] ; SSE-NEXT: [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ] -; SSE-NEXT: [[ADDR:%.*]] = getelementptr double, double* [[ARR]], i32 [[I]] -; SSE-NEXT: [[NEXTVAL:%.*]] = load double, double* [[ADDR]], align 8 +; SSE-NEXT: [[ADDR:%.*]] = getelementptr double, ptr [[ARR]], i32 [[I]] +; SSE-NEXT: [[NEXTVAL:%.*]] = load double, ptr [[ADDR]], align 8 ; SSE-NEXT: [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01 ; SSE-NEXT: br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]] ; SSE: do.add: @@ -111,7 +110,7 @@ define double @sumIfVector(double* nocapture readonly %arr) { ; SSE-NEXT: [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32 ; SSE-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP2:![0-9]+]] ; SSE: done: -; SSE-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; SSE-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; SSE-NEXT: ret double [[TOT_NEXT_LCSSA]] ; ; AVX-LABEL: @sumIfVector( @@ -123,30 +122,29 @@ define double @sumIfVector(double* nocapture readonly %arr) { ; AVX-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ] ; AVX-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; AVX-NEXT: [[TMP1:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[TMP0]] -; AVX-NEXT: [[TMP2:%.*]] = getelementptr double, double* [[TMP1]], i32 0 -; AVX-NEXT: [[TMP3:%.*]] = bitcast double* [[TMP2]] to <4 x double>* -; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP3]], align 8 -; AVX-NEXT: [[TMP4:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD]], -; AVX-NEXT: [[TMP5:%.*]] = fadd fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]] -; AVX-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], -; AVX-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP4]], <4 x double> [[TMP5]], <4 x double> [[VEC_PHI]] +; AVX-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[ARR:%.*]], i32 [[TMP0]] +; AVX-NEXT: [[TMP2:%.*]] = getelementptr double, ptr [[TMP1]], i32 0 +; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 8 +; AVX-NEXT: [[TMP3:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD]], +; AVX-NEXT: [[TMP4:%.*]] = fadd fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]] +; AVX-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP3]], +; AVX-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP4]], <4 x double> [[VEC_PHI]] ; AVX-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; AVX-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 -; AVX-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AVX-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 +; AVX-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX: middle.block: -; AVX-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[PREDPHI]]) +; AVX-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[PREDPHI]]) ; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i32 32, 32 ; AVX-NEXT: br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]] ; AVX: scalar.ph: ; AVX-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; AVX-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; AVX-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; AVX-NEXT: br label [[LOOP:%.*]] ; AVX: loop: ; AVX-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ] ; AVX-NEXT: [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ] -; AVX-NEXT: [[ADDR:%.*]] = getelementptr double, double* [[ARR]], i32 [[I]] -; AVX-NEXT: [[NEXTVAL:%.*]] = load double, double* [[ADDR]], align 8 +; AVX-NEXT: [[ADDR:%.*]] = getelementptr double, ptr [[ARR]], i32 [[I]] +; AVX-NEXT: [[NEXTVAL:%.*]] = load double, ptr [[ADDR]], align 8 ; AVX-NEXT: [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01 ; AVX-NEXT: br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]] ; AVX: do.add: @@ -160,7 +158,7 @@ define double @sumIfVector(double* nocapture readonly %arr) { ; AVX-NEXT: [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32 ; AVX-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP2:![0-9]+]] ; AVX: done: -; AVX-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; AVX-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; AVX-NEXT: ret double [[TOT_NEXT_LCSSA]] ; entry: @@ -170,8 +168,8 @@ loop: %i = phi i32 [0, %entry], [%i.next, %next.iter] %tot = phi double [0.0, %entry], [%tot.next, %next.iter] - %addr = getelementptr double, double* %arr, i32 %i - %nextval = load double, double* %addr + %addr = getelementptr double, ptr %arr, i32 %i + %nextval = load double, ptr %addr %tst = fcmp fast une double %nextval, 42.0 br i1 %tst, label %do.add, label %no.add diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll index f44b4aa940622..fab69c602fce4 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll @@ -4,84 +4,83 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -define i32 @inv_load_conditional(i32* %a, i64 %n, i32* %b, i32 %k) { +define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) { ; CHECK-LABEL: @inv_load_conditional( ; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 -; CHECK-NEXT: [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 8 +; CHECK-NEXT: [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 8 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[B]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[SMAX]], 2 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[B]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[A]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[SMAX6]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK7]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[SMAX2]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775792 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32*> poison, i32* [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT]], <16 x i32*> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <16 x i32> poison, i32 [[NTRUNC]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT8]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775792 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x ptr> poison, ptr [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x ptr> [[BROADCAST_SPLATINSERT]], <16 x ptr> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <16 x i32> poison, i32 [[NTRUNC]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT4]], <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* -; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT9]], <16 x i32>* [[TMP1]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP1]], align 4, !alias.scope !0, !noalias !3 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32*> [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT]], i32 4, <16 x i1> [[TMP3]], <16 x i32> poison), !alias.scope !3 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x ptr> [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[BROADCAST_SPLAT]], i32 4, <16 x i1> [[TMP3]], <16 x i32> poison), !alias.scope !3 ; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i32> [[PREDPHI]], i64 15 -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX6]], 8 +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX2]], 8 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK_NOT_NOT:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC11:%.*]] = and i64 [[SMAX6]], 9223372036854775800 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT15]], <8 x i32*> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT17]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[N_VEC7:%.*]] = and i64 [[SMAX2]], 9223372036854775800 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT11]], <8 x ptr> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT13]], <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT21:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* -; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT18]], <8 x i32>* [[TMP6]], align 4, !alias.scope !7, !noalias !10 -; CHECK-NEXT: [[INDEX_NEXT21]] = add nuw i64 [[OFFSET_IDX]], 8 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT21]], [[N_VEC11]] -; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX9]] +; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT14]], ptr [[TMP5]], align 4, !alias.scope !7, !noalias !10 +; CHECK-NEXT: [[INDEX_NEXT17]] = add nuw i64 [[INDEX9]], 8 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC7]] +; CHECK-NEXT: br i1 [[TMP6]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i32*> [[BROADCAST_SPLAT16]], zeroinitializer -; CHECK-NEXT: [[WIDE_MASKED_GATHER19:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[BROADCAST_SPLAT16]], i32 4, <8 x i1> [[TMP8]], <8 x i32> poison), !alias.scope !10 -; CHECK-NEXT: [[PREDPHI20:%.*]] = select <8 x i1> [[TMP8]], <8 x i32> [[WIDE_MASKED_GATHER19]], <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[PREDPHI20]], i64 7 -; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC11]] -; CHECK-NEXT: br i1 [[CMP_N12]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <8 x ptr> [[BROADCAST_SPLAT12]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[BROADCAST_SPLAT12]], i32 4, <8 x i1> [[TMP7]], <8 x i32> poison), !alias.scope !10 +; CHECK-NEXT: [[PREDPHI16:%.*]] = select <8 x i1> [[TMP7]], <8 x i32> [[WIDE_MASKED_GATHER15]], <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[PREDPHI16]], i64 7 +; CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC7]] +; CHECK-NEXT: br i1 [[CMP_N8]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC7]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] -; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32* [[A]], null -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[I1]], align 4 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]] +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[A]], null +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[I1]], align 4 ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[LATCH]], label [[COND_LOAD:%.*]] ; CHECK: cond_load: -; CHECK-NEXT: [[ALOAD:%.*]] = load i32, i32* [[A]], align 4 +; CHECK-NEXT: [[ALOAD:%.*]] = load i32, ptr [[A]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[A_LCSSA:%.*]] = phi i32 [ [[ALOAD]], [[COND_LOAD]] ], [ 1, [[FOR_BODY]] ] @@ -89,7 +88,7 @@ define i32 @inv_load_conditional(i32* %a, i64 %n, i32* %b, i32 %k) { ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[A_LCSSA_LCSSA:%.*]] = phi i32 [ [[A_LCSSA]], [[LATCH]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ [[TMP9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[A_LCSSA_LCSSA:%.*]] = phi i32 [ [[A_LCSSA]], [[LATCH]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ [[TMP8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[A_LCSSA_LCSSA]] ; entry: @@ -98,14 +97,14 @@ entry: for.body: ; preds = %for.body, %entry %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] - %i1 = getelementptr inbounds i32, i32* %b, i64 %i - %i2 = load i32, i32* %i1, align 8 - %cmp = icmp ne i32* %a, null - store i32 %ntrunc, i32* %i1 + %i1 = getelementptr inbounds i32, ptr %b, i64 %i + %i2 = load i32, ptr %i1, align 8 + %cmp = icmp ne ptr %a, null + store i32 %ntrunc, ptr %i1 br i1 %cmp, label %cond_load, label %latch cond_load: - %aload = load i32, i32* %a, align 4 + %aload = load i32, ptr %a, align 4 br label %latch latch: diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll index b45103ce4853f..1f5336d1b6f0d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -7,101 +7,97 @@ target triple = "x86_64-unknown-linux-gnu" ; first test checks that loop with a reduction and a uniform store gets ; vectorized. -define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) { +define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b) { ; CHECK-LABEL: @inv_val_store_to_inv_address_with_reduction( ; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 -; CHECK-NEXT: [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 8 +; CHECK-NEXT: [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 8 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[SMAX]], 2 +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[A]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[B]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[SMAX6]], 64 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK7]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[SMAX2]], 64 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775744 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775744 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI10:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 8, !alias.scope !0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 16 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>* -; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 32 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>* -; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 8, !alias.scope !0 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 48 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>* -; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 8, !alias.scope !0 -; CHECK-NEXT: [[TMP8]] = add <16 x i32> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP9]] = add <16 x i32> [[VEC_PHI8]], [[WIDE_LOAD11]] -; CHECK-NEXT: [[TMP10]] = add <16 x i32> [[VEC_PHI9]], [[WIDE_LOAD12]] -; CHECK-NEXT: [[TMP11]] = add <16 x i32> [[VEC_PHI10]], [[WIDE_LOAD13]] -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 8, !alias.scope !0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 16 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP2]], align 8, !alias.scope !0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 32 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i32>, ptr [[TMP3]], align 8, !alias.scope !0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 48 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i32>, ptr [[TMP4]], align 8, !alias.scope !0 +; CHECK-NEXT: [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6]] = add <16 x i32> [[VEC_PHI4]], [[WIDE_LOAD7]] +; CHECK-NEXT: [[TMP7]] = add <16 x i32> [[VEC_PHI5]], [[WIDE_LOAD8]] +; CHECK-NEXT: [[TMP8]] = add <16 x i32> [[VEC_PHI6]], [[WIDE_LOAD9]] +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <16 x i32> [[TMP10]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX15:%.*]] = add <16 x i32> [[TMP11]], [[BIN_RDX14]] -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX15]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <16 x i32> [[TMP7]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <16 x i32> [[TMP8]], [[BIN_RDX10]] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX6]], 56 +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX2]], 56 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP10]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC17:%.*]] = and i64 [[SMAX6]], 9223372036854775800 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> , i32 [[BC_MERGE_RDX]], i64 0 +; CHECK-NEXT: [[N_VEC13:%.*]] = and i64 [[SMAX2]], 9223372036854775800 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> , i32 [[BC_MERGE_RDX]], i64 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI20:%.*]] = phi <8 x i32> [ [[TMP14]], [[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <8 x i32>* -; CHECK-NEXT: [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 8, !alias.scope !7 -; CHECK-NEXT: [[TMP17]] = add <8 x i32> [[VEC_PHI20]], [[WIDE_LOAD21]] -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !10, !noalias !7 -; CHECK-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[OFFSET_IDX]], 8 -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT22]], [[N_VEC17]] -; CHECK-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[INDEX15:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT18:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI16:%.*]] = phi <8 x i32> [ [[TMP11]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX15]] +; CHECK-NEXT: [[WIDE_LOAD17:%.*]] = load <8 x i32>, ptr [[TMP12]], align 8, !alias.scope !7 +; CHECK-NEXT: [[TMP13]] = add <8 x i32> [[VEC_PHI16]], [[WIDE_LOAD17]] +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !10, !noalias !7 +; CHECK-NEXT: [[INDEX_NEXT18]] = add nuw i64 [[INDEX15]], 8 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT18]], [[N_VEC13]] +; CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP17]]) -; CHECK-NEXT: [[CMP_N18:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC17]] -; CHECK-NEXT: br i1 [[CMP_N18]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP13]]) +; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC13]] +; CHECK-NEXT: br i1 [[CMP_N14]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX23:%.*]] = phi i32 [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX19:%.*]] = phi i32 [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP10]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[T0:%.*]] = phi i32 [ [[T3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX23]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] -; CHECK-NEXT: [[T2:%.*]] = load i32, i32* [[T1]], align 8 +; CHECK-NEXT: [[T0:%.*]] = phi i32 [ [[T3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX19]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]] +; CHECK-NEXT: [[T2:%.*]] = load i32, ptr [[T1]], align 8 ; CHECK-NEXT: [[T3]] = add i32 [[T0]], [[T2]] -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4 +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[T4:%.*]] = phi i32 [ [[T3]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[T4:%.*]] = phi i32 [ [[T3]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[T4]] ; entry: @@ -111,10 +107,10 @@ entry: for.body: ; preds = %for.body, %entry %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] %t0 = phi i32 [ %t3, %for.body ], [ 0, %entry ] - %t1 = getelementptr inbounds i32, i32* %b, i64 %i - %t2 = load i32, i32* %t1, align 8 + %t1 = getelementptr inbounds i32, ptr %b, i64 %i + %t2 = load i32, ptr %t1, align 8 %t3 = add i32 %t0, %t2 - store i32 %ntrunc, i32* %a + store i32 %ntrunc, ptr %a %i.next = add nuw nsw i64 %i, 1 %cond = icmp slt i64 %i.next, %n br i1 %cond, label %for.body, label %for.end @@ -126,89 +122,86 @@ for.end: ; preds = %for.body ; Conditional store ; if (b[i] == k) a = ntrunc -define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) { +define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i32 %k) { ; CHECK-LABEL: @inv_val_store_to_inv_address_conditional( ; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 -; CHECK-NEXT: [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 8 +; CHECK-NEXT: [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 8 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[B]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[SMAX]], 2 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[B]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[A]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[SMAX6]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK7]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[SMAX2]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775792 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775792 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[K:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <16 x i32> poison, i32 [[NTRUNC]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT8]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <16 x i32*> poison, i32* [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT10]], <16 x i32*> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <16 x i32> poison, i32 [[NTRUNC]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT4]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <16 x ptr> poison, ptr [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <16 x ptr> [[BROADCAST_SPLATINSERT6]], <16 x ptr> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 8, !alias.scope !15, !noalias !18 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 8, !alias.scope !15, !noalias !18 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* -; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT9]], <16 x i32>* [[TMP3]], align 4, !alias.scope !15, !noalias !18 -; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[BROADCAST_SPLAT9]], <16 x i32*> [[BROADCAST_SPLAT11]], i32 4, <16 x i1> [[TMP2]]), !alias.scope !18 +; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP1]], align 4, !alias.scope !15, !noalias !18 +; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> [[BROADCAST_SPLAT5]], <16 x ptr> [[BROADCAST_SPLAT7]], i32 4, <16 x i1> [[TMP2]]), !alias.scope !18 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX6]], 8 +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX2]], 8 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK_NOT_NOT:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC13:%.*]] = and i64 [[SMAX6]], 9223372036854775800 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT17]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT19]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT22:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT21]], <8 x i32*> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[N_VEC9:%.*]] = and i64 [[SMAX2]], 9223372036854775800 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT13]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT17]], <8 x ptr> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* -; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 8, !alias.scope !21, !noalias !24 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD16]], [[BROADCAST_SPLAT18]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* -; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT20]], <8 x i32>* [[TMP8]], align 4, !alias.scope !21, !noalias !24 -; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> [[BROADCAST_SPLAT20]], <8 x i32*> [[BROADCAST_SPLAT22]], i32 4, <8 x i1> [[TMP7]]), !alias.scope !24 -; CHECK-NEXT: [[INDEX_NEXT23]] = add nuw i64 [[OFFSET_IDX]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT23]], [[N_VEC13]] -; CHECK-NEXT: br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX11]] +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP4]], align 8, !alias.scope !21, !noalias !24 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD12]], [[BROADCAST_SPLAT14]] +; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT16]], ptr [[TMP4]], align 4, !alias.scope !21, !noalias !24 +; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[BROADCAST_SPLAT16]], <8 x ptr> [[BROADCAST_SPLAT18]], i32 4, <8 x i1> [[TMP5]]), !alias.scope !24 +; CHECK-NEXT: [[INDEX_NEXT19]] = add nuw i64 [[INDEX11]], 8 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT19]], [[N_VEC9]] +; CHECK-NEXT: br i1 [[TMP6]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC13]] -; CHECK-NEXT: br i1 [[CMP_N14]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N10:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC9]] +; CHECK-NEXT: br i1 [[CMP_N10]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] -; CHECK-NEXT: [[T2:%.*]] = load i32, i32* [[T1]], align 8 +; CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]] +; CHECK-NEXT: [[T2:%.*]] = load i32, ptr [[T1]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[T2]], [[K]] -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[T1]], align 4 +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[T1]], align 4 ; CHECK-NEXT: br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]] ; CHECK: cond_store: -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4 +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 @@ -223,14 +216,14 @@ entry: for.body: ; preds = %for.body, %entry %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] - %t1 = getelementptr inbounds i32, i32* %b, i64 %i - %t2 = load i32, i32* %t1, align 8 + %t1 = getelementptr inbounds i32, ptr %b, i64 %i + %t2 = load i32, ptr %t1, align 8 %cmp = icmp eq i32 %t2, %k - store i32 %ntrunc, i32* %t1 + store i32 %ntrunc, ptr %t1 br i1 %cmp, label %cond_store, label %latch cond_store: - store i32 %ntrunc, i32* %a + store i32 %ntrunc, ptr %a br label %latch latch: @@ -242,106 +235,101 @@ for.end: ; preds = %for.body ret void } -define void @variant_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32* %c, i32 %k) { +define void @variant_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, ptr %c, i32 %k) { ; CHECK-LABEL: @variant_val_store_to_inv_address_conditional( ; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 -; CHECK-NEXT: [[SMAX16:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX16]], 8 +; CHECK-NEXT: [[SMAX10:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX10]], 8 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1 -; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[C:%.*]], i64 [[SMAX]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[B]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[SMAX]], 2 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 +; CHECK-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[B]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[A]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: [[BOUND09:%.*]] = icmp ugt i32* [[SCEVGEP7]], [[B]] -; CHECK-NEXT: [[BOUND110:%.*]] = icmp ugt i32* [[SCEVGEP]], [[C]] -; CHECK-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] -; CHECK-NEXT: [[BOUND012:%.*]] = icmp ugt i32* [[SCEVGEP7]], [[A]] -; CHECK-NEXT: [[BOUND113:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[C]] -; CHECK-NEXT: [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]] -; CHECK-NEXT: [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]] -; CHECK-NEXT: br i1 [[CONFLICT_RDX15]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ugt ptr [[UGLYGEP2]], [[B]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ugt ptr [[UGLYGEP]], [[C]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: [[BOUND06:%.*]] = icmp ugt ptr [[UGLYGEP2]], [[A]] +; CHECK-NEXT: [[BOUND17:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[C]] +; CHECK-NEXT: [[FOUND_CONFLICT8:%.*]] = and i1 [[BOUND06]], [[BOUND17]] +; CHECK-NEXT: [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT8]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX9]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: [[MIN_ITERS_CHECK17:%.*]] = icmp ult i64 [[SMAX16]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK17]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK11:%.*]] = icmp ult i64 [[SMAX10]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK11]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX16]], 9223372036854775792 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX10]], 9223372036854775792 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[K:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <16 x i32> poison, i32 [[NTRUNC]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT19:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT18]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <16 x i32*> poison, i32* [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT20]], <16 x i32*> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <16 x i32> poison, i32 [[NTRUNC]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT12]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <16 x ptr> poison, ptr [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <16 x ptr> [[BROADCAST_SPLATINSERT14]], <16 x ptr> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 8, !alias.scope !28, !noalias !31 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 8, !alias.scope !28, !noalias !31 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* -; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT19]], <16 x i32>* [[TMP3]], align 4, !alias.scope !28, !noalias !31 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[C]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP5]], i32 8, <16 x i1> [[TMP2]], <16 x i32> poison), !alias.scope !34 -; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x i32*> [[BROADCAST_SPLAT21]], i32 4, <16 x i1> [[TMP2]]), !alias.scope !35, !noalias !34 +; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT13]], ptr [[TMP1]], align 4, !alias.scope !28, !noalias !31 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 8, <16 x i1> [[TMP2]], <16 x i32> poison), !alias.scope !34 +; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x ptr> [[BROADCAST_SPLAT15]], i32 4, <16 x i1> [[TMP2]]), !alias.scope !35, !noalias !34 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX16]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX10]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX16]], 8 +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX10]], 8 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK_NOT_NOT:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC23:%.*]] = and i64 [[SMAX16]], 9223372036854775800 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT27]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT30:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT29]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT32:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT33:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT32]], <8 x i32*> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[N_VEC17:%.*]] = and i64 [[SMAX10]], 9223372036854775800 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT22:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT21]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT23:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT24:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT23]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT26:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT27:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT26]], <8 x ptr> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT34:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <8 x i32>* -; CHECK-NEXT: [[WIDE_LOAD26:%.*]] = load <8 x i32>, <8 x i32>* [[TMP8]], align 8, !alias.scope !37, !noalias !40 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD26]], [[BROADCAST_SPLAT28]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP7]] to <8 x i32>* -; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT30]], <8 x i32>* [[TMP10]], align 4, !alias.scope !37, !noalias !40 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[C]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <8 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD31:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP12]], i32 8, <8 x i1> [[TMP9]], <8 x i32> poison), !alias.scope !43 -; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> [[WIDE_MASKED_LOAD31]], <8 x i32*> [[BROADCAST_SPLAT33]], i32 4, <8 x i1> [[TMP9]]), !alias.scope !44, !noalias !43 -; CHECK-NEXT: [[INDEX_NEXT34]] = add nuw i64 [[OFFSET_IDX]], 8 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT34]], [[N_VEC23]] -; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]] +; CHECK-NEXT: [[INDEX19:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX19]] +; CHECK-NEXT: [[WIDE_LOAD20:%.*]] = load <8 x i32>, ptr [[TMP5]], align 8, !alias.scope !37, !noalias !40 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD20]], [[BROADCAST_SPLAT22]] +; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT24]], ptr [[TMP5]], align 4, !alias.scope !37, !noalias !40 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX19]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP7]], i32 8, <8 x i1> [[TMP6]], <8 x i32> poison), !alias.scope !43 +; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[WIDE_MASKED_LOAD25]], <8 x ptr> [[BROADCAST_SPLAT27]], i32 4, <8 x i1> [[TMP6]]), !alias.scope !44, !noalias !43 +; CHECK-NEXT: [[INDEX_NEXT28]] = add nuw i64 [[INDEX19]], 8 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT28]], [[N_VEC17]] +; CHECK-NEXT: br i1 [[TMP8]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N24:%.*]] = icmp eq i64 [[SMAX16]], [[N_VEC23]] -; CHECK-NEXT: br i1 [[CMP_N24]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N18:%.*]] = icmp eq i64 [[SMAX10]], [[N_VEC17]] +; CHECK-NEXT: br i1 [[CMP_N18]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] -; CHECK-NEXT: [[T2:%.*]] = load i32, i32* [[T1]], align 8 +; CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]] +; CHECK-NEXT: [[T2:%.*]] = load i32, ptr [[T1]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[T2]], [[K]] -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[T1]], align 4 +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[T1]], align 4 ; CHECK-NEXT: br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]] ; CHECK: cond_store: -; CHECK-NEXT: [[T3:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[I]] -; CHECK-NEXT: [[T4:%.*]] = load i32, i32* [[T3]], align 8 -; CHECK-NEXT: store i32 [[T4]], i32* [[A]], align 4 +; CHECK-NEXT: [[T3:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[I]] +; CHECK-NEXT: [[T4:%.*]] = load i32, ptr [[T3]], align 8 +; CHECK-NEXT: store i32 [[T4]], ptr [[A]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 @@ -356,16 +344,16 @@ entry: for.body: ; preds = %for.body, %entry %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] - %t1 = getelementptr inbounds i32, i32* %b, i64 %i - %t2 = load i32, i32* %t1, align 8 + %t1 = getelementptr inbounds i32, ptr %b, i64 %i + %t2 = load i32, ptr %t1, align 8 %cmp = icmp eq i32 %t2, %k - store i32 %ntrunc, i32* %t1 + store i32 %ntrunc, ptr %t1 br i1 %cmp, label %cond_store, label %latch cond_store: - %t3 = getelementptr inbounds i32, i32* %c, i64 %i - %t4 = load i32, i32* %t3, align 8 - store i32 %t4, i32* %a + %t3 = getelementptr inbounds i32, ptr %c, i64 %i + %t4 = load i32, ptr %t3, align 8 + store i32 %t4, ptr %a br label %latch latch: diff --git a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll index 17fafe50a7592..e5aed27a9f240 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll @@ -5,7 +5,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16 target triple = "x86_64-unknown-linux-gnu" ; TODO: Make sure selected VF for the epilog loop doesn't exceed remaining TC. -define void @test1(i8 * noalias %src, i8 * noalias %dst) #0 { +define void @test1(ptr noalias %src, ptr noalias %dst) #0 { ; CHECK-LABEL: @test1( ; CHECK-NEXT: iter.check: ; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] @@ -16,17 +16,15 @@ define void @test1(i8 * noalias %src, i8 * noalias %dst) #0 { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 64 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[WIDE_LOAD]], <16 x i8>* [[TMP6]], align 64 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 64 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: store <16 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 64 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 17, 16 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -36,19 +34,17 @@ define void @test1(i8 * noalias %src, i8 * noalias %dst) #0 { ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>* -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 64 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <8 x i8>* -; CHECK-NEXT: store <8 x i8> [[WIDE_LOAD3]], <8 x i8>* [[TMP14]], align 64 -; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[OFFSET_IDX]], 8 -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 16 -; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX2]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: store <8 x i8> [[WIDE_LOAD3]], ptr [[TMP10]], align 64 +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX2]], 8 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 16 +; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 17, 16 ; CHECK-NEXT: br i1 [[CMP_N1]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -57,10 +53,10 @@ define void @test1(i8 * noalias %src, i8 * noalias %dst) #0 { ; CHECK-NEXT: br label [[LOOP_MEMCPY_EXPANSION:%.*]] ; CHECK: loop-memcpy-expansion: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] -; CHECK-NEXT: [[LDADDR:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[I]] -; CHECK-NEXT: [[VAL:%.*]] = load i8, i8* [[LDADDR]], align 64 -; CHECK-NEXT: [[STADDR:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[I]] -; CHECK-NEXT: store i8 [[VAL]], i8* [[STADDR]], align 64 +; CHECK-NEXT: [[LDADDR:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I]] +; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[LDADDR]], align 64 +; CHECK-NEXT: [[STADDR:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I]] +; CHECK-NEXT: store i8 [[VAL]], ptr [[STADDR]], align 64 ; CHECK-NEXT: [[I_NEXT]] = add i64 [[I]], 1 ; CHECK-NEXT: [[IS_NEXT:%.*]] = icmp ult i64 [[I_NEXT]], 17 ; CHECK-NEXT: br i1 [[IS_NEXT]], label [[LOOP_MEMCPY_EXPANSION]], label [[EXIT]], !llvm.loop [[LOOP4:![0-9]+]] @@ -72,10 +68,10 @@ entry: loop-memcpy-expansion: %i = phi i64 [ 0, %entry ], [ %i.next, %loop-memcpy-expansion ] - %ldaddr = getelementptr inbounds i8, i8 * %src, i64 %i - %val = load i8, i8 * %ldaddr, align 64 - %staddr = getelementptr inbounds i8, i8 * %dst, i64 %i - store i8 %val, i8 * %staddr, align 64 + %ldaddr = getelementptr inbounds i8, ptr %src, i64 %i + %val = load i8, ptr %ldaddr, align 64 + %staddr = getelementptr inbounds i8, ptr %dst, i64 %i + store i8 %val, ptr %staddr, align 64 %i.next = add i64 %i, 1 %is.next = icmp ult i64 %i.next, 17 br i1 %is.next, label %loop-memcpy-expansion, label %exit diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll index 8f9a9164de80a..40a449f711720 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -8,7 +8,7 @@ target triple = "x86_64-unknown-linux-gnu" ;; We don't need a masked.load for this due to deref facts, and can instead ;; use a plain vector load. -declare void @init(i32* nocapture nofree) +declare void @init(ptr nocapture nofree) ;; For ease of explanation, this one demonstrates ;; with a range check, but there are better lowering options specifically for @@ -18,8 +18,7 @@ define i32 @test_explicit_pred(i64 %len) { ; CHECK-LABEL: @test_explicit_pred( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4096 x i32], align 4 -; CHECK-NEXT: [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32* -; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[LEN:%.*]], i32 0 @@ -34,10 +33,10 @@ define i32 @test_explicit_pred(i64 %len) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], ; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], @@ -49,48 +48,44 @@ define i32 @test_explicit_pred(i64 %len) { ; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT8]] ; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT10]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp slt <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT12]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP8]], i32 4 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[TMP8]], i32 8 -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP17]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP8]], i32 12 -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; CHECK-NEXT: [[TMP20:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[TMP21:%.*]] = xor <4 x i1> [[TMP5]], -; CHECK-NEXT: [[TMP22:%.*]] = xor <4 x i1> [[TMP6]], -; CHECK-NEXT: [[TMP23:%.*]] = xor <4 x i1> [[TMP7]], +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP8]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP8]], i32 8 +; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP8]], i32 12 +; CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP4]], +; CHECK-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP5]], +; CHECK-NEXT: [[TMP18:%.*]] = xor <4 x i1> [[TMP6]], +; CHECK-NEXT: [[TMP19:%.*]] = xor <4 x i1> [[TMP7]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI16:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[WIDE_LOAD13]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI17:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[WIDE_LOAD14]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI18:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> [[WIDE_LOAD15]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP24]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP25]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI16]] -; CHECK-NEXT: [[TMP26]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI17]] -; CHECK-NEXT: [[TMP27]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI18]] +; CHECK-NEXT: [[TMP20]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP21]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI16]] +; CHECK-NEXT: [[TMP22]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI17]] +; CHECK-NEXT: [[TMP23]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI18]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], -; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP25]], [[TMP24]] -; CHECK-NEXT: [[BIN_RDX19:%.*]] = add <4 x i32> [[TMP26]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX20:%.*]] = add <4 x i32> [[TMP27]], [[BIN_RDX19]] -; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX20]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP21]], [[TMP20]] +; CHECK-NEXT: [[BIN_RDX19:%.*]] = add <4 x i32> [[TMP22]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX20:%.*]] = add <4 x i32> [[TMP23]], [[BIN_RDX19]] +; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX20]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -99,8 +94,8 @@ define i32 @test_explicit_pred(i64 %len) { ; CHECK-NEXT: [[EARLYCND:%.*]] = icmp slt i64 [[IV]], [[LEN]] ; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]] ; CHECK: pred: -; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ] @@ -108,13 +103,12 @@ define i32 @test_explicit_pred(i64 %len) { ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: %alloca = alloca [4096 x i32] - %base = bitcast [4096 x i32]* %alloca to i32* - call void @init(i32* %base) + call void @init(ptr %alloca) br label %loop loop: %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] @@ -123,8 +117,8 @@ loop: %earlycnd = icmp slt i64 %iv, %len br i1 %earlycnd, label %pred, label %latch pred: - %addr = getelementptr inbounds i32, i32* %base, i64 %iv - %val = load i32, i32* %addr + %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv + %val = load i32, ptr %addr br label %latch latch: %val.phi = phi i32 [0, %loop], [%val, %pred] @@ -137,21 +131,20 @@ loop_exit: } ;; Similiar to the above, but without an analyzeable condition. -define i32 @test_explicit_pred_generic(i64 %len, i1* %test_base) { +define i32 @test_explicit_pred_generic(i64 %len, ptr %test_base) { ; CHECK-LABEL: @test_explicit_pred_generic( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4096 x i32], align 4 -; CHECK-NEXT: [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32* -; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP76:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -168,106 +161,102 @@ define i32 @test_explicit_pred_generic(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 ; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 ; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 ; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 ; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 ; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 +; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 +; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 +; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 ; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 ; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 ; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 ; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 +; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 +; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 +; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4 -; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP71]], align 4 -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8 -; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP73]], align 4 -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12 -; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP75]], align 4 -; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP68]], align 4 +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4 +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4 +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP71]], align 4 +; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_LOAD5]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP76]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP77]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] +; CHECK-NEXT: [[TMP78]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] +; CHECK-NEXT: [[TMP79]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP77]], [[TMP76]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP78]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX10]] +; CHECK-NEXT: [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]] -; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1 +; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] +; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 ; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]] ; CHECK: pred: -; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ] @@ -275,24 +264,23 @@ define i32 @test_explicit_pred_generic(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: %alloca = alloca [4096 x i32] - %base = bitcast [4096 x i32]* %alloca to i32* - call void @init(i32* %base) + call void @init(ptr %alloca) br label %loop loop: %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ] %iv.next = add i64 %iv, 1 - %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv - %earlycnd = load i1, i1* %test_addr + %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv + %earlycnd = load i1, ptr %test_addr br i1 %earlycnd, label %pred, label %latch pred: - %addr = getelementptr inbounds i32, i32* %base, i64 %iv - %val = load i32, i32* %addr + %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv + %val = load i32, ptr %addr br label %latch latch: %val.phi = phi i32 [0, %loop], [%val, %pred] @@ -308,12 +296,11 @@ loop_exit: ; there are better lowerings, this is a test of robustness of vectorization, ; nothing more.) ; TODO: currently shows predication which can be removed -define i32 @test_invariant_address(i64 %len, i1* %test_base) { +define i32 @test_invariant_address(i64 %len, ptr %test_base) { ; CHECK-LABEL: @test_invariant_address( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4096 x i32], align 4 -; CHECK-NEXT: [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32* -; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -339,82 +326,82 @@ define i32 @test_invariant_address(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 ; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 ; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 ; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 ; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 ; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 +; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 +; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 +; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 ; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 ; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 ; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 ; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 +; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 +; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 +; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP65:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP66:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP67:%.*]] = load i32, i32* [[BASE]], align 4 +; CHECK-NEXT: [[TMP64:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP65:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP67:%.*]] = load i32, ptr [[ALLOCA]], align 4 ; CHECK-NEXT: [[TMP68:%.*]] = insertelement <4 x i32> poison, i32 [[TMP64]], i32 0 ; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP65]], i32 1 ; CHECK-NEXT: [[TMP70:%.*]] = insertelement <4 x i32> [[TMP69]], i32 [[TMP66]], i32 2 ; CHECK-NEXT: [[TMP71:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP67]], i32 3 -; CHECK-NEXT: [[TMP72:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP73:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP74:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP75:%.*]] = load i32, i32* [[BASE]], align 4 +; CHECK-NEXT: [[TMP72:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP73:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP74:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP75:%.*]] = load i32, ptr [[ALLOCA]], align 4 ; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> poison, i32 [[TMP72]], i32 0 ; CHECK-NEXT: [[TMP77:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP73]], i32 1 ; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP74]], i32 2 ; CHECK-NEXT: [[TMP79:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP75]], i32 3 -; CHECK-NEXT: [[TMP80:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP81:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP82:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP83:%.*]] = load i32, i32* [[BASE]], align 4 +; CHECK-NEXT: [[TMP80:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP82:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP83:%.*]] = load i32, ptr [[ALLOCA]], align 4 ; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0 ; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1 ; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2 ; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3 -; CHECK-NEXT: [[TMP88:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP89:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP90:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP91:%.*]] = load i32, i32* [[BASE]], align 4 +; CHECK-NEXT: [[TMP88:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP89:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP90:%.*]] = load i32, ptr [[ALLOCA]], align 4 +; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[ALLOCA]], align 4 ; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0 ; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1 ; CHECK-NEXT: [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2 @@ -449,11 +436,11 @@ define i32 @test_invariant_address(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]] -; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1 +; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] +; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 ; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]] ; CHECK: pred: -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[BASE]], align 4 +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ALLOCA]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ] @@ -466,18 +453,17 @@ define i32 @test_invariant_address(i64 %len, i1* %test_base) { ; entry: %alloca = alloca [4096 x i32] - %base = bitcast [4096 x i32]* %alloca to i32* - call void @init(i32* %base) + call void @init(ptr %alloca) br label %loop loop: %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ] %iv.next = add i64 %iv, 1 - %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv - %earlycnd = load i1, i1* %test_addr + %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv + %earlycnd = load i1, ptr %test_addr br i1 %earlycnd, label %pred, label %latch pred: - %val = load i32, i32* %base + %val = load i32, ptr %alloca br label %latch latch: %val.phi = phi i32 [0, %loop], [%val, %pred] @@ -490,21 +476,20 @@ loop_exit: } ; Overlapping loads - Fails alignment checking, not dereferenceability -define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) { +define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) { ; CHECK-LABEL: @test_step_narrower_than_access( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4096 x i32], align 4 -; CHECK-NEXT: [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32* -; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE33:%.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE33]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP181:%.*]], [[PRED_LOAD_CONTINUE33]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP182:%.*]], [[PRED_LOAD_CONTINUE33]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP183:%.*]], [[PRED_LOAD_CONTINUE33]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP148:%.*]], [[PRED_LOAD_CONTINUE33]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP149:%.*]], [[PRED_LOAD_CONTINUE33]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP150:%.*]], [[PRED_LOAD_CONTINUE33]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP151:%.*]], [[PRED_LOAD_CONTINUE33]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -521,50 +506,50 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 ; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 ; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 ; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 ; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 ; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 +; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 +; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 +; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 ; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 ; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 ; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 ; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 +; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 +; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 +; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 @@ -572,217 +557,183 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0 ; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP65:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i16, i16* [[TMP65]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP67:%.*]] = bitcast i16* [[TMP66]] to i32* -; CHECK-NEXT: [[TMP68:%.*]] = load i32, i32* [[TMP67]], align 4 -; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i32> poison, i32 [[TMP68]], i32 0 +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[TMP65]], align 4 +; CHECK-NEXT: [[TMP67:%.*]] = insertelement <4 x i32> poison, i32 [[TMP66]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP70:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP69]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1 -; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]] +; CHECK-NEXT: [[TMP68:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP67]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP69:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1 +; CHECK-NEXT: br i1 [[TMP69]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]] ; CHECK: pred.load.if4: -; CHECK-NEXT: [[TMP72:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i16, i16* [[TMP72]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP74:%.*]] = bitcast i16* [[TMP73]] to i32* -; CHECK-NEXT: [[TMP75:%.*]] = load i32, i32* [[TMP74]], align 4 -; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP75]], i32 1 +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP71:%.*]] = load i32, ptr [[TMP70]], align 4 +; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP71]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE5]] ; CHECK: pred.load.continue5: -; CHECK-NEXT: [[TMP77:%.*]] = phi <4 x i32> [ [[TMP70]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP76]], [[PRED_LOAD_IF4]] ] -; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2 -; CHECK-NEXT: br i1 [[TMP78]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]] +; CHECK-NEXT: [[TMP73:%.*]] = phi <4 x i32> [ [[TMP68]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP72]], [[PRED_LOAD_IF4]] ] +; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2 +; CHECK-NEXT: br i1 [[TMP74]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]] ; CHECK: pred.load.if6: -; CHECK-NEXT: [[TMP79:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds i16, i16* [[TMP79]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP81:%.*]] = bitcast i16* [[TMP80]] to i32* -; CHECK-NEXT: [[TMP82:%.*]] = load i32, i32* [[TMP81]], align 4 -; CHECK-NEXT: [[TMP83:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP82]], i32 2 +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP76:%.*]] = load i32, ptr [[TMP75]], align 4 +; CHECK-NEXT: [[TMP77:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP76]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE7]] ; CHECK: pred.load.continue7: -; CHECK-NEXT: [[TMP84:%.*]] = phi <4 x i32> [ [[TMP77]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP83]], [[PRED_LOAD_IF6]] ] -; CHECK-NEXT: [[TMP85:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3 -; CHECK-NEXT: br i1 [[TMP85]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] +; CHECK-NEXT: [[TMP78:%.*]] = phi <4 x i32> [ [[TMP73]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP77]], [[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3 +; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] ; CHECK: pred.load.if8: -; CHECK-NEXT: [[TMP86:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP87:%.*]] = getelementptr inbounds i16, i16* [[TMP86]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP88:%.*]] = bitcast i16* [[TMP87]] to i32* -; CHECK-NEXT: [[TMP89:%.*]] = load i32, i32* [[TMP88]], align 4 -; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP89]], i32 3 +; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[TMP80]], align 4 +; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP81]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE9]] ; CHECK: pred.load.continue9: -; CHECK-NEXT: [[TMP91:%.*]] = phi <4 x i32> [ [[TMP84]], [[PRED_LOAD_CONTINUE7]] ], [ [[TMP90]], [[PRED_LOAD_IF8]] ] -; CHECK-NEXT: [[TMP92:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0 -; CHECK-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] +; CHECK-NEXT: [[TMP83:%.*]] = phi <4 x i32> [ [[TMP78]], [[PRED_LOAD_CONTINUE7]] ], [ [[TMP82]], [[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0 +; CHECK-NEXT: br i1 [[TMP84]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] ; CHECK: pred.load.if10: -; CHECK-NEXT: [[TMP93:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds i16, i16* [[TMP93]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP95:%.*]] = bitcast i16* [[TMP94]] to i32* -; CHECK-NEXT: [[TMP96:%.*]] = load i32, i32* [[TMP95]], align 4 -; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0 +; CHECK-NEXT: [[TMP85:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP86:%.*]] = load i32, ptr [[TMP85]], align 4 +; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> poison, i32 [[TMP86]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE11]] ; CHECK: pred.load.continue11: -; CHECK-NEXT: [[TMP98:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP97]], [[PRED_LOAD_IF10]] ] -; CHECK-NEXT: [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1 -; CHECK-NEXT: br i1 [[TMP99]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] +; CHECK-NEXT: [[TMP88:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP87]], [[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP89:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1 +; CHECK-NEXT: br i1 [[TMP89]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] ; CHECK: pred.load.if12: -; CHECK-NEXT: [[TMP100:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP101:%.*]] = getelementptr inbounds i16, i16* [[TMP100]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP102:%.*]] = bitcast i16* [[TMP101]] to i32* -; CHECK-NEXT: [[TMP103:%.*]] = load i32, i32* [[TMP102]], align 4 -; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP103]], i32 1 +; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[TMP90]], align 4 +; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP91]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE13]] ; CHECK: pred.load.continue13: -; CHECK-NEXT: [[TMP105:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP104]], [[PRED_LOAD_IF12]] ] -; CHECK-NEXT: [[TMP106:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2 -; CHECK-NEXT: br i1 [[TMP106]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] +; CHECK-NEXT: [[TMP93:%.*]] = phi <4 x i32> [ [[TMP88]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP92]], [[PRED_LOAD_IF12]] ] +; CHECK-NEXT: [[TMP94:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2 +; CHECK-NEXT: br i1 [[TMP94]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] ; CHECK: pred.load.if14: -; CHECK-NEXT: [[TMP107:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP108:%.*]] = getelementptr inbounds i16, i16* [[TMP107]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP109:%.*]] = bitcast i16* [[TMP108]] to i32* -; CHECK-NEXT: [[TMP110:%.*]] = load i32, i32* [[TMP109]], align 4 -; CHECK-NEXT: [[TMP111:%.*]] = insertelement <4 x i32> [[TMP105]], i32 [[TMP110]], i32 2 +; CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP95]], align 4 +; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP96]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE15]] ; CHECK: pred.load.continue15: -; CHECK-NEXT: [[TMP112:%.*]] = phi <4 x i32> [ [[TMP105]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP111]], [[PRED_LOAD_IF14]] ] -; CHECK-NEXT: [[TMP113:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3 -; CHECK-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] +; CHECK-NEXT: [[TMP98:%.*]] = phi <4 x i32> [ [[TMP93]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP97]], [[PRED_LOAD_IF14]] ] +; CHECK-NEXT: [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3 +; CHECK-NEXT: br i1 [[TMP99]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] ; CHECK: pred.load.if16: -; CHECK-NEXT: [[TMP114:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP115:%.*]] = getelementptr inbounds i16, i16* [[TMP114]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP116:%.*]] = bitcast i16* [[TMP115]] to i32* -; CHECK-NEXT: [[TMP117:%.*]] = load i32, i32* [[TMP116]], align 4 -; CHECK-NEXT: [[TMP118:%.*]] = insertelement <4 x i32> [[TMP112]], i32 [[TMP117]], i32 3 +; CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP101:%.*]] = load i32, ptr [[TMP100]], align 4 +; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP101]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE17]] ; CHECK: pred.load.continue17: -; CHECK-NEXT: [[TMP119:%.*]] = phi <4 x i32> [ [[TMP112]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP118]], [[PRED_LOAD_IF16]] ] -; CHECK-NEXT: [[TMP120:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0 -; CHECK-NEXT: br i1 [[TMP120]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] +; CHECK-NEXT: [[TMP103:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP102]], [[PRED_LOAD_IF16]] ] +; CHECK-NEXT: [[TMP104:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0 +; CHECK-NEXT: br i1 [[TMP104]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] ; CHECK: pred.load.if18: -; CHECK-NEXT: [[TMP121:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds i16, i16* [[TMP121]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP123:%.*]] = bitcast i16* [[TMP122]] to i32* -; CHECK-NEXT: [[TMP124:%.*]] = load i32, i32* [[TMP123]], align 4 -; CHECK-NEXT: [[TMP125:%.*]] = insertelement <4 x i32> poison, i32 [[TMP124]], i32 0 +; CHECK-NEXT: [[TMP105:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP105]], align 4 +; CHECK-NEXT: [[TMP107:%.*]] = insertelement <4 x i32> poison, i32 [[TMP106]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE19]] ; CHECK: pred.load.continue19: -; CHECK-NEXT: [[TMP126:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP125]], [[PRED_LOAD_IF18]] ] -; CHECK-NEXT: [[TMP127:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1 -; CHECK-NEXT: br i1 [[TMP127]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] +; CHECK-NEXT: [[TMP108:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP107]], [[PRED_LOAD_IF18]] ] +; CHECK-NEXT: [[TMP109:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1 +; CHECK-NEXT: br i1 [[TMP109]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] ; CHECK: pred.load.if20: -; CHECK-NEXT: [[TMP128:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP129:%.*]] = getelementptr inbounds i16, i16* [[TMP128]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP130:%.*]] = bitcast i16* [[TMP129]] to i32* -; CHECK-NEXT: [[TMP131:%.*]] = load i32, i32* [[TMP130]], align 4 -; CHECK-NEXT: [[TMP132:%.*]] = insertelement <4 x i32> [[TMP126]], i32 [[TMP131]], i32 1 +; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP111:%.*]] = load i32, ptr [[TMP110]], align 4 +; CHECK-NEXT: [[TMP112:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP111]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE21]] ; CHECK: pred.load.continue21: -; CHECK-NEXT: [[TMP133:%.*]] = phi <4 x i32> [ [[TMP126]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP132]], [[PRED_LOAD_IF20]] ] -; CHECK-NEXT: [[TMP134:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2 -; CHECK-NEXT: br i1 [[TMP134]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]] +; CHECK-NEXT: [[TMP113:%.*]] = phi <4 x i32> [ [[TMP108]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP112]], [[PRED_LOAD_IF20]] ] +; CHECK-NEXT: [[TMP114:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2 +; CHECK-NEXT: br i1 [[TMP114]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]] ; CHECK: pred.load.if22: -; CHECK-NEXT: [[TMP135:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP136:%.*]] = getelementptr inbounds i16, i16* [[TMP135]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP137:%.*]] = bitcast i16* [[TMP136]] to i32* -; CHECK-NEXT: [[TMP138:%.*]] = load i32, i32* [[TMP137]], align 4 -; CHECK-NEXT: [[TMP139:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP138]], i32 2 +; CHECK-NEXT: [[TMP115:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP116:%.*]] = load i32, ptr [[TMP115]], align 4 +; CHECK-NEXT: [[TMP117:%.*]] = insertelement <4 x i32> [[TMP113]], i32 [[TMP116]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE23]] ; CHECK: pred.load.continue23: -; CHECK-NEXT: [[TMP140:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP139]], [[PRED_LOAD_IF22]] ] -; CHECK-NEXT: [[TMP141:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3 -; CHECK-NEXT: br i1 [[TMP141]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]] +; CHECK-NEXT: [[TMP118:%.*]] = phi <4 x i32> [ [[TMP113]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP117]], [[PRED_LOAD_IF22]] ] +; CHECK-NEXT: [[TMP119:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3 +; CHECK-NEXT: br i1 [[TMP119]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]] ; CHECK: pred.load.if24: -; CHECK-NEXT: [[TMP142:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP143:%.*]] = getelementptr inbounds i16, i16* [[TMP142]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP144:%.*]] = bitcast i16* [[TMP143]] to i32* -; CHECK-NEXT: [[TMP145:%.*]] = load i32, i32* [[TMP144]], align 4 -; CHECK-NEXT: [[TMP146:%.*]] = insertelement <4 x i32> [[TMP140]], i32 [[TMP145]], i32 3 +; CHECK-NEXT: [[TMP120:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP121:%.*]] = load i32, ptr [[TMP120]], align 4 +; CHECK-NEXT: [[TMP122:%.*]] = insertelement <4 x i32> [[TMP118]], i32 [[TMP121]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE25]] ; CHECK: pred.load.continue25: -; CHECK-NEXT: [[TMP147:%.*]] = phi <4 x i32> [ [[TMP140]], [[PRED_LOAD_CONTINUE23]] ], [ [[TMP146]], [[PRED_LOAD_IF24]] ] -; CHECK-NEXT: [[TMP148:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0 -; CHECK-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]] +; CHECK-NEXT: [[TMP123:%.*]] = phi <4 x i32> [ [[TMP118]], [[PRED_LOAD_CONTINUE23]] ], [ [[TMP122]], [[PRED_LOAD_IF24]] ] +; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0 +; CHECK-NEXT: br i1 [[TMP124]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]] ; CHECK: pred.load.if26: -; CHECK-NEXT: [[TMP149:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP150:%.*]] = getelementptr inbounds i16, i16* [[TMP149]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP151:%.*]] = bitcast i16* [[TMP150]] to i32* -; CHECK-NEXT: [[TMP152:%.*]] = load i32, i32* [[TMP151]], align 4 -; CHECK-NEXT: [[TMP153:%.*]] = insertelement <4 x i32> poison, i32 [[TMP152]], i32 0 +; CHECK-NEXT: [[TMP125:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP126:%.*]] = load i32, ptr [[TMP125]], align 4 +; CHECK-NEXT: [[TMP127:%.*]] = insertelement <4 x i32> poison, i32 [[TMP126]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE27]] ; CHECK: pred.load.continue27: -; CHECK-NEXT: [[TMP154:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE25]] ], [ [[TMP153]], [[PRED_LOAD_IF26]] ] -; CHECK-NEXT: [[TMP155:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1 -; CHECK-NEXT: br i1 [[TMP155]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]] +; CHECK-NEXT: [[TMP128:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE25]] ], [ [[TMP127]], [[PRED_LOAD_IF26]] ] +; CHECK-NEXT: [[TMP129:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1 +; CHECK-NEXT: br i1 [[TMP129]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]] ; CHECK: pred.load.if28: -; CHECK-NEXT: [[TMP156:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP157:%.*]] = getelementptr inbounds i16, i16* [[TMP156]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP158:%.*]] = bitcast i16* [[TMP157]] to i32* -; CHECK-NEXT: [[TMP159:%.*]] = load i32, i32* [[TMP158]], align 4 -; CHECK-NEXT: [[TMP160:%.*]] = insertelement <4 x i32> [[TMP154]], i32 [[TMP159]], i32 1 +; CHECK-NEXT: [[TMP130:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP131:%.*]] = load i32, ptr [[TMP130]], align 4 +; CHECK-NEXT: [[TMP132:%.*]] = insertelement <4 x i32> [[TMP128]], i32 [[TMP131]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE29]] ; CHECK: pred.load.continue29: -; CHECK-NEXT: [[TMP161:%.*]] = phi <4 x i32> [ [[TMP154]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP160]], [[PRED_LOAD_IF28]] ] -; CHECK-NEXT: [[TMP162:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2 -; CHECK-NEXT: br i1 [[TMP162]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]] +; CHECK-NEXT: [[TMP133:%.*]] = phi <4 x i32> [ [[TMP128]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP132]], [[PRED_LOAD_IF28]] ] +; CHECK-NEXT: [[TMP134:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2 +; CHECK-NEXT: br i1 [[TMP134]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]] ; CHECK: pred.load.if30: -; CHECK-NEXT: [[TMP163:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP164:%.*]] = getelementptr inbounds i16, i16* [[TMP163]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP165:%.*]] = bitcast i16* [[TMP164]] to i32* -; CHECK-NEXT: [[TMP166:%.*]] = load i32, i32* [[TMP165]], align 4 -; CHECK-NEXT: [[TMP167:%.*]] = insertelement <4 x i32> [[TMP161]], i32 [[TMP166]], i32 2 +; CHECK-NEXT: [[TMP135:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP136:%.*]] = load i32, ptr [[TMP135]], align 4 +; CHECK-NEXT: [[TMP137:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP136]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE31]] ; CHECK: pred.load.continue31: -; CHECK-NEXT: [[TMP168:%.*]] = phi <4 x i32> [ [[TMP161]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP167]], [[PRED_LOAD_IF30]] ] -; CHECK-NEXT: [[TMP169:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3 -; CHECK-NEXT: br i1 [[TMP169]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]] +; CHECK-NEXT: [[TMP138:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP137]], [[PRED_LOAD_IF30]] ] +; CHECK-NEXT: [[TMP139:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3 +; CHECK-NEXT: br i1 [[TMP139]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]] ; CHECK: pred.load.if32: -; CHECK-NEXT: [[TMP170:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP171:%.*]] = getelementptr inbounds i16, i16* [[TMP170]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP172:%.*]] = bitcast i16* [[TMP171]] to i32* -; CHECK-NEXT: [[TMP173:%.*]] = load i32, i32* [[TMP172]], align 4 -; CHECK-NEXT: [[TMP174:%.*]] = insertelement <4 x i32> [[TMP168]], i32 [[TMP173]], i32 3 +; CHECK-NEXT: [[TMP140:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP141:%.*]] = load i32, ptr [[TMP140]], align 4 +; CHECK-NEXT: [[TMP142:%.*]] = insertelement <4 x i32> [[TMP138]], i32 [[TMP141]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE33]] ; CHECK: pred.load.continue33: -; CHECK-NEXT: [[TMP175:%.*]] = phi <4 x i32> [ [[TMP168]], [[PRED_LOAD_CONTINUE31]] ], [ [[TMP174]], [[PRED_LOAD_IF32]] ] -; CHECK-NEXT: [[TMP176:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP177:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP178:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP179:%.*]] = xor <4 x i1> [[TMP63]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP91]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI34:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP119]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI35:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP147]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI36:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP175]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP180]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP181]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI34]] -; CHECK-NEXT: [[TMP182]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI35]] -; CHECK-NEXT: [[TMP183]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI36]] +; CHECK-NEXT: [[TMP143:%.*]] = phi <4 x i32> [ [[TMP138]], [[PRED_LOAD_CONTINUE31]] ], [ [[TMP142]], [[PRED_LOAD_IF32]] ] +; CHECK-NEXT: [[TMP144:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP145:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP146:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP147:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP83]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI34:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP103]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI35:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP123]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI36:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP143]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP148]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP149]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI34]] +; CHECK-NEXT: [[TMP150]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI35]] +; CHECK-NEXT: [[TMP151]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI36]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP184:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP184]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP152:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP152]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP181]], [[TMP180]] -; CHECK-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP182]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP183]], [[BIN_RDX37]] -; CHECK-NEXT: [[TMP185:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP149]], [[TMP148]] +; CHECK-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP150]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP151]], [[BIN_RDX37]] +; CHECK-NEXT: [[TMP153:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP185]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP153]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]] -; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1 +; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] +; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 ; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]] ; CHECK: pred: -; CHECK-NEXT: [[BASE_I16P:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[ADDR_I16P:%.*]] = getelementptr inbounds i16, i16* [[BASE_I16P]], i64 [[IV]] -; CHECK-NEXT: [[ADDR:%.*]] = bitcast i16* [[ADDR_I16P]] to i32* -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4 +; CHECK-NEXT: [[ADDR_I16P:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[IV]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ADDR_I16P]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ] @@ -790,26 +741,23 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP185]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP153]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: %alloca = alloca [4096 x i32] - %base = bitcast [4096 x i32]* %alloca to i32* - call void @init(i32* %base) + call void @init(ptr %alloca) br label %loop loop: %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ] %iv.next = add i64 %iv, 1 - %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv - %earlycnd = load i1, i1* %test_addr + %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv + %earlycnd = load i1, ptr %test_addr br i1 %earlycnd, label %pred, label %latch pred: - %base.i16p = bitcast i32* %base to i16* - %addr.i16p = getelementptr inbounds i16, i16* %base.i16p, i64 %iv - %addr = bitcast i16* %addr.i16p to i32* - %val = load i32, i32* %addr + %addr.i16p = getelementptr inbounds i16, ptr %alloca, i64 %iv + %val = load i32, ptr %addr.i16p br label %latch latch: %val.phi = phi i32 [0, %loop], [%val, %pred] @@ -821,12 +769,11 @@ loop_exit: ret i32 %accum.next } -define i32 @test_max_trip_count(i64 %len, i1* %test_base, i64 %n) { +define i32 @test_max_trip_count(i64 %len, ptr %test_base, i64 %n) { ; CHECK-LABEL: @test_max_trip_count( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4096 x i32], align 4 -; CHECK-NEXT: [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32* -; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) ; CHECK-NEXT: [[MIN_CMP:%.*]] = icmp ult i64 4096, [[N:%.*]] ; CHECK-NEXT: [[MIN_N:%.*]] = select i1 [[MIN_CMP]], i64 4096, i64 [[N]] ; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[MIN_N]], 2 @@ -838,10 +785,10 @@ define i32 @test_max_trip_count(i64 %len, i1* %test_base, i64 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP84:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 2 @@ -858,106 +805,102 @@ define i32 @test_max_trip_count(i64 %len, i1* %test_base, i64 %n) { ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = load i1, i1* [[TMP20]], align 1 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = load i1, ptr [[TMP20]], align 1 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> poison, i1 [[TMP33]], i32 0 ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 1 ; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 2 ; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i1> [[TMP39]], i1 [[TMP36]], i32 3 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = load i1, i1* [[TMP24]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 +; CHECK-NEXT: [[TMP44:%.*]] = load i1, ptr [[TMP24]], align 1 ; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> poison, i1 [[TMP41]], i32 0 ; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 1 ; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 2 ; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i1> [[TMP47]], i1 [[TMP44]], i32 3 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = load i1, i1* [[TMP28]], align 1 +; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 +; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 +; CHECK-NEXT: [[TMP52:%.*]] = load i1, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> poison, i1 [[TMP49]], i32 0 ; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 1 ; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 2 ; CHECK-NEXT: [[TMP56:%.*]] = insertelement <4 x i1> [[TMP55]], i1 [[TMP52]], i32 3 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 -; CHECK-NEXT: [[TMP60:%.*]] = load i1, i1* [[TMP32]], align 1 +; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 +; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP60:%.*]] = load i1, ptr [[TMP32]], align 1 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> poison, i1 [[TMP57]], i32 0 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 1 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 2 ; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i1> [[TMP63]], i1 [[TMP60]], i32 3 -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, i32* [[TMP65]], i32 0 -; CHECK-NEXT: [[TMP70:%.*]] = bitcast i32* [[TMP69]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP70]], i32 4, <4 x i1> [[TMP40]], <4 x i32> poison) -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, i32* [[TMP65]], i32 4 -; CHECK-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP72]], i32 4, <4 x i1> [[TMP48]], <4 x i32> poison) -; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, i32* [[TMP65]], i32 8 -; CHECK-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP74]], i32 4, <4 x i1> [[TMP56]], <4 x i32> poison) -; CHECK-NEXT: [[TMP75:%.*]] = getelementptr i32, i32* [[TMP65]], i32 12 -; CHECK-NEXT: [[TMP76:%.*]] = bitcast i32* [[TMP75]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP76]], i32 4, <4 x i1> [[TMP64]], <4 x i32> poison) -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP40]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP48]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP56]], -; CHECK-NEXT: [[TMP80:%.*]] = xor <4 x i1> [[TMP64]], +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP40]], <4 x i32> poison) +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP65]], i32 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP48]], <4 x i32> poison) +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP65]], i32 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP56]], <4 x i32> poison) +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, ptr [[TMP65]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP72]], i32 4, <4 x i1> [[TMP64]], <4 x i32> poison) +; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP40]], +; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP48]], +; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP56]], +; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP64]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP40]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP56]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP64]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP84]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP77]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP78]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] +; CHECK-NEXT: [[TMP79]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] +; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP85:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP81:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP81]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP82]], [[TMP81]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP84]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP78]], [[TMP77]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP80]], [[BIN_RDX10]] +; CHECK-NEXT: [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP82]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]] -; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1 +; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] +; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 ; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]] ; CHECK: pred: -; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ] @@ -965,13 +908,12 @@ define i32 @test_max_trip_count(i64 %len, i1* %test_base, i64 %n) { ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], [[MIN_N]] ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP82]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: %alloca = alloca [4096 x i32] - %base = bitcast [4096 x i32]* %alloca to i32* - call void @init(i32* %base) + call void @init(ptr %alloca) %min.cmp = icmp ult i64 4096, %n %min.n = select i1 %min.cmp, i64 4096, i64 %n br label %loop @@ -979,12 +921,12 @@ loop: %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ] %iv.next = add i64 %iv, 1 - %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv - %earlycnd = load i1, i1* %test_addr + %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv + %earlycnd = load i1, ptr %test_addr br i1 %earlycnd, label %pred, label %latch pred: - %addr = getelementptr inbounds i32, i32* %base, i64 %iv - %val = load i32, i32* %addr + %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv + %val = load i32, ptr %addr br label %latch latch: %val.phi = phi i32 [0, %loop], [%val, %pred] @@ -999,21 +941,20 @@ loop_exit: -define i32 @test_non_zero_start(i64 %len, i1* %test_base) { +define i32 @test_non_zero_start(i64 %len, ptr %test_base) { ; CHECK-LABEL: @test_non_zero_start( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4096 x i32], align 4 -; CHECK-NEXT: [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32* -; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP76:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1024, [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 1 @@ -1031,106 +972,102 @@ define i32 @test_non_zero_start(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 13 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 14 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 ; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 ; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 ; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 ; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 ; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 +; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 +; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 +; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 ; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 ; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 ; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 ; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 +; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 +; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 +; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4 -; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8 -; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12 -; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) +; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP76]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP77]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] +; CHECK-NEXT: [[TMP78]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] +; CHECK-NEXT: [[TMP79]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3072 -; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3072 +; CHECK-NEXT: br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP77]], [[TMP76]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP78]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX10]] +; CHECK-NEXT: [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 3072, 3072 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 1024, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]] -; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1 +; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] +; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 ; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]] ; CHECK: pred: -; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ] @@ -1138,24 +1075,23 @@ define i32 @test_non_zero_start(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: %alloca = alloca [4096 x i32] - %base = bitcast [4096 x i32]* %alloca to i32* - call void @init(i32* %base) + call void @init(ptr %alloca) br label %loop loop: %iv = phi i64 [ 1024, %entry ], [ %iv.next, %latch ] %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ] %iv.next = add i64 %iv, 1 - %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv - %earlycnd = load i1, i1* %test_addr + %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv + %earlycnd = load i1, ptr %test_addr br i1 %earlycnd, label %pred, label %latch pred: - %addr = getelementptr inbounds i32, i32* %base, i64 %iv - %val = load i32, i32* %addr + %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv + %val = load i32, ptr %addr br label %latch latch: %val.phi = phi i32 [0, %loop], [%val, %pred] @@ -1167,23 +1103,22 @@ loop_exit: ret i32 %accum.next } -define i32 @neg_out_of_bounds_start(i64 %len, i1* %test_base) { +define i32 @neg_out_of_bounds_start(i64 %len, ptr %test_base) { ; CHECK-LABEL: @neg_out_of_bounds_start( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4096 x i32], align 4 -; CHECK-NEXT: [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32* -; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ -10, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[IV]] -; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1 +; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[IV]] +; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 ; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]] ; CHECK: pred: -; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ] @@ -1196,19 +1131,18 @@ define i32 @neg_out_of_bounds_start(i64 %len, i1* %test_base) { ; entry: %alloca = alloca [4096 x i32] - %base = bitcast [4096 x i32]* %alloca to i32* - call void @init(i32* %base) + call void @init(ptr %alloca) br label %loop loop: %iv = phi i64 [ -10, %entry ], [ %iv.next, %latch ] %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ] %iv.next = add i64 %iv, 1 - %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv - %earlycnd = load i1, i1* %test_addr + %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv + %earlycnd = load i1, ptr %test_addr br i1 %earlycnd, label %pred, label %latch pred: - %addr = getelementptr inbounds i32, i32* %base, i64 %iv - %val = load i32, i32* %addr + %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv + %val = load i32, ptr %addr br label %latch latch: %val.phi = phi i32 [0, %loop], [%val, %pred] @@ -1222,12 +1156,11 @@ loop_exit: ;; TODO: handle non-unit strides -define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { +define i32 @test_non_unit_stride(i64 %len, ptr %test_base) { ; CHECK-LABEL: @test_non_unit_stride( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4096 x i32], align 4 -; CHECK-NEXT: [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32* -; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1254,50 +1187,50 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 26 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 28 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 30 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 ; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 ; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 ; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 ; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 ; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 +; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 +; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 +; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 ; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 ; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 ; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 ; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 +; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 +; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 +; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 @@ -1305,8 +1238,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0 ; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP66:%.*]] = load i32, i32* [[TMP65]], align 4 +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[TMP65]], align 4 ; CHECK-NEXT: [[TMP67:%.*]] = insertelement <4 x i32> poison, i32 [[TMP66]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: @@ -1314,8 +1247,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP69:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1 ; CHECK-NEXT: br i1 [[TMP69]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]] ; CHECK: pred.load.if4: -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP71:%.*]] = load i32, i32* [[TMP70]], align 4 +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP71:%.*]] = load i32, ptr [[TMP70]], align 4 ; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP71]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE5]] ; CHECK: pred.load.continue5: @@ -1323,8 +1256,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2 ; CHECK-NEXT: br i1 [[TMP74]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]] ; CHECK: pred.load.if6: -; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP76:%.*]] = load i32, i32* [[TMP75]], align 4 +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP76:%.*]] = load i32, ptr [[TMP75]], align 4 ; CHECK-NEXT: [[TMP77:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP76]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE7]] ; CHECK: pred.load.continue7: @@ -1332,8 +1265,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3 ; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] ; CHECK: pred.load.if8: -; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP81:%.*]] = load i32, i32* [[TMP80]], align 4 +; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[TMP80]], align 4 ; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP81]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE9]] ; CHECK: pred.load.continue9: @@ -1341,8 +1274,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0 ; CHECK-NEXT: br i1 [[TMP84]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] ; CHECK: pred.load.if10: -; CHECK-NEXT: [[TMP85:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP86:%.*]] = load i32, i32* [[TMP85]], align 4 +; CHECK-NEXT: [[TMP85:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP86:%.*]] = load i32, ptr [[TMP85]], align 4 ; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> poison, i32 [[TMP86]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE11]] ; CHECK: pred.load.continue11: @@ -1350,8 +1283,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP89:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1 ; CHECK-NEXT: br i1 [[TMP89]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] ; CHECK: pred.load.if12: -; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP91:%.*]] = load i32, i32* [[TMP90]], align 4 +; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[TMP90]], align 4 ; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP91]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE13]] ; CHECK: pred.load.continue13: @@ -1359,8 +1292,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP94:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2 ; CHECK-NEXT: br i1 [[TMP94]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] ; CHECK: pred.load.if14: -; CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP96:%.*]] = load i32, i32* [[TMP95]], align 4 +; CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP95]], align 4 ; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP96]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE15]] ; CHECK: pred.load.continue15: @@ -1368,8 +1301,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3 ; CHECK-NEXT: br i1 [[TMP99]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] ; CHECK: pred.load.if16: -; CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP101:%.*]] = load i32, i32* [[TMP100]], align 4 +; CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP101:%.*]] = load i32, ptr [[TMP100]], align 4 ; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP101]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE17]] ; CHECK: pred.load.continue17: @@ -1377,8 +1310,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP104:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0 ; CHECK-NEXT: br i1 [[TMP104]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] ; CHECK: pred.load.if18: -; CHECK-NEXT: [[TMP105:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP106:%.*]] = load i32, i32* [[TMP105]], align 4 +; CHECK-NEXT: [[TMP105:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP105]], align 4 ; CHECK-NEXT: [[TMP107:%.*]] = insertelement <4 x i32> poison, i32 [[TMP106]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE19]] ; CHECK: pred.load.continue19: @@ -1386,8 +1319,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP109:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1 ; CHECK-NEXT: br i1 [[TMP109]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] ; CHECK: pred.load.if20: -; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP111:%.*]] = load i32, i32* [[TMP110]], align 4 +; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP111:%.*]] = load i32, ptr [[TMP110]], align 4 ; CHECK-NEXT: [[TMP112:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP111]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE21]] ; CHECK: pred.load.continue21: @@ -1395,8 +1328,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP114:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2 ; CHECK-NEXT: br i1 [[TMP114]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]] ; CHECK: pred.load.if22: -; CHECK-NEXT: [[TMP115:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP116:%.*]] = load i32, i32* [[TMP115]], align 4 +; CHECK-NEXT: [[TMP115:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP116:%.*]] = load i32, ptr [[TMP115]], align 4 ; CHECK-NEXT: [[TMP117:%.*]] = insertelement <4 x i32> [[TMP113]], i32 [[TMP116]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE23]] ; CHECK: pred.load.continue23: @@ -1404,8 +1337,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP119:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3 ; CHECK-NEXT: br i1 [[TMP119]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]] ; CHECK: pred.load.if24: -; CHECK-NEXT: [[TMP120:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP121:%.*]] = load i32, i32* [[TMP120]], align 4 +; CHECK-NEXT: [[TMP120:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP121:%.*]] = load i32, ptr [[TMP120]], align 4 ; CHECK-NEXT: [[TMP122:%.*]] = insertelement <4 x i32> [[TMP118]], i32 [[TMP121]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE25]] ; CHECK: pred.load.continue25: @@ -1413,8 +1346,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0 ; CHECK-NEXT: br i1 [[TMP124]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]] ; CHECK: pred.load.if26: -; CHECK-NEXT: [[TMP125:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP126:%.*]] = load i32, i32* [[TMP125]], align 4 +; CHECK-NEXT: [[TMP125:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP126:%.*]] = load i32, ptr [[TMP125]], align 4 ; CHECK-NEXT: [[TMP127:%.*]] = insertelement <4 x i32> poison, i32 [[TMP126]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE27]] ; CHECK: pred.load.continue27: @@ -1422,8 +1355,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP129:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1 ; CHECK-NEXT: br i1 [[TMP129]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]] ; CHECK: pred.load.if28: -; CHECK-NEXT: [[TMP130:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP131:%.*]] = load i32, i32* [[TMP130]], align 4 +; CHECK-NEXT: [[TMP130:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP131:%.*]] = load i32, ptr [[TMP130]], align 4 ; CHECK-NEXT: [[TMP132:%.*]] = insertelement <4 x i32> [[TMP128]], i32 [[TMP131]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE29]] ; CHECK: pred.load.continue29: @@ -1431,8 +1364,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP134:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2 ; CHECK-NEXT: br i1 [[TMP134]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]] ; CHECK: pred.load.if30: -; CHECK-NEXT: [[TMP135:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP136:%.*]] = load i32, i32* [[TMP135]], align 4 +; CHECK-NEXT: [[TMP135:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP136:%.*]] = load i32, ptr [[TMP135]], align 4 ; CHECK-NEXT: [[TMP137:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP136]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE31]] ; CHECK: pred.load.continue31: @@ -1440,8 +1373,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP139:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3 ; CHECK-NEXT: br i1 [[TMP139]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]] ; CHECK: pred.load.if32: -; CHECK-NEXT: [[TMP140:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP141:%.*]] = load i32, i32* [[TMP140]], align 4 +; CHECK-NEXT: [[TMP140:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP141:%.*]] = load i32, ptr [[TMP140]], align 4 ; CHECK-NEXT: [[TMP142:%.*]] = insertelement <4 x i32> [[TMP138]], i32 [[TMP141]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE33]] ; CHECK: pred.load.continue33: @@ -1476,12 +1409,12 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 2 -; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]] -; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1 +; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] +; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 ; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]] ; CHECK: pred: -; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ] @@ -1494,19 +1427,18 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) { ; entry: %alloca = alloca [4096 x i32] - %base = bitcast [4096 x i32]* %alloca to i32* - call void @init(i32* %base) + call void @init(ptr %alloca) br label %loop loop: %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ] %iv.next = add i64 %iv, 2 - %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv - %earlycnd = load i1, i1* %test_addr + %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv + %earlycnd = load i1, ptr %test_addr br i1 %earlycnd, label %pred, label %latch pred: - %addr = getelementptr inbounds i32, i32* %base, i64 %iv - %val = load i32, i32* %addr + %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv + %val = load i32, ptr %addr br label %latch latch: %val.phi = phi i32 [0, %loop], [%val, %pred] @@ -1518,21 +1450,20 @@ loop_exit: ret i32 %accum.next } -define i32 @neg_off_by_many(i64 %len, i1* %test_base) { +define i32 @neg_off_by_many(i64 %len, ptr %test_base) { ; CHECK-LABEL: @neg_off_by_many( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [1024 x i32], align 4 -; CHECK-NEXT: [[BASE:%.*]] = bitcast [1024 x i32]* [[ALLOCA]] to i32* -; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP76:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -1549,106 +1480,102 @@ define i32 @neg_off_by_many(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 ; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 ; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 ; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 ; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 ; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 +; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 +; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 +; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 ; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 ; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 ; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 ; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 +; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 +; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 +; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4 -; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8 -; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12 -; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) +; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP76]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP77]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] +; CHECK-NEXT: [[TMP78]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] +; CHECK-NEXT: [[TMP79]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP77]], [[TMP76]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP78]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX10]] +; CHECK-NEXT: [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]] -; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1 +; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] +; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 ; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]] ; CHECK: pred: -; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ] @@ -1656,24 +1583,23 @@ define i32 @neg_off_by_many(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: %alloca = alloca [1024 x i32] - %base = bitcast [1024 x i32]* %alloca to i32* - call void @init(i32* %base) + call void @init(ptr %alloca) br label %loop loop: %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ] %iv.next = add i64 %iv, 1 - %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv - %earlycnd = load i1, i1* %test_addr + %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv + %earlycnd = load i1, ptr %test_addr br i1 %earlycnd, label %pred, label %latch pred: - %addr = getelementptr inbounds i32, i32* %base, i64 %iv - %val = load i32, i32* %addr + %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv + %val = load i32, ptr %addr br label %latch latch: %val.phi = phi i32 [0, %loop], [%val, %pred] @@ -1685,21 +1611,20 @@ loop_exit: ret i32 %accum.next } -define i32 @neg_off_by_one_iteration(i64 %len, i1* %test_base) { +define i32 @neg_off_by_one_iteration(i64 %len, ptr %test_base) { ; CHECK-LABEL: @neg_off_by_one_iteration( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4095 x i32], align 4 -; CHECK-NEXT: [[BASE:%.*]] = bitcast [4095 x i32]* [[ALLOCA]] to i32* -; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP76:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -1716,106 +1641,102 @@ define i32 @neg_off_by_one_iteration(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 ; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 ; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 ; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 ; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 ; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 +; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 +; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 +; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 ; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 ; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 ; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 ; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 +; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 +; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 +; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4 -; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8 -; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12 -; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) +; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP76]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP77]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] +; CHECK-NEXT: [[TMP78]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] +; CHECK-NEXT: [[TMP79]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP77]], [[TMP76]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP78]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX10]] +; CHECK-NEXT: [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]] -; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1 +; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] +; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 ; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]] ; CHECK: pred: -; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ] @@ -1823,24 +1744,23 @@ define i32 @neg_off_by_one_iteration(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: %alloca = alloca [4095 x i32] - %base = bitcast [4095 x i32]* %alloca to i32* - call void @init(i32* %base) + call void @init(ptr %alloca) br label %loop loop: %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ] %iv.next = add i64 %iv, 1 - %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv - %earlycnd = load i1, i1* %test_addr + %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv + %earlycnd = load i1, ptr %test_addr br i1 %earlycnd, label %pred, label %latch pred: - %addr = getelementptr inbounds i32, i32* %base, i64 %iv - %val = load i32, i32* %addr + %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv + %val = load i32, ptr %addr br label %latch latch: %val.phi = phi i32 [0, %loop], [%val, %pred] @@ -1852,21 +1772,20 @@ loop_exit: ret i32 %accum.next } -define i32 @neg_off_by_one_byte(i64 %len, i1* %test_base) { +define i32 @neg_off_by_one_byte(i64 %len, ptr %test_base) { ; CHECK-LABEL: @neg_off_by_one_byte( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [16383 x i8], align 1 -; CHECK-NEXT: [[BASE:%.*]] = bitcast [16383 x i8]* [[ALLOCA]] to i32* -; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP76:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -1883,106 +1802,102 @@ define i32 @neg_off_by_one_byte(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 ; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 ; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 ; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 ; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 ; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 +; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 +; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 +; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 ; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 ; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 ; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 ; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 +; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 +; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 +; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4 -; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8 -; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12 -; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) +; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP76]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP77]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] +; CHECK-NEXT: [[TMP78]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] +; CHECK-NEXT: [[TMP79]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP77]], [[TMP76]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP78]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX10]] +; CHECK-NEXT: [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]] -; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1 +; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] +; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 ; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]] ; CHECK: pred: -; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ] @@ -1990,24 +1905,23 @@ define i32 @neg_off_by_one_byte(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: %alloca = alloca [16383 x i8] - %base = bitcast [16383 x i8]* %alloca to i32* - call void @init(i32* %base) + call void @init(ptr %alloca) br label %loop loop: %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ] %iv.next = add i64 %iv, 1 - %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv - %earlycnd = load i1, i1* %test_addr + %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv + %earlycnd = load i1, ptr %test_addr br i1 %earlycnd, label %pred, label %latch pred: - %addr = getelementptr inbounds i32, i32* %base, i64 %iv - %val = load i32, i32* %addr + %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv + %val = load i32, ptr %addr br label %latch latch: %val.phi = phi i32 [0, %loop], [%val, %pred] @@ -2022,12 +1936,11 @@ loop_exit: ; Show that we handle case where exit count is non-constant, but that we ; have a constant bound on it which is sufficient to show dereferenceability. -define i32 @test_constant_max(i64 %len, i1* %test_base) { +define i32 @test_constant_max(i64 %len, ptr %test_base) { ; CHECK-LABEL: @test_constant_max( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4096 x i32], align 4 -; CHECK-NEXT: [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32* -; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) ; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[LEN:%.*]], 4094 ; CHECK-NEXT: [[MIN:%.*]] = select i1 [[CMP]], i64 4094, i64 [[LEN]] ; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[MIN]], 2 @@ -2039,10 +1952,10 @@ define i32 @test_constant_max(i64 %len, i1* %test_base) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP84:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 2 @@ -2059,106 +1972,102 @@ define i32 @test_constant_max(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = load i1, i1* [[TMP20]], align 1 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = load i1, ptr [[TMP20]], align 1 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> poison, i1 [[TMP33]], i32 0 ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 1 ; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 2 ; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i1> [[TMP39]], i1 [[TMP36]], i32 3 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = load i1, i1* [[TMP24]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 +; CHECK-NEXT: [[TMP44:%.*]] = load i1, ptr [[TMP24]], align 1 ; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> poison, i1 [[TMP41]], i32 0 ; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 1 ; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 2 ; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i1> [[TMP47]], i1 [[TMP44]], i32 3 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = load i1, i1* [[TMP28]], align 1 +; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 +; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 +; CHECK-NEXT: [[TMP52:%.*]] = load i1, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> poison, i1 [[TMP49]], i32 0 ; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 1 ; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 2 ; CHECK-NEXT: [[TMP56:%.*]] = insertelement <4 x i1> [[TMP55]], i1 [[TMP52]], i32 3 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 -; CHECK-NEXT: [[TMP60:%.*]] = load i1, i1* [[TMP32]], align 1 +; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 +; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP60:%.*]] = load i1, ptr [[TMP32]], align 1 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> poison, i1 [[TMP57]], i32 0 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 1 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 2 ; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i1> [[TMP63]], i1 [[TMP60]], i32 3 -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, i32* [[TMP65]], i32 0 -; CHECK-NEXT: [[TMP70:%.*]] = bitcast i32* [[TMP69]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP70]], align 4 -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, i32* [[TMP65]], i32 4 -; CHECK-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP72]], align 4 -; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, i32* [[TMP65]], i32 8 -; CHECK-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; CHECK-NEXT: [[TMP75:%.*]] = getelementptr i32, i32* [[TMP65]], i32 12 -; CHECK-NEXT: [[TMP76:%.*]] = bitcast i32* [[TMP75]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP76]], align 4 -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP40]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP48]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP56]], -; CHECK-NEXT: [[TMP80:%.*]] = xor <4 x i1> [[TMP64]], +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4 +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP65]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4 +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP65]], i32 8 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP71]], align 4 +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, ptr [[TMP65]], i32 12 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP72]], align 4 +; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP40]], +; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP48]], +; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP56]], +; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP64]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP40]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP56]], <4 x i32> [[WIDE_LOAD5]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP64]], <4 x i32> [[WIDE_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP84]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP77]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP78]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] +; CHECK-NEXT: [[TMP79]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] +; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP85:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: [[TMP81:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP81]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP82]], [[TMP81]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP84]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP78]], [[TMP77]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP80]], [[BIN_RDX10]] +; CHECK-NEXT: [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP82]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]] -; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1 +; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] +; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 ; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]] ; CHECK: pred: -; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ] @@ -2166,13 +2075,12 @@ define i32 @test_constant_max(i64 %len, i1* %test_base) { ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], [[MIN]] ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP82]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: %alloca = alloca [4096 x i32] - %base = bitcast [4096 x i32]* %alloca to i32* - call void @init(i32* %base) + call void @init(ptr %alloca) %cmp = icmp ugt i64 %len, 4094 %min = select i1 %cmp, i64 4094, i64 %len br label %loop @@ -2180,12 +2088,12 @@ loop: %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ] %iv.next = add i64 %iv, 1 - %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv - %earlycnd = load i1, i1* %test_addr + %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv + %earlycnd = load i1, ptr %test_addr br i1 %earlycnd, label %pred, label %latch pred: - %addr = getelementptr inbounds i32, i32* %base, i64 %iv - %val = load i32, i32* %addr + %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv + %val = load i32, ptr %addr br label %latch latch: %val.phi = phi i32 [0, %loop], [%val, %pred] @@ -2199,24 +2107,23 @@ loop_exit: ;; Model a custom allocate which allocates in chunks of 8 bytes -declare align 8 dereferenceable_or_null(8) i8* @my_alloc(i32) allocsize(0) -declare align 8 dereferenceable_or_null(8) i8* @my_array_alloc(i32, i32) allocsize(0, 1) +declare align 8 dereferenceable_or_null(8) ptr @my_alloc(i32) allocsize(0) +declare align 8 dereferenceable_or_null(8) ptr @my_array_alloc(i32, i32) allocsize(0, 1) -define i32 @test_allocsize(i64 %len, i1* %test_base) nofree nosync { +define i32 @test_allocsize(i64 %len, ptr %test_base) nofree nosync { ; CHECK-LABEL: @test_allocsize( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ALLOCATION:%.*]] = call nonnull i8* @my_alloc(i32 16384) -; CHECK-NEXT: [[BASE:%.*]] = bitcast i8* [[ALLOCATION]] to i32* -; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: [[ALLOCATION:%.*]] = call nonnull ptr @my_alloc(i32 16384) +; CHECK-NEXT: call void @init(ptr [[ALLOCATION]]) ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP76:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -2233,106 +2140,102 @@ define i32 @test_allocsize(i64 %len, i1* %test_base) nofree nosync { ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 ; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 ; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 ; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 ; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 ; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 +; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 +; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 +; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 ; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 ; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 ; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 ; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 +; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 +; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 +; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4 -; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8 -; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12 -; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) +; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP76]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP77]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] +; CHECK-NEXT: [[TMP78]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] +; CHECK-NEXT: [[TMP79]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP77]], [[TMP76]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP78]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX10]] +; CHECK-NEXT: [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]] -; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1 +; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] +; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 ; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]] ; CHECK: pred: -; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCATION]], i64 [[IV]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ] @@ -2340,24 +2243,23 @@ define i32 @test_allocsize(i64 %len, i1* %test_base) nofree nosync { ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: - %allocation = call nonnull i8* @my_alloc(i32 16384) - %base = bitcast i8* %allocation to i32* - call void @init(i32* %base) + %allocation = call nonnull ptr @my_alloc(i32 16384) + call void @init(ptr %allocation) br label %loop loop: %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ] %iv.next = add i64 %iv, 1 - %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv - %earlycnd = load i1, i1* %test_addr + %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv + %earlycnd = load i1, ptr %test_addr br i1 %earlycnd, label %pred, label %latch pred: - %addr = getelementptr inbounds i32, i32* %base, i64 %iv - %val = load i32, i32* %addr + %addr = getelementptr inbounds i32, ptr %allocation, i64 %iv + %val = load i32, ptr %addr br label %latch latch: %val.phi = phi i32 [0, %loop], [%val, %pred] @@ -2370,21 +2272,20 @@ loop_exit: } -define i32 @test_allocsize_array(i64 %len, i1* %test_base) nofree nosync { +define i32 @test_allocsize_array(i64 %len, ptr %test_base) nofree nosync { ; CHECK-LABEL: @test_allocsize_array( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ALLOCATION:%.*]] = call nonnull i8* @my_array_alloc(i32 4096, i32 4) -; CHECK-NEXT: [[BASE:%.*]] = bitcast i8* [[ALLOCATION]] to i32* -; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: [[ALLOCATION:%.*]] = call nonnull ptr @my_array_alloc(i32 4096, i32 4) +; CHECK-NEXT: call void @init(ptr [[ALLOCATION]]) ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP76:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -2401,106 +2302,102 @@ define i32 @test_allocsize_array(i64 %len, i1* %test_base) nofree nosync { ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 ; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 ; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 ; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 ; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 ; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 +; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 +; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 +; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 ; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 ; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 ; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 ; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 +; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 +; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 +; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4 -; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8 -; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12 -; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) +; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP76]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP77]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] +; CHECK-NEXT: [[TMP78]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] +; CHECK-NEXT: [[TMP79]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP77]], [[TMP76]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP78]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX10]] +; CHECK-NEXT: [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]] -; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1 +; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] +; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 ; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]] ; CHECK: pred: -; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCATION]], i64 [[IV]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ] @@ -2508,24 +2405,23 @@ define i32 @test_allocsize_array(i64 %len, i1* %test_base) nofree nosync { ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: - %allocation = call nonnull i8* @my_array_alloc(i32 4096, i32 4) - %base = bitcast i8* %allocation to i32* - call void @init(i32* %base) + %allocation = call nonnull ptr @my_array_alloc(i32 4096, i32 4) + call void @init(ptr %allocation) br label %loop loop: %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ] %iv.next = add i64 %iv, 1 - %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv - %earlycnd = load i1, i1* %test_addr + %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv + %earlycnd = load i1, ptr %test_addr br i1 %earlycnd, label %pred, label %latch pred: - %addr = getelementptr inbounds i32, i32* %base, i64 %iv - %val = load i32, i32* %addr + %addr = getelementptr inbounds i32, ptr %allocation, i64 %iv + %val = load i32, ptr %addr br label %latch latch: %val.phi = phi i32 [0, %loop], [%val, %pred] @@ -2537,21 +2433,20 @@ loop_exit: ret i32 %accum.next } -declare void @my_free(i8*) +declare void @my_free(ptr) ; For the point in time variant of deref(N) semantics, show a negative ; example where hoisting without explicit predication might introduce a ; dynamic use after free. (e.g. allzero is true when all elements of the ; test vector are false and thus base is never accessed.) -define i32 @test_allocsize_cond_deref(i1 %allzero, i1* %test_base) { +define i32 @test_allocsize_cond_deref(i1 %allzero, ptr %test_base) { ; CHECK-LABEL: @test_allocsize_cond_deref( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ALLOCATION:%.*]] = call nonnull i8* @my_alloc(i32 16384) -; CHECK-NEXT: [[BASE:%.*]] = bitcast i8* [[ALLOCATION]] to i32* -; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: [[ALLOCATION:%.*]] = call nonnull ptr @my_alloc(i32 16384) +; CHECK-NEXT: call void @init(ptr [[ALLOCATION]]) ; CHECK-NEXT: br i1 [[ALLZERO:%.*]], label [[FREEIT:%.*]], label [[PREHEADER:%.*]] ; CHECK: freeit: -; CHECK-NEXT: call void @my_free(i8* [[ALLOCATION]]) +; CHECK-NEXT: call void @my_free(ptr [[ALLOCATION]]) ; CHECK-NEXT: br label [[PREHEADER]] ; CHECK: preheader: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -2559,10 +2454,10 @@ define i32 @test_allocsize_cond_deref(i1 %allzero, i1* %test_base) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP76:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -2579,106 +2474,102 @@ define i32 @test_allocsize_cond_deref(i1 %allzero, i1* %test_base) { ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1 ; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 ; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1 ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 ; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 ; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 ; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 +; CHECK-NEXT: [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1 +; CHECK-NEXT: [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1 +; CHECK-NEXT: [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1 ; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 ; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 ; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 ; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 +; CHECK-NEXT: [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1 +; CHECK-NEXT: [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1 +; CHECK-NEXT: [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4 -; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8 -; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12 -; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) +; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP76]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP77]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] +; CHECK-NEXT: [[TMP78]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] +; CHECK-NEXT: [[TMP79]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-NEXT: [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP77]], [[TMP76]] +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP78]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX10]] +; CHECK-NEXT: [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]] -; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1 +; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] +; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 ; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]] ; CHECK: pred: -; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCATION]], i64 [[IV]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ] @@ -2686,16 +2577,15 @@ define i32 @test_allocsize_cond_deref(i1 %allzero, i1* %test_base) { ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: - %allocation = call nonnull i8* @my_alloc(i32 16384) - %base = bitcast i8* %allocation to i32* - call void @init(i32* %base) + %allocation = call nonnull ptr @my_alloc(i32 16384) + call void @init(ptr %allocation) br i1 %allzero, label %freeit, label %preheader freeit: - call void @my_free(i8* %allocation) + call void @my_free(ptr %allocation) br label %preheader preheader: br label %loop @@ -2703,12 +2593,12 @@ loop: %iv = phi i64 [ 0, %preheader ], [ %iv.next, %latch ] %accum = phi i32 [ 0, %preheader ], [ %accum.next, %latch ] %iv.next = add i64 %iv, 1 - %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv - %earlycnd = load i1, i1* %test_addr + %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv + %earlycnd = load i1, ptr %test_addr br i1 %earlycnd, label %pred, label %latch pred: - %addr = getelementptr inbounds i32, i32* %base, i64 %iv - %val = load i32, i32* %addr + %addr = getelementptr inbounds i32, ptr %allocation, i64 %iv + %val = load i32, ptr %addr br label %latch latch: %val.phi = phi i32 [0, %loop], [%val, %pred] diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll index 7308afab01372..c03b0295f31d4 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -17,12 +17,12 @@ target triple = "x86_64-pc_linux" ; } ;} -define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 { +define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 { ; AVX1-LABEL: @foo1( ; AVX1-NEXT: entry: -; AVX1-NEXT: [[B3:%.*]] = ptrtoint i32* [[B:%.*]] to i64 -; AVX1-NEXT: [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64 -; AVX1-NEXT: [[A1:%.*]] = ptrtoint i32* [[A:%.*]] to i64 +; AVX1-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; AVX1-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64 +; AVX1-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64 ; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX1: vector.memcheck: ; AVX1-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] @@ -36,23 +36,20 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; AVX1: vector.body: ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0 -; AVX1-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4 -; AVX1-NEXT: [[TMP6:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP7]], i32 0 -; AVX1-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP9]], i32 4, <8 x i1> [[TMP6]], <8 x i32> poison) -; AVX1-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX1-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i32 0 -; AVX1-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP10]], <8 x i32>* [[TMP13]], i32 4, <8 x i1> [[TMP6]]) +; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4 +; AVX1-NEXT: [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP7]], i32 4, <8 x i1> [[TMP5]], <8 x i32> poison) +; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0 +; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP8]], ptr [[TMP10]], i32 4, <8 x i1> [[TMP5]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; AVX1-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX1-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AVX1-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; AVX1-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -61,16 +58,16 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP15]], 100 +; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP12]], 100 ; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX1: if.then: -; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 -; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]] -; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; AVX1-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4 +; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]] +; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; AVX1-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4 ; AVX1-NEXT: br label [[FOR_INC]] ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -81,9 +78,9 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; ; AVX2-LABEL: @foo1( ; AVX2-NEXT: entry: -; AVX2-NEXT: [[B3:%.*]] = ptrtoint i32* [[B:%.*]] to i64 -; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64 -; AVX2-NEXT: [[A1:%.*]] = ptrtoint i32* [[A:%.*]] to i64 +; AVX2-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64 +; AVX2-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64 ; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX2: vector.memcheck: ; AVX2-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] @@ -100,65 +97,53 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 8 ; AVX2-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 24 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]] -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]] -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 -; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4 -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 8 -; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4 -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16 -; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 24 -; AVX2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, <8 x i32>* [[TMP17]], align 4 -; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], -; AVX2-NEXT: [[TMP20:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], -; AVX2-NEXT: [[TMP21:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], -; AVX2-NEXT: [[TMP22:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP4]] -; AVX2-NEXT: [[TMP25:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP5]] -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP22]], i32 0 -; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison) -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP22]], i32 8 -; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison) -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP22]], i32 16 -; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP20]], <8 x i32> poison) -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP22]], i32 24 -; AVX2-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP33]], i32 4, <8 x i1> [[TMP21]], <8 x i32> poison) -; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX2-NEXT: [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]] -; AVX2-NEXT: [[TMP36:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]] -; AVX2-NEXT: [[TMP37:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]] -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP40:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP4]] -; AVX2-NEXT: [[TMP41:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP5]] -; AVX2-NEXT: [[TMP42:%.*]] = getelementptr i32, i32* [[TMP38]], i32 0 -; AVX2-NEXT: [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP34]], <8 x i32>* [[TMP43]], i32 4, <8 x i1> [[TMP18]]) -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr i32, i32* [[TMP38]], i32 8 -; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP35]], <8 x i32>* [[TMP45]], i32 4, <8 x i1> [[TMP19]]) -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP38]], i32 16 -; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP36]], <8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP20]]) -; AVX2-NEXT: [[TMP48:%.*]] = getelementptr i32, i32* [[TMP38]], i32 24 -; AVX2-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP37]], <8 x i32>* [[TMP49]], i32 4, <8 x i1> [[TMP21]]) +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP4]] +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 4 +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 8 +; AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP11]], align 4 +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 16 +; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP12]], align 4 +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 24 +; AVX2-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP13]], align 4 +; AVX2-NEXT: [[TMP14:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX2-NEXT: [[TMP15:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], +; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], +; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP4]] +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP18]], i32 0 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP22]], i32 4, <8 x i1> [[TMP14]], <8 x i32> poison) +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP18]], i32 8 +; AVX2-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP23]], i32 4, <8 x i1> [[TMP15]], <8 x i32> poison) +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP18]], i32 16 +; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP24]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison) +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP18]], i32 24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP25]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison) +; AVX2-NEXT: [[TMP26:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX2-NEXT: [[TMP27:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]] +; AVX2-NEXT: [[TMP28:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]] +; AVX2-NEXT: [[TMP29:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]] +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]] +; AVX2-NEXT: [[TMP33:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr [[TMP30]], i32 0 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP26]], ptr [[TMP34]], i32 4, <8 x i1> [[TMP14]]) +; AVX2-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[TMP30]], i32 8 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP27]], ptr [[TMP35]], i32 4, <8 x i1> [[TMP15]]) +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[TMP30]], i32 16 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP28]], ptr [[TMP36]], i32 4, <8 x i1> [[TMP16]]) +; AVX2-NEXT: [[TMP37:%.*]] = getelementptr i32, ptr [[TMP30]], i32 24 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP29]], ptr [[TMP37]], i32 4, <8 x i1> [[TMP17]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; AVX2-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AVX2-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 +; AVX2-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -167,16 +152,16 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP51:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP51]], 100 +; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100 ; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX2: if.then: -; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP52:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 -; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP52]], [[TMP51]] -; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; AVX2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4 +; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP40:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP40]], [[TMP39]] +; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; AVX2-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4 ; AVX2-NEXT: br label [[FOR_INC]] ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -187,9 +172,9 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; ; AVX512-LABEL: @foo1( ; AVX512-NEXT: iter.check: -; AVX512-NEXT: [[B3:%.*]] = ptrtoint i32* [[B:%.*]] to i64 -; AVX512-NEXT: [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64 -; AVX512-NEXT: [[A1:%.*]] = ptrtoint i32* [[A:%.*]] to i64 +; AVX512-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; AVX512-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64 +; AVX512-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64 ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: ; AVX512-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] @@ -208,65 +193,53 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 16 ; AVX512-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 48 -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]] -; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]] -; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 -; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4 -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16 -; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4 -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 32 -; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 48 -; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i32>, <16 x i32>* [[TMP17]], align 4 -; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], -; AVX512-NEXT: [[TMP20:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], -; AVX512-NEXT: [[TMP21:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP4]] -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP5]] -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP22]], i32 0 -; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison) -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP22]], i32 16 -; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP22]], i32 32 -; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP20]], <16 x i32> poison) -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP22]], i32 48 -; AVX512-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP33]], i32 4, <16 x i1> [[TMP21]], <16 x i32> poison) -; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX512-NEXT: [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]] -; AVX512-NEXT: [[TMP36:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]] -; AVX512-NEXT: [[TMP37:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]] -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP4]] -; AVX512-NEXT: [[TMP41:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP5]] -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr i32, i32* [[TMP38]], i32 0 -; AVX512-NEXT: [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP34]], <16 x i32>* [[TMP43]], i32 4, <16 x i1> [[TMP18]]) -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr i32, i32* [[TMP38]], i32 16 -; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP35]], <16 x i32>* [[TMP45]], i32 4, <16 x i1> [[TMP19]]) -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP38]], i32 32 -; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP36]], <16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP20]]) -; AVX512-NEXT: [[TMP48:%.*]] = getelementptr i32, i32* [[TMP38]], i32 48 -; AVX512-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP37]], <16 x i32>* [[TMP49]], i32 4, <16 x i1> [[TMP21]]) +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP4]] +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP5]] +; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 4 +; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 16 +; AVX512-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP11]], align 4 +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 32 +; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr [[TMP12]], align 4 +; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 48 +; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP13]], align 4 +; AVX512-NEXT: [[TMP14:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP15:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], +; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], +; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP4]] +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP5]] +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP18]], i32 0 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 4, <16 x i1> [[TMP14]], <16 x i32> poison) +; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP18]], i32 16 +; AVX512-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP23]], i32 4, <16 x i1> [[TMP15]], <16 x i32> poison) +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP18]], i32 32 +; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP24]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison) +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP18]], i32 48 +; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP25]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison) +; AVX512-NEXT: [[TMP26:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX512-NEXT: [[TMP27:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]] +; AVX512-NEXT: [[TMP28:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]] +; AVX512-NEXT: [[TMP29:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]] +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]] +; AVX512-NEXT: [[TMP33:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP5]] +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr [[TMP30]], i32 0 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP26]], ptr [[TMP34]], i32 4, <16 x i1> [[TMP14]]) +; AVX512-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[TMP30]], i32 16 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP27]], ptr [[TMP35]], i32 4, <16 x i1> [[TMP15]]) +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[TMP30]], i32 32 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP28]], ptr [[TMP36]], i32 4, <16 x i1> [[TMP16]]) +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr i32, ptr [[TMP30]], i32 48 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP29]], ptr [[TMP37]], i32 4, <16 x i1> [[TMP17]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 -; AVX512-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AVX512-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 +; AVX512-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -276,25 +249,22 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; AVX512: vec.epilog.vector.body: -; AVX512-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; AVX512-NEXT: [[TMP51:%.*]] = add i64 [[OFFSET_IDX]], 0 -; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP51]] -; AVX512-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[TMP52]], i32 0 -; AVX512-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP54]], align 4 -; AVX512-NEXT: [[TMP55:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], -; AVX512-NEXT: [[TMP56:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP51]] -; AVX512-NEXT: [[TMP57:%.*]] = getelementptr i32, i32* [[TMP56]], i32 0 -; AVX512-NEXT: [[TMP58:%.*]] = bitcast i32* [[TMP57]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP58]], i32 4, <8 x i1> [[TMP55]], <8 x i32> poison) -; AVX512-NEXT: [[TMP59:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_LOAD13]] -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP51]] -; AVX512-NEXT: [[TMP61:%.*]] = getelementptr i32, i32* [[TMP60]], i32 0 -; AVX512-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <8 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP59]], <8 x i32>* [[TMP62]], i32 4, <8 x i1> [[TMP55]]) -; AVX512-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[OFFSET_IDX]], 8 -; AVX512-NEXT: [[TMP63:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000 -; AVX512-NEXT: br i1 [[TMP63]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; AVX512-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP39:%.*]] = add i64 [[INDEX12]], 0 +; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP39]] +; AVX512-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[TMP40]], i32 0 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[TMP41]], align 4 +; AVX512-NEXT: [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], +; AVX512-NEXT: [[TMP43:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP39]] +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr i32, ptr [[TMP43]], i32 0 +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP44]], i32 4, <8 x i1> [[TMP42]], <8 x i32> poison) +; AVX512-NEXT: [[TMP45:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_LOAD13]] +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP39]] +; AVX512-NEXT: [[TMP47:%.*]] = getelementptr i32, ptr [[TMP46]], i32 0 +; AVX512-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP45]], ptr [[TMP47]], i32 4, <8 x i1> [[TMP42]]) +; AVX512-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8 +; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000 +; AVX512-NEXT: br i1 [[TMP48]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; AVX512: vec.epilog.middle.block: ; AVX512-NEXT: [[CMP_N11:%.*]] = icmp eq i64 10000, 10000 ; AVX512-NEXT: br i1 [[CMP_N11]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] @@ -303,16 +273,16 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP64:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP64]], 100 +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP49:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP49]], 100 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX512: if.then: -; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP65:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 -; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP65]], [[TMP64]] -; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; AVX512-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4 +; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP50:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP50]], [[TMP49]] +; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4 ; AVX512-NEXT: br label [[FOR_INC]] ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -326,17 +296,17 @@ entry: for.body: ; preds = %for.inc, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] - %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 %cmp1 = icmp slt i32 %0, 100 br i1 %cmp1, label %if.then, label %for.inc if.then: ; preds = %for.body - %arrayidx3 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv - %1 = load i32, i32* %arrayidx3, align 4 + %arrayidx3 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %1 = load i32, ptr %arrayidx3, align 4 %add = add nsw i32 %1, %0 - %arrayidx7 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv - store i32 %add, i32* %arrayidx7, align 4 + %arrayidx7 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + store i32 %add, ptr %arrayidx7, align 4 br label %for.inc for.inc: ; preds = %for.body, %if.then @@ -350,12 +320,12 @@ for.end: ; preds = %for.inc ; The same as @foo1 but all the pointers are address space 1 pointers. -define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* nocapture readonly %B, i32 addrspace(1)* nocapture readonly %trigger) local_unnamed_addr #0 { +define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture readonly %trigger) local_unnamed_addr #0 { ; AVX1-LABEL: @foo1_addrspace1( ; AVX1-NEXT: entry: -; AVX1-NEXT: [[B3:%.*]] = ptrtoint i32 addrspace(1)* [[B:%.*]] to i64 -; AVX1-NEXT: [[TRIGGER2:%.*]] = ptrtoint i32 addrspace(1)* [[TRIGGER:%.*]] to i64 -; AVX1-NEXT: [[A1:%.*]] = ptrtoint i32 addrspace(1)* [[A:%.*]] to i64 +; AVX1-NEXT: [[B3:%.*]] = ptrtoint ptr addrspace(1) [[B:%.*]] to i64 +; AVX1-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr addrspace(1) [[TRIGGER:%.*]] to i64 +; AVX1-NEXT: [[A1:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64 ; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX1: vector.memcheck: ; AVX1-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] @@ -369,23 +339,20 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; AVX1: vector.body: ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP3]], i32 0 -; AVX1-NEXT: [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <8 x i32> addrspace(1)* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP5]], align 4 -; AVX1-NEXT: [[TMP6:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP7:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP8:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP7]], i32 0 -; AVX1-NEXT: [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <8 x i32> addrspace(1)* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP9]], i32 4, <8 x i1> [[TMP6]], <8 x i32> poison) -; AVX1-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX1-NEXT: [[TMP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP12:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP11]], i32 0 -; AVX1-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)* -; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP10]], <8 x i32> addrspace(1)* [[TMP13]], i32 4, <8 x i1> [[TMP6]]) +; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 0 +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP4]], align 4 +; AVX1-NEXT: [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP6]], i32 0 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP7]], i32 4, <8 x i1> [[TMP5]], <8 x i32> poison) +; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP9]], i32 0 +; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP8]], ptr addrspace(1) [[TMP10]], i32 4, <8 x i1> [[TMP5]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; AVX1-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX1-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; AVX1-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; AVX1-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -394,16 +361,16 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP15:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4 -; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP15]], 100 +; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4 +; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP12]], 100 ; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX1: if.then: -; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP16:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4 -; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]] -; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]] -; AVX1-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4 +; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[B]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX3]], align 4 +; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]] +; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i64 [[INDVARS_IV]] +; AVX1-NEXT: store i32 [[ADD]], ptr addrspace(1) [[ARRAYIDX7]], align 4 ; AVX1-NEXT: br label [[FOR_INC]] ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -414,9 +381,9 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; ; AVX2-LABEL: @foo1_addrspace1( ; AVX2-NEXT: entry: -; AVX2-NEXT: [[B3:%.*]] = ptrtoint i32 addrspace(1)* [[B:%.*]] to i64 -; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint i32 addrspace(1)* [[TRIGGER:%.*]] to i64 -; AVX2-NEXT: [[A1:%.*]] = ptrtoint i32 addrspace(1)* [[A:%.*]] to i64 +; AVX2-NEXT: [[B3:%.*]] = ptrtoint ptr addrspace(1) [[B:%.*]] to i64 +; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr addrspace(1) [[TRIGGER:%.*]] to i64 +; AVX2-NEXT: [[A1:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64 ; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX2: vector.memcheck: ; AVX2-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] @@ -433,65 +400,53 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 8 ; AVX2-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 24 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP4]] -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP5]] -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 0 -; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP11]], align 4 -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 8 -; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP13]], align 4 -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 16 -; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP15]], align 4 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 24 -; AVX2-NEXT: [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP17]], align 4 -; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], -; AVX2-NEXT: [[TMP20:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], -; AVX2-NEXT: [[TMP21:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], -; AVX2-NEXT: [[TMP22:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP4]] -; AVX2-NEXT: [[TMP25:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP5]] -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 0 -; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison) -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 8 -; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison) -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 16 -; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP20]], <8 x i32> poison) -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 24 -; AVX2-NEXT: [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP33]], i32 4, <8 x i1> [[TMP21]], <8 x i32> poison) -; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX2-NEXT: [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]] -; AVX2-NEXT: [[TMP36:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]] -; AVX2-NEXT: [[TMP37:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]] -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP4]] -; AVX2-NEXT: [[TMP41:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP5]] -; AVX2-NEXT: [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 0 -; AVX2-NEXT: [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP34]], <8 x i32> addrspace(1)* [[TMP43]], i32 4, <8 x i1> [[TMP18]]) -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 8 -; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP19]]) -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 16 -; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP36]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP20]]) -; AVX2-NEXT: [[TMP48:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 24 -; AVX2-NEXT: [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP37]], <8 x i32> addrspace(1)* [[TMP49]], i32 4, <8 x i1> [[TMP21]]) +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP4]] +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 0 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP10]], align 4 +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 8 +; AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP11]], align 4 +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 16 +; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP12]], align 4 +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 24 +; AVX2-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP13]], align 4 +; AVX2-NEXT: [[TMP14:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX2-NEXT: [[TMP15:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], +; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], +; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP4]] +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 0 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP22]], i32 4, <8 x i1> [[TMP14]], <8 x i32> poison) +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 8 +; AVX2-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP23]], i32 4, <8 x i1> [[TMP15]], <8 x i32> poison) +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 16 +; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP24]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison) +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP25]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison) +; AVX2-NEXT: [[TMP26:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX2-NEXT: [[TMP27:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]] +; AVX2-NEXT: [[TMP28:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]] +; AVX2-NEXT: [[TMP29:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]] +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP4]] +; AVX2-NEXT: [[TMP33:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 0 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP26]], ptr addrspace(1) [[TMP34]], i32 4, <8 x i1> [[TMP14]]) +; AVX2-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 8 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP27]], ptr addrspace(1) [[TMP35]], i32 4, <8 x i1> [[TMP15]]) +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 16 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP28]], ptr addrspace(1) [[TMP36]], i32 4, <8 x i1> [[TMP16]]) +; AVX2-NEXT: [[TMP37:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 24 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP29]], ptr addrspace(1) [[TMP37]], i32 4, <8 x i1> [[TMP17]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; AVX2-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; AVX2-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 +; AVX2-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -500,16 +455,16 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP51:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4 -; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP51]], 100 +; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP39:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100 ; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX2: if.then: -; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP52:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4 -; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP52]], [[TMP51]] -; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]] -; AVX2-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4 +; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[B]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP40:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX3]], align 4 +; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP40]], [[TMP39]] +; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i64 [[INDVARS_IV]] +; AVX2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[ARRAYIDX7]], align 4 ; AVX2-NEXT: br label [[FOR_INC]] ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -520,9 +475,9 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; ; AVX512-LABEL: @foo1_addrspace1( ; AVX512-NEXT: iter.check: -; AVX512-NEXT: [[B3:%.*]] = ptrtoint i32 addrspace(1)* [[B:%.*]] to i64 -; AVX512-NEXT: [[TRIGGER2:%.*]] = ptrtoint i32 addrspace(1)* [[TRIGGER:%.*]] to i64 -; AVX512-NEXT: [[A1:%.*]] = ptrtoint i32 addrspace(1)* [[A:%.*]] to i64 +; AVX512-NEXT: [[B3:%.*]] = ptrtoint ptr addrspace(1) [[B:%.*]] to i64 +; AVX512-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr addrspace(1) [[TRIGGER:%.*]] to i64 +; AVX512-NEXT: [[A1:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64 ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: ; AVX512-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] @@ -541,65 +496,53 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 16 ; AVX512-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 48 -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP4]] -; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP5]] -; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 0 -; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP11]], align 4 -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 16 -; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP13]], align 4 -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 32 -; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP15]], align 4 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 48 -; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP17]], align 4 -; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], -; AVX512-NEXT: [[TMP20:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], -; AVX512-NEXT: [[TMP21:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP4]] -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP5]] -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 0 -; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison) -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 16 -; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 32 -; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP20]], <16 x i32> poison) -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 48 -; AVX512-NEXT: [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP33]], i32 4, <16 x i1> [[TMP21]], <16 x i32> poison) -; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX512-NEXT: [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]] -; AVX512-NEXT: [[TMP36:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]] -; AVX512-NEXT: [[TMP37:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]] -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP4]] -; AVX512-NEXT: [[TMP41:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP5]] -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 0 -; AVX512-NEXT: [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP34]], <16 x i32> addrspace(1)* [[TMP43]], i32 4, <16 x i1> [[TMP18]]) -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 16 -; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP19]]) -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 32 -; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP36]], <16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP20]]) -; AVX512-NEXT: [[TMP48:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 48 -; AVX512-NEXT: [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP37]], <16 x i32> addrspace(1)* [[TMP49]], i32 4, <16 x i1> [[TMP21]]) +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP4]] +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP5]] +; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 0 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP10]], align 4 +; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 16 +; AVX512-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP11]], align 4 +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 32 +; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP12]], align 4 +; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 48 +; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP13]], align 4 +; AVX512-NEXT: [[TMP14:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP15:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], +; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], +; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP4]] +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP5]] +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 0 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP22]], i32 4, <16 x i1> [[TMP14]], <16 x i32> poison) +; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 16 +; AVX512-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP23]], i32 4, <16 x i1> [[TMP15]], <16 x i32> poison) +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 32 +; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP24]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison) +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 48 +; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP25]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison) +; AVX512-NEXT: [[TMP26:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX512-NEXT: [[TMP27:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]] +; AVX512-NEXT: [[TMP28:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]] +; AVX512-NEXT: [[TMP29:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]] +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP4]] +; AVX512-NEXT: [[TMP33:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP5]] +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 0 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP26]], ptr addrspace(1) [[TMP34]], i32 4, <16 x i1> [[TMP14]]) +; AVX512-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 16 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP27]], ptr addrspace(1) [[TMP35]], i32 4, <16 x i1> [[TMP15]]) +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 32 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP28]], ptr addrspace(1) [[TMP36]], i32 4, <16 x i1> [[TMP16]]) +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 48 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP29]], ptr addrspace(1) [[TMP37]], i32 4, <16 x i1> [[TMP17]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 -; AVX512-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; AVX512-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 +; AVX512-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -609,25 +552,22 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; AVX512: vec.epilog.vector.body: -; AVX512-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; AVX512-NEXT: [[TMP51:%.*]] = add i64 [[OFFSET_IDX]], 0 -; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP51]] -; AVX512-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP52]], i32 0 -; AVX512-NEXT: [[TMP54:%.*]] = bitcast i32 addrspace(1)* [[TMP53]] to <8 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP54]], align 4 -; AVX512-NEXT: [[TMP55:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], -; AVX512-NEXT: [[TMP56:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP51]] -; AVX512-NEXT: [[TMP57:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i32 0 -; AVX512-NEXT: [[TMP58:%.*]] = bitcast i32 addrspace(1)* [[TMP57]] to <8 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP58]], i32 4, <8 x i1> [[TMP55]], <8 x i32> poison) -; AVX512-NEXT: [[TMP59:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_LOAD13]] -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP51]] -; AVX512-NEXT: [[TMP61:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP60]], i32 0 -; AVX512-NEXT: [[TMP62:%.*]] = bitcast i32 addrspace(1)* [[TMP61]] to <8 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP59]], <8 x i32> addrspace(1)* [[TMP62]], i32 4, <8 x i1> [[TMP55]]) -; AVX512-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[OFFSET_IDX]], 8 -; AVX512-NEXT: [[TMP63:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000 -; AVX512-NEXT: br i1 [[TMP63]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; AVX512-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP39:%.*]] = add i64 [[INDEX12]], 0 +; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP39]] +; AVX512-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP40]], i32 0 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP41]], align 4 +; AVX512-NEXT: [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], +; AVX512-NEXT: [[TMP43:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP39]] +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP43]], i32 0 +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP44]], i32 4, <8 x i1> [[TMP42]], <8 x i32> poison) +; AVX512-NEXT: [[TMP45:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_LOAD13]] +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP39]] +; AVX512-NEXT: [[TMP47:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP46]], i32 0 +; AVX512-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP45]], ptr addrspace(1) [[TMP47]], i32 4, <8 x i1> [[TMP42]]) +; AVX512-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8 +; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000 +; AVX512-NEXT: br i1 [[TMP48]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; AVX512: vec.epilog.middle.block: ; AVX512-NEXT: [[CMP_N11:%.*]] = icmp eq i64 10000, 10000 ; AVX512-NEXT: br i1 [[CMP_N11]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] @@ -636,16 +576,16 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP64:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4 -; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP64]], 100 +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP49:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP49]], 100 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX512: if.then: -; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP65:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4 -; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP65]], [[TMP64]] -; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]] -; AVX512-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4 +; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[B]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP50:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX3]], align 4 +; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP50]], [[TMP49]] +; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store i32 [[ADD]], ptr addrspace(1) [[ARRAYIDX7]], align 4 ; AVX512-NEXT: br label [[FOR_INC]] ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -659,17 +599,17 @@ entry: for.body: ; preds = %for.inc, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] - %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %trigger, i64 %indvars.iv - %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %trigger, i64 %indvars.iv + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 %cmp1 = icmp slt i32 %0, 100 br i1 %cmp1, label %if.then, label %for.inc if.then: ; preds = %for.body - %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %B, i64 %indvars.iv - %1 = load i32, i32 addrspace(1)* %arrayidx3, align 4 + %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %B, i64 %indvars.iv + %1 = load i32, ptr addrspace(1) %arrayidx3, align 4 %add = add nsw i32 %1, %0 - %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %A, i64 %indvars.iv - store i32 %add, i32 addrspace(1)* %arrayidx7, align 4 + %arrayidx7 = getelementptr inbounds i32, ptr addrspace(1) %A, i64 %indvars.iv + store i32 %add, ptr addrspace(1) %arrayidx7, align 4 br label %for.inc for.inc: ; preds = %for.body, %if.then @@ -683,7 +623,7 @@ for.end: ; preds = %for.inc ; The source code: ; -;void foo2(float *A, float *B, int *trigger) { +;void foo2(ptr A, ptr B, int *trigger) { ; ; for (int i=0; i<10000; i++) { ; if (trigger[i] < 100) { @@ -692,12 +632,12 @@ for.end: ; preds = %for.inc ; } ;} -define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 { +define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 { ; AVX1-LABEL: @foo2( ; AVX1-NEXT: entry: -; AVX1-NEXT: [[B3:%.*]] = ptrtoint float* [[B:%.*]] to i64 -; AVX1-NEXT: [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64 -; AVX1-NEXT: [[A1:%.*]] = ptrtoint float* [[A:%.*]] to i64 +; AVX1-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; AVX1-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64 +; AVX1-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64 ; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX1: vector.memcheck: ; AVX1-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] @@ -711,24 +651,21 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; AVX1: vector.body: ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0 -; AVX1-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4 -; AVX1-NEXT: [[TMP6:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP7:%.*]] = getelementptr float, float* [[B]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP8:%.*]] = getelementptr float, float* [[TMP7]], i32 0 -; AVX1-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP6]], <8 x float> poison) -; AVX1-NEXT: [[TMP10:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> -; AVX1-NEXT: [[TMP11:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP10]] -; AVX1-NEXT: [[TMP12:%.*]] = getelementptr float, float* [[A]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP13:%.*]] = getelementptr float, float* [[TMP12]], i32 0 -; AVX1-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to <8 x float>* -; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP11]], <8 x float>* [[TMP14]], i32 4, <8 x i1> [[TMP6]]) +; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4 +; AVX1-NEXT: [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP7:%.*]] = getelementptr float, ptr [[TMP6]], i32 0 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP7]], i32 4, <8 x i1> [[TMP5]], <8 x float> poison) +; AVX1-NEXT: [[TMP8:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> +; AVX1-NEXT: [[TMP9:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP8]] +; AVX1-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i32 0 +; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP9]], ptr [[TMP11]], i32 4, <8 x i1> [[TMP5]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; AVX1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; AVX1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -737,17 +674,17 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP16]], 100 +; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP13]], 100 ; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX1: if.then: -; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX3]], align 4 -; AVX1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP16]] to float -; AVX1-NEXT: [[ADD:%.*]] = fadd float [[TMP17]], [[CONV]] -; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] -; AVX1-NEXT: store float [[ADD]], float* [[ARRAYIDX7]], align 4 +; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX3]], align 4 +; AVX1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP13]] to float +; AVX1-NEXT: [[ADD:%.*]] = fadd float [[TMP14]], [[CONV]] +; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; AVX1-NEXT: store float [[ADD]], ptr [[ARRAYIDX7]], align 4 ; AVX1-NEXT: br label [[FOR_INC]] ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -758,9 +695,9 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; ; AVX2-LABEL: @foo2( ; AVX2-NEXT: entry: -; AVX2-NEXT: [[B3:%.*]] = ptrtoint float* [[B:%.*]] to i64 -; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64 -; AVX2-NEXT: [[A1:%.*]] = ptrtoint float* [[A:%.*]] to i64 +; AVX2-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64 +; AVX2-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64 ; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX2: vector.memcheck: ; AVX2-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] @@ -777,69 +714,57 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 8 ; AVX2-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 24 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]] -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]] -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 -; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4 -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 8 -; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4 -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16 -; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 24 -; AVX2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, <8 x i32>* [[TMP17]], align 4 -; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], -; AVX2-NEXT: [[TMP20:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], -; AVX2-NEXT: [[TMP21:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], -; AVX2-NEXT: [[TMP22:%.*]] = getelementptr float, float* [[B]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP23:%.*]] = getelementptr float, float* [[B]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[B]], i64 [[TMP4]] -; AVX2-NEXT: [[TMP25:%.*]] = getelementptr float, float* [[B]], i64 [[TMP5]] -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr float, float* [[TMP22]], i32 0 -; AVX2-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP27]], i32 4, <8 x i1> [[TMP18]], <8 x float> poison) -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[TMP22]], i32 8 -; AVX2-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP29]], i32 4, <8 x i1> [[TMP19]], <8 x float> poison) -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr float, float* [[TMP22]], i32 16 -; AVX2-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP20]], <8 x float> poison) -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[TMP22]], i32 24 -; AVX2-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP33]], i32 4, <8 x i1> [[TMP21]], <8 x float> poison) -; AVX2-NEXT: [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> -; AVX2-NEXT: [[TMP35:%.*]] = sitofp <8 x i32> [[WIDE_LOAD5]] to <8 x float> -; AVX2-NEXT: [[TMP36:%.*]] = sitofp <8 x i32> [[WIDE_LOAD6]] to <8 x float> -; AVX2-NEXT: [[TMP37:%.*]] = sitofp <8 x i32> [[WIDE_LOAD7]] to <8 x float> -; AVX2-NEXT: [[TMP38:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP34]] -; AVX2-NEXT: [[TMP39:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD8]], [[TMP35]] -; AVX2-NEXT: [[TMP40:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD9]], [[TMP36]] -; AVX2-NEXT: [[TMP41:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD10]], [[TMP37]] -; AVX2-NEXT: [[TMP42:%.*]] = getelementptr float, float* [[A]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP43:%.*]] = getelementptr float, float* [[A]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr float, float* [[A]], i64 [[TMP4]] -; AVX2-NEXT: [[TMP45:%.*]] = getelementptr float, float* [[A]], i64 [[TMP5]] -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr float, float* [[TMP42]], i32 0 -; AVX2-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP38]], <8 x float>* [[TMP47]], i32 4, <8 x i1> [[TMP18]]) -; AVX2-NEXT: [[TMP48:%.*]] = getelementptr float, float* [[TMP42]], i32 8 -; AVX2-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP49]], i32 4, <8 x i1> [[TMP19]]) -; AVX2-NEXT: [[TMP50:%.*]] = getelementptr float, float* [[TMP42]], i32 16 -; AVX2-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP40]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP20]]) -; AVX2-NEXT: [[TMP52:%.*]] = getelementptr float, float* [[TMP42]], i32 24 -; AVX2-NEXT: [[TMP53:%.*]] = bitcast float* [[TMP52]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP41]], <8 x float>* [[TMP53]], i32 4, <8 x i1> [[TMP21]]) +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP4]] +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 4 +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 8 +; AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP11]], align 4 +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 16 +; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP12]], align 4 +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 24 +; AVX2-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP13]], align 4 +; AVX2-NEXT: [[TMP14:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX2-NEXT: [[TMP15:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], +; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], +; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP19:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP4]] +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr float, ptr [[TMP18]], i32 0 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP22]], i32 4, <8 x i1> [[TMP14]], <8 x float> poison) +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr float, ptr [[TMP18]], i32 8 +; AVX2-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP23]], i32 4, <8 x i1> [[TMP15]], <8 x float> poison) +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr float, ptr [[TMP18]], i32 16 +; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP24]], i32 4, <8 x i1> [[TMP16]], <8 x float> poison) +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr float, ptr [[TMP18]], i32 24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP25]], i32 4, <8 x i1> [[TMP17]], <8 x float> poison) +; AVX2-NEXT: [[TMP26:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> +; AVX2-NEXT: [[TMP27:%.*]] = sitofp <8 x i32> [[WIDE_LOAD5]] to <8 x float> +; AVX2-NEXT: [[TMP28:%.*]] = sitofp <8 x i32> [[WIDE_LOAD6]] to <8 x float> +; AVX2-NEXT: [[TMP29:%.*]] = sitofp <8 x i32> [[WIDE_LOAD7]] to <8 x float> +; AVX2-NEXT: [[TMP30:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP26]] +; AVX2-NEXT: [[TMP31:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD8]], [[TMP27]] +; AVX2-NEXT: [[TMP32:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD9]], [[TMP28]] +; AVX2-NEXT: [[TMP33:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD10]], [[TMP29]] +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP35:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP4]] +; AVX2-NEXT: [[TMP37:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr float, ptr [[TMP34]], i32 0 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP30]], ptr [[TMP38]], i32 4, <8 x i1> [[TMP14]]) +; AVX2-NEXT: [[TMP39:%.*]] = getelementptr float, ptr [[TMP34]], i32 8 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP31]], ptr [[TMP39]], i32 4, <8 x i1> [[TMP15]]) +; AVX2-NEXT: [[TMP40:%.*]] = getelementptr float, ptr [[TMP34]], i32 16 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP32]], ptr [[TMP40]], i32 4, <8 x i1> [[TMP16]]) +; AVX2-NEXT: [[TMP41:%.*]] = getelementptr float, ptr [[TMP34]], i32 24 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP33]], ptr [[TMP41]], i32 4, <8 x i1> [[TMP17]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; AVX2-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; AVX2-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 +; AVX2-NEXT: br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -848,17 +773,17 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP55:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP55]], 100 +; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP43]], 100 ; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX2: if.then: -; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP56:%.*]] = load float, float* [[ARRAYIDX3]], align 4 -; AVX2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP55]] to float -; AVX2-NEXT: [[ADD:%.*]] = fadd float [[TMP56]], [[CONV]] -; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] -; AVX2-NEXT: store float [[ADD]], float* [[ARRAYIDX7]], align 4 +; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP44:%.*]] = load float, ptr [[ARRAYIDX3]], align 4 +; AVX2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP43]] to float +; AVX2-NEXT: [[ADD:%.*]] = fadd float [[TMP44]], [[CONV]] +; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; AVX2-NEXT: store float [[ADD]], ptr [[ARRAYIDX7]], align 4 ; AVX2-NEXT: br label [[FOR_INC]] ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -869,9 +794,9 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; ; AVX512-LABEL: @foo2( ; AVX512-NEXT: iter.check: -; AVX512-NEXT: [[B3:%.*]] = ptrtoint float* [[B:%.*]] to i64 -; AVX512-NEXT: [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64 -; AVX512-NEXT: [[A1:%.*]] = ptrtoint float* [[A:%.*]] to i64 +; AVX512-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; AVX512-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64 +; AVX512-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64 ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: ; AVX512-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] @@ -890,69 +815,57 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 16 ; AVX512-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 48 -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]] -; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]] -; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 -; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4 -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16 -; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4 -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 32 -; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 48 -; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i32>, <16 x i32>* [[TMP17]], align 4 -; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], -; AVX512-NEXT: [[TMP20:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], -; AVX512-NEXT: [[TMP21:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr float, float* [[B]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP23:%.*]] = getelementptr float, float* [[B]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[B]], i64 [[TMP4]] -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr float, float* [[B]], i64 [[TMP5]] -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr float, float* [[TMP22]], i32 0 -; AVX512-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP27]], i32 4, <16 x i1> [[TMP18]], <16 x float> poison) -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[TMP22]], i32 16 -; AVX512-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP19]], <16 x float> poison) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr float, float* [[TMP22]], i32 32 -; AVX512-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP20]], <16 x float> poison) -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[TMP22]], i32 48 -; AVX512-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP33]], i32 4, <16 x i1> [[TMP21]], <16 x float> poison) -; AVX512-NEXT: [[TMP34:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float> -; AVX512-NEXT: [[TMP35:%.*]] = sitofp <16 x i32> [[WIDE_LOAD5]] to <16 x float> -; AVX512-NEXT: [[TMP36:%.*]] = sitofp <16 x i32> [[WIDE_LOAD6]] to <16 x float> -; AVX512-NEXT: [[TMP37:%.*]] = sitofp <16 x i32> [[WIDE_LOAD7]] to <16 x float> -; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP34]] -; AVX512-NEXT: [[TMP39:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD8]], [[TMP35]] -; AVX512-NEXT: [[TMP40:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD9]], [[TMP36]] -; AVX512-NEXT: [[TMP41:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD10]], [[TMP37]] -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr float, float* [[A]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP43:%.*]] = getelementptr float, float* [[A]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr float, float* [[A]], i64 [[TMP4]] -; AVX512-NEXT: [[TMP45:%.*]] = getelementptr float, float* [[A]], i64 [[TMP5]] -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr float, float* [[TMP42]], i32 0 -; AVX512-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP38]], <16 x float>* [[TMP47]], i32 4, <16 x i1> [[TMP18]]) -; AVX512-NEXT: [[TMP48:%.*]] = getelementptr float, float* [[TMP42]], i32 16 -; AVX512-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP39]], <16 x float>* [[TMP49]], i32 4, <16 x i1> [[TMP19]]) -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr float, float* [[TMP42]], i32 32 -; AVX512-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP40]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP20]]) -; AVX512-NEXT: [[TMP52:%.*]] = getelementptr float, float* [[TMP42]], i32 48 -; AVX512-NEXT: [[TMP53:%.*]] = bitcast float* [[TMP52]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP41]], <16 x float>* [[TMP53]], i32 4, <16 x i1> [[TMP21]]) +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP4]] +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP5]] +; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 4 +; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 16 +; AVX512-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP11]], align 4 +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 32 +; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr [[TMP12]], align 4 +; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 48 +; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP13]], align 4 +; AVX512-NEXT: [[TMP14:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP15:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], +; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], +; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP19:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP4]] +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP5]] +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr float, ptr [[TMP18]], i32 0 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP22]], i32 4, <16 x i1> [[TMP14]], <16 x float> poison) +; AVX512-NEXT: [[TMP23:%.*]] = getelementptr float, ptr [[TMP18]], i32 16 +; AVX512-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP23]], i32 4, <16 x i1> [[TMP15]], <16 x float> poison) +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr float, ptr [[TMP18]], i32 32 +; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP24]], i32 4, <16 x i1> [[TMP16]], <16 x float> poison) +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr float, ptr [[TMP18]], i32 48 +; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP25]], i32 4, <16 x i1> [[TMP17]], <16 x float> poison) +; AVX512-NEXT: [[TMP26:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float> +; AVX512-NEXT: [[TMP27:%.*]] = sitofp <16 x i32> [[WIDE_LOAD5]] to <16 x float> +; AVX512-NEXT: [[TMP28:%.*]] = sitofp <16 x i32> [[WIDE_LOAD6]] to <16 x float> +; AVX512-NEXT: [[TMP29:%.*]] = sitofp <16 x i32> [[WIDE_LOAD7]] to <16 x float> +; AVX512-NEXT: [[TMP30:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP26]] +; AVX512-NEXT: [[TMP31:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD8]], [[TMP27]] +; AVX512-NEXT: [[TMP32:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD9]], [[TMP28]] +; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD10]], [[TMP29]] +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP35:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP4]] +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP5]] +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr float, ptr [[TMP34]], i32 0 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP30]], ptr [[TMP38]], i32 4, <16 x i1> [[TMP14]]) +; AVX512-NEXT: [[TMP39:%.*]] = getelementptr float, ptr [[TMP34]], i32 16 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP31]], ptr [[TMP39]], i32 4, <16 x i1> [[TMP15]]) +; AVX512-NEXT: [[TMP40:%.*]] = getelementptr float, ptr [[TMP34]], i32 32 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP32]], ptr [[TMP40]], i32 4, <16 x i1> [[TMP16]]) +; AVX512-NEXT: [[TMP41:%.*]] = getelementptr float, ptr [[TMP34]], i32 48 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP33]], ptr [[TMP41]], i32 4, <16 x i1> [[TMP17]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 -; AVX512-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; AVX512-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 +; AVX512-NEXT: br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -962,26 +875,23 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; AVX512: vec.epilog.vector.body: -; AVX512-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; AVX512-NEXT: [[TMP55:%.*]] = add i64 [[OFFSET_IDX]], 0 -; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP55]] -; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i32 0 -; AVX512-NEXT: [[TMP58:%.*]] = bitcast i32* [[TMP57]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP58]], align 4 -; AVX512-NEXT: [[TMP59:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr float, float* [[B]], i64 [[TMP55]] -; AVX512-NEXT: [[TMP61:%.*]] = getelementptr float, float* [[TMP60]], i32 0 -; AVX512-NEXT: [[TMP62:%.*]] = bitcast float* [[TMP61]] to <8 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP62]], i32 4, <8 x i1> [[TMP59]], <8 x float> poison) -; AVX512-NEXT: [[TMP63:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x float> -; AVX512-NEXT: [[TMP64:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD14]], [[TMP63]] -; AVX512-NEXT: [[TMP65:%.*]] = getelementptr float, float* [[A]], i64 [[TMP55]] -; AVX512-NEXT: [[TMP66:%.*]] = getelementptr float, float* [[TMP65]], i32 0 -; AVX512-NEXT: [[TMP67:%.*]] = bitcast float* [[TMP66]] to <8 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP64]], <8 x float>* [[TMP67]], i32 4, <8 x i1> [[TMP59]]) -; AVX512-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[OFFSET_IDX]], 8 -; AVX512-NEXT: [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000 -; AVX512-NEXT: br i1 [[TMP68]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; AVX512-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP43:%.*]] = add i64 [[INDEX12]], 0 +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP43]] +; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 0 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[TMP45]], align 4 +; AVX512-NEXT: [[TMP46:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], +; AVX512-NEXT: [[TMP47:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP43]] +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr float, ptr [[TMP47]], i32 0 +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP48]], i32 4, <8 x i1> [[TMP46]], <8 x float> poison) +; AVX512-NEXT: [[TMP49:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x float> +; AVX512-NEXT: [[TMP50:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD14]], [[TMP49]] +; AVX512-NEXT: [[TMP51:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP43]] +; AVX512-NEXT: [[TMP52:%.*]] = getelementptr float, ptr [[TMP51]], i32 0 +; AVX512-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP50]], ptr [[TMP52]], i32 4, <8 x i1> [[TMP46]]) +; AVX512-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8 +; AVX512-NEXT: [[TMP53:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000 +; AVX512-NEXT: br i1 [[TMP53]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; AVX512: vec.epilog.middle.block: ; AVX512-NEXT: [[CMP_N11:%.*]] = icmp eq i64 10000, 10000 ; AVX512-NEXT: br i1 [[CMP_N11]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] @@ -990,17 +900,17 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP69:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP69]], 100 +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP54:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP54]], 100 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX512: if.then: -; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP70:%.*]] = load float, float* [[ARRAYIDX3]], align 4 -; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP69]] to float -; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP70]], [[CONV]] -; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] -; AVX512-NEXT: store float [[ADD]], float* [[ARRAYIDX7]], align 4 +; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP55:%.*]] = load float, ptr [[ARRAYIDX3]], align 4 +; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP54]] to float +; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP55]], [[CONV]] +; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store float [[ADD]], ptr [[ARRAYIDX7]], align 4 ; AVX512-NEXT: br label [[FOR_INC]] ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -1014,18 +924,18 @@ entry: for.body: ; preds = %for.inc, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] - %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 %cmp1 = icmp slt i32 %0, 100 br i1 %cmp1, label %if.then, label %for.inc if.then: ; preds = %for.body - %arrayidx3 = getelementptr inbounds float, float* %B, i64 %indvars.iv - %1 = load float, float* %arrayidx3, align 4 + %arrayidx3 = getelementptr inbounds float, ptr %B, i64 %indvars.iv + %1 = load float, ptr %arrayidx3, align 4 %conv = sitofp i32 %0 to float %add = fadd float %1, %conv - %arrayidx7 = getelementptr inbounds float, float* %A, i64 %indvars.iv - store float %add, float* %arrayidx7, align 4 + %arrayidx7 = getelementptr inbounds float, ptr %A, i64 %indvars.iv + store float %add, ptr %arrayidx7, align 4 br label %for.inc for.inc: ; preds = %for.body, %if.then @@ -1039,7 +949,7 @@ for.end: ; preds = %for.inc ; The source code: ; -;void foo3(double *A, double *B, int *trigger) { +;void foo3(ptr A, ptr B, int *trigger) { ; ; for (int i=0; i<10000; i++) { ; if (trigger[i] < 100) { @@ -1048,27 +958,21 @@ for.end: ; preds = %for.inc ; } ;} -define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 { +define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 { ; AVX-LABEL: @foo3( ; AVX-NEXT: entry: -; AVX-NEXT: [[A1:%.*]] = bitcast double* [[A:%.*]] to i8* -; AVX-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* -; AVX-NEXT: [[B6:%.*]] = bitcast double* [[B:%.*]] to i8* ; AVX-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX: vector.memcheck: -; AVX-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 10000 -; AVX-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8* -; AVX-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000 -; AVX-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; AVX-NEXT: [[SCEVGEP7:%.*]] = getelementptr double, double* [[B]], i64 10000 -; AVX-NEXT: [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8* -; AVX-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]] -; AVX-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] +; AVX-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000 +; AVX-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000 +; AVX-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000 +; AVX-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]] +; AVX-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]] ; AVX-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]] -; AVX-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] -; AVX-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[UGLYGEP2]] +; AVX-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[UGLYGEP]] +; AVX-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; AVX-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; AVX-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; AVX: vector.ph: ; AVX-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1078,69 +982,57 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca ; AVX-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; AVX-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 ; AVX-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]] -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; AVX-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !7 -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 4 -; AVX-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* -; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !7 -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 -; AVX-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; AVX-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !7 -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 12 -; AVX-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>* -; AVX-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4, !alias.scope !7 -; AVX-NEXT: [[TMP16:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], -; AVX-NEXT: [[TMP17:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD12]], -; AVX-NEXT: [[TMP18:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD13]], -; AVX-NEXT: [[TMP19:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD14]], -; AVX-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[B]], i64 [[TMP0]] -; AVX-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[B]], i64 [[TMP1]] -; AVX-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[B]], i64 [[TMP2]] -; AVX-NEXT: [[TMP23:%.*]] = getelementptr double, double* [[B]], i64 [[TMP3]] -; AVX-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP20]], i32 0 -; AVX-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <4 x double>* -; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP25]], i32 8, <4 x i1> [[TMP16]], <4 x double> poison), !alias.scope !10 -; AVX-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[TMP20]], i32 4 -; AVX-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <4 x double>* -; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP27]], i32 8, <4 x i1> [[TMP17]], <4 x double> poison), !alias.scope !10 -; AVX-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP20]], i32 8 -; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>* -; AVX-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP18]], <4 x double> poison), !alias.scope !10 -; AVX-NEXT: [[TMP30:%.*]] = getelementptr double, double* [[TMP20]], i32 12 -; AVX-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* -; AVX-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP19]], <4 x double> poison), !alias.scope !10 -; AVX-NEXT: [[TMP32:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> -; AVX-NEXT: [[TMP33:%.*]] = sitofp <4 x i32> [[WIDE_LOAD12]] to <4 x double> -; AVX-NEXT: [[TMP34:%.*]] = sitofp <4 x i32> [[WIDE_LOAD13]] to <4 x double> -; AVX-NEXT: [[TMP35:%.*]] = sitofp <4 x i32> [[WIDE_LOAD14]] to <4 x double> -; AVX-NEXT: [[TMP36:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP32]] -; AVX-NEXT: [[TMP37:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD15]], [[TMP33]] -; AVX-NEXT: [[TMP38:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD16]], [[TMP34]] -; AVX-NEXT: [[TMP39:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD17]], [[TMP35]] -; AVX-NEXT: [[TMP40:%.*]] = getelementptr double, double* [[A]], i64 [[TMP0]] -; AVX-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[A]], i64 [[TMP1]] -; AVX-NEXT: [[TMP42:%.*]] = getelementptr double, double* [[A]], i64 [[TMP2]] -; AVX-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[A]], i64 [[TMP3]] -; AVX-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP40]], i32 0 -; AVX-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP36]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[TMP16]]), !alias.scope !12, !noalias !14 -; AVX-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[TMP40]], i32 4 -; AVX-NEXT: [[TMP47:%.*]] = bitcast double* [[TMP46]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP37]], <4 x double>* [[TMP47]], i32 8, <4 x i1> [[TMP17]]), !alias.scope !12, !noalias !14 -; AVX-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP40]], i32 8 -; AVX-NEXT: [[TMP49:%.*]] = bitcast double* [[TMP48]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP38]], <4 x double>* [[TMP49]], i32 8, <4 x i1> [[TMP18]]), !alias.scope !12, !noalias !14 -; AVX-NEXT: [[TMP50:%.*]] = getelementptr double, double* [[TMP40]], i32 12 -; AVX-NEXT: [[TMP51:%.*]] = bitcast double* [[TMP50]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP39]], <4 x double>* [[TMP51]], i32 8, <4 x i1> [[TMP19]]), !alias.scope !12, !noalias !14 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]] +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]] +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]] +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4, !alias.scope !7 +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4 +; AVX-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope !7 +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8 +; AVX-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4, !alias.scope !7 +; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12 +; AVX-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4, !alias.scope !7 +; AVX-NEXT: [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], +; AVX-NEXT: [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], +; AVX-NEXT: [[TMP14:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD7]], +; AVX-NEXT: [[TMP15:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD8]], +; AVX-NEXT: [[TMP16:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP0]] +; AVX-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP1]] +; AVX-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP2]] +; AVX-NEXT: [[TMP19:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP3]] +; AVX-NEXT: [[TMP20:%.*]] = getelementptr double, ptr [[TMP16]], i32 0 +; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP12]], <4 x double> poison), !alias.scope !10 +; AVX-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[TMP16]], i32 4 +; AVX-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> poison), !alias.scope !10 +; AVX-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[TMP16]], i32 8 +; AVX-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP14]], <4 x double> poison), !alias.scope !10 +; AVX-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[TMP16]], i32 12 +; AVX-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x double> poison), !alias.scope !10 +; AVX-NEXT: [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> +; AVX-NEXT: [[TMP25:%.*]] = sitofp <4 x i32> [[WIDE_LOAD6]] to <4 x double> +; AVX-NEXT: [[TMP26:%.*]] = sitofp <4 x i32> [[WIDE_LOAD7]] to <4 x double> +; AVX-NEXT: [[TMP27:%.*]] = sitofp <4 x i32> [[WIDE_LOAD8]] to <4 x double> +; AVX-NEXT: [[TMP28:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP24]] +; AVX-NEXT: [[TMP29:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD9]], [[TMP25]] +; AVX-NEXT: [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD10]], [[TMP26]] +; AVX-NEXT: [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], [[TMP27]] +; AVX-NEXT: [[TMP32:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP0]] +; AVX-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP1]] +; AVX-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP2]] +; AVX-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]] +; AVX-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[TMP32]], i32 0 +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP28]], ptr [[TMP36]], i32 8, <4 x i1> [[TMP12]]), !alias.scope !12, !noalias !14 +; AVX-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[TMP32]], i32 4 +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP29]], ptr [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope !12, !noalias !14 +; AVX-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP32]], i32 8 +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP30]], ptr [[TMP38]], i32 8, <4 x i1> [[TMP14]]), !alias.scope !12, !noalias !14 +; AVX-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP32]], i32 12 +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP31]], ptr [[TMP39]], i32 8, <4 x i1> [[TMP15]]), !alias.scope !12, !noalias !14 ; AVX-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; AVX-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; AVX-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; AVX-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; AVX: middle.block: ; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1149,17 +1041,17 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca ; AVX-NEXT: br label [[FOR_BODY:%.*]] ; AVX: for.body: ; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP53]], 100 +; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX-NEXT: [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; AVX-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP41]], 100 ; AVX-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX: if.then: -; AVX-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP54:%.*]] = load double, double* [[ARRAYIDX3]], align 8 -; AVX-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP53]] to double -; AVX-NEXT: [[ADD:%.*]] = fadd double [[TMP54]], [[CONV]] -; AVX-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]] -; AVX-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8 +; AVX-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV]] +; AVX-NEXT: [[TMP42:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 +; AVX-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP41]] to double +; AVX-NEXT: [[ADD:%.*]] = fadd double [[TMP42]], [[CONV]] +; AVX-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] +; AVX-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8 ; AVX-NEXT: br label [[FOR_INC]] ; AVX: for.inc: ; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -1170,24 +1062,18 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca ; ; AVX512-LABEL: @foo3( ; AVX512-NEXT: entry: -; AVX512-NEXT: [[A1:%.*]] = bitcast double* [[A:%.*]] to i8* -; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* -; AVX512-NEXT: [[B6:%.*]] = bitcast double* [[B:%.*]] to i8* ; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: -; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 10000 -; AVX512-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8* -; AVX512-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000 -; AVX512-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; AVX512-NEXT: [[SCEVGEP7:%.*]] = getelementptr double, double* [[B]], i64 10000 -; AVX512-NEXT: [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8* -; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]] -; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] +; AVX512-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000 +; AVX512-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000 +; AVX512-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000 +; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]] +; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]] ; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX512-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]] -; AVX512-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] -; AVX512-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[UGLYGEP2]] +; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[UGLYGEP]] +; AVX512-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; AVX512: vector.ph: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1197,69 +1083,57 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca ; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 ; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 ; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !11 -; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 -; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !11 -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 -; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !11 -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24 -; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !11 -; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], -; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], -; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[B]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[B]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[B]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP23:%.*]] = getelementptr double, double* [[B]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP20]], i32 0 -; AVX512-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP25]], i32 8, <8 x i1> [[TMP16]], <8 x double> poison), !alias.scope !14 -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[TMP20]], i32 8 -; AVX512-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP27]], i32 8, <8 x i1> [[TMP17]], <8 x double> poison), !alias.scope !14 -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP20]], i32 16 -; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP18]], <8 x double> poison), !alias.scope !14 -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr double, double* [[TMP20]], i32 24 -; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP19]], <8 x double> poison), !alias.scope !14 -; AVX512-NEXT: [[TMP32:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double> -; AVX512-NEXT: [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x double> -; AVX512-NEXT: [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x double> -; AVX512-NEXT: [[TMP35:%.*]] = sitofp <8 x i32> [[WIDE_LOAD14]] to <8 x double> -; AVX512-NEXT: [[TMP36:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], [[TMP32]] -; AVX512-NEXT: [[TMP37:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD15]], [[TMP33]] -; AVX512-NEXT: [[TMP38:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD16]], [[TMP34]] -; AVX512-NEXT: [[TMP39:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD17]], [[TMP35]] -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr double, double* [[A]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[A]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr double, double* [[A]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[A]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP40]], i32 0 -; AVX512-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP36]], <8 x double>* [[TMP45]], i32 8, <8 x i1> [[TMP16]]), !alias.scope !16, !noalias !18 -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[TMP40]], i32 8 -; AVX512-NEXT: [[TMP47:%.*]] = bitcast double* [[TMP46]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP37]], <8 x double>* [[TMP47]], i32 8, <8 x i1> [[TMP17]]), !alias.scope !16, !noalias !18 -; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP40]], i32 16 -; AVX512-NEXT: [[TMP49:%.*]] = bitcast double* [[TMP48]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP38]], <8 x double>* [[TMP49]], i32 8, <8 x i1> [[TMP18]]), !alias.scope !16, !noalias !18 -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr double, double* [[TMP40]], i32 24 -; AVX512-NEXT: [[TMP51:%.*]] = bitcast double* [[TMP50]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP39]], <8 x double>* [[TMP51]], i32 8, <8 x i1> [[TMP19]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4, !alias.scope !11 +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8 +; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !alias.scope !11 +; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 16 +; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP10]], align 4, !alias.scope !11 +; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 24 +; AVX512-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP11]], align 4, !alias.scope !11 +; AVX512-NEXT: [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], +; AVX512-NEXT: [[TMP14:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], +; AVX512-NEXT: [[TMP15:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD8]], +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP19:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr double, ptr [[TMP16]], i32 0 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP20]], i32 8, <8 x i1> [[TMP12]], <8 x double> poison), !alias.scope !14 +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[TMP16]], i32 8 +; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP21]], i32 8, <8 x i1> [[TMP13]], <8 x double> poison), !alias.scope !14 +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[TMP16]], i32 16 +; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP22]], i32 8, <8 x i1> [[TMP14]], <8 x double> poison), !alias.scope !14 +; AVX512-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[TMP16]], i32 24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x double> poison), !alias.scope !14 +; AVX512-NEXT: [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double> +; AVX512-NEXT: [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD6]] to <8 x double> +; AVX512-NEXT: [[TMP26:%.*]] = sitofp <8 x i32> [[WIDE_LOAD7]] to <8 x double> +; AVX512-NEXT: [[TMP27:%.*]] = sitofp <8 x i32> [[WIDE_LOAD8]] to <8 x double> +; AVX512-NEXT: [[TMP28:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], [[TMP24]] +; AVX512-NEXT: [[TMP29:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD9]], [[TMP25]] +; AVX512-NEXT: [[TMP30:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD10]], [[TMP26]] +; AVX512-NEXT: [[TMP31:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD11]], [[TMP27]] +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[TMP32]], i32 0 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP28]], ptr [[TMP36]], i32 8, <8 x i1> [[TMP12]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[TMP32]], i32 8 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP29]], ptr [[TMP37]], i32 8, <8 x i1> [[TMP13]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP32]], i32 16 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP30]], ptr [[TMP38]], i32 8, <8 x i1> [[TMP14]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP32]], i32 24 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP31]], ptr [[TMP39]], i32 8, <8 x i1> [[TMP15]]), !alias.scope !16, !noalias !18 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 +; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1268,17 +1142,17 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP53]], 100 +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP41]], 100 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX512: if.then: -; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP54:%.*]] = load double, double* [[ARRAYIDX3]], align 8 -; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP53]] to double -; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP54]], [[CONV]] -; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]] -; AVX512-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8 +; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP42:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 +; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP41]] to double +; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP42]], [[CONV]] +; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8 ; AVX512-NEXT: br label [[FOR_INC]] ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -1292,18 +1166,18 @@ entry: for.body: ; preds = %for.inc, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] - %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 %cmp1 = icmp slt i32 %0, 100 br i1 %cmp1, label %if.then, label %for.inc if.then: ; preds = %for.body - %arrayidx3 = getelementptr inbounds double, double* %B, i64 %indvars.iv - %1 = load double, double* %arrayidx3, align 8 + %arrayidx3 = getelementptr inbounds double, ptr %B, i64 %indvars.iv + %1 = load double, ptr %arrayidx3, align 8 %conv = sitofp i32 %0 to double %add = fadd double %1, %conv - %arrayidx7 = getelementptr inbounds double, double* %A, i64 %indvars.iv - store double %add, double* %arrayidx7, align 8 + %arrayidx7 = getelementptr inbounds double, ptr %A, i64 %indvars.iv + store double %add, ptr %arrayidx7, align 8 br label %for.inc for.inc: ; preds = %for.body, %if.then @@ -1317,7 +1191,7 @@ for.end: ; preds = %for.inc ; The source code: ; -;void foo4(double *A, double *B, int *trigger) { +;void foo4(ptr A, ptr B, int *trigger) { ; ; for (int i=0; i<10000; i += 16) { ; if (trigger[i] < 100) { @@ -1326,24 +1200,24 @@ for.end: ; preds = %for.inc ; } ;} -define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 { +define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 { ; AVX-LABEL: @foo4( ; AVX-NEXT: entry: ; AVX-NEXT: br label [[FOR_BODY:%.*]] ; AVX: for.body: ; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[INDVARS_IV]] +; AVX-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; AVX-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100 ; AVX-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX: if.then: ; AVX-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 -; AVX-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP1]] -; AVX-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; AVX-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 [[TMP1]] +; AVX-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 ; AVX-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double ; AVX-NEXT: [[ADD:%.*]] = fadd double [[TMP2]], [[CONV]] -; AVX-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]] -; AVX-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8 +; AVX-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[INDVARS_IV]] +; AVX-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8 ; AVX-NEXT: br label [[FOR_INC]] ; AVX: for.inc: ; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 @@ -1354,40 +1228,34 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca ; ; AVX512-LABEL: @foo4( ; AVX512-NEXT: entry: -; AVX512-NEXT: [[A1:%.*]] = bitcast double* [[A:%.*]] to i8* -; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* -; AVX512-NEXT: [[B6:%.*]] = bitcast double* [[B:%.*]] to i8* ; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: -; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 9985 -; AVX512-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8* -; AVX512-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 9985 -; AVX512-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; AVX512-NEXT: [[SCEVGEP7:%.*]] = getelementptr double, double* [[B]], i64 19969 -; AVX512-NEXT: [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8* -; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]] -; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] +; AVX512-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880 +; AVX512-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940 +; AVX512-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752 +; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]] +; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]] ; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX512-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]] -; AVX512-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] -; AVX512-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[UGLYGEP2]] +; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[UGLYGEP]] +; AVX512-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; AVX512: vector.ph: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND]] -; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP0]], i32 4, <8 x i1> , <8 x i32> poison), !alias.scope !21 +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <8 x i64> [[VEC_IND]] +; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP0]], i32 4, <8 x i1> , <8 x i32> poison), !alias.scope !21 ; AVX512-NEXT: [[TMP1:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], ; AVX512-NEXT: [[TMP2:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], -; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP2]] -; AVX512-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> poison), !alias.scope !24 +; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[B]], <8 x i64> [[TMP2]] +; AVX512-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> poison), !alias.scope !24 ; AVX512-NEXT: [[TMP4:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double> -; AVX512-NEXT: [[TMP5:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER12]], [[TMP4]] -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND]] -; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP5]], <8 x double*> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope !26, !noalias !28 +; AVX512-NEXT: [[TMP5:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER6]], [[TMP4]] +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, ptr [[A]], <8 x i64> [[VEC_IND]] +; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> [[TMP5]], <8 x ptr> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope !26, !noalias !28 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; AVX512-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], ; AVX512-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624 @@ -1400,18 +1268,18 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP8]], 100 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX512: if.then: ; AVX512-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 -; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP9]] -; AVX512-NEXT: [[TMP10:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP9]] +; AVX512-NEXT: [[TMP10:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 ; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP8]] to double ; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP10]], [[CONV]] -; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]] -; AVX512-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8 +; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8 ; AVX512-NEXT: br label [[FOR_INC]] ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 @@ -1425,19 +1293,19 @@ entry: for.body: ; preds = %entry, %for.inc %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] - %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 %cmp1 = icmp slt i32 %0, 100 br i1 %cmp1, label %if.then, label %for.inc if.then: ; preds = %for.body %1 = shl nuw nsw i64 %indvars.iv, 1 - %arrayidx3 = getelementptr inbounds double, double* %B, i64 %1 - %2 = load double, double* %arrayidx3, align 8 + %arrayidx3 = getelementptr inbounds double, ptr %B, i64 %1 + %2 = load double, ptr %arrayidx3, align 8 %conv = sitofp i32 %0 to double %add = fadd double %2, %conv - %arrayidx7 = getelementptr inbounds double, double* %A, i64 %indvars.iv - store double %add, double* %arrayidx7, align 8 + %arrayidx7 = getelementptr inbounds double, ptr %A, i64 %indvars.iv + store double %add, ptr %arrayidx7, align 8 br label %for.inc for.inc: ; preds = %for.body, %if.then @@ -1449,11 +1317,11 @@ for.end: ; preds = %for.inc ret void } -@a = common global [1 x i32*] zeroinitializer, align 8 -@c = common global i32* null, align 8 +@a = common global [1 x ptr] zeroinitializer, align 8 +@c = common global ptr null, align 8 ; Reverse loop -;void foo6(double *in, double *out, unsigned size, int *trigger) { +;void foo6(ptr in, ptr out, unsigned size, int *trigger) { ; ; for (int i=SIZE-1; i>=0; i--) { ; if (trigger[i] > 0) { @@ -1462,22 +1330,22 @@ for.end: ; preds = %for.inc ; } ;} -define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %size, i32* nocapture readonly %trigger) local_unnamed_addr #0 { +define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr nocapture readonly %trigger) local_unnamed_addr #0 { ; AVX1-LABEL: @foo6( ; AVX1-NEXT: entry: ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 4095, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; AVX1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0 ; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX1: if.then: -; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN:%.*]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[IN:%.*]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 ; AVX1-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e-01 -; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDVARS_IV]] -; AVX1-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8 +; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], i64 [[INDVARS_IV]] +; AVX1-NEXT: store double [[ADD]], ptr [[ARRAYIDX5]], align 8 ; AVX1-NEXT: br label [[FOR_INC]] ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 @@ -1488,24 +1356,18 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s ; ; AVX2-LABEL: @foo6( ; AVX2-NEXT: entry: -; AVX2-NEXT: [[OUT1:%.*]] = bitcast double* [[OUT:%.*]] to i8* -; AVX2-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* -; AVX2-NEXT: [[IN6:%.*]] = bitcast double* [[IN:%.*]] to i8* ; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX2: vector.memcheck: -; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT]], i64 4096 -; AVX2-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8* -; AVX2-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 4096 -; AVX2-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; AVX2-NEXT: [[SCEVGEP7:%.*]] = getelementptr double, double* [[IN]], i64 4096 -; AVX2-NEXT: [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8* -; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[OUT1]], [[SCEVGEP45]] -; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] +; AVX2-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 32768 +; AVX2-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 16384 +; AVX2-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 32768 +; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[UGLYGEP1]] +; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]] ; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX2-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[OUT1]], [[SCEVGEP78]] -; AVX2-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[IN6]], [[SCEVGEP2]] -; AVX2-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX2-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[OUT]], [[UGLYGEP2]] +; AVX2-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[IN]], [[UGLYGEP]] +; AVX2-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; AVX2: vector.ph: ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1516,93 +1378,81 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s ; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -4 ; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -8 ; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -12 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -3 -; AVX2-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !17 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 -3 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope !17 ; AVX2-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -4 -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -3 -; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !17 -; AVX2-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD12]], <4 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -8 -; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 -3 -; AVX2-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4, !alias.scope !17 -; AVX2-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD14]], <4 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -12 -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 -3 -; AVX2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4, !alias.scope !17 -; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD16]], <4 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP20:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer -; AVX2-NEXT: [[TMP21:%.*]] = icmp sgt <4 x i32> [[REVERSE13]], zeroinitializer -; AVX2-NEXT: [[TMP22:%.*]] = icmp sgt <4 x i32> [[REVERSE15]], zeroinitializer -; AVX2-NEXT: [[TMP23:%.*]] = icmp sgt <4 x i32> [[REVERSE17]], zeroinitializer -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP24]], i32 0 -; AVX2-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP28]], i32 -3 -; AVX2-NEXT: [[REVERSE18:%.*]] = shufflevector <4 x i1> [[TMP20]], <4 x i1> poison, <4 x i32> -; AVX2-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP30]], i32 8, <4 x i1> [[REVERSE18]], <4 x double> poison), !alias.scope !20 -; AVX2-NEXT: [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: [[TMP31:%.*]] = getelementptr double, double* [[TMP24]], i32 -4 -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP31]], i32 -3 -; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP21]], <4 x i1> poison, <4 x i32> -; AVX2-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP33]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope !20 +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -4 +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 -3 +; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4, !alias.scope !17 +; AVX2-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD6]], <4 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -8 +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 -3 +; AVX2-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4, !alias.scope !17 +; AVX2-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD8]], <4 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -12 +; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 -3 +; AVX2-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4, !alias.scope !17 +; AVX2-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD10]], <4 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer +; AVX2-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i32> [[REVERSE7]], zeroinitializer +; AVX2-NEXT: [[TMP18:%.*]] = icmp sgt <4 x i32> [[REVERSE9]], zeroinitializer +; AVX2-NEXT: [[TMP19:%.*]] = icmp sgt <4 x i32> [[REVERSE11]], zeroinitializer +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr double, ptr [[TMP20]], i32 0 +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[TMP24]], i32 -3 +; AVX2-NEXT: [[REVERSE12:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> poison, <4 x i32> +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP25]], i32 8, <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope !20 +; AVX2-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[TMP20]], i32 -4 +; AVX2-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[TMP26]], i32 -3 +; AVX2-NEXT: [[REVERSE14:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> poison, <4 x i32> +; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP27]], i32 8, <4 x i1> [[REVERSE14]], <4 x double> poison), !alias.scope !20 +; AVX2-NEXT: [[REVERSE16:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD15]], <4 x double> poison, <4 x i32> +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr double, ptr [[TMP20]], i32 -8 +; AVX2-NEXT: [[TMP29:%.*]] = getelementptr double, ptr [[TMP28]], i32 -3 +; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP18]], <4 x i1> poison, <4 x i32> +; AVX2-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP29]], i32 8, <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope !20 +; AVX2-NEXT: [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD18]], <4 x double> poison, <4 x i32> +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr double, ptr [[TMP20]], i32 -12 +; AVX2-NEXT: [[TMP31:%.*]] = getelementptr double, ptr [[TMP30]], i32 -3 +; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP19]], <4 x i1> poison, <4 x i32> +; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP31]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope !20 ; AVX2-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP24]], i32 -8 -; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i32 -3 -; AVX2-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i1> [[TMP22]], <4 x i1> poison, <4 x i32> -; AVX2-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE23]], <4 x double> poison), !alias.scope !20 -; AVX2-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD24]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP24]], i32 -12 -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i32 -3 -; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP23]], <4 x i1> poison, <4 x i32> -; AVX2-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> poison), !alias.scope !20 -; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD27]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: [[TMP40:%.*]] = fadd <4 x double> [[REVERSE19]], -; AVX2-NEXT: [[TMP41:%.*]] = fadd <4 x double> [[REVERSE22]], -; AVX2-NEXT: [[TMP42:%.*]] = fadd <4 x double> [[REVERSE25]], -; AVX2-NEXT: [[TMP43:%.*]] = fadd <4 x double> [[REVERSE28]], -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]] -; AVX2-NEXT: [[REVERSE29:%.*]] = shufflevector <4 x double> [[TMP40]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP44]], i32 0 -; AVX2-NEXT: [[TMP49:%.*]] = getelementptr double, double* [[TMP48]], i32 -3 -; AVX2-NEXT: [[TMP50:%.*]] = bitcast double* [[TMP49]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE29]], <4 x double>* [[TMP50]], i32 8, <4 x i1> [[REVERSE18]]), !alias.scope !22, !noalias !24 -; AVX2-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x double> [[TMP41]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: [[TMP51:%.*]] = getelementptr double, double* [[TMP44]], i32 -4 -; AVX2-NEXT: [[TMP52:%.*]] = getelementptr double, double* [[TMP51]], i32 -3 -; AVX2-NEXT: [[TMP53:%.*]] = bitcast double* [[TMP52]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE31]], <4 x double>* [[TMP53]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope !22, !noalias !24 -; AVX2-NEXT: [[REVERSE33:%.*]] = shufflevector <4 x double> [[TMP42]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: [[TMP54:%.*]] = getelementptr double, double* [[TMP44]], i32 -8 -; AVX2-NEXT: [[TMP55:%.*]] = getelementptr double, double* [[TMP54]], i32 -3 -; AVX2-NEXT: [[TMP56:%.*]] = bitcast double* [[TMP55]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE33]], <4 x double>* [[TMP56]], i32 8, <4 x i1> [[REVERSE23]]), !alias.scope !22, !noalias !24 -; AVX2-NEXT: [[REVERSE35:%.*]] = shufflevector <4 x double> [[TMP43]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: [[TMP57:%.*]] = getelementptr double, double* [[TMP44]], i32 -12 -; AVX2-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP57]], i32 -3 -; AVX2-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !22, !noalias !24 +; AVX2-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[REVERSE13]], +; AVX2-NEXT: [[TMP33:%.*]] = fadd <4 x double> [[REVERSE16]], +; AVX2-NEXT: [[TMP34:%.*]] = fadd <4 x double> [[REVERSE19]], +; AVX2-NEXT: [[TMP35:%.*]] = fadd <4 x double> [[REVERSE22]], +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX2-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x double> [[TMP32]], <4 x double> poison, <4 x i32> +; AVX2-NEXT: [[TMP40:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 +; AVX2-NEXT: [[TMP41:%.*]] = getelementptr double, ptr [[TMP40]], i32 -3 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE23]], ptr [[TMP41]], i32 8, <4 x i1> [[REVERSE12]]), !alias.scope !22, !noalias !24 +; AVX2-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x double> [[TMP33]], <4 x double> poison, <4 x i32> +; AVX2-NEXT: [[TMP42:%.*]] = getelementptr double, ptr [[TMP36]], i32 -4 +; AVX2-NEXT: [[TMP43:%.*]] = getelementptr double, ptr [[TMP42]], i32 -3 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE25]], ptr [[TMP43]], i32 8, <4 x i1> [[REVERSE14]]), !alias.scope !22, !noalias !24 +; AVX2-NEXT: [[REVERSE27:%.*]] = shufflevector <4 x double> [[TMP34]], <4 x double> poison, <4 x i32> +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[TMP36]], i32 -8 +; AVX2-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[TMP44]], i32 -3 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE27]], ptr [[TMP45]], i32 8, <4 x i1> [[REVERSE17]]), !alias.scope !22, !noalias !24 +; AVX2-NEXT: [[REVERSE29:%.*]] = shufflevector <4 x double> [[TMP35]], <4 x double> poison, <4 x i32> +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[TMP36]], i32 -12 +; AVX2-NEXT: [[TMP47:%.*]] = getelementptr double, ptr [[TMP46]], i32 -3 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE29]], ptr [[TMP47]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope !22, !noalias !24 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; AVX2-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX2-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; AVX2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1611,16 +1461,16 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP61:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP61]], 0 +; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP49:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP49]], 0 ; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX2: if.then: -; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP62:%.*]] = load double, double* [[ARRAYIDX3]], align 8 -; AVX2-NEXT: [[ADD:%.*]] = fadd double [[TMP62]], 5.000000e-01 -; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] -; AVX2-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8 +; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP50:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 +; AVX2-NEXT: [[ADD:%.*]] = fadd double [[TMP50]], 5.000000e-01 +; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]] +; AVX2-NEXT: store double [[ADD]], ptr [[ARRAYIDX5]], align 8 ; AVX2-NEXT: br label [[FOR_INC]] ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 @@ -1631,24 +1481,18 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s ; ; AVX512-LABEL: @foo6( ; AVX512-NEXT: entry: -; AVX512-NEXT: [[OUT1:%.*]] = bitcast double* [[OUT:%.*]] to i8* -; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* -; AVX512-NEXT: [[IN6:%.*]] = bitcast double* [[IN:%.*]] to i8* ; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: -; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT]], i64 4096 -; AVX512-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8* -; AVX512-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 4096 -; AVX512-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; AVX512-NEXT: [[SCEVGEP7:%.*]] = getelementptr double, double* [[IN]], i64 4096 -; AVX512-NEXT: [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8* -; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[OUT1]], [[SCEVGEP45]] -; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] +; AVX512-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 32768 +; AVX512-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 16384 +; AVX512-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 32768 +; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[UGLYGEP1]] +; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]] ; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX512-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[OUT1]], [[SCEVGEP78]] -; AVX512-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[IN6]], [[SCEVGEP2]] -; AVX512-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[OUT]], [[UGLYGEP2]] +; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[IN]], [[UGLYGEP]] +; AVX512-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; AVX512: vector.ph: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1659,93 +1503,81 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s ; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -8 ; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -16 ; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -24 -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -7 -; AVX512-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !alias.scope !31 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 -7 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !alias.scope !31 ; AVX512-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> poison, <8 x i32> -; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -8 -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -7 -; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !31 -; AVX512-NEXT: [[REVERSE13:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD12]], <8 x i32> poison, <8 x i32> -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -16 -; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 -7 -; AVX512-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 4, !alias.scope !31 -; AVX512-NEXT: [[REVERSE15:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD14]], <8 x i32> poison, <8 x i32> -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -24 -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 -7 -; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i32>, <8 x i32>* [[TMP19]], align 4, !alias.scope !31 -; AVX512-NEXT: [[REVERSE17:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD16]], <8 x i32> poison, <8 x i32> -; AVX512-NEXT: [[TMP20:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer -; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <8 x i32> [[REVERSE13]], zeroinitializer -; AVX512-NEXT: [[TMP22:%.*]] = icmp sgt <8 x i32> [[REVERSE15]], zeroinitializer -; AVX512-NEXT: [[TMP23:%.*]] = icmp sgt <8 x i32> [[REVERSE17]], zeroinitializer -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP24]], i32 0 -; AVX512-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP28]], i32 -7 -; AVX512-NEXT: [[REVERSE18:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> poison, <8 x i32> -; AVX512-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP30]], i32 8, <8 x i1> [[REVERSE18]], <8 x double> poison), !alias.scope !34 -; AVX512-NEXT: [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: [[TMP31:%.*]] = getelementptr double, double* [[TMP24]], i32 -8 -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP31]], i32 -7 -; AVX512-NEXT: [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP21]], <8 x i1> poison, <8 x i32> -; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP33]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope !34 +; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -8 +; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 -7 +; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP11]], align 4, !alias.scope !31 +; AVX512-NEXT: [[REVERSE7:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD6]], <8 x i32> poison, <8 x i32> +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -16 +; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 -7 +; AVX512-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP13]], align 4, !alias.scope !31 +; AVX512-NEXT: [[REVERSE9:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD8]], <8 x i32> poison, <8 x i32> +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -24 +; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 -7 +; AVX512-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i32>, ptr [[TMP15]], align 4, !alias.scope !31 +; AVX512-NEXT: [[REVERSE11:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD10]], <8 x i32> poison, <8 x i32> +; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer +; AVX512-NEXT: [[TMP17:%.*]] = icmp sgt <8 x i32> [[REVERSE7]], zeroinitializer +; AVX512-NEXT: [[TMP18:%.*]] = icmp sgt <8 x i32> [[REVERSE9]], zeroinitializer +; AVX512-NEXT: [[TMP19:%.*]] = icmp sgt <8 x i32> [[REVERSE11]], zeroinitializer +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double, ptr [[TMP20]], i32 0 +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[TMP24]], i32 -7 +; AVX512-NEXT: [[REVERSE12:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> poison, <8 x i32> +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP25]], i32 8, <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope !34 +; AVX512-NEXT: [[REVERSE13:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[TMP20]], i32 -8 +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[TMP26]], i32 -7 +; AVX512-NEXT: [[REVERSE14:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> poison, <8 x i32> +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP27]], i32 8, <8 x i1> [[REVERSE14]], <8 x double> poison), !alias.scope !34 +; AVX512-NEXT: [[REVERSE16:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD15]], <8 x double> poison, <8 x i32> +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr double, ptr [[TMP20]], i32 -16 +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr double, ptr [[TMP28]], i32 -7 +; AVX512-NEXT: [[REVERSE17:%.*]] = shufflevector <8 x i1> [[TMP18]], <8 x i1> poison, <8 x i32> +; AVX512-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP29]], i32 8, <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope !34 +; AVX512-NEXT: [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD18]], <8 x double> poison, <8 x i32> +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr double, ptr [[TMP20]], i32 -24 +; AVX512-NEXT: [[TMP31:%.*]] = getelementptr double, ptr [[TMP30]], i32 -7 +; AVX512-NEXT: [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP19]], <8 x i1> poison, <8 x i32> +; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP31]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope !34 ; AVX512-NEXT: [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP24]], i32 -16 -; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i32 -7 -; AVX512-NEXT: [[REVERSE23:%.*]] = shufflevector <8 x i1> [[TMP22]], <8 x i1> poison, <8 x i32> -; AVX512-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE23]], <8 x double> poison), !alias.scope !34 -; AVX512-NEXT: [[REVERSE25:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD24]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP24]], i32 -24 -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i32 -7 -; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP23]], <8 x i1> poison, <8 x i32> -; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> poison), !alias.scope !34 -; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD27]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: [[TMP40:%.*]] = fadd <8 x double> [[REVERSE19]], -; AVX512-NEXT: [[TMP41:%.*]] = fadd <8 x double> [[REVERSE22]], -; AVX512-NEXT: [[TMP42:%.*]] = fadd <8 x double> [[REVERSE25]], -; AVX512-NEXT: [[TMP43:%.*]] = fadd <8 x double> [[REVERSE28]], -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]] -; AVX512-NEXT: [[REVERSE29:%.*]] = shufflevector <8 x double> [[TMP40]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP44]], i32 0 -; AVX512-NEXT: [[TMP49:%.*]] = getelementptr double, double* [[TMP48]], i32 -7 -; AVX512-NEXT: [[TMP50:%.*]] = bitcast double* [[TMP49]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE29]], <8 x double>* [[TMP50]], i32 8, <8 x i1> [[REVERSE18]]), !alias.scope !36, !noalias !38 -; AVX512-NEXT: [[REVERSE31:%.*]] = shufflevector <8 x double> [[TMP41]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: [[TMP51:%.*]] = getelementptr double, double* [[TMP44]], i32 -8 -; AVX512-NEXT: [[TMP52:%.*]] = getelementptr double, double* [[TMP51]], i32 -7 -; AVX512-NEXT: [[TMP53:%.*]] = bitcast double* [[TMP52]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE31]], <8 x double>* [[TMP53]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope !36, !noalias !38 -; AVX512-NEXT: [[REVERSE33:%.*]] = shufflevector <8 x double> [[TMP42]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: [[TMP54:%.*]] = getelementptr double, double* [[TMP44]], i32 -16 -; AVX512-NEXT: [[TMP55:%.*]] = getelementptr double, double* [[TMP54]], i32 -7 -; AVX512-NEXT: [[TMP56:%.*]] = bitcast double* [[TMP55]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE33]], <8 x double>* [[TMP56]], i32 8, <8 x i1> [[REVERSE23]]), !alias.scope !36, !noalias !38 -; AVX512-NEXT: [[REVERSE35:%.*]] = shufflevector <8 x double> [[TMP43]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: [[TMP57:%.*]] = getelementptr double, double* [[TMP44]], i32 -24 -; AVX512-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP57]], i32 -7 -; AVX512-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE35]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !36, !noalias !38 +; AVX512-NEXT: [[TMP32:%.*]] = fadd <8 x double> [[REVERSE13]], +; AVX512-NEXT: [[TMP33:%.*]] = fadd <8 x double> [[REVERSE16]], +; AVX512-NEXT: [[TMP34:%.*]] = fadd <8 x double> [[REVERSE19]], +; AVX512-NEXT: [[TMP35:%.*]] = fadd <8 x double> [[REVERSE22]], +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX512-NEXT: [[REVERSE23:%.*]] = shufflevector <8 x double> [[TMP32]], <8 x double> poison, <8 x i32> +; AVX512-NEXT: [[TMP40:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 +; AVX512-NEXT: [[TMP41:%.*]] = getelementptr double, ptr [[TMP40]], i32 -7 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE23]], ptr [[TMP41]], i32 8, <8 x i1> [[REVERSE12]]), !alias.scope !36, !noalias !38 +; AVX512-NEXT: [[REVERSE25:%.*]] = shufflevector <8 x double> [[TMP33]], <8 x double> poison, <8 x i32> +; AVX512-NEXT: [[TMP42:%.*]] = getelementptr double, ptr [[TMP36]], i32 -8 +; AVX512-NEXT: [[TMP43:%.*]] = getelementptr double, ptr [[TMP42]], i32 -7 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE25]], ptr [[TMP43]], i32 8, <8 x i1> [[REVERSE14]]), !alias.scope !36, !noalias !38 +; AVX512-NEXT: [[REVERSE27:%.*]] = shufflevector <8 x double> [[TMP34]], <8 x double> poison, <8 x i32> +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[TMP36]], i32 -16 +; AVX512-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[TMP44]], i32 -7 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE27]], ptr [[TMP45]], i32 8, <8 x i1> [[REVERSE17]]), !alias.scope !36, !noalias !38 +; AVX512-NEXT: [[REVERSE29:%.*]] = shufflevector <8 x double> [[TMP35]], <8 x double> poison, <8 x i32> +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[TMP36]], i32 -24 +; AVX512-NEXT: [[TMP47:%.*]] = getelementptr double, ptr [[TMP46]], i32 -7 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE29]], ptr [[TMP47]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope !36, !noalias !38 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; AVX512-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX512-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] +; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1754,16 +1586,16 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP61:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP61]], 0 +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP49:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP49]], 0 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX512: if.then: -; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP62:%.*]] = load double, double* [[ARRAYIDX3]], align 8 -; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP62]], 5.000000e-01 -; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] -; AVX512-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8 +; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP50:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 +; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP50]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store double [[ADD]], ptr [[ARRAYIDX5]], align 8 ; AVX512-NEXT: br label [[FOR_INC]] ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 @@ -1777,17 +1609,17 @@ entry: for.body: ; preds = %for.inc, %entry %indvars.iv = phi i64 [ 4095, %entry ], [ %indvars.iv.next, %for.inc ] - %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 %cmp1 = icmp sgt i32 %0, 0 br i1 %cmp1, label %if.then, label %for.inc if.then: ; preds = %for.body - %arrayidx3 = getelementptr inbounds double, double* %in, i64 %indvars.iv - %1 = load double, double* %arrayidx3, align 8 + %arrayidx3 = getelementptr inbounds double, ptr %in, i64 %indvars.iv + %1 = load double, ptr %arrayidx3, align 8 %add = fadd double %1, 5.000000e-01 - %arrayidx5 = getelementptr inbounds double, double* %out, i64 %indvars.iv - store double %add, double* %arrayidx5, align 8 + %arrayidx5 = getelementptr inbounds double, ptr %out, i64 %indvars.iv + store double %add, ptr %arrayidx5, align 8 br label %for.inc for.inc: ; preds = %for.body, %if.then @@ -1799,7 +1631,7 @@ for.end: ; preds = %for.inc ret void } -; void foo7 (double * __restrict__ out, double ** __restrict__ in, +; void foo7 (ptr __restrict__ out, ptr __restrict__ in, ; bool * __restrict__ trigger, unsigned size) { ; ; for (unsigned i=0; i* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1 -; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4 -; AVX1-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 -; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8 -; AVX1-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 -; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12 -; AVX1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1 -; AVX1-NEXT: [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], -; AVX1-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], -; AVX1-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], -; AVX1-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer -; AVX1-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer -; AVX1-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer -; AVX1-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer -; AVX1-NEXT: [[TMP24:%.*]] = getelementptr double*, double** [[IN:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP25:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP26:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP27:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], -; AVX1-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], -; AVX1-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], -; AVX1-NEXT: [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], -; AVX1-NEXT: [[TMP32:%.*]] = getelementptr double*, double** [[TMP24]], i32 0 -; AVX1-NEXT: [[TMP33:%.*]] = bitcast double** [[TMP32]] to <4 x double*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x double*> poison) -; AVX1-NEXT: [[TMP34:%.*]] = getelementptr double*, double** [[TMP24]], i32 4 -; AVX1-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <4 x double*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x double*> poison) -; AVX1-NEXT: [[TMP36:%.*]] = getelementptr double*, double** [[TMP24]], i32 8 -; AVX1-NEXT: [[TMP37:%.*]] = bitcast double** [[TMP36]] to <4 x double*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x double*> poison) -; AVX1-NEXT: [[TMP38:%.*]] = getelementptr double*, double** [[TMP24]], i32 12 -; AVX1-NEXT: [[TMP39:%.*]] = bitcast double** [[TMP38]] to <4 x double*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x double*> poison) -; AVX1-NEXT: [[TMP40:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX1-NEXT: [[TMP41:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX1-NEXT: [[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX1-NEXT: [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX1-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], -; AVX1-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], -; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], -; AVX1-NEXT: [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], -; AVX1-NEXT: [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0 -; AVX1-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]]) -; AVX1-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4 -; AVX1-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]]) -; AVX1-NEXT: [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8 -; AVX1-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]]) -; AVX1-NEXT: [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 12 -; AVX1-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1 +; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 4 +; AVX1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 +; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8 +; AVX1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12 +; AVX1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 +; AVX1-NEXT: [[TMP12:%.*]] = and <4 x i8> [[WIDE_LOAD]], +; AVX1-NEXT: [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD1]], +; AVX1-NEXT: [[TMP14:%.*]] = and <4 x i8> [[WIDE_LOAD2]], +; AVX1-NEXT: [[TMP15:%.*]] = and <4 x i8> [[WIDE_LOAD3]], +; AVX1-NEXT: [[TMP16:%.*]] = icmp eq <4 x i8> [[TMP12]], zeroinitializer +; AVX1-NEXT: [[TMP17:%.*]] = icmp eq <4 x i8> [[TMP13]], zeroinitializer +; AVX1-NEXT: [[TMP18:%.*]] = icmp eq <4 x i8> [[TMP14]], zeroinitializer +; AVX1-NEXT: [[TMP19:%.*]] = icmp eq <4 x i8> [[TMP15]], zeroinitializer +; AVX1-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP24:%.*]] = xor <4 x i1> [[TMP16]], +; AVX1-NEXT: [[TMP25:%.*]] = xor <4 x i1> [[TMP17]], +; AVX1-NEXT: [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], +; AVX1-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], +; AVX1-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4 +; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8 +; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12 +; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX1-NEXT: [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer +; AVX1-NEXT: [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX1-NEXT: [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX1-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP40:%.*]] = xor <4 x i1> [[TMP32]], +; AVX1-NEXT: [[TMP41:%.*]] = xor <4 x i1> [[TMP33]], +; AVX1-NEXT: [[TMP42:%.*]] = xor <4 x i1> [[TMP34]], +; AVX1-NEXT: [[TMP43:%.*]] = xor <4 x i1> [[TMP35]], +; AVX1-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP24]], <4 x i1> [[TMP40]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP45:%.*]] = select <4 x i1> [[TMP25]], <4 x i1> [[TMP41]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]]) +; AVX1-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]]) +; AVX1-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]]) +; AVX1-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; AVX1-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; AVX1-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX1-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -1909,19 +1729,19 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX1-NEXT: [[TMP66:%.*]] = and i8 [[TMP65]], 1 -; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0 +; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; AVX1-NEXT: [[TMP54:%.*]] = and i8 [[TMP53]], 1 +; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP54]], 0 ; AVX1-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX1: land.lhs.true: -; AVX1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP67:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 -; AVX1-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP67]], null +; AVX1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP55:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 +; AVX1-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP55]], null ; AVX1-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX1: if.then: -; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] -; AVX1-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8 +; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]] +; AVX1-NEXT: store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8 ; AVX1-NEXT: br label [[FOR_INC]] ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -1950,81 +1770,69 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 ; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1 -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4 -; AVX2-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8 -; AVX2-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12 -; AVX2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1 -; AVX2-NEXT: [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], -; AVX2-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], -; AVX2-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], -; AVX2-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer -; AVX2-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer -; AVX2-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer -; AVX2-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr double*, double** [[IN:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP25:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP27:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], -; AVX2-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], -; AVX2-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], -; AVX2-NEXT: [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr double*, double** [[TMP24]], i32 0 -; AVX2-NEXT: [[TMP33:%.*]] = bitcast double** [[TMP32]] to <4 x double*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x double*> poison) -; AVX2-NEXT: [[TMP34:%.*]] = getelementptr double*, double** [[TMP24]], i32 4 -; AVX2-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <4 x double*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x double*> poison) -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr double*, double** [[TMP24]], i32 8 -; AVX2-NEXT: [[TMP37:%.*]] = bitcast double** [[TMP36]] to <4 x double*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x double*> poison) -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double*, double** [[TMP24]], i32 12 -; AVX2-NEXT: [[TMP39:%.*]] = bitcast double** [[TMP38]] to <4 x double*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x double*> poison) -; AVX2-NEXT: [[TMP40:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX2-NEXT: [[TMP41:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX2-NEXT: [[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX2-NEXT: [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], -; AVX2-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], -; AVX2-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], -; AVX2-NEXT: [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], -; AVX2-NEXT: [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0 -; AVX2-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]]) -; AVX2-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4 -; AVX2-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]]) -; AVX2-NEXT: [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8 -; AVX2-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]]) -; AVX2-NEXT: [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 12 -; AVX2-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1 +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 4 +; AVX2-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8 +; AVX2-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12 +; AVX2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 +; AVX2-NEXT: [[TMP12:%.*]] = and <4 x i8> [[WIDE_LOAD]], +; AVX2-NEXT: [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD1]], +; AVX2-NEXT: [[TMP14:%.*]] = and <4 x i8> [[WIDE_LOAD2]], +; AVX2-NEXT: [[TMP15:%.*]] = and <4 x i8> [[WIDE_LOAD3]], +; AVX2-NEXT: [[TMP16:%.*]] = icmp eq <4 x i8> [[TMP12]], zeroinitializer +; AVX2-NEXT: [[TMP17:%.*]] = icmp eq <4 x i8> [[TMP13]], zeroinitializer +; AVX2-NEXT: [[TMP18:%.*]] = icmp eq <4 x i8> [[TMP14]], zeroinitializer +; AVX2-NEXT: [[TMP19:%.*]] = icmp eq <4 x i8> [[TMP15]], zeroinitializer +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP24:%.*]] = xor <4 x i1> [[TMP16]], +; AVX2-NEXT: [[TMP25:%.*]] = xor <4 x i1> [[TMP17]], +; AVX2-NEXT: [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], +; AVX2-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4 +; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8 +; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12 +; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX2-NEXT: [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer +; AVX2-NEXT: [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX2-NEXT: [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP40:%.*]] = xor <4 x i1> [[TMP32]], +; AVX2-NEXT: [[TMP41:%.*]] = xor <4 x i1> [[TMP33]], +; AVX2-NEXT: [[TMP42:%.*]] = xor <4 x i1> [[TMP34]], +; AVX2-NEXT: [[TMP43:%.*]] = xor <4 x i1> [[TMP35]], +; AVX2-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP24]], <4 x i1> [[TMP40]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP45:%.*]] = select <4 x i1> [[TMP25]], <4 x i1> [[TMP41]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]]) +; AVX2-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]]) +; AVX2-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]]) +; AVX2-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; AVX2-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2033,19 +1841,19 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX2-NEXT: [[TMP66:%.*]] = and i8 [[TMP65]], 1 -; AVX2-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0 +; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; AVX2-NEXT: [[TMP54:%.*]] = and i8 [[TMP53]], 1 +; AVX2-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP54]], 0 ; AVX2-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX2: land.lhs.true: -; AVX2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP67:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 -; AVX2-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP67]], null +; AVX2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP55:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 +; AVX2-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP55]], null ; AVX2-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX2: if.then: -; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] -; AVX2-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8 +; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]] +; AVX2-NEXT: store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8 ; AVX2-NEXT: br label [[FOR_INC]] ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -2074,81 +1882,69 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 ; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 ; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0 -; AVX512-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP9]], align 1 -; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8 -; AVX512-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 1 -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 16 -; AVX512-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 1 -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 24 -; AVX512-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 1 -; AVX512-NEXT: [[TMP16:%.*]] = and <8 x i8> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP17:%.*]] = and <8 x i8> [[WIDE_LOAD1]], -; AVX512-NEXT: [[TMP18:%.*]] = and <8 x i8> [[WIDE_LOAD2]], -; AVX512-NEXT: [[TMP19:%.*]] = and <8 x i8> [[WIDE_LOAD3]], -; AVX512-NEXT: [[TMP20:%.*]] = icmp eq <8 x i8> [[TMP16]], zeroinitializer -; AVX512-NEXT: [[TMP21:%.*]] = icmp eq <8 x i8> [[TMP17]], zeroinitializer -; AVX512-NEXT: [[TMP22:%.*]] = icmp eq <8 x i8> [[TMP18]], zeroinitializer -; AVX512-NEXT: [[TMP23:%.*]] = icmp eq <8 x i8> [[TMP19]], zeroinitializer -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double*, double** [[IN:%.*]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP28:%.*]] = xor <8 x i1> [[TMP20]], -; AVX512-NEXT: [[TMP29:%.*]] = xor <8 x i1> [[TMP21]], -; AVX512-NEXT: [[TMP30:%.*]] = xor <8 x i1> [[TMP22]], -; AVX512-NEXT: [[TMP31:%.*]] = xor <8 x i1> [[TMP23]], -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr double*, double** [[TMP24]], i32 0 -; AVX512-NEXT: [[TMP33:%.*]] = bitcast double** [[TMP32]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP33]], i32 8, <8 x i1> [[TMP28]], <8 x double*> poison) -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double*, double** [[TMP24]], i32 8 -; AVX512-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP35]], i32 8, <8 x i1> [[TMP29]], <8 x double*> poison) -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double*, double** [[TMP24]], i32 16 -; AVX512-NEXT: [[TMP37:%.*]] = bitcast double** [[TMP36]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP37]], i32 8, <8 x i1> [[TMP30]], <8 x double*> poison) -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double*, double** [[TMP24]], i32 24 -; AVX512-NEXT: [[TMP39:%.*]] = bitcast double** [[TMP38]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP39]], i32 8, <8 x i1> [[TMP31]], <8 x double*> poison) -; AVX512-NEXT: [[TMP40:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP41:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX512-NEXT: [[TMP42:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX512-NEXT: [[TMP43:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP48:%.*]] = xor <8 x i1> [[TMP40]], -; AVX512-NEXT: [[TMP49:%.*]] = xor <8 x i1> [[TMP41]], -; AVX512-NEXT: [[TMP50:%.*]] = xor <8 x i1> [[TMP42]], -; AVX512-NEXT: [[TMP51:%.*]] = xor <8 x i1> [[TMP43]], -; AVX512-NEXT: [[TMP52:%.*]] = select <8 x i1> [[TMP28]], <8 x i1> [[TMP48]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP53:%.*]] = select <8 x i1> [[TMP29]], <8 x i1> [[TMP49]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP54:%.*]] = select <8 x i1> [[TMP30]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP55:%.*]] = select <8 x i1> [[TMP31]], <8 x i1> [[TMP51]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0 -; AVX512-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP57]], i32 8, <8 x i1> [[TMP52]]) -; AVX512-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 8 -; AVX512-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP59]], i32 8, <8 x i1> [[TMP53]]) -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 16 -; AVX512-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP61]], i32 8, <8 x i1> [[TMP54]]) -; AVX512-NEXT: [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 24 -; AVX512-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]]) +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP8]], align 1 +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8 +; AVX512-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP9]], align 1 +; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16 +; AVX512-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP10]], align 1 +; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 24 +; AVX512-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP11]], align 1 +; AVX512-NEXT: [[TMP12:%.*]] = and <8 x i8> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP13:%.*]] = and <8 x i8> [[WIDE_LOAD1]], +; AVX512-NEXT: [[TMP14:%.*]] = and <8 x i8> [[WIDE_LOAD2]], +; AVX512-NEXT: [[TMP15:%.*]] = and <8 x i8> [[WIDE_LOAD3]], +; AVX512-NEXT: [[TMP16:%.*]] = icmp eq <8 x i8> [[TMP12]], zeroinitializer +; AVX512-NEXT: [[TMP17:%.*]] = icmp eq <8 x i8> [[TMP13]], zeroinitializer +; AVX512-NEXT: [[TMP18:%.*]] = icmp eq <8 x i8> [[TMP14]], zeroinitializer +; AVX512-NEXT: [[TMP19:%.*]] = icmp eq <8 x i8> [[TMP15]], zeroinitializer +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP24:%.*]] = xor <8 x i1> [[TMP16]], +; AVX512-NEXT: [[TMP25:%.*]] = xor <8 x i1> [[TMP17]], +; AVX512-NEXT: [[TMP26:%.*]] = xor <8 x i1> [[TMP18]], +; AVX512-NEXT: [[TMP27:%.*]] = xor <8 x i1> [[TMP19]], +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP28]], i32 8, <8 x i1> [[TMP24]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8 +; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP29]], i32 8, <8 x i1> [[TMP25]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 16 +; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP30]], i32 8, <8 x i1> [[TMP26]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP31]], i32 8, <8 x i1> [[TMP27]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP32:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX512-NEXT: [[TMP33:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer +; AVX512-NEXT: [[TMP34:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX512-NEXT: [[TMP35:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP40:%.*]] = xor <8 x i1> [[TMP32]], +; AVX512-NEXT: [[TMP41:%.*]] = xor <8 x i1> [[TMP33]], +; AVX512-NEXT: [[TMP42:%.*]] = xor <8 x i1> [[TMP34]], +; AVX512-NEXT: [[TMP43:%.*]] = xor <8 x i1> [[TMP35]], +; AVX512-NEXT: [[TMP44:%.*]] = select <8 x i1> [[TMP24]], <8 x i1> [[TMP40]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP45:%.*]] = select <8 x i1> [[TMP25]], <8 x i1> [[TMP41]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP46:%.*]] = select <8 x i1> [[TMP26]], <8 x i1> [[TMP42]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP47:%.*]] = select <8 x i1> [[TMP27]], <8 x i1> [[TMP43]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP48]], i32 8, <8 x i1> [[TMP44]]) +; AVX512-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 8 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP49]], i32 8, <8 x i1> [[TMP45]]) +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 16 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP50]], i32 8, <8 x i1> [[TMP46]]) +; AVX512-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 24 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP51]], i32 8, <8 x i1> [[TMP47]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] +; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2157,19 +1953,19 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX512-NEXT: [[TMP66:%.*]] = and i8 [[TMP65]], 1 -; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0 +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; AVX512-NEXT: [[TMP54:%.*]] = and i8 [[TMP53]], 1 +; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP54]], 0 ; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX512: land.lhs.true: -; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP67:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 -; AVX512-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP67]], null +; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP55:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 +; AVX512-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP55]], null ; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX512: if.then: -; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] -; AVX512-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8 +; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8 ; AVX512-NEXT: br label [[FOR_INC]] ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -2190,21 +1986,21 @@ for.body.preheader: ; preds = %entry for.body: ; preds = %for.inc, %for.body.preheader %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] - %arrayidx = getelementptr inbounds i8, i8* %trigger, i64 %indvars.iv - %0 = load i8, i8* %arrayidx, align 1 + %arrayidx = getelementptr inbounds i8, ptr %trigger, i64 %indvars.iv + %0 = load i8, ptr %arrayidx, align 1 %1 = and i8 %0, 1 %tobool = icmp eq i8 %1, 0 br i1 %tobool, label %for.inc, label %land.lhs.true land.lhs.true: ; preds = %for.body - %arrayidx2 = getelementptr inbounds double*, double** %in, i64 %indvars.iv - %2 = load double*, double** %arrayidx2, align 8 - %cmp3 = icmp eq double* %2, null + %arrayidx2 = getelementptr inbounds ptr, ptr %in, i64 %indvars.iv + %2 = load ptr, ptr %arrayidx2, align 8 + %cmp3 = icmp eq ptr %2, null br i1 %cmp3, label %for.inc, label %if.then if.then: ; preds = %land.lhs.true - %arrayidx5 = getelementptr inbounds double, double* %out, i64 %indvars.iv - store double 5.000000e-01, double* %arrayidx5, align 8 + %arrayidx5 = getelementptr inbounds double, ptr %out, i64 %indvars.iv + store double 5.000000e-01, ptr %arrayidx5, align 8 br label %for.inc for.inc: ; preds = %land.lhs.true, %for.body, %if.then @@ -2217,14 +2013,14 @@ for.end: ; preds = %for.inc, %entry } ;typedef int (*fp)(); -;void foo8 (double* __restrict__ out, fp* __restrict__ in, bool * __restrict__ trigger, unsigned size) { +;void foo8 (ptr __restrict__ out, fp* __restrict__ in, bool * __restrict__ trigger, unsigned size) { ; ; for (unsigned i=0; i* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1 -; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4 -; AVX1-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 -; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8 -; AVX1-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 -; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12 -; AVX1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1 -; AVX1-NEXT: [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], -; AVX1-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], -; AVX1-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], -; AVX1-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer -; AVX1-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer -; AVX1-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer -; AVX1-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer -; AVX1-NEXT: [[TMP24:%.*]] = getelementptr i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP25:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP26:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP27:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], -; AVX1-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], -; AVX1-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], -; AVX1-NEXT: [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], -; AVX1-NEXT: [[TMP32:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 0 -; AVX1-NEXT: [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <4 x i32 ()*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x i32 ()*> poison) -; AVX1-NEXT: [[TMP34:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 4 -; AVX1-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <4 x i32 ()*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x i32 ()*> poison) -; AVX1-NEXT: [[TMP36:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 8 -; AVX1-NEXT: [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <4 x i32 ()*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x i32 ()*> poison) -; AVX1-NEXT: [[TMP38:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 12 -; AVX1-NEXT: [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <4 x i32 ()*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x i32 ()*> poison) -; AVX1-NEXT: [[TMP40:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX1-NEXT: [[TMP41:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX1-NEXT: [[TMP42:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX1-NEXT: [[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX1-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], -; AVX1-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], -; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], -; AVX1-NEXT: [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], -; AVX1-NEXT: [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0 -; AVX1-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]]) -; AVX1-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4 -; AVX1-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]]) -; AVX1-NEXT: [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8 -; AVX1-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]]) -; AVX1-NEXT: [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 12 -; AVX1-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1 +; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 4 +; AVX1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 +; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8 +; AVX1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12 +; AVX1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 +; AVX1-NEXT: [[TMP12:%.*]] = and <4 x i8> [[WIDE_LOAD]], +; AVX1-NEXT: [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD1]], +; AVX1-NEXT: [[TMP14:%.*]] = and <4 x i8> [[WIDE_LOAD2]], +; AVX1-NEXT: [[TMP15:%.*]] = and <4 x i8> [[WIDE_LOAD3]], +; AVX1-NEXT: [[TMP16:%.*]] = icmp eq <4 x i8> [[TMP12]], zeroinitializer +; AVX1-NEXT: [[TMP17:%.*]] = icmp eq <4 x i8> [[TMP13]], zeroinitializer +; AVX1-NEXT: [[TMP18:%.*]] = icmp eq <4 x i8> [[TMP14]], zeroinitializer +; AVX1-NEXT: [[TMP19:%.*]] = icmp eq <4 x i8> [[TMP15]], zeroinitializer +; AVX1-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP24:%.*]] = xor <4 x i1> [[TMP16]], +; AVX1-NEXT: [[TMP25:%.*]] = xor <4 x i1> [[TMP17]], +; AVX1-NEXT: [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], +; AVX1-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], +; AVX1-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4 +; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8 +; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12 +; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX1-NEXT: [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer +; AVX1-NEXT: [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX1-NEXT: [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX1-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP40:%.*]] = xor <4 x i1> [[TMP32]], +; AVX1-NEXT: [[TMP41:%.*]] = xor <4 x i1> [[TMP33]], +; AVX1-NEXT: [[TMP42:%.*]] = xor <4 x i1> [[TMP34]], +; AVX1-NEXT: [[TMP43:%.*]] = xor <4 x i1> [[TMP35]], +; AVX1-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP24]], <4 x i1> [[TMP40]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP45:%.*]] = select <4 x i1> [[TMP25]], <4 x i1> [[TMP41]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]]) +; AVX1-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]]) +; AVX1-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]]) +; AVX1-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; AVX1-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; AVX1-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX1-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2326,19 +2110,19 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX1-NEXT: [[TMP66:%.*]] = and i8 [[TMP65]], 1 -; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0 +; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; AVX1-NEXT: [[TMP54:%.*]] = and i8 [[TMP53]], 1 +; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP54]], 0 ; AVX1-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX1: land.lhs.true: -; AVX1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP67:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 -; AVX1-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP67]], null +; AVX1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP55:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 +; AVX1-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP55]], null ; AVX1-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX1: if.then: -; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] -; AVX1-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8 +; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]] +; AVX1-NEXT: store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8 ; AVX1-NEXT: br label [[FOR_INC]] ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -2367,81 +2151,69 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 ; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1 -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4 -; AVX2-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8 -; AVX2-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12 -; AVX2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1 -; AVX2-NEXT: [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], -; AVX2-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], -; AVX2-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], -; AVX2-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer -; AVX2-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer -; AVX2-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer -; AVX2-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP25:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP27:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], -; AVX2-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], -; AVX2-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], -; AVX2-NEXT: [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 0 -; AVX2-NEXT: [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <4 x i32 ()*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x i32 ()*> poison) -; AVX2-NEXT: [[TMP34:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 4 -; AVX2-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <4 x i32 ()*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x i32 ()*> poison) -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 8 -; AVX2-NEXT: [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <4 x i32 ()*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x i32 ()*> poison) -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 12 -; AVX2-NEXT: [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <4 x i32 ()*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x i32 ()*> poison) -; AVX2-NEXT: [[TMP40:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX2-NEXT: [[TMP41:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX2-NEXT: [[TMP42:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX2-NEXT: [[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], -; AVX2-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], -; AVX2-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], -; AVX2-NEXT: [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], -; AVX2-NEXT: [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0 -; AVX2-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]]) -; AVX2-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4 -; AVX2-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]]) -; AVX2-NEXT: [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8 -; AVX2-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]]) -; AVX2-NEXT: [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 12 -; AVX2-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1 +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 4 +; AVX2-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8 +; AVX2-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12 +; AVX2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 +; AVX2-NEXT: [[TMP12:%.*]] = and <4 x i8> [[WIDE_LOAD]], +; AVX2-NEXT: [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD1]], +; AVX2-NEXT: [[TMP14:%.*]] = and <4 x i8> [[WIDE_LOAD2]], +; AVX2-NEXT: [[TMP15:%.*]] = and <4 x i8> [[WIDE_LOAD3]], +; AVX2-NEXT: [[TMP16:%.*]] = icmp eq <4 x i8> [[TMP12]], zeroinitializer +; AVX2-NEXT: [[TMP17:%.*]] = icmp eq <4 x i8> [[TMP13]], zeroinitializer +; AVX2-NEXT: [[TMP18:%.*]] = icmp eq <4 x i8> [[TMP14]], zeroinitializer +; AVX2-NEXT: [[TMP19:%.*]] = icmp eq <4 x i8> [[TMP15]], zeroinitializer +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP24:%.*]] = xor <4 x i1> [[TMP16]], +; AVX2-NEXT: [[TMP25:%.*]] = xor <4 x i1> [[TMP17]], +; AVX2-NEXT: [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], +; AVX2-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4 +; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8 +; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12 +; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX2-NEXT: [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer +; AVX2-NEXT: [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX2-NEXT: [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP40:%.*]] = xor <4 x i1> [[TMP32]], +; AVX2-NEXT: [[TMP41:%.*]] = xor <4 x i1> [[TMP33]], +; AVX2-NEXT: [[TMP42:%.*]] = xor <4 x i1> [[TMP34]], +; AVX2-NEXT: [[TMP43:%.*]] = xor <4 x i1> [[TMP35]], +; AVX2-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP24]], <4 x i1> [[TMP40]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP45:%.*]] = select <4 x i1> [[TMP25]], <4 x i1> [[TMP41]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]]) +; AVX2-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]]) +; AVX2-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]]) +; AVX2-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> , ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; AVX2-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2450,19 +2222,19 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX2-NEXT: [[TMP66:%.*]] = and i8 [[TMP65]], 1 -; AVX2-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0 +; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; AVX2-NEXT: [[TMP54:%.*]] = and i8 [[TMP53]], 1 +; AVX2-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP54]], 0 ; AVX2-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX2: land.lhs.true: -; AVX2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP67:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 -; AVX2-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP67]], null +; AVX2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP55:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 +; AVX2-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP55]], null ; AVX2-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX2: if.then: -; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] -; AVX2-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8 +; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]] +; AVX2-NEXT: store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8 ; AVX2-NEXT: br label [[FOR_INC]] ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -2491,81 +2263,69 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 ; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 ; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0 -; AVX512-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP9]], align 1 -; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8 -; AVX512-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 1 -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 16 -; AVX512-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 1 -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 24 -; AVX512-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 1 -; AVX512-NEXT: [[TMP16:%.*]] = and <8 x i8> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP17:%.*]] = and <8 x i8> [[WIDE_LOAD1]], -; AVX512-NEXT: [[TMP18:%.*]] = and <8 x i8> [[WIDE_LOAD2]], -; AVX512-NEXT: [[TMP19:%.*]] = and <8 x i8> [[WIDE_LOAD3]], -; AVX512-NEXT: [[TMP20:%.*]] = icmp eq <8 x i8> [[TMP16]], zeroinitializer -; AVX512-NEXT: [[TMP21:%.*]] = icmp eq <8 x i8> [[TMP17]], zeroinitializer -; AVX512-NEXT: [[TMP22:%.*]] = icmp eq <8 x i8> [[TMP18]], zeroinitializer -; AVX512-NEXT: [[TMP23:%.*]] = icmp eq <8 x i8> [[TMP19]], zeroinitializer -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP28:%.*]] = xor <8 x i1> [[TMP20]], -; AVX512-NEXT: [[TMP29:%.*]] = xor <8 x i1> [[TMP21]], -; AVX512-NEXT: [[TMP30:%.*]] = xor <8 x i1> [[TMP22]], -; AVX512-NEXT: [[TMP31:%.*]] = xor <8 x i1> [[TMP23]], -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 0 -; AVX512-NEXT: [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP33]], i32 8, <8 x i1> [[TMP28]], <8 x i32 ()*> poison) -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 8 -; AVX512-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP35]], i32 8, <8 x i1> [[TMP29]], <8 x i32 ()*> poison) -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 16 -; AVX512-NEXT: [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP37]], i32 8, <8 x i1> [[TMP30]], <8 x i32 ()*> poison) -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 24 -; AVX512-NEXT: [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP39]], i32 8, <8 x i1> [[TMP31]], <8 x i32 ()*> poison) -; AVX512-NEXT: [[TMP40:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP41:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX512-NEXT: [[TMP42:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX512-NEXT: [[TMP43:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP48:%.*]] = xor <8 x i1> [[TMP40]], -; AVX512-NEXT: [[TMP49:%.*]] = xor <8 x i1> [[TMP41]], -; AVX512-NEXT: [[TMP50:%.*]] = xor <8 x i1> [[TMP42]], -; AVX512-NEXT: [[TMP51:%.*]] = xor <8 x i1> [[TMP43]], -; AVX512-NEXT: [[TMP52:%.*]] = select <8 x i1> [[TMP28]], <8 x i1> [[TMP48]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP53:%.*]] = select <8 x i1> [[TMP29]], <8 x i1> [[TMP49]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP54:%.*]] = select <8 x i1> [[TMP30]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP55:%.*]] = select <8 x i1> [[TMP31]], <8 x i1> [[TMP51]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0 -; AVX512-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP57]], i32 8, <8 x i1> [[TMP52]]) -; AVX512-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 8 -; AVX512-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP59]], i32 8, <8 x i1> [[TMP53]]) -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 16 -; AVX512-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP61]], i32 8, <8 x i1> [[TMP54]]) -; AVX512-NEXT: [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 24 -; AVX512-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]]) +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP8]], align 1 +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8 +; AVX512-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP9]], align 1 +; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16 +; AVX512-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP10]], align 1 +; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 24 +; AVX512-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP11]], align 1 +; AVX512-NEXT: [[TMP12:%.*]] = and <8 x i8> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP13:%.*]] = and <8 x i8> [[WIDE_LOAD1]], +; AVX512-NEXT: [[TMP14:%.*]] = and <8 x i8> [[WIDE_LOAD2]], +; AVX512-NEXT: [[TMP15:%.*]] = and <8 x i8> [[WIDE_LOAD3]], +; AVX512-NEXT: [[TMP16:%.*]] = icmp eq <8 x i8> [[TMP12]], zeroinitializer +; AVX512-NEXT: [[TMP17:%.*]] = icmp eq <8 x i8> [[TMP13]], zeroinitializer +; AVX512-NEXT: [[TMP18:%.*]] = icmp eq <8 x i8> [[TMP14]], zeroinitializer +; AVX512-NEXT: [[TMP19:%.*]] = icmp eq <8 x i8> [[TMP15]], zeroinitializer +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP24:%.*]] = xor <8 x i1> [[TMP16]], +; AVX512-NEXT: [[TMP25:%.*]] = xor <8 x i1> [[TMP17]], +; AVX512-NEXT: [[TMP26:%.*]] = xor <8 x i1> [[TMP18]], +; AVX512-NEXT: [[TMP27:%.*]] = xor <8 x i1> [[TMP19]], +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP28]], i32 8, <8 x i1> [[TMP24]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8 +; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP29]], i32 8, <8 x i1> [[TMP25]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 16 +; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP30]], i32 8, <8 x i1> [[TMP26]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP31]], i32 8, <8 x i1> [[TMP27]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP32:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX512-NEXT: [[TMP33:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer +; AVX512-NEXT: [[TMP34:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX512-NEXT: [[TMP35:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP40:%.*]] = xor <8 x i1> [[TMP32]], +; AVX512-NEXT: [[TMP41:%.*]] = xor <8 x i1> [[TMP33]], +; AVX512-NEXT: [[TMP42:%.*]] = xor <8 x i1> [[TMP34]], +; AVX512-NEXT: [[TMP43:%.*]] = xor <8 x i1> [[TMP35]], +; AVX512-NEXT: [[TMP44:%.*]] = select <8 x i1> [[TMP24]], <8 x i1> [[TMP40]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP45:%.*]] = select <8 x i1> [[TMP25]], <8 x i1> [[TMP41]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP46:%.*]] = select <8 x i1> [[TMP26]], <8 x i1> [[TMP42]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP47:%.*]] = select <8 x i1> [[TMP27]], <8 x i1> [[TMP43]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP48]], i32 8, <8 x i1> [[TMP44]]) +; AVX512-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 8 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP49]], i32 8, <8 x i1> [[TMP45]]) +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 16 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP50]], i32 8, <8 x i1> [[TMP46]]) +; AVX512-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 24 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> , ptr [[TMP51]], i32 8, <8 x i1> [[TMP47]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] +; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2574,19 +2334,19 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX512-NEXT: [[TMP66:%.*]] = and i8 [[TMP65]], 1 -; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0 +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; AVX512-NEXT: [[TMP54:%.*]] = and i8 [[TMP53]], 1 +; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP54]], 0 ; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX512: land.lhs.true: -; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP67:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 -; AVX512-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP67]], null +; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP55:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8 +; AVX512-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP55]], null ; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX512: if.then: -; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] -; AVX512-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8 +; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8 ; AVX512-NEXT: br label [[FOR_INC]] ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -2607,21 +2367,21 @@ for.body.preheader: ; preds = %entry for.body: ; preds = %for.inc, %for.body.preheader %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] - %arrayidx = getelementptr inbounds i8, i8* %trigger, i64 %indvars.iv - %0 = load i8, i8* %arrayidx, align 1 + %arrayidx = getelementptr inbounds i8, ptr %trigger, i64 %indvars.iv + %0 = load i8, ptr %arrayidx, align 1 %1 = and i8 %0, 1 %tobool = icmp eq i8 %1, 0 br i1 %tobool, label %for.inc, label %land.lhs.true land.lhs.true: ; preds = %for.body - %arrayidx2 = getelementptr inbounds i32 ()*, i32 ()** %in, i64 %indvars.iv - %2 = load i32 ()*, i32 ()** %arrayidx2, align 8 - %cmp3 = icmp eq i32 ()* %2, null + %arrayidx2 = getelementptr inbounds ptr, ptr %in, i64 %indvars.iv + %2 = load ptr, ptr %arrayidx2, align 8 + %cmp3 = icmp eq ptr %2, null br i1 %cmp3, label %for.inc, label %if.then if.then: ; preds = %land.lhs.true - %arrayidx5 = getelementptr inbounds double, double* %out, i64 %indvars.iv - store double 5.000000e-01, double* %arrayidx5, align 8 + %arrayidx5 = getelementptr inbounds double, ptr %out, i64 %indvars.iv + store double 5.000000e-01, ptr %arrayidx5, align 8 br label %for.inc for.inc: ; preds = %land.lhs.true, %for.body, %if.then diff --git a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll index 5a4bda6680e54..6a4ed2cd056aa 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll @@ -16,1580 +16,1164 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -define i32 @enabled(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) { +define i32 @enabled(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i32 %N) { ; O1-LABEL: @enabled( ; O1-NEXT: entry: ; O1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; O1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; O1-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O1-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; O1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 -; O1-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O1-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; O1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; O1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; O1-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O1-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; O1-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; O1-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; O1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O1-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; O1-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; O1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; O1-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O1-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; O1-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; O1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; O1-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O1-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; O1-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; O1-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; O1-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O1-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; O1-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; O1-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; O1-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O1-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; O1-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; O1-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; O1-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O1-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; O1-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; O1-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; O1-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O1-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; O1-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; O1-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; O1-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O1-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; O1-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; O1-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; O1-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O1-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; O1-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; O1-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; O1-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O1-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; O1-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; O1-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; O1-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O1-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; O1-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; O1-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; O1-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O1-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; O1-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; O1-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; O1-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O1-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; O1-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; O1-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; O1-NEXT: ret i32 [[TMP78]] +; O1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; O1-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; O1-NEXT: store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4 +; O1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4 +; O1-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; O1-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] +; O1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4 +; O1-NEXT: store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; O1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8 +; O1-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; O1-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] +; O1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8 +; O1-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; O1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12 +; O1-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4 +; O1-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] +; O1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12 +; O1-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4 +; O1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16 +; O1-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; O1-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] +; O1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16 +; O1-NEXT: store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4 +; O1-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20 +; O1-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 +; O1-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] +; O1-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20 +; O1-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4 +; O1-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24 +; O1-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4 +; O1-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] +; O1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24 +; O1-NEXT: store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4 +; O1-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28 +; O1-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4 +; O1-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] +; O1-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28 +; O1-NEXT: store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4 +; O1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32 +; O1-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4 +; O1-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] +; O1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32 +; O1-NEXT: store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4 +; O1-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36 +; O1-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4 +; O1-NEXT: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] +; O1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36 +; O1-NEXT: store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4 +; O1-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40 +; O1-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4 +; O1-NEXT: [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] +; O1-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40 +; O1-NEXT: store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4 +; O1-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44 +; O1-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4 +; O1-NEXT: [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] +; O1-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44 +; O1-NEXT: store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4 +; O1-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48 +; O1-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4 +; O1-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] +; O1-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48 +; O1-NEXT: store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4 +; O1-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52 +; O1-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4 +; O1-NEXT: [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] +; O1-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52 +; O1-NEXT: store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4 +; O1-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56 +; O1-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4 +; O1-NEXT: [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] +; O1-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56 +; O1-NEXT: store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4 +; O1-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60 +; O1-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4 +; O1-NEXT: [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] +; O1-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60 +; O1-NEXT: store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4 +; O1-NEXT: [[TMP46:%.*]] = load i32, ptr [[A]], align 4 +; O1-NEXT: ret i32 [[TMP46]] ; ; O2-LABEL: @enabled( ; O2-NEXT: entry: ; O2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; O2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; O2-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O2-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; O2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 -; O2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; O2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; O2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; O2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O2-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; O2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; O2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; O2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; O2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; O2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; O2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; O2-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; O2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; O2-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O2-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; O2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; O2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; O2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O2-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; O2-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; O2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; O2-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; O2-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; O2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; O2-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O2-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; O2-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; O2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; O2-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O2-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; O2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; O2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; O2-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O2-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; O2-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; O2-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; O2-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O2-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; O2-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; O2-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; O2-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O2-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; O2-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; O2-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; O2-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O2-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; O2-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; O2-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; O2-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O2-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; O2-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; O2-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; O2-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O2-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; O2-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; O2-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; O2-NEXT: ret i32 [[TMP78]] +; O2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; O2-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; O2-NEXT: store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4 +; O2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4 +; O2-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; O2-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4 +; O2-NEXT: store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; O2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8 +; O2-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; O2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8 +; O2-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; O2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12 +; O2-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4 +; O2-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12 +; O2-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4 +; O2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16 +; O2-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; O2-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16 +; O2-NEXT: store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4 +; O2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20 +; O2-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 +; O2-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20 +; O2-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4 +; O2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24 +; O2-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4 +; O2-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24 +; O2-NEXT: store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4 +; O2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28 +; O2-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4 +; O2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28 +; O2-NEXT: store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4 +; O2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32 +; O2-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4 +; O2-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32 +; O2-NEXT: store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4 +; O2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36 +; O2-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4 +; O2-NEXT: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36 +; O2-NEXT: store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4 +; O2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40 +; O2-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4 +; O2-NEXT: [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40 +; O2-NEXT: store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4 +; O2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44 +; O2-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4 +; O2-NEXT: [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44 +; O2-NEXT: store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4 +; O2-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48 +; O2-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4 +; O2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48 +; O2-NEXT: store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4 +; O2-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52 +; O2-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4 +; O2-NEXT: [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52 +; O2-NEXT: store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4 +; O2-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56 +; O2-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4 +; O2-NEXT: [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56 +; O2-NEXT: store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4 +; O2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60 +; O2-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4 +; O2-NEXT: [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60 +; O2-NEXT: store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4 +; O2-NEXT: [[TMP46:%.*]] = load i32, ptr [[A]], align 4 +; O2-NEXT: ret i32 [[TMP46]] ; ; O3-LABEL: @enabled( ; O3-NEXT: entry: ; O3-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; O3-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; O3-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O3-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; O3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 -; O3-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O3-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; O3-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; O3-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; O3-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O3-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; O3-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; O3-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; O3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O3-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; O3-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; O3-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; O3-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O3-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; O3-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; O3-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; O3-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O3-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; O3-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; O3-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; O3-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O3-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; O3-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; O3-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; O3-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O3-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; O3-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; O3-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; O3-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O3-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; O3-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; O3-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; O3-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O3-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; O3-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; O3-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; O3-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O3-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; O3-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; O3-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; O3-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O3-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; O3-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; O3-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; O3-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O3-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; O3-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; O3-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; O3-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O3-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; O3-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; O3-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; O3-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O3-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; O3-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; O3-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; O3-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O3-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; O3-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; O3-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; O3-NEXT: ret i32 [[TMP78]] +; O3-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; O3-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; O3-NEXT: store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4 +; O3-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4 +; O3-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; O3-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4 +; O3-NEXT: store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; O3-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8 +; O3-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; O3-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8 +; O3-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; O3-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12 +; O3-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4 +; O3-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12 +; O3-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4 +; O3-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16 +; O3-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; O3-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16 +; O3-NEXT: store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4 +; O3-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20 +; O3-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 +; O3-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20 +; O3-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4 +; O3-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24 +; O3-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4 +; O3-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24 +; O3-NEXT: store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4 +; O3-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28 +; O3-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4 +; O3-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28 +; O3-NEXT: store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4 +; O3-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32 +; O3-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4 +; O3-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32 +; O3-NEXT: store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4 +; O3-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36 +; O3-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4 +; O3-NEXT: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36 +; O3-NEXT: store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4 +; O3-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40 +; O3-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4 +; O3-NEXT: [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40 +; O3-NEXT: store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4 +; O3-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44 +; O3-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4 +; O3-NEXT: [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44 +; O3-NEXT: store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4 +; O3-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48 +; O3-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4 +; O3-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48 +; O3-NEXT: store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4 +; O3-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52 +; O3-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4 +; O3-NEXT: [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52 +; O3-NEXT: store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4 +; O3-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56 +; O3-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4 +; O3-NEXT: [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56 +; O3-NEXT: store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4 +; O3-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60 +; O3-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4 +; O3-NEXT: [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60 +; O3-NEXT: store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4 +; O3-NEXT: [[TMP46:%.*]] = load i32, ptr [[A]], align 4 +; O3-NEXT: ret i32 [[TMP46]] ; ; O3DEFAULT-LABEL: @enabled( ; O3DEFAULT-NEXT: entry: ; O3DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; O3DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; O3DEFAULT-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O3DEFAULT-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; O3DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 -; O3DEFAULT-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O3DEFAULT-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; O3DEFAULT-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; O3DEFAULT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; O3DEFAULT-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O3DEFAULT-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; O3DEFAULT-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; O3DEFAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; O3DEFAULT-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O3DEFAULT-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; O3DEFAULT-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; O3DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; O3DEFAULT-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O3DEFAULT-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; O3DEFAULT-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; O3DEFAULT-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; O3DEFAULT-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O3DEFAULT-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; O3DEFAULT-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; O3DEFAULT-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; O3DEFAULT-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O3DEFAULT-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; O3DEFAULT-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; O3DEFAULT-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; O3DEFAULT-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O3DEFAULT-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; O3DEFAULT-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; O3DEFAULT-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; O3DEFAULT-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O3DEFAULT-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; O3DEFAULT-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; O3DEFAULT-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; O3DEFAULT-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O3DEFAULT-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; O3DEFAULT-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; O3DEFAULT-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; O3DEFAULT-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O3DEFAULT-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; O3DEFAULT-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; O3DEFAULT-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; O3DEFAULT-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O3DEFAULT-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; O3DEFAULT-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; O3DEFAULT-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; O3DEFAULT-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O3DEFAULT-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; O3DEFAULT-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; O3DEFAULT-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; O3DEFAULT-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O3DEFAULT-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; O3DEFAULT-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; O3DEFAULT-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; O3DEFAULT-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O3DEFAULT-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; O3DEFAULT-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; O3DEFAULT-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; O3DEFAULT-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O3DEFAULT-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; O3DEFAULT-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; O3DEFAULT-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; O3DEFAULT-NEXT: ret i32 [[TMP78]] +; O3DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; O3DEFAULT-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4 +; O3DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4 +; O3DEFAULT-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; O3DEFAULT-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; O3DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8 +; O3DEFAULT-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; O3DEFAULT-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; O3DEFAULT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12 +; O3DEFAULT-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4 +; O3DEFAULT-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4 +; O3DEFAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16 +; O3DEFAULT-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; O3DEFAULT-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4 +; O3DEFAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20 +; O3DEFAULT-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 +; O3DEFAULT-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4 +; O3DEFAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24 +; O3DEFAULT-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4 +; O3DEFAULT-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4 +; O3DEFAULT-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28 +; O3DEFAULT-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4 +; O3DEFAULT-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4 +; O3DEFAULT-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32 +; O3DEFAULT-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4 +; O3DEFAULT-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4 +; O3DEFAULT-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36 +; O3DEFAULT-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4 +; O3DEFAULT-NEXT: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4 +; O3DEFAULT-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40 +; O3DEFAULT-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4 +; O3DEFAULT-NEXT: [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4 +; O3DEFAULT-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44 +; O3DEFAULT-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4 +; O3DEFAULT-NEXT: [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4 +; O3DEFAULT-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48 +; O3DEFAULT-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4 +; O3DEFAULT-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4 +; O3DEFAULT-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52 +; O3DEFAULT-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4 +; O3DEFAULT-NEXT: [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4 +; O3DEFAULT-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56 +; O3DEFAULT-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4 +; O3DEFAULT-NEXT: [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4 +; O3DEFAULT-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60 +; O3DEFAULT-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4 +; O3DEFAULT-NEXT: [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4 +; O3DEFAULT-NEXT: [[TMP46:%.*]] = load i32, ptr [[A]], align 4 +; O3DEFAULT-NEXT: ret i32 [[TMP46]] ; ; Os-LABEL: @enabled( ; Os-NEXT: entry: ; Os-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; Os-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; Os-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; Os-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; Os-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 -; Os-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; Os-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; Os-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; Os-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; Os-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; Os-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; Os-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; Os-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; Os-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; Os-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; Os-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; Os-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; Os-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; Os-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; Os-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; Os-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; Os-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; Os-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; Os-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; Os-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; Os-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; Os-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; Os-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; Os-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; Os-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; Os-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; Os-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; Os-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; Os-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; Os-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; Os-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; Os-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; Os-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; Os-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; Os-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; Os-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; Os-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; Os-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; Os-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; Os-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; Os-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; Os-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; Os-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; Os-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; Os-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; Os-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; Os-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; Os-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; Os-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; Os-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; Os-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; Os-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; Os-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; Os-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; Os-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; Os-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; Os-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; Os-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; Os-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; Os-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; Os-NEXT: ret i32 [[TMP78]] +; Os-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; Os-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; Os-NEXT: store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4 +; Os-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4 +; Os-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; Os-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4 +; Os-NEXT: store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; Os-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8 +; Os-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; Os-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8 +; Os-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; Os-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12 +; Os-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4 +; Os-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12 +; Os-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4 +; Os-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16 +; Os-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; Os-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16 +; Os-NEXT: store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4 +; Os-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20 +; Os-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 +; Os-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20 +; Os-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4 +; Os-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24 +; Os-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4 +; Os-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24 +; Os-NEXT: store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4 +; Os-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28 +; Os-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4 +; Os-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28 +; Os-NEXT: store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4 +; Os-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32 +; Os-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4 +; Os-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32 +; Os-NEXT: store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4 +; Os-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36 +; Os-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4 +; Os-NEXT: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36 +; Os-NEXT: store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4 +; Os-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40 +; Os-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4 +; Os-NEXT: [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40 +; Os-NEXT: store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4 +; Os-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44 +; Os-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4 +; Os-NEXT: [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44 +; Os-NEXT: store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4 +; Os-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48 +; Os-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4 +; Os-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48 +; Os-NEXT: store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4 +; Os-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52 +; Os-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4 +; Os-NEXT: [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52 +; Os-NEXT: store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4 +; Os-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56 +; Os-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4 +; Os-NEXT: [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56 +; Os-NEXT: store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4 +; Os-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60 +; Os-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4 +; Os-NEXT: [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60 +; Os-NEXT: store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4 +; Os-NEXT: [[TMP46:%.*]] = load i32, ptr [[A]], align 4 +; Os-NEXT: ret i32 [[TMP46]] ; ; Oz-LABEL: @enabled( ; Oz-NEXT: entry: ; Oz-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; Oz-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; Oz-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; Oz-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; Oz-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 -; Oz-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; Oz-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; Oz-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; Oz-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; Oz-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; Oz-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; Oz-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; Oz-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; Oz-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; Oz-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; Oz-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; Oz-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; Oz-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; Oz-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; Oz-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; Oz-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; Oz-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; Oz-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; Oz-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; Oz-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; Oz-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; Oz-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; Oz-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; Oz-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; Oz-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; Oz-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; Oz-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; Oz-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; Oz-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; Oz-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; Oz-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; Oz-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; Oz-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; Oz-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; Oz-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; Oz-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; Oz-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; Oz-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; Oz-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; Oz-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; Oz-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; Oz-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; Oz-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; Oz-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; Oz-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; Oz-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; Oz-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; Oz-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; Oz-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; Oz-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; Oz-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; Oz-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; Oz-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; Oz-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; Oz-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; Oz-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; Oz-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; Oz-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; Oz-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; Oz-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; Oz-NEXT: ret i32 [[TMP78]] +; Oz-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; Oz-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; Oz-NEXT: store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4 +; Oz-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4 +; Oz-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; Oz-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] +; Oz-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4 +; Oz-NEXT: store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; Oz-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8 +; Oz-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; Oz-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] +; Oz-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8 +; Oz-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; Oz-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12 +; Oz-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4 +; Oz-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] +; Oz-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12 +; Oz-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4 +; Oz-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16 +; Oz-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; Oz-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] +; Oz-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16 +; Oz-NEXT: store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4 +; Oz-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20 +; Oz-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 +; Oz-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] +; Oz-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20 +; Oz-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4 +; Oz-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24 +; Oz-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4 +; Oz-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] +; Oz-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24 +; Oz-NEXT: store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4 +; Oz-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28 +; Oz-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4 +; Oz-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] +; Oz-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28 +; Oz-NEXT: store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4 +; Oz-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32 +; Oz-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4 +; Oz-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] +; Oz-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32 +; Oz-NEXT: store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4 +; Oz-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36 +; Oz-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4 +; Oz-NEXT: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] +; Oz-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36 +; Oz-NEXT: store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4 +; Oz-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40 +; Oz-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4 +; Oz-NEXT: [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] +; Oz-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40 +; Oz-NEXT: store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4 +; Oz-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44 +; Oz-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4 +; Oz-NEXT: [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] +; Oz-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44 +; Oz-NEXT: store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4 +; Oz-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48 +; Oz-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4 +; Oz-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] +; Oz-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48 +; Oz-NEXT: store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4 +; Oz-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52 +; Oz-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4 +; Oz-NEXT: [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] +; Oz-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52 +; Oz-NEXT: store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4 +; Oz-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56 +; Oz-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4 +; Oz-NEXT: [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] +; Oz-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56 +; Oz-NEXT: store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4 +; Oz-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60 +; Oz-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4 +; Oz-NEXT: [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] +; Oz-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60 +; Oz-NEXT: store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4 +; Oz-NEXT: [[TMP46:%.*]] = load i32, ptr [[A]], align 4 +; Oz-NEXT: ret i32 [[TMP46]] ; ; O1VEC2-LABEL: @enabled( ; O1VEC2-NEXT: entry: ; O1VEC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; O1VEC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; O1VEC2-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O1VEC2-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; O1VEC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 -; O1VEC2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O1VEC2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; O1VEC2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; O1VEC2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; O1VEC2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O1VEC2-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; O1VEC2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; O1VEC2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; O1VEC2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O1VEC2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; O1VEC2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; O1VEC2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; O1VEC2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O1VEC2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; O1VEC2-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; O1VEC2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; O1VEC2-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O1VEC2-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; O1VEC2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; O1VEC2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; O1VEC2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O1VEC2-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; O1VEC2-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; O1VEC2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; O1VEC2-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O1VEC2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; O1VEC2-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; O1VEC2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; O1VEC2-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O1VEC2-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; O1VEC2-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; O1VEC2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; O1VEC2-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O1VEC2-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; O1VEC2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; O1VEC2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; O1VEC2-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O1VEC2-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; O1VEC2-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; O1VEC2-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; O1VEC2-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O1VEC2-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; O1VEC2-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; O1VEC2-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; O1VEC2-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O1VEC2-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; O1VEC2-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; O1VEC2-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; O1VEC2-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O1VEC2-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; O1VEC2-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; O1VEC2-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; O1VEC2-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O1VEC2-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; O1VEC2-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; O1VEC2-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; O1VEC2-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O1VEC2-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; O1VEC2-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; O1VEC2-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; O1VEC2-NEXT: ret i32 [[TMP78]] +; O1VEC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; O1VEC2-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; O1VEC2-NEXT: store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4 +; O1VEC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4 +; O1VEC2-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; O1VEC2-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] +; O1VEC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4 +; O1VEC2-NEXT: store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; O1VEC2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8 +; O1VEC2-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; O1VEC2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] +; O1VEC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8 +; O1VEC2-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; O1VEC2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12 +; O1VEC2-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4 +; O1VEC2-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] +; O1VEC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12 +; O1VEC2-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4 +; O1VEC2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16 +; O1VEC2-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; O1VEC2-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] +; O1VEC2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16 +; O1VEC2-NEXT: store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4 +; O1VEC2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20 +; O1VEC2-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 +; O1VEC2-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] +; O1VEC2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20 +; O1VEC2-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4 +; O1VEC2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24 +; O1VEC2-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4 +; O1VEC2-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] +; O1VEC2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24 +; O1VEC2-NEXT: store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4 +; O1VEC2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28 +; O1VEC2-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4 +; O1VEC2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] +; O1VEC2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28 +; O1VEC2-NEXT: store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4 +; O1VEC2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32 +; O1VEC2-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4 +; O1VEC2-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] +; O1VEC2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32 +; O1VEC2-NEXT: store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4 +; O1VEC2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36 +; O1VEC2-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4 +; O1VEC2-NEXT: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] +; O1VEC2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36 +; O1VEC2-NEXT: store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4 +; O1VEC2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40 +; O1VEC2-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4 +; O1VEC2-NEXT: [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] +; O1VEC2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40 +; O1VEC2-NEXT: store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4 +; O1VEC2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44 +; O1VEC2-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4 +; O1VEC2-NEXT: [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] +; O1VEC2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44 +; O1VEC2-NEXT: store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4 +; O1VEC2-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48 +; O1VEC2-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4 +; O1VEC2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] +; O1VEC2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48 +; O1VEC2-NEXT: store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4 +; O1VEC2-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52 +; O1VEC2-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4 +; O1VEC2-NEXT: [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] +; O1VEC2-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52 +; O1VEC2-NEXT: store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4 +; O1VEC2-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56 +; O1VEC2-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4 +; O1VEC2-NEXT: [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] +; O1VEC2-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56 +; O1VEC2-NEXT: store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4 +; O1VEC2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60 +; O1VEC2-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4 +; O1VEC2-NEXT: [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] +; O1VEC2-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60 +; O1VEC2-NEXT: store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4 +; O1VEC2-NEXT: [[TMP46:%.*]] = load i32, ptr [[A]], align 4 +; O1VEC2-NEXT: ret i32 [[TMP46]] ; ; OzVEC2-LABEL: @enabled( ; OzVEC2-NEXT: entry: ; OzVEC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; OzVEC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; OzVEC2-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; OzVEC2-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; OzVEC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 -; OzVEC2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; OzVEC2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; OzVEC2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; OzVEC2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; OzVEC2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; OzVEC2-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; OzVEC2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; OzVEC2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; OzVEC2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; OzVEC2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; OzVEC2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; OzVEC2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; OzVEC2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; OzVEC2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; OzVEC2-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; OzVEC2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; OzVEC2-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; OzVEC2-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; OzVEC2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; OzVEC2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; OzVEC2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; OzVEC2-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; OzVEC2-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; OzVEC2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; OzVEC2-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; OzVEC2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; OzVEC2-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; OzVEC2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; OzVEC2-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; OzVEC2-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; OzVEC2-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; OzVEC2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; OzVEC2-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; OzVEC2-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; OzVEC2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; OzVEC2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; OzVEC2-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; OzVEC2-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; OzVEC2-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; OzVEC2-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; OzVEC2-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; OzVEC2-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; OzVEC2-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; OzVEC2-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; OzVEC2-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; OzVEC2-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; OzVEC2-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; OzVEC2-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; OzVEC2-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; OzVEC2-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; OzVEC2-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; OzVEC2-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; OzVEC2-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; OzVEC2-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; OzVEC2-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; OzVEC2-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; OzVEC2-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; OzVEC2-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; OzVEC2-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; OzVEC2-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; OzVEC2-NEXT: ret i32 [[TMP78]] +; OzVEC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; OzVEC2-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; OzVEC2-NEXT: store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4 +; OzVEC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4 +; OzVEC2-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; OzVEC2-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] +; OzVEC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4 +; OzVEC2-NEXT: store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; OzVEC2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8 +; OzVEC2-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; OzVEC2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] +; OzVEC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8 +; OzVEC2-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; OzVEC2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12 +; OzVEC2-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4 +; OzVEC2-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] +; OzVEC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12 +; OzVEC2-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4 +; OzVEC2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16 +; OzVEC2-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; OzVEC2-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] +; OzVEC2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16 +; OzVEC2-NEXT: store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4 +; OzVEC2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20 +; OzVEC2-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 +; OzVEC2-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] +; OzVEC2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20 +; OzVEC2-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4 +; OzVEC2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24 +; OzVEC2-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4 +; OzVEC2-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] +; OzVEC2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24 +; OzVEC2-NEXT: store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4 +; OzVEC2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28 +; OzVEC2-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4 +; OzVEC2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] +; OzVEC2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28 +; OzVEC2-NEXT: store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4 +; OzVEC2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32 +; OzVEC2-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4 +; OzVEC2-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] +; OzVEC2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32 +; OzVEC2-NEXT: store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4 +; OzVEC2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36 +; OzVEC2-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4 +; OzVEC2-NEXT: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] +; OzVEC2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36 +; OzVEC2-NEXT: store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4 +; OzVEC2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40 +; OzVEC2-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4 +; OzVEC2-NEXT: [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] +; OzVEC2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40 +; OzVEC2-NEXT: store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4 +; OzVEC2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44 +; OzVEC2-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4 +; OzVEC2-NEXT: [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] +; OzVEC2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44 +; OzVEC2-NEXT: store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4 +; OzVEC2-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48 +; OzVEC2-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4 +; OzVEC2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] +; OzVEC2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48 +; OzVEC2-NEXT: store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4 +; OzVEC2-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52 +; OzVEC2-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4 +; OzVEC2-NEXT: [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] +; OzVEC2-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52 +; OzVEC2-NEXT: store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4 +; OzVEC2-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56 +; OzVEC2-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4 +; OzVEC2-NEXT: [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] +; OzVEC2-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56 +; OzVEC2-NEXT: store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4 +; OzVEC2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60 +; OzVEC2-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4 +; OzVEC2-NEXT: [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] +; OzVEC2-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60 +; OzVEC2-NEXT: store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4 +; OzVEC2-NEXT: [[TMP46:%.*]] = load i32, ptr [[A]], align 4 +; OzVEC2-NEXT: ret i32 [[TMP46]] ; ; O3DIS-LABEL: @enabled( ; O3DIS-NEXT: entry: ; O3DIS-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; O3DIS-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; O3DIS-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; O3DIS-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O3DIS-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; O3DIS-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; O3DIS-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; O3DIS-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 -; O3DIS-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; O3DIS-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O3DIS-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; O3DIS-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; O3DIS-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; O3DIS-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; O3DIS-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; O3DIS-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; O3DIS-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O3DIS-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; O3DIS-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; O3DIS-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; O3DIS-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; O3DIS-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; O3DIS-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; O3DIS-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O3DIS-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; O3DIS-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; O3DIS-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; O3DIS-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; O3DIS-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; O3DIS-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; O3DIS-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O3DIS-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; O3DIS-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; O3DIS-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; O3DIS-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; O3DIS-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; O3DIS-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; O3DIS-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O3DIS-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; O3DIS-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; O3DIS-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; O3DIS-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; O3DIS-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; O3DIS-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; O3DIS-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O3DIS-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; O3DIS-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; O3DIS-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; O3DIS-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; O3DIS-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; O3DIS-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; O3DIS-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O3DIS-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; O3DIS-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; O3DIS-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; O3DIS-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; O3DIS-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; O3DIS-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; O3DIS-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O3DIS-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; O3DIS-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; O3DIS-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; O3DIS-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; O3DIS-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; O3DIS-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; O3DIS-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O3DIS-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; O3DIS-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; O3DIS-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; O3DIS-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; O3DIS-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; O3DIS-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; O3DIS-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O3DIS-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; O3DIS-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; O3DIS-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; O3DIS-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; O3DIS-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; O3DIS-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; O3DIS-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O3DIS-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; O3DIS-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; O3DIS-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; O3DIS-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; O3DIS-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; O3DIS-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; O3DIS-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O3DIS-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; O3DIS-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; O3DIS-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; O3DIS-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; O3DIS-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; O3DIS-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; O3DIS-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O3DIS-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; O3DIS-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; O3DIS-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; O3DIS-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; O3DIS-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; O3DIS-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; O3DIS-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O3DIS-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; O3DIS-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; O3DIS-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; O3DIS-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; O3DIS-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; O3DIS-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; O3DIS-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O3DIS-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; O3DIS-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; O3DIS-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; O3DIS-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; O3DIS-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; O3DIS-NEXT: ret i32 [[TMP78]] +; O3DIS-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; O3DIS-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; O3DIS-NEXT: store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4 +; O3DIS-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4 +; O3DIS-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; O3DIS-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] +; O3DIS-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4 +; O3DIS-NEXT: store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; O3DIS-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8 +; O3DIS-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; O3DIS-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] +; O3DIS-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8 +; O3DIS-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; O3DIS-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12 +; O3DIS-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4 +; O3DIS-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] +; O3DIS-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12 +; O3DIS-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4 +; O3DIS-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16 +; O3DIS-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; O3DIS-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] +; O3DIS-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16 +; O3DIS-NEXT: store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4 +; O3DIS-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20 +; O3DIS-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 +; O3DIS-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] +; O3DIS-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20 +; O3DIS-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4 +; O3DIS-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24 +; O3DIS-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4 +; O3DIS-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] +; O3DIS-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24 +; O3DIS-NEXT: store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4 +; O3DIS-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28 +; O3DIS-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4 +; O3DIS-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] +; O3DIS-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28 +; O3DIS-NEXT: store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4 +; O3DIS-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32 +; O3DIS-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4 +; O3DIS-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] +; O3DIS-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32 +; O3DIS-NEXT: store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4 +; O3DIS-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36 +; O3DIS-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4 +; O3DIS-NEXT: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] +; O3DIS-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36 +; O3DIS-NEXT: store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4 +; O3DIS-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40 +; O3DIS-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4 +; O3DIS-NEXT: [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] +; O3DIS-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40 +; O3DIS-NEXT: store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4 +; O3DIS-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44 +; O3DIS-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4 +; O3DIS-NEXT: [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] +; O3DIS-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44 +; O3DIS-NEXT: store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4 +; O3DIS-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48 +; O3DIS-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4 +; O3DIS-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] +; O3DIS-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48 +; O3DIS-NEXT: store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4 +; O3DIS-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52 +; O3DIS-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4 +; O3DIS-NEXT: [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] +; O3DIS-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52 +; O3DIS-NEXT: store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4 +; O3DIS-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56 +; O3DIS-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4 +; O3DIS-NEXT: [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] +; O3DIS-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56 +; O3DIS-NEXT: store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4 +; O3DIS-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60 +; O3DIS-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4 +; O3DIS-NEXT: [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] +; O3DIS-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60 +; O3DIS-NEXT: store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4 +; O3DIS-NEXT: [[TMP46:%.*]] = load i32, ptr [[A]], align 4 +; O3DIS-NEXT: ret i32 [[TMP46]] ; entry: br label %for.body for.body: ; preds = %for.body, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 %add = add nsw i32 %0, %N - %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv - store i32 %add, i32* %arrayidx2, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv + store i32 %add, ptr %arrayidx2, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 64 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 for.end: ; preds = %for.body - %1 = load i32, i32* %a, align 4 + %1 = load i32, ptr %a, align 4 ret i32 %1 } -define i32 @nopragma(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) { +define i32 @nopragma(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i32 %N) { ; O1-LABEL: @nopragma( ; O1-NEXT: entry: ; O1-NEXT: br label [[FOR_BODY:%.*]] ; O1: for.body: ; O1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; O1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]] -; O1-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; O1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]] +; O1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; O1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]] -; O1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] -; O1-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; O1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]] +; O1-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; O1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; O1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 ; O1-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; O1: for.end: -; O1-NEXT: [[TMP1:%.*]] = load i32, i32* [[A]], align 4 +; O1-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4 ; O1-NEXT: ret i32 [[TMP1]] ; ; O2-LABEL: @nopragma( ; O2-NEXT: entry: ; O2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; O2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; O2-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O2-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; O2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 -; O2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; O2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; O2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; O2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O2-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; O2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; O2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; O2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; O2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; O2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; O2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; O2-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; O2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; O2-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O2-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; O2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; O2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; O2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O2-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; O2-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; O2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; O2-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; O2-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; O2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; O2-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O2-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; O2-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; O2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; O2-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O2-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; O2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; O2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; O2-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O2-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; O2-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; O2-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; O2-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O2-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; O2-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; O2-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; O2-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O2-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; O2-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; O2-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; O2-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O2-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; O2-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; O2-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; O2-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O2-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; O2-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; O2-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; O2-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O2-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; O2-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; O2-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; O2-NEXT: ret i32 [[TMP78]] +; O2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; O2-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; O2-NEXT: store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4 +; O2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4 +; O2-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; O2-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4 +; O2-NEXT: store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; O2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8 +; O2-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; O2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8 +; O2-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; O2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12 +; O2-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4 +; O2-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12 +; O2-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4 +; O2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16 +; O2-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; O2-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16 +; O2-NEXT: store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4 +; O2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20 +; O2-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 +; O2-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20 +; O2-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4 +; O2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24 +; O2-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4 +; O2-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24 +; O2-NEXT: store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4 +; O2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28 +; O2-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4 +; O2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28 +; O2-NEXT: store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4 +; O2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32 +; O2-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4 +; O2-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32 +; O2-NEXT: store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4 +; O2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36 +; O2-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4 +; O2-NEXT: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36 +; O2-NEXT: store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4 +; O2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40 +; O2-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4 +; O2-NEXT: [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40 +; O2-NEXT: store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4 +; O2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44 +; O2-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4 +; O2-NEXT: [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44 +; O2-NEXT: store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4 +; O2-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48 +; O2-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4 +; O2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48 +; O2-NEXT: store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4 +; O2-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52 +; O2-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4 +; O2-NEXT: [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52 +; O2-NEXT: store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4 +; O2-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56 +; O2-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4 +; O2-NEXT: [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56 +; O2-NEXT: store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4 +; O2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60 +; O2-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4 +; O2-NEXT: [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60 +; O2-NEXT: store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4 +; O2-NEXT: [[TMP46:%.*]] = load i32, ptr [[A]], align 4 +; O2-NEXT: ret i32 [[TMP46]] ; ; O3-LABEL: @nopragma( ; O3-NEXT: entry: ; O3-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; O3-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; O3-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O3-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; O3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 -; O3-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O3-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; O3-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; O3-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; O3-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O3-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; O3-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; O3-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; O3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O3-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; O3-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; O3-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; O3-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O3-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; O3-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; O3-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; O3-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O3-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; O3-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; O3-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; O3-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O3-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; O3-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; O3-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; O3-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O3-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; O3-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; O3-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; O3-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O3-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; O3-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; O3-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; O3-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O3-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; O3-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; O3-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; O3-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O3-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; O3-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; O3-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; O3-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O3-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; O3-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; O3-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; O3-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O3-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; O3-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; O3-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; O3-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O3-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; O3-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; O3-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; O3-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O3-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; O3-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; O3-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; O3-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; O3-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O3-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; O3-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; O3-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; O3-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; O3-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; O3-NEXT: ret i32 [[TMP78]] +; O3-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; O3-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; O3-NEXT: store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4 +; O3-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4 +; O3-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; O3-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4 +; O3-NEXT: store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; O3-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8 +; O3-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; O3-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8 +; O3-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; O3-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12 +; O3-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4 +; O3-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12 +; O3-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4 +; O3-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16 +; O3-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; O3-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16 +; O3-NEXT: store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4 +; O3-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20 +; O3-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 +; O3-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20 +; O3-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4 +; O3-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24 +; O3-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4 +; O3-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24 +; O3-NEXT: store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4 +; O3-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28 +; O3-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4 +; O3-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28 +; O3-NEXT: store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4 +; O3-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32 +; O3-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4 +; O3-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32 +; O3-NEXT: store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4 +; O3-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36 +; O3-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4 +; O3-NEXT: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36 +; O3-NEXT: store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4 +; O3-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40 +; O3-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4 +; O3-NEXT: [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40 +; O3-NEXT: store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4 +; O3-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44 +; O3-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4 +; O3-NEXT: [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44 +; O3-NEXT: store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4 +; O3-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48 +; O3-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4 +; O3-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48 +; O3-NEXT: store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4 +; O3-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52 +; O3-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4 +; O3-NEXT: [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52 +; O3-NEXT: store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4 +; O3-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56 +; O3-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4 +; O3-NEXT: [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56 +; O3-NEXT: store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4 +; O3-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60 +; O3-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4 +; O3-NEXT: [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] +; O3-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60 +; O3-NEXT: store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4 +; O3-NEXT: [[TMP46:%.*]] = load i32, ptr [[A]], align 4 +; O3-NEXT: ret i32 [[TMP46]] ; ; O3DEFAULT-LABEL: @nopragma( ; O3DEFAULT-NEXT: entry: ; O3DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; O3DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; O3DEFAULT-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O3DEFAULT-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; O3DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 -; O3DEFAULT-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O3DEFAULT-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; O3DEFAULT-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; O3DEFAULT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; O3DEFAULT-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O3DEFAULT-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; O3DEFAULT-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; O3DEFAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; O3DEFAULT-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O3DEFAULT-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; O3DEFAULT-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; O3DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; O3DEFAULT-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O3DEFAULT-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; O3DEFAULT-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; O3DEFAULT-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; O3DEFAULT-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O3DEFAULT-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; O3DEFAULT-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; O3DEFAULT-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; O3DEFAULT-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O3DEFAULT-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; O3DEFAULT-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; O3DEFAULT-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; O3DEFAULT-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O3DEFAULT-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; O3DEFAULT-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; O3DEFAULT-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; O3DEFAULT-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O3DEFAULT-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; O3DEFAULT-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; O3DEFAULT-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; O3DEFAULT-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O3DEFAULT-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; O3DEFAULT-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; O3DEFAULT-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; O3DEFAULT-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O3DEFAULT-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; O3DEFAULT-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; O3DEFAULT-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; O3DEFAULT-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O3DEFAULT-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; O3DEFAULT-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; O3DEFAULT-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; O3DEFAULT-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O3DEFAULT-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; O3DEFAULT-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; O3DEFAULT-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; O3DEFAULT-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O3DEFAULT-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; O3DEFAULT-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; O3DEFAULT-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; O3DEFAULT-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O3DEFAULT-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; O3DEFAULT-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; O3DEFAULT-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; O3DEFAULT-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O3DEFAULT-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; O3DEFAULT-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; O3DEFAULT-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; O3DEFAULT-NEXT: ret i32 [[TMP78]] +; O3DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; O3DEFAULT-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4 +; O3DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4 +; O3DEFAULT-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; O3DEFAULT-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; O3DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8 +; O3DEFAULT-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; O3DEFAULT-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; O3DEFAULT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12 +; O3DEFAULT-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4 +; O3DEFAULT-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4 +; O3DEFAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16 +; O3DEFAULT-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; O3DEFAULT-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4 +; O3DEFAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20 +; O3DEFAULT-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 +; O3DEFAULT-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4 +; O3DEFAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24 +; O3DEFAULT-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4 +; O3DEFAULT-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4 +; O3DEFAULT-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28 +; O3DEFAULT-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4 +; O3DEFAULT-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4 +; O3DEFAULT-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32 +; O3DEFAULT-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4 +; O3DEFAULT-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4 +; O3DEFAULT-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36 +; O3DEFAULT-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4 +; O3DEFAULT-NEXT: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4 +; O3DEFAULT-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40 +; O3DEFAULT-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4 +; O3DEFAULT-NEXT: [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4 +; O3DEFAULT-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44 +; O3DEFAULT-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4 +; O3DEFAULT-NEXT: [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4 +; O3DEFAULT-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48 +; O3DEFAULT-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4 +; O3DEFAULT-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4 +; O3DEFAULT-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52 +; O3DEFAULT-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4 +; O3DEFAULT-NEXT: [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4 +; O3DEFAULT-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56 +; O3DEFAULT-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4 +; O3DEFAULT-NEXT: [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4 +; O3DEFAULT-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60 +; O3DEFAULT-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4 +; O3DEFAULT-NEXT: [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60 +; O3DEFAULT-NEXT: store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4 +; O3DEFAULT-NEXT: [[TMP46:%.*]] = load i32, ptr [[A]], align 4 +; O3DEFAULT-NEXT: ret i32 [[TMP46]] ; ; Os-LABEL: @nopragma( ; Os-NEXT: entry: ; Os-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; Os-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; Os-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; Os-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; Os-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 -; Os-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; Os-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; Os-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; Os-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; Os-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; Os-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; Os-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; Os-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; Os-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; Os-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; Os-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; Os-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; Os-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; Os-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; Os-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; Os-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; Os-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; Os-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; Os-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; Os-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; Os-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; Os-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; Os-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; Os-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; Os-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; Os-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; Os-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; Os-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; Os-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; Os-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; Os-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; Os-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; Os-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; Os-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; Os-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; Os-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; Os-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; Os-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; Os-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; Os-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; Os-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; Os-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; Os-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; Os-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; Os-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; Os-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; Os-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; Os-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; Os-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; Os-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; Os-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; Os-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; Os-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; Os-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; Os-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; Os-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; Os-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; Os-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; Os-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; Os-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; Os-NEXT: ret i32 [[TMP78]] +; Os-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; Os-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; Os-NEXT: store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4 +; Os-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4 +; Os-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; Os-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4 +; Os-NEXT: store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; Os-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8 +; Os-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; Os-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8 +; Os-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; Os-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12 +; Os-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4 +; Os-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12 +; Os-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4 +; Os-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16 +; Os-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; Os-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16 +; Os-NEXT: store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4 +; Os-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20 +; Os-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 +; Os-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20 +; Os-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4 +; Os-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24 +; Os-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4 +; Os-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24 +; Os-NEXT: store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4 +; Os-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28 +; Os-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4 +; Os-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28 +; Os-NEXT: store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4 +; Os-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32 +; Os-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4 +; Os-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32 +; Os-NEXT: store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4 +; Os-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36 +; Os-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4 +; Os-NEXT: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36 +; Os-NEXT: store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4 +; Os-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40 +; Os-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4 +; Os-NEXT: [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40 +; Os-NEXT: store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4 +; Os-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44 +; Os-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4 +; Os-NEXT: [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44 +; Os-NEXT: store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4 +; Os-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48 +; Os-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4 +; Os-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48 +; Os-NEXT: store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4 +; Os-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52 +; Os-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4 +; Os-NEXT: [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52 +; Os-NEXT: store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4 +; Os-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56 +; Os-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4 +; Os-NEXT: [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56 +; Os-NEXT: store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4 +; Os-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60 +; Os-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4 +; Os-NEXT: [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60 +; Os-NEXT: store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4 +; Os-NEXT: [[TMP46:%.*]] = load i32, ptr [[A]], align 4 +; Os-NEXT: ret i32 [[TMP46]] ; ; Oz-LABEL: @nopragma( ; Oz-NEXT: entry: ; Oz-NEXT: br label [[FOR_BODY:%.*]] ; Oz: for.body: ; Oz-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; Oz-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]] -; Oz-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; Oz-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]] +; Oz-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; Oz-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]] -; Oz-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] -; Oz-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; Oz-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]] +; Oz-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; Oz-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; Oz-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 ; Oz-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; Oz: for.end: -; Oz-NEXT: [[TMP1:%.*]] = load i32, i32* [[A]], align 4 +; Oz-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4 ; Oz-NEXT: ret i32 [[TMP1]] ; ; O1VEC2-LABEL: @nopragma( @@ -1602,18 +1186,16 @@ define i32 @nopragma(i32* noalias nocapture %a, i32* noalias nocapture readonly ; O1VEC2: vector.body: ; O1VEC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; O1VEC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; O1VEC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] -; O1VEC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; O1VEC2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; O1VEC2-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; O1VEC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 -; O1VEC2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], align 4 +; O1VEC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] +; O1VEC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; O1VEC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; O1VEC2-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; O1VEC2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; O1VEC2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; O1VEC2-NEXT: store <4 x i32> [[TMP3]], ptr [[TMP5]], align 4 ; O1VEC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; O1VEC2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; O1VEC2-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; O1VEC2-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; O1VEC2-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; O1VEC2: middle.block: ; O1VEC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 64, 64 ; O1VEC2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1622,17 +1204,17 @@ define i32 @nopragma(i32* noalias nocapture %a, i32* noalias nocapture readonly ; O1VEC2-NEXT: br label [[FOR_BODY:%.*]] ; O1VEC2: for.body: ; O1VEC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; O1VEC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; O1VEC2-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; O1VEC2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[N]] -; O1VEC2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; O1VEC2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; O1VEC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; O1VEC2-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; O1VEC2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP7]], [[N]] +; O1VEC2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; O1VEC2-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; O1VEC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; O1VEC2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 ; O1VEC2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; O1VEC2: for.end: -; O1VEC2-NEXT: [[TMP10:%.*]] = load i32, i32* [[A]], align 4 -; O1VEC2-NEXT: ret i32 [[TMP10]] +; O1VEC2-NEXT: [[TMP8:%.*]] = load i32, ptr [[A]], align 4 +; O1VEC2-NEXT: ret i32 [[TMP8]] ; ; OzVEC2-LABEL: @nopragma( ; OzVEC2-NEXT: entry: @@ -1644,18 +1226,16 @@ define i32 @nopragma(i32* noalias nocapture %a, i32* noalias nocapture readonly ; OzVEC2: vector.body: ; OzVEC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; OzVEC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; OzVEC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] -; OzVEC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; OzVEC2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; OzVEC2-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; OzVEC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 -; OzVEC2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], align 4 +; OzVEC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] +; OzVEC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; OzVEC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; OzVEC2-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; OzVEC2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; OzVEC2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; OzVEC2-NEXT: store <4 x i32> [[TMP3]], ptr [[TMP5]], align 4 ; OzVEC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; OzVEC2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; OzVEC2-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; OzVEC2-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; OzVEC2-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; OzVEC2: middle.block: ; OzVEC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 64, 64 ; OzVEC2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1664,33 +1244,33 @@ define i32 @nopragma(i32* noalias nocapture %a, i32* noalias nocapture readonly ; OzVEC2-NEXT: br label [[FOR_BODY:%.*]] ; OzVEC2: for.body: ; OzVEC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; OzVEC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; OzVEC2-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; OzVEC2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[N]] -; OzVEC2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; OzVEC2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; OzVEC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; OzVEC2-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; OzVEC2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP7]], [[N]] +; OzVEC2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; OzVEC2-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; OzVEC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; OzVEC2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 ; OzVEC2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; OzVEC2: for.end: -; OzVEC2-NEXT: [[TMP10:%.*]] = load i32, i32* [[A]], align 4 -; OzVEC2-NEXT: ret i32 [[TMP10]] +; OzVEC2-NEXT: [[TMP8:%.*]] = load i32, ptr [[A]], align 4 +; OzVEC2-NEXT: ret i32 [[TMP8]] ; ; O3DIS-LABEL: @nopragma( ; O3DIS-NEXT: entry: ; O3DIS-NEXT: br label [[FOR_BODY:%.*]] ; O3DIS: for.body: ; O3DIS-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; O3DIS-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]] -; O3DIS-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; O3DIS-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]] +; O3DIS-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; O3DIS-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]] -; O3DIS-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] -; O3DIS-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; O3DIS-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]] +; O3DIS-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; O3DIS-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; O3DIS-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 ; O3DIS-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; O3DIS: for.end: -; O3DIS-NEXT: [[TMP1:%.*]] = load i32, i32* [[A]], align 4 +; O3DIS-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4 ; O3DIS-NEXT: ret i32 [[TMP1]] ; entry: @@ -1698,36 +1278,36 @@ entry: for.body: ; preds = %for.body, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 %add = add nsw i32 %0, %N - %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv - store i32 %add, i32* %arrayidx2, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv + store i32 %add, ptr %arrayidx2, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 64 br i1 %exitcond, label %for.end, label %for.body for.end: ; preds = %for.body - %1 = load i32, i32* %a, align 4 + %1 = load i32, ptr %a, align 4 ret i32 %1 } -define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) { +define i32 @disabled(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i32 %N) { ; O1-LABEL: @disabled( ; O1-NEXT: entry: ; O1-NEXT: br label [[FOR_BODY:%.*]] ; O1: for.body: ; O1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; O1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]] -; O1-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; O1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]] +; O1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; O1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]] -; O1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] -; O1-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; O1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]] +; O1-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; O1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; O1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48 ; O1-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; O1: for.end: -; O1-NEXT: [[TMP1:%.*]] = load i32, i32* [[A]], align 4 +; O1-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4 ; O1-NEXT: ret i32 [[TMP1]] ; ; O2-LABEL: @disabled( @@ -1735,16 +1315,16 @@ define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly ; O2-NEXT: br label [[FOR_BODY:%.*]] ; O2: for.body: ; O2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; O2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]] -; O2-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; O2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]] +; O2-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; O2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]] -; O2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] -; O2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; O2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]] +; O2-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; O2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; O2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48 ; O2-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; O2: for.end: -; O2-NEXT: [[TMP1:%.*]] = load i32, i32* [[A]], align 4 +; O2-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4 ; O2-NEXT: ret i32 [[TMP1]] ; ; O3-LABEL: @disabled( @@ -1752,122 +1332,98 @@ define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly ; O3-NEXT: br label [[FOR_BODY:%.*]] ; O3: for.body: ; O3-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; O3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]] -; O3-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; O3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]] +; O3-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; O3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]] -; O3-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] -; O3-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; O3-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]] +; O3-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; O3-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; O3-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48 ; O3-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; O3: for.end: -; O3-NEXT: [[TMP1:%.*]] = load i32, i32* [[A]], align 4 +; O3-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4 ; O3-NEXT: ret i32 [[TMP1]] ; ; O3DEFAULT-LABEL: @disabled( ; O3DEFAULT-NEXT: entry: -; O3DEFAULT-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; O3DEFAULT-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O3DEFAULT-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 -; O3DEFAULT-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer -; O3DEFAULT-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[TMP1]], [[SHUFFLE]] -; O3DEFAULT-NEXT: [[TMP4:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4 -; O3DEFAULT-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 -; O3DEFAULT-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; O3DEFAULT-NEXT: [[TMP5:%.*]] = bitcast i32* [[ARRAYIDX_4]] to <4 x i32>* -; O3DEFAULT-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4 -; O3DEFAULT-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], [[SHUFFLE]] -; O3DEFAULT-NEXT: [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX2_4]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4 -; O3DEFAULT-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; O3DEFAULT-NEXT: [[ARRAYIDX2_8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; O3DEFAULT-NEXT: [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX_8]] to <4 x i32>* -; O3DEFAULT-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O3DEFAULT-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[TMP10]], [[SHUFFLE]] -; O3DEFAULT-NEXT: [[TMP12:%.*]] = bitcast i32* [[ARRAYIDX2_8]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4 -; O3DEFAULT-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; O3DEFAULT-NEXT: [[ARRAYIDX2_12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; O3DEFAULT-NEXT: [[TMP13:%.*]] = bitcast i32* [[ARRAYIDX_12]] to <4 x i32>* -; O3DEFAULT-NEXT: [[TMP14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4 -; O3DEFAULT-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], [[SHUFFLE]] -; O3DEFAULT-NEXT: [[TMP16:%.*]] = bitcast i32* [[ARRAYIDX2_12]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4 -; O3DEFAULT-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; O3DEFAULT-NEXT: [[ARRAYIDX2_16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; O3DEFAULT-NEXT: [[TMP17:%.*]] = bitcast i32* [[ARRAYIDX_16]] to <4 x i32>* -; O3DEFAULT-NEXT: [[TMP18:%.*]] = load <4 x i32>, <4 x i32>* [[TMP17]], align 4 -; O3DEFAULT-NEXT: [[TMP19:%.*]] = add nsw <4 x i32> [[TMP18]], [[SHUFFLE]] -; O3DEFAULT-NEXT: [[TMP20:%.*]] = bitcast i32* [[ARRAYIDX2_16]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP19]], <4 x i32>* [[TMP20]], align 4 -; O3DEFAULT-NEXT: [[ARRAYIDX_20:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; O3DEFAULT-NEXT: [[ARRAYIDX2_20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; O3DEFAULT-NEXT: [[TMP21:%.*]] = bitcast i32* [[ARRAYIDX_20]] to <4 x i32>* -; O3DEFAULT-NEXT: [[TMP22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP21]], align 4 -; O3DEFAULT-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[TMP22]], [[SHUFFLE]] -; O3DEFAULT-NEXT: [[TMP24:%.*]] = bitcast i32* [[ARRAYIDX2_20]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP23]], <4 x i32>* [[TMP24]], align 4 -; O3DEFAULT-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; O3DEFAULT-NEXT: [[ARRAYIDX2_24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; O3DEFAULT-NEXT: [[TMP25:%.*]] = bitcast i32* [[ARRAYIDX_24]] to <4 x i32>* -; O3DEFAULT-NEXT: [[TMP26:%.*]] = load <4 x i32>, <4 x i32>* [[TMP25]], align 4 -; O3DEFAULT-NEXT: [[TMP27:%.*]] = add nsw <4 x i32> [[TMP26]], [[SHUFFLE]] -; O3DEFAULT-NEXT: [[TMP28:%.*]] = bitcast i32* [[ARRAYIDX2_24]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP27]], <4 x i32>* [[TMP28]], align 4 -; O3DEFAULT-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; O3DEFAULT-NEXT: [[ARRAYIDX2_28:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; O3DEFAULT-NEXT: [[TMP29:%.*]] = bitcast i32* [[ARRAYIDX_28]] to <4 x i32>* -; O3DEFAULT-NEXT: [[TMP30:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O3DEFAULT-NEXT: [[TMP31:%.*]] = add nsw <4 x i32> [[TMP30]], [[SHUFFLE]] -; O3DEFAULT-NEXT: [[TMP32:%.*]] = bitcast i32* [[ARRAYIDX2_28]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP31]], <4 x i32>* [[TMP32]], align 4 -; O3DEFAULT-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; O3DEFAULT-NEXT: [[ARRAYIDX2_32:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; O3DEFAULT-NEXT: [[TMP33:%.*]] = bitcast i32* [[ARRAYIDX_32]] to <4 x i32>* -; O3DEFAULT-NEXT: [[TMP34:%.*]] = load <4 x i32>, <4 x i32>* [[TMP33]], align 4 -; O3DEFAULT-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[TMP34]], [[SHUFFLE]] -; O3DEFAULT-NEXT: [[TMP36:%.*]] = bitcast i32* [[ARRAYIDX2_32]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP36]], align 4 -; O3DEFAULT-NEXT: [[ARRAYIDX_36:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; O3DEFAULT-NEXT: [[ARRAYIDX2_36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; O3DEFAULT-NEXT: [[TMP37:%.*]] = bitcast i32* [[ARRAYIDX_36]] to <4 x i32>* -; O3DEFAULT-NEXT: [[TMP38:%.*]] = load <4 x i32>, <4 x i32>* [[TMP37]], align 4 -; O3DEFAULT-NEXT: [[TMP39:%.*]] = add nsw <4 x i32> [[TMP38]], [[SHUFFLE]] -; O3DEFAULT-NEXT: [[TMP40:%.*]] = bitcast i32* [[ARRAYIDX2_36]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP39]], <4 x i32>* [[TMP40]], align 4 -; O3DEFAULT-NEXT: [[ARRAYIDX_40:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; O3DEFAULT-NEXT: [[ARRAYIDX2_40:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; O3DEFAULT-NEXT: [[TMP41:%.*]] = bitcast i32* [[ARRAYIDX_40]] to <4 x i32>* -; O3DEFAULT-NEXT: [[TMP42:%.*]] = load <4 x i32>, <4 x i32>* [[TMP41]], align 4 -; O3DEFAULT-NEXT: [[TMP43:%.*]] = add nsw <4 x i32> [[TMP42]], [[SHUFFLE]] -; O3DEFAULT-NEXT: [[TMP44:%.*]] = bitcast i32* [[ARRAYIDX2_40]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP43]], <4 x i32>* [[TMP44]], align 4 -; O3DEFAULT-NEXT: [[ARRAYIDX_44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; O3DEFAULT-NEXT: [[ARRAYIDX2_44:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; O3DEFAULT-NEXT: [[TMP45:%.*]] = bitcast i32* [[ARRAYIDX_44]] to <4 x i32>* -; O3DEFAULT-NEXT: [[TMP46:%.*]] = load <4 x i32>, <4 x i32>* [[TMP45]], align 4 -; O3DEFAULT-NEXT: [[TMP47:%.*]] = add nsw <4 x i32> [[TMP46]], [[SHUFFLE]] -; O3DEFAULT-NEXT: [[TMP48:%.*]] = bitcast i32* [[ARRAYIDX2_44]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP47]], <4 x i32>* [[TMP48]], align 4 -; O3DEFAULT-NEXT: [[TMP49:%.*]] = load i32, i32* [[A]], align 4 -; O3DEFAULT-NEXT: ret i32 [[TMP49]] +; O3DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; O3DEFAULT-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 +; O3DEFAULT-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer +; O3DEFAULT-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[TMP0]], [[TMP2]] +; O3DEFAULT-NEXT: store <4 x i32> [[TMP3]], ptr [[A:%.*]], align 4 +; O3DEFAULT-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4 +; O3DEFAULT-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4 +; O3DEFAULT-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_4]], align 4 +; O3DEFAULT-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP2]] +; O3DEFAULT-NEXT: store <4 x i32> [[TMP5]], ptr [[ARRAYIDX2_4]], align 4 +; O3DEFAULT-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8 +; O3DEFAULT-NEXT: [[ARRAYIDX2_8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8 +; O3DEFAULT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_8]], align 4 +; O3DEFAULT-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP2]] +; O3DEFAULT-NEXT: store <4 x i32> [[TMP7]], ptr [[ARRAYIDX2_8]], align 4 +; O3DEFAULT-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12 +; O3DEFAULT-NEXT: [[ARRAYIDX2_12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12 +; O3DEFAULT-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_12]], align 4 +; O3DEFAULT-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[TMP8]], [[TMP2]] +; O3DEFAULT-NEXT: store <4 x i32> [[TMP9]], ptr [[ARRAYIDX2_12]], align 4 +; O3DEFAULT-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16 +; O3DEFAULT-NEXT: [[ARRAYIDX2_16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16 +; O3DEFAULT-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_16]], align 4 +; O3DEFAULT-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[TMP10]], [[TMP2]] +; O3DEFAULT-NEXT: store <4 x i32> [[TMP11]], ptr [[ARRAYIDX2_16]], align 4 +; O3DEFAULT-NEXT: [[ARRAYIDX_20:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20 +; O3DEFAULT-NEXT: [[ARRAYIDX2_20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20 +; O3DEFAULT-NEXT: [[TMP12:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_20]], align 4 +; O3DEFAULT-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP2]] +; O3DEFAULT-NEXT: store <4 x i32> [[TMP13]], ptr [[ARRAYIDX2_20]], align 4 +; O3DEFAULT-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24 +; O3DEFAULT-NEXT: [[ARRAYIDX2_24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24 +; O3DEFAULT-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_24]], align 4 +; O3DEFAULT-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], [[TMP2]] +; O3DEFAULT-NEXT: store <4 x i32> [[TMP15]], ptr [[ARRAYIDX2_24]], align 4 +; O3DEFAULT-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28 +; O3DEFAULT-NEXT: [[ARRAYIDX2_28:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28 +; O3DEFAULT-NEXT: [[TMP16:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_28]], align 4 +; O3DEFAULT-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[TMP16]], [[TMP2]] +; O3DEFAULT-NEXT: store <4 x i32> [[TMP17]], ptr [[ARRAYIDX2_28]], align 4 +; O3DEFAULT-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32 +; O3DEFAULT-NEXT: [[ARRAYIDX2_32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32 +; O3DEFAULT-NEXT: [[TMP18:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_32]], align 4 +; O3DEFAULT-NEXT: [[TMP19:%.*]] = add nsw <4 x i32> [[TMP18]], [[TMP2]] +; O3DEFAULT-NEXT: store <4 x i32> [[TMP19]], ptr [[ARRAYIDX2_32]], align 4 +; O3DEFAULT-NEXT: [[ARRAYIDX_36:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36 +; O3DEFAULT-NEXT: [[ARRAYIDX2_36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36 +; O3DEFAULT-NEXT: [[TMP20:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_36]], align 4 +; O3DEFAULT-NEXT: [[TMP21:%.*]] = add nsw <4 x i32> [[TMP20]], [[TMP2]] +; O3DEFAULT-NEXT: store <4 x i32> [[TMP21]], ptr [[ARRAYIDX2_36]], align 4 +; O3DEFAULT-NEXT: [[ARRAYIDX_40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40 +; O3DEFAULT-NEXT: [[ARRAYIDX2_40:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40 +; O3DEFAULT-NEXT: [[TMP22:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_40]], align 4 +; O3DEFAULT-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[TMP22]], [[TMP2]] +; O3DEFAULT-NEXT: store <4 x i32> [[TMP23]], ptr [[ARRAYIDX2_40]], align 4 +; O3DEFAULT-NEXT: [[ARRAYIDX_44:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44 +; O3DEFAULT-NEXT: [[ARRAYIDX2_44:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44 +; O3DEFAULT-NEXT: [[TMP24:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_44]], align 4 +; O3DEFAULT-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[TMP24]], [[TMP2]] +; O3DEFAULT-NEXT: store <4 x i32> [[TMP25]], ptr [[ARRAYIDX2_44]], align 4 +; O3DEFAULT-NEXT: [[TMP26:%.*]] = load i32, ptr [[A]], align 4 +; O3DEFAULT-NEXT: ret i32 [[TMP26]] ; ; Os-LABEL: @disabled( ; Os-NEXT: entry: ; Os-NEXT: br label [[FOR_BODY:%.*]] ; Os: for.body: ; Os-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; Os-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]] -; Os-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; Os-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]] +; Os-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; Os-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]] -; Os-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] -; Os-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; Os-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]] +; Os-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; Os-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; Os-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48 ; Os-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; Os: for.end: -; Os-NEXT: [[TMP1:%.*]] = load i32, i32* [[A]], align 4 +; Os-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4 ; Os-NEXT: ret i32 [[TMP1]] ; ; Oz-LABEL: @disabled( @@ -1875,16 +1431,16 @@ define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly ; Oz-NEXT: br label [[FOR_BODY:%.*]] ; Oz: for.body: ; Oz-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; Oz-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]] -; Oz-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; Oz-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]] +; Oz-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; Oz-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]] -; Oz-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] -; Oz-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; Oz-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]] +; Oz-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; Oz-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; Oz-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48 ; Oz-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; Oz: for.end: -; Oz-NEXT: [[TMP1:%.*]] = load i32, i32* [[A]], align 4 +; Oz-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4 ; Oz-NEXT: ret i32 [[TMP1]] ; ; O1VEC2-LABEL: @disabled( @@ -1892,16 +1448,16 @@ define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly ; O1VEC2-NEXT: br label [[FOR_BODY:%.*]] ; O1VEC2: for.body: ; O1VEC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; O1VEC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]] -; O1VEC2-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; O1VEC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]] +; O1VEC2-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; O1VEC2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]] -; O1VEC2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] -; O1VEC2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; O1VEC2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]] +; O1VEC2-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; O1VEC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; O1VEC2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48 ; O1VEC2-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; O1VEC2: for.end: -; O1VEC2-NEXT: [[TMP1:%.*]] = load i32, i32* [[A]], align 4 +; O1VEC2-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4 ; O1VEC2-NEXT: ret i32 [[TMP1]] ; ; OzVEC2-LABEL: @disabled( @@ -1909,16 +1465,16 @@ define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly ; OzVEC2-NEXT: br label [[FOR_BODY:%.*]] ; OzVEC2: for.body: ; OzVEC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; OzVEC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]] -; OzVEC2-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; OzVEC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]] +; OzVEC2-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; OzVEC2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]] -; OzVEC2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] -; OzVEC2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; OzVEC2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]] +; OzVEC2-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; OzVEC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; OzVEC2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48 ; OzVEC2-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; OzVEC2: for.end: -; OzVEC2-NEXT: [[TMP1:%.*]] = load i32, i32* [[A]], align 4 +; OzVEC2-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4 ; OzVEC2-NEXT: ret i32 [[TMP1]] ; ; O3DIS-LABEL: @disabled( @@ -1926,16 +1482,16 @@ define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly ; O3DIS-NEXT: br label [[FOR_BODY:%.*]] ; O3DIS: for.body: ; O3DIS-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; O3DIS-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]] -; O3DIS-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; O3DIS-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]] +; O3DIS-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; O3DIS-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]] -; O3DIS-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] -; O3DIS-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; O3DIS-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]] +; O3DIS-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; O3DIS-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; O3DIS-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48 ; O3DIS-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; O3DIS: for.end: -; O3DIS-NEXT: [[TMP1:%.*]] = load i32, i32* [[A]], align 4 +; O3DIS-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4 ; O3DIS-NEXT: ret i32 [[TMP1]] ; entry: @@ -1943,17 +1499,17 @@ entry: for.body: ; preds = %for.body, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 %add = add nsw i32 %0, %N - %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv - store i32 %add, i32* %arrayidx2, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv + store i32 %add, ptr %arrayidx2, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 48 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2 for.end: ; preds = %for.body - %1 = load i32, i32* %a, align 4 + %1 = load i32, ptr %a, align 4 ret i32 %1 } diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll index 70e928778c737..f8b95f3dd6fb0 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll @@ -22,17 +22,15 @@ define i32 @foo_optsize() #0 { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> poison, <64 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <64 x i32> [[VEC_IV]], -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP1]], <64 x i8> poison) -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> , <64 x i8> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* -; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP6]], <64 x i8>* [[TMP7]], i32 1, <64 x i1> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[TMP3]], i32 1, <64 x i1> [[TMP1]], <64 x i8> poison) +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select <64 x i1> [[TMP4]], <64 x i8> , <64 x i8> +; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP5]], ptr [[TMP3]], i32 1, <64 x i1> [[TMP1]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 64 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -40,11 +38,11 @@ define i32 @foo_optsize() #0 { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] -; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[I_08]] +; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP7]], 0 ; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 -; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: store i8 [[DOT]], ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] @@ -63,17 +61,15 @@ define i32 @foo_optsize() #0 { ; AUTOVF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT]], <32 x i32> poison, <32 x i32> zeroinitializer ; AUTOVF-NEXT: [[VEC_IV:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], ; AUTOVF-NEXT: [[TMP1:%.*]] = icmp ule <32 x i32> [[VEC_IV]], -; AUTOVF-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] -; AUTOVF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 -; AUTOVF-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* -; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* [[TMP4]], i32 1, <32 x i1> [[TMP1]], <32 x i8> poison) -; AUTOVF-NEXT: [[TMP5:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer -; AUTOVF-NEXT: [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> , <32 x i8> -; AUTOVF-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* -; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> [[TMP6]], <32 x i8>* [[TMP7]], i32 1, <32 x i1> [[TMP1]]) +; AUTOVF-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[TMP0]] +; AUTOVF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr [[TMP3]], i32 1, <32 x i1> [[TMP1]], <32 x i8> poison) +; AUTOVF-NEXT: [[TMP4:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer +; AUTOVF-NEXT: [[TMP5:%.*]] = select <32 x i1> [[TMP4]], <32 x i8> , <32 x i8> +; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0(<32 x i8> [[TMP5]], ptr [[TMP3]], i32 1, <32 x i1> [[TMP1]]) ; AUTOVF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 32 -; AUTOVF-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224 -; AUTOVF-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AUTOVF-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224 +; AUTOVF-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AUTOVF: middle.block: ; AUTOVF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AUTOVF: scalar.ph: @@ -81,11 +77,11 @@ define i32 @foo_optsize() #0 { ; AUTOVF-NEXT: br label [[FOR_BODY:%.*]] ; AUTOVF: for.body: ; AUTOVF-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] -; AUTOVF-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AUTOVF-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0 +; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[I_08]] +; AUTOVF-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; AUTOVF-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP7]], 0 ; AUTOVF-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 -; AUTOVF-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 +; AUTOVF-NEXT: store i8 [[DOT]], ptr [[ARRAYIDX]], align 1 ; AUTOVF-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 ; AUTOVF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202 ; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] @@ -98,11 +94,11 @@ entry: for.body: ; preds = %for.body, %entry %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 - %0 = load i8, i8* %arrayidx, align 1 + %arrayidx = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 %i.08 + %0 = load i8, ptr %arrayidx, align 1 %cmp1 = icmp eq i8 %0, 0 %. = select i1 %cmp1, i8 2, i8 1 - store i8 %., i8* %arrayidx, align 1 + store i8 %., ptr %arrayidx, align 1 %inc = add nsw i32 %i.08, 1 %exitcond = icmp eq i32 %i.08, 202 br i1 %exitcond, label %for.end, label %for.body @@ -126,17 +122,15 @@ define i32 @foo_minsize() #1 { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> poison, <64 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <64 x i32> [[VEC_IV]], -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP1]], <64 x i8> poison) -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> , <64 x i8> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* -; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP6]], <64 x i8>* [[TMP7]], i32 1, <64 x i1> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[TMP3]], i32 1, <64 x i1> [[TMP1]], <64 x i8> poison) +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select <64 x i1> [[TMP4]], <64 x i8> , <64 x i8> +; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP5]], ptr [[TMP3]], i32 1, <64 x i1> [[TMP1]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 64 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -144,11 +138,11 @@ define i32 @foo_minsize() #1 { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] -; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[I_08]] +; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP7]], 0 ; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 -; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: store i8 [[DOT]], ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -167,17 +161,15 @@ define i32 @foo_minsize() #1 { ; AUTOVF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT]], <32 x i32> poison, <32 x i32> zeroinitializer ; AUTOVF-NEXT: [[VEC_IV:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], ; AUTOVF-NEXT: [[TMP1:%.*]] = icmp ule <32 x i32> [[VEC_IV]], -; AUTOVF-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] -; AUTOVF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 -; AUTOVF-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* -; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* [[TMP4]], i32 1, <32 x i1> [[TMP1]], <32 x i8> poison) -; AUTOVF-NEXT: [[TMP5:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer -; AUTOVF-NEXT: [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> , <32 x i8> -; AUTOVF-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* -; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> [[TMP6]], <32 x i8>* [[TMP7]], i32 1, <32 x i1> [[TMP1]]) +; AUTOVF-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[TMP0]] +; AUTOVF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr [[TMP3]], i32 1, <32 x i1> [[TMP1]], <32 x i8> poison) +; AUTOVF-NEXT: [[TMP4:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer +; AUTOVF-NEXT: [[TMP5:%.*]] = select <32 x i1> [[TMP4]], <32 x i8> , <32 x i8> +; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0(<32 x i8> [[TMP5]], ptr [[TMP3]], i32 1, <32 x i1> [[TMP1]]) ; AUTOVF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 32 -; AUTOVF-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224 -; AUTOVF-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; AUTOVF-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224 +; AUTOVF-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; AUTOVF: middle.block: ; AUTOVF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AUTOVF: scalar.ph: @@ -185,11 +177,11 @@ define i32 @foo_minsize() #1 { ; AUTOVF-NEXT: br label [[FOR_BODY:%.*]] ; AUTOVF: for.body: ; AUTOVF-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] -; AUTOVF-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AUTOVF-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0 +; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[I_08]] +; AUTOVF-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; AUTOVF-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP7]], 0 ; AUTOVF-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 -; AUTOVF-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 +; AUTOVF-NEXT: store i8 [[DOT]], ptr [[ARRAYIDX]], align 1 ; AUTOVF-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 ; AUTOVF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202 ; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -202,11 +194,11 @@ entry: for.body: ; preds = %for.body, %entry %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 - %0 = load i8, i8* %arrayidx, align 1 + %arrayidx = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 %i.08 + %0 = load i8, ptr %arrayidx, align 1 %cmp1 = icmp eq i8 %0, 0 %. = select i1 %cmp1, i8 2, i8 1 - store i8 %., i8* %arrayidx, align 1 + store i8 %., ptr %arrayidx, align 1 %inc = add nsw i32 %i.08, 1 %exitcond = icmp eq i32 %i.08, 202 br i1 %exitcond, label %for.end, label %for.body @@ -219,7 +211,7 @@ attributes #1 = { minsize } ; We can vectorize this one by refraining from versioning for stride==1. -define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 { +define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i32 %k) #2 { ; CHECK-LABEL: @scev4stride1( ; CHECK-NEXT: for.body.preheader: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -232,16 +224,15 @@ define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture read ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <64 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <64 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <64 x i32> [[TMP1]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <64 x i32> @llvm.masked.gather.v64i32.v64p0i32(<64 x i32*> [[TMP2]], i32 4, <64 x i1> , <64 x i32> poison) -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <64 x i32>* -; CHECK-NEXT: store <64 x i32> [[WIDE_MASKED_GATHER]], <64 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], <64 x i32> [[TMP1]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <64 x i32> @llvm.masked.gather.v64i32.v64p0(<64 x ptr> [[TMP2]], i32 4, <64 x i1> , <64 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: store <64 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP4]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <64 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 256, 256 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -251,10 +242,10 @@ define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture read ; CHECK: for.body: ; CHECK-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K]] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[MUL]] -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_07]] -; CHECK-NEXT: store i32 [[TMP7]], i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[MUL]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_07]] +; CHECK-NEXT: store i32 [[TMP6]], ptr [[ARRAYIDX1]], align 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] @@ -273,16 +264,15 @@ define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture read ; AUTOVF-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AUTOVF-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; AUTOVF-NEXT: [[TMP1:%.*]] = mul nsw <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; AUTOVF-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <8 x i32> [[TMP1]] -; AUTOVF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 4, <8 x i1> , <8 x i32> poison) -; AUTOVF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] -; AUTOVF-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0 -; AUTOVF-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* -; AUTOVF-NEXT: store <8 x i32> [[WIDE_MASKED_GATHER]], <8 x i32>* [[TMP5]], align 4 +; AUTOVF-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], <8 x i32> [[TMP1]] +; AUTOVF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 4, <8 x i1> , <8 x i32> poison) +; AUTOVF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; AUTOVF-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; AUTOVF-NEXT: store <8 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP4]], align 4 ; AUTOVF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; AUTOVF-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], -; AUTOVF-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; AUTOVF-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; AUTOVF-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 +; AUTOVF-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; AUTOVF: middle.block: ; AUTOVF-NEXT: [[CMP_N:%.*]] = icmp eq i32 256, 256 ; AUTOVF-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -292,10 +282,10 @@ define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture read ; AUTOVF: for.body: ; AUTOVF-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; AUTOVF-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K]] -; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[MUL]] -; AUTOVF-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AUTOVF-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_07]] -; AUTOVF-NEXT: store i32 [[TMP7]], i32* [[ARRAYIDX1]], align 4 +; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[MUL]] +; AUTOVF-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; AUTOVF-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_07]] +; AUTOVF-NEXT: store i32 [[TMP6]], ptr [[ARRAYIDX1]], align 4 ; AUTOVF-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1 ; AUTOVF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256 ; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] @@ -308,10 +298,10 @@ for.body.preheader: for.body: ; preds = %for.body.preheader, %for.body %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] %mul = mul nsw i32 %i.07, %k - %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07 - store i32 %0, i32* %arrayidx1, align 4 + %arrayidx = getelementptr inbounds i32, ptr %b, i32 %mul + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, ptr %a, i32 %i.07 + store i32 %0, ptr %arrayidx1, align 4 %inc = add nuw nsw i32 %i.07, 1 %exitcond = icmp eq i32 %inc, 256 br i1 %exitcond, label %for.end.loopexit, label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll index 5789d86c9a192..96046ddfe9a04 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll @@ -9,7 +9,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" -define void @small_tc(float* noalias nocapture %A, float* noalias nocapture readonly %B) { +define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: @small_tc( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -18,19 +18,16 @@ define void @small_tc(float* noalias nocapture %A, float* noalias nocapture read ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0 -; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]] +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 8, 8 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -39,15 +36,15 @@ define void @small_tc(float* noalias nocapture %A, float* noalias nocapture read ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !0 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !0 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]] -; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP6]], [[TMP7]] +; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP3:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -56,12 +53,12 @@ entry: for.body: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv - %0 = load float, float* %arrayidx, align 4, !llvm.access.group !5 - %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv - %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !5 + %arrayidx = getelementptr inbounds float, ptr %B, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4, !llvm.access.group !5 + %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv + %1 = load float, ptr %arrayidx2, align 4, !llvm.access.group !5 %add = fadd fast float %0, %1 - store float %add, float* %arrayidx2, align 4, !llvm.access.group !5 + store float %add, ptr %arrayidx2, align 4, !llvm.access.group !5 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 8 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll index e2b4e4f3e1514..36dee12e4d7d3 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll @@ -8,104 +8,100 @@ target triple = "x86_64-unknown-linux-gnu" @a = common local_unnamed_addr global [192 x [192 x i32]] zeroinitializer, align 16 -define i32 @main(i32* %ptr) { +define i32 @main(ptr %ptr) { ; CHECK-LABEL: @main( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[S:%.*]] = alloca i16, align 2 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[I]] to i8* -; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull [[TMP0]]) -; CHECK-NEXT: store i32 0, i32* [[I]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[S]] to i8* -; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 2, i8* nonnull [[TMP1]]) -; CHECK-NEXT: [[CALL:%.*]] = call i32 (i32*, ...) bitcast (i32 (...)* @goo to i32 (i32*, ...)*)(i32* nonnull [[I]]) -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[I]], align 4 -; CHECK-NEXT: [[STOREMERGE6:%.*]] = trunc i32 [[TMP2]] to i16 -; CHECK-NEXT: store i16 [[STOREMERGE6]], i16* [[S]], align 2 -; CHECK-NEXT: [[CONV17:%.*]] = and i32 [[TMP2]], 65472 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[I]]) +; CHECK-NEXT: store i32 0, ptr [[I]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[S]]) +; CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr, ...) @goo(ptr nonnull [[I]]) +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4 +; CHECK-NEXT: [[STOREMERGE6:%.*]] = trunc i32 [[TMP0]] to i16 +; CHECK-NEXT: store i16 [[STOREMERGE6]], ptr [[S]], align 2 +; CHECK-NEXT: [[CONV17:%.*]] = and i32 [[TMP0]], 65472 ; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[CONV17]], 0 ; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END12:%.*]] ; CHECK: for.body.lr.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[STOREMERGE_IN9:%.*]] = phi i32 [ [[TMP2]], [[FOR_BODY_LR_PH]] ], [ [[ADD:%.*]], [[FOR_INC9:%.*]] ] +; CHECK-NEXT: [[STOREMERGE_IN9:%.*]] = phi i32 [ [[TMP0]], [[FOR_BODY_LR_PH]] ], [ [[ADD:%.*]], [[FOR_INC9:%.*]] ] ; CHECK-NEXT: [[CONV52:%.*]] = and i32 [[STOREMERGE_IN9]], 255 -; CHECK-NEXT: [[CMP63:%.*]] = icmp ult i32 [[TMP2]], [[CONV52]] +; CHECK-NEXT: [[CMP63:%.*]] = icmp ult i32 [[TMP0]], [[CONV52]] ; CHECK-NEXT: br i1 [[CMP63]], label [[FOR_BODY8_LR_PH:%.*]], label [[FOR_INC9]] ; CHECK: for.body8.lr.ph: ; CHECK-NEXT: [[CONV3:%.*]] = trunc i32 [[STOREMERGE_IN9]] to i8 -; CHECK-NEXT: [[DOTPROMOTED:%.*]] = load i32, i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16 -; CHECK-NEXT: [[TMP3:%.*]] = add i8 [[CONV3]], -1 -; CHECK-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32 -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], 1 -; CHECK-NEXT: [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 [[TMP4]]) -; CHECK-NEXT: [[TMP6:%.*]] = sub i32 [[TMP5]], [[UMIN1]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP6]], 32 +; CHECK-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr @a, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[CONV3]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 +; CHECK-NEXT: [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP3]], [[UMIN1]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP4]], 32 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: -; CHECK-NEXT: [[TMP7:%.*]] = add i8 [[CONV3]], -1 -; CHECK-NEXT: [[TMP8:%.*]] = zext i8 [[TMP7]] to i32 -; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 [[TMP8]]) -; CHECK-NEXT: [[TMP9:%.*]] = sub i32 [[TMP8]], [[UMIN]] -; CHECK-NEXT: [[TMP10:%.*]] = trunc i32 [[TMP9]] to i8 -; CHECK-NEXT: [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 1, i8 [[TMP10]]) +; CHECK-NEXT: [[TMP5:%.*]] = add i8 [[CONV3]], -1 +; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 +; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP6]]) +; CHECK-NEXT: [[TMP7:%.*]] = sub i32 [[TMP6]], [[UMIN]] +; CHECK-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8 +; CHECK-NEXT: [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 1, i8 [[TMP8]]) ; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0 ; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i8, i1 } [[MUL]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = sub i8 [[TMP7]], [[MUL_RESULT]] -; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt i8 [[TMP11]], [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = or i1 [[TMP12]], [[MUL_OVERFLOW]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp ugt i32 [[TMP9]], 255 -; CHECK-NEXT: [[TMP15:%.*]] = or i1 [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[DOTPROMOTED]], 1 -; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP16]], [[TMP9]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp slt i32 [[TMP17]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP15]], [[TMP18]] -; CHECK-NEXT: br i1 [[TMP19]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = sub i8 [[TMP5]], [[MUL_RESULT]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt i8 [[TMP9]], [[TMP5]] +; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP10]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt i32 [[TMP7]], 255 +; CHECK-NEXT: [[TMP13:%.*]] = or i1 [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[DOTPROMOTED]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], [[TMP7]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp slt i32 [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = or i1 [[TMP13]], [[TMP16]] +; CHECK-NEXT: br i1 [[TMP17]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP6]], 8 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP6]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP4]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP4]], [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = add i32 [[DOTPROMOTED]], [[N_VEC]] -; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i32 [[N_VEC]] to i8 -; CHECK-NEXT: [[IND_END3:%.*]] = sub i8 [[CONV3]], [[CAST_CRD]] +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 +; CHECK-NEXT: [[IND_END2:%.*]] = sub i8 [[CONV3]], [[DOTCAST]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[DOTPROMOTED]], [[INDEX]] -; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP20]], 1 -; CHECK-NEXT: [[TMP26:%.*]] = add i32 [[TMP21]], 1 -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP25]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 [[TMP26]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[TMP29]], i32 0 -; CHECK-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[TMP32]], align 4 -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[TMP29]], i32 4 -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[TMP34]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[OFFSET_IDX]], 4 +; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP18]], 1 +; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP19]], 1 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i32 [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0 +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP24]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 4 +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP25]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP37:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP6]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP4]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DOTPROMOTED]], [[FOR_BODY8_LR_PH]] ], [ [[DOTPROMOTED]], [[VECTOR_SCEVCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i8 [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[CONV3]], [[FOR_BODY8_LR_PH]] ], [ [[CONV3]], [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i8 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[CONV3]], [[FOR_BODY8_LR_PH]] ], [ [[CONV3]], [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY8:%.*]] ; CHECK: for.body8: ; CHECK-NEXT: [[INC5:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY8]] ] -; CHECK-NEXT: [[C_04:%.*]] = phi i8 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[DEC:%.*]], [[FOR_BODY8]] ] +; CHECK-NEXT: [[C_04:%.*]] = phi i8 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[DEC:%.*]], [[FOR_BODY8]] ] ; CHECK-NEXT: [[INC]] = add i32 [[INC5]], 1 ; CHECK-NEXT: [[DEC]] = add i8 [[C_04]], -1 -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 [[INC]] -; CHECK-NEXT: store i32 0, i32* [[GEP]], align 4 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i32 [[INC]] +; CHECK-NEXT: store i32 0, ptr [[GEP]], align 4 ; CHECK-NEXT: [[CONV5:%.*]] = zext i8 [[DEC]] to i32 -; CHECK-NEXT: [[CMP6:%.*]] = icmp ult i32 [[TMP2]], [[CONV5]] +; CHECK-NEXT: [[CMP6:%.*]] = icmp ult i32 [[TMP0]], [[CONV5]] ; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY8]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: for.cond4.for.inc9_crit_edge: ; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[FOR_BODY8]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: store i32 [[INC_LCSSA]], i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16 +; CHECK-NEXT: store i32 [[INC_LCSSA]], ptr @a, align 16 ; CHECK-NEXT: br label [[FOR_INC9]] ; CHECK: for.inc9: ; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[STOREMERGE_IN9]], 65535 @@ -116,27 +112,25 @@ define i32 @main(i32* %ptr) { ; CHECK: for.cond.for.end12_crit_edge: ; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_INC9]] ] ; CHECK-NEXT: [[STOREMERGE:%.*]] = trunc i32 [[ADD_LCSSA]] to i16 -; CHECK-NEXT: store i16 [[STOREMERGE]], i16* [[S]], align 2 +; CHECK-NEXT: store i16 [[STOREMERGE]], ptr [[S]], align 2 ; CHECK-NEXT: br label [[FOR_END12]] ; CHECK: for.end12: -; CHECK-NEXT: [[CALL13:%.*]] = call i32 (i16*, ...) bitcast (i32 (...)* @foo to i32 (i16*, ...)*)(i16* nonnull [[S]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 2, i8* nonnull [[TMP1]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP0]]) +; CHECK-NEXT: [[CALL13:%.*]] = call i32 (ptr, ...) @foo(ptr nonnull [[S]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[S]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[I]]) ; CHECK-NEXT: ret i32 0 ; entry: %i = alloca i32, align 4 %s = alloca i16, align 2 - %0 = bitcast i32* %i to i8* - call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #3 - store i32 0, i32* %i, align 4 - %1 = bitcast i16* %s to i8* - call void @llvm.lifetime.start.p0i8(i64 2, i8* nonnull %1) #3 - %call = call i32 (i32*, ...) bitcast (i32 (...)* @goo to i32 (i32*, ...)*)(i32* nonnull %i) #3 - %2 = load i32, i32* %i, align 4 - %storemerge6 = trunc i32 %2 to i16 - store i16 %storemerge6, i16* %s, align 2 - %conv17 = and i32 %2, 65472 + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %i) #3 + store i32 0, ptr %i, align 4 + call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %s) #3 + %call = call i32 (ptr, ...) @goo(ptr nonnull %i) #3 + %0 = load i32, ptr %i, align 4 + %storemerge6 = trunc i32 %0 to i16 + store i16 %storemerge6, ptr %s, align 2 + %conv17 = and i32 %0, 65472 %cmp8 = icmp eq i32 %conv17, 0 br i1 %cmp8, label %for.body.lr.ph, label %for.end12 @@ -144,14 +138,14 @@ for.body.lr.ph: ; preds = %entry br label %for.body for.body: ; preds = %for.body.lr.ph, %for.inc9 - %storemerge.in9 = phi i32 [ %2, %for.body.lr.ph ], [ %add, %for.inc9 ] + %storemerge.in9 = phi i32 [ %0, %for.body.lr.ph ], [ %add, %for.inc9 ] %conv52 = and i32 %storemerge.in9, 255 - %cmp63 = icmp ult i32 %2, %conv52 + %cmp63 = icmp ult i32 %0, %conv52 br i1 %cmp63, label %for.body8.lr.ph, label %for.inc9 for.body8.lr.ph: ; preds = %for.body %conv3 = trunc i32 %storemerge.in9 to i8 - %.promoted = load i32, i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16 + %.promoted = load i32, ptr @a, align 16 br label %for.body8 for.body8: ; preds = %for.body8.lr.ph, %for.body8 @@ -159,15 +153,15 @@ for.body8: ; preds = %for.body8.lr.ph, %f %c.04 = phi i8 [ %conv3, %for.body8.lr.ph ], [ %dec, %for.body8 ] %inc = add i32 %inc5, 1 %dec = add i8 %c.04, -1 - %gep = getelementptr inbounds i32, i32* %ptr, i32 %inc - store i32 0, i32* %gep + %gep = getelementptr inbounds i32, ptr %ptr, i32 %inc + store i32 0, ptr %gep %conv5 = zext i8 %dec to i32 - %cmp6 = icmp ult i32 %2, %conv5 + %cmp6 = icmp ult i32 %0, %conv5 br i1 %cmp6, label %for.body8, label %for.cond4.for.inc9_crit_edge for.cond4.for.inc9_crit_edge: ; preds = %for.body8 %inc.lcssa = phi i32 [ %inc, %for.body8 ] - store i32 %inc.lcssa, i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16 + store i32 %inc.lcssa, ptr @a, align 16 br label %for.inc9 for.inc9: ; preds = %for.cond4.for.inc9_crit_edge, %for.body @@ -180,22 +174,22 @@ for.inc9: ; preds = %for.cond4.for.inc9_ for.cond.for.end12_crit_edge: ; preds = %for.inc9 %add.lcssa = phi i32 [ %add, %for.inc9 ] %storemerge = trunc i32 %add.lcssa to i16 - store i16 %storemerge, i16* %s, align 2 + store i16 %storemerge, ptr %s, align 2 br label %for.end12 for.end12: ; preds = %for.cond.for.end12_crit_edge, %entry - %call13 = call i32 (i16*, ...) bitcast (i32 (...)* @foo to i32 (i16*, ...)*)(i16* nonnull %s) #3 - call void @llvm.lifetime.end.p0i8(i64 2, i8* nonnull %1) #3 - call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #3 + %call13 = call i32 (ptr, ...) @foo(ptr nonnull %s) #3 + call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %s) #3 + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %i) #3 ret i32 0 } ; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 +declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1 declare i32 @goo(...) local_unnamed_addr #2 declare i32 @foo(...) local_unnamed_addr #2 ; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 +declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll index f133265cce496..7ace7fd2c099d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll @@ -5,7 +5,7 @@ ; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 ; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mcpu=slm | FileCheck %s --check-prefix=SSE2 -define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture readonly %s1, i16* noalias nocapture readonly %s2, i32 %n) { +define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readonly %s1, ptr noalias nocapture readonly %s2, i32 %n) { ; SSE2-LABEL: @test_muladd( ; SSE2-NEXT: entry: ; SSE2-NEXT: [[CMP30:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -22,32 +22,29 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read ; SSE2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SSE2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; SSE2-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 -; SSE2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[S1:%.*]], i64 [[TMP1]] -; SSE2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[TMP2]], i32 0 -; SSE2-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <8 x i16>* -; SSE2-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 2 +; SSE2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP1]] +; SSE2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0 +; SSE2-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 ; SSE2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> ; SSE2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; SSE2-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> -; SSE2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[S2:%.*]], i64 [[TMP1]] -; SSE2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0 -; SSE2-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <8 x i16>* -; SSE2-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 2 +; SSE2-NEXT: [[TMP4:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> +; SSE2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP1]] +; SSE2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; SSE2-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP6]], align 2 ; SSE2-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> ; SSE2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> -; SSE2-NEXT: [[TMP9:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32> -; SSE2-NEXT: [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP9]], [[TMP5]] -; SSE2-NEXT: [[TMP13:%.*]] = sext <4 x i16> [[STRIDED_VEC1]] to <4 x i32> -; SSE2-NEXT: [[TMP15:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32> -; SSE2-NEXT: [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP13]] -; SSE2-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[TMP16]], [[TMP10]] -; SSE2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]] -; SSE2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0 -; SSE2-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <4 x i32>* -; SSE2-NEXT: store <4 x i32> [[TMP17]], <4 x i32>* [[TMP20]], align 4 +; SSE2-NEXT: [[TMP7:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32> +; SSE2-NEXT: [[TMP8:%.*]] = mul nsw <4 x i32> [[TMP7]], [[TMP4]] +; SSE2-NEXT: [[TMP9:%.*]] = sext <4 x i16> [[STRIDED_VEC1]] to <4 x i32> +; SSE2-NEXT: [[TMP10:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32> +; SSE2-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP10]], [[TMP9]] +; SSE2-NEXT: [[TMP12:%.*]] = add nsw <4 x i32> [[TMP11]], [[TMP8]] +; SSE2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]] +; SSE2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; SSE2-NEXT: store <4 x i32> [[TMP12]], ptr [[TMP14]], align 4 ; SSE2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; SSE2-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SSE2-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SSE2-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SSE2-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SSE2: middle.block: ; SSE2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; SSE2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -56,25 +53,25 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read ; SSE2-NEXT: br label [[FOR_BODY:%.*]] ; SSE2: for.body: ; SSE2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; SSE2-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 -; SSE2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP22]] -; SSE2-NEXT: [[TMP23:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -; SSE2-NEXT: [[CONV:%.*]] = sext i16 [[TMP23]] to i32 -; SSE2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP22]] -; SSE2-NEXT: [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2 -; SSE2-NEXT: [[CONV5:%.*]] = sext i16 [[TMP24]] to i32 +; SSE2-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; SSE2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP16]] +; SSE2-NEXT: [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; SSE2-NEXT: [[CONV:%.*]] = sext i16 [[TMP17]] to i32 +; SSE2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP16]] +; SSE2-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; SSE2-NEXT: [[CONV5:%.*]] = sext i16 [[TMP18]] to i32 ; SSE2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]] -; SSE2-NEXT: [[TMP25:%.*]] = or i64 [[TMP22]], 1 -; SSE2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP25]] -; SSE2-NEXT: [[TMP26:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 -; SSE2-NEXT: [[CONV11:%.*]] = sext i16 [[TMP26]] to i32 -; SSE2-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP25]] -; SSE2-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2 -; SSE2-NEXT: [[CONV16:%.*]] = sext i16 [[TMP27]] to i32 +; SSE2-NEXT: [[TMP19:%.*]] = or i64 [[TMP16]], 1 +; SSE2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP19]] +; SSE2-NEXT: [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 +; SSE2-NEXT: [[CONV11:%.*]] = sext i16 [[TMP20]] to i32 +; SSE2-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP19]] +; SSE2-NEXT: [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2 +; SSE2-NEXT: [[CONV16:%.*]] = sext i16 [[TMP21]] to i32 ; SSE2-NEXT: [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]] ; SSE2-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]] -; SSE2-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[INDVARS_IV]] -; SSE2-NEXT: store i32 [[ADD18]], i32* [[ARRAYIDX20]], align 4 +; SSE2-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDVARS_IV]] +; SSE2-NEXT: store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4 ; SSE2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; SSE2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] ; SSE2-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] @@ -101,55 +98,49 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read ; SSE41-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; SSE41-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP0]], 1 ; SSE41-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP1]], 1 -; SSE41-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[S1:%.*]], i64 [[TMP2]] -; SSE41-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP3]] -; SSE41-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP4]], i32 0 -; SSE41-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <8 x i16>* -; SSE41-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0 -; SSE41-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP8]] to <8 x i16>* -; SSE41-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, <8 x i16>* [[TMP7]], align 2 -; SSE41-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP9]], align 2 +; SSE41-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP2]] +; SSE41-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP3]] +; SSE41-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0 +; SSE41-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; SSE41-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP6]], align 2 +; SSE41-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2 ; SSE41-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> ; SSE41-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> ; SSE41-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> ; SSE41-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> -; SSE41-NEXT: [[TMP10:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> -; SSE41-NEXT: [[TMP11:%.*]] = sext <4 x i16> [[STRIDED_VEC2]] to <4 x i32> -; SSE41-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[S2:%.*]], i64 [[TMP2]] -; SSE41-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP3]] -; SSE41-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[TMP12]], i32 0 -; SSE41-NEXT: [[TMP15:%.*]] = bitcast i16* [[TMP14]] to <8 x i16>* -; SSE41-NEXT: [[TMP16:%.*]] = getelementptr inbounds i16, i16* [[TMP13]], i32 0 -; SSE41-NEXT: [[TMP17:%.*]] = bitcast i16* [[TMP16]] to <8 x i16>* -; SSE41-NEXT: [[WIDE_VEC5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP15]], align 2 -; SSE41-NEXT: [[WIDE_VEC6:%.*]] = load <8 x i16>, <8 x i16>* [[TMP17]], align 2 +; SSE41-NEXT: [[TMP8:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> +; SSE41-NEXT: [[TMP9:%.*]] = sext <4 x i16> [[STRIDED_VEC2]] to <4 x i32> +; SSE41-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP2]] +; SSE41-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP3]] +; SSE41-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 +; SSE41-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0 +; SSE41-NEXT: [[WIDE_VEC5:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2 +; SSE41-NEXT: [[WIDE_VEC6:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2 ; SSE41-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC5]], <8 x i16> poison, <4 x i32> ; SSE41-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> ; SSE41-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC5]], <8 x i16> poison, <4 x i32> ; SSE41-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> -; SSE41-NEXT: [[TMP18:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32> -; SSE41-NEXT: [[TMP19:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32> -; SSE41-NEXT: [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP18]], [[TMP10]] -; SSE41-NEXT: [[TMP21:%.*]] = mul nsw <4 x i32> [[TMP19]], [[TMP11]] -; SSE41-NEXT: [[TMP26:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32> -; SSE41-NEXT: [[TMP27:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32> -; SSE41-NEXT: [[TMP30:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32> -; SSE41-NEXT: [[TMP31:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32> -; SSE41-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP30]], [[TMP26]] -; SSE41-NEXT: [[TMP33:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP27]] -; SSE41-NEXT: [[TMP34:%.*]] = add nsw <4 x i32> [[TMP32]], [[TMP20]] -; SSE41-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[TMP33]], [[TMP21]] -; SSE41-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]] -; SSE41-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP1]] -; SSE41-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 0 -; SSE41-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; SSE41-NEXT: store <4 x i32> [[TMP34]], <4 x i32>* [[TMP39]], align 4 -; SSE41-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 4 -; SSE41-NEXT: [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <4 x i32>* -; SSE41-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP41]], align 4 +; SSE41-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32> +; SSE41-NEXT: [[TMP15:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32> +; SSE41-NEXT: [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP14]], [[TMP8]] +; SSE41-NEXT: [[TMP17:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP9]] +; SSE41-NEXT: [[TMP18:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32> +; SSE41-NEXT: [[TMP19:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32> +; SSE41-NEXT: [[TMP20:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32> +; SSE41-NEXT: [[TMP21:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32> +; SSE41-NEXT: [[TMP22:%.*]] = mul nsw <4 x i32> [[TMP20]], [[TMP18]] +; SSE41-NEXT: [[TMP23:%.*]] = mul nsw <4 x i32> [[TMP21]], [[TMP19]] +; SSE41-NEXT: [[TMP24:%.*]] = add nsw <4 x i32> [[TMP22]], [[TMP16]] +; SSE41-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[TMP23]], [[TMP17]] +; SSE41-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]] +; SSE41-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP1]] +; SSE41-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 0 +; SSE41-NEXT: store <4 x i32> [[TMP24]], ptr [[TMP28]], align 4 +; SSE41-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 4 +; SSE41-NEXT: store <4 x i32> [[TMP25]], ptr [[TMP29]], align 4 ; SSE41-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; SSE41-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SSE41-NEXT: br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SSE41-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SSE41-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SSE41: middle.block: ; SSE41-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; SSE41-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -158,25 +149,25 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read ; SSE41-NEXT: br label [[FOR_BODY:%.*]] ; SSE41: for.body: ; SSE41-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; SSE41-NEXT: [[TMP43:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 -; SSE41-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP43]] -; SSE41-NEXT: [[TMP44:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -; SSE41-NEXT: [[CONV:%.*]] = sext i16 [[TMP44]] to i32 -; SSE41-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP43]] -; SSE41-NEXT: [[TMP45:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2 -; SSE41-NEXT: [[CONV5:%.*]] = sext i16 [[TMP45]] to i32 +; SSE41-NEXT: [[TMP31:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; SSE41-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP31]] +; SSE41-NEXT: [[TMP32:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; SSE41-NEXT: [[CONV:%.*]] = sext i16 [[TMP32]] to i32 +; SSE41-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP31]] +; SSE41-NEXT: [[TMP33:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; SSE41-NEXT: [[CONV5:%.*]] = sext i16 [[TMP33]] to i32 ; SSE41-NEXT: [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]] -; SSE41-NEXT: [[TMP46:%.*]] = or i64 [[TMP43]], 1 -; SSE41-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP46]] -; SSE41-NEXT: [[TMP47:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 -; SSE41-NEXT: [[CONV11:%.*]] = sext i16 [[TMP47]] to i32 -; SSE41-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP46]] -; SSE41-NEXT: [[TMP48:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2 -; SSE41-NEXT: [[CONV16:%.*]] = sext i16 [[TMP48]] to i32 +; SSE41-NEXT: [[TMP34:%.*]] = or i64 [[TMP31]], 1 +; SSE41-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP34]] +; SSE41-NEXT: [[TMP35:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 +; SSE41-NEXT: [[CONV11:%.*]] = sext i16 [[TMP35]] to i32 +; SSE41-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP34]] +; SSE41-NEXT: [[TMP36:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2 +; SSE41-NEXT: [[CONV16:%.*]] = sext i16 [[TMP36]] to i32 ; SSE41-NEXT: [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]] ; SSE41-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]] -; SSE41-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[INDVARS_IV]] -; SSE41-NEXT: store i32 [[ADD18]], i32* [[ARRAYIDX20]], align 4 +; SSE41-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDVARS_IV]] +; SSE41-NEXT: store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4 ; SSE41-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; SSE41-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] ; SSE41-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] @@ -207,22 +198,18 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read ; AVX1-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP1]], 1 ; AVX1-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP2]], 1 ; AVX1-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP3]], 1 -; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[S1:%.*]], i64 [[TMP4]] -; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP5]] -; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP6]] -; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP7]] -; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP8]], i32 0 -; AVX1-NEXT: [[TMP13:%.*]] = bitcast i16* [[TMP12]] to <8 x i16>* -; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[TMP9]], i32 0 -; AVX1-NEXT: [[TMP15:%.*]] = bitcast i16* [[TMP14]] to <8 x i16>* -; AVX1-NEXT: [[TMP16:%.*]] = getelementptr inbounds i16, i16* [[TMP10]], i32 0 -; AVX1-NEXT: [[TMP17:%.*]] = bitcast i16* [[TMP16]] to <8 x i16>* -; AVX1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[TMP11]], i32 0 -; AVX1-NEXT: [[TMP19:%.*]] = bitcast i16* [[TMP18]] to <8 x i16>* -; AVX1-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, <8 x i16>* [[TMP13]], align 2 -; AVX1-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP15]], align 2 -; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP17]], align 2 -; AVX1-NEXT: [[WIDE_VEC3:%.*]] = load <8 x i16>, <8 x i16>* [[TMP19]], align 2 +; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP4]] +; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP5]] +; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP6]] +; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP7]] +; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0 +; AVX1-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0 +; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 +; AVX1-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0 +; AVX1-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2 +; AVX1-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2 +; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP14]], align 2 +; AVX1-NEXT: [[WIDE_VEC3:%.*]] = load <8 x i16>, ptr [[TMP15]], align 2 ; AVX1-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> @@ -231,26 +218,22 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read ; AVX1-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[TMP20:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> -; AVX1-NEXT: [[TMP21:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32> -; AVX1-NEXT: [[TMP22:%.*]] = sext <4 x i16> [[STRIDED_VEC5]] to <4 x i32> -; AVX1-NEXT: [[TMP23:%.*]] = sext <4 x i16> [[STRIDED_VEC6]] to <4 x i32> -; AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, i16* [[S2:%.*]], i64 [[TMP4]] -; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP5]] -; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP6]] -; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP7]] -; AVX1-NEXT: [[TMP28:%.*]] = getelementptr inbounds i16, i16* [[TMP24]], i32 0 -; AVX1-NEXT: [[TMP29:%.*]] = bitcast i16* [[TMP28]] to <8 x i16>* -; AVX1-NEXT: [[TMP30:%.*]] = getelementptr inbounds i16, i16* [[TMP25]], i32 0 -; AVX1-NEXT: [[TMP31:%.*]] = bitcast i16* [[TMP30]] to <8 x i16>* -; AVX1-NEXT: [[TMP32:%.*]] = getelementptr inbounds i16, i16* [[TMP26]], i32 0 -; AVX1-NEXT: [[TMP33:%.*]] = bitcast i16* [[TMP32]] to <8 x i16>* -; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds i16, i16* [[TMP27]], i32 0 -; AVX1-NEXT: [[TMP35:%.*]] = bitcast i16* [[TMP34]] to <8 x i16>* -; AVX1-NEXT: [[WIDE_VEC11:%.*]] = load <8 x i16>, <8 x i16>* [[TMP29]], align 2 -; AVX1-NEXT: [[WIDE_VEC12:%.*]] = load <8 x i16>, <8 x i16>* [[TMP31]], align 2 -; AVX1-NEXT: [[WIDE_VEC13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP33]], align 2 -; AVX1-NEXT: [[WIDE_VEC14:%.*]] = load <8 x i16>, <8 x i16>* [[TMP35]], align 2 +; AVX1-NEXT: [[TMP16:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> +; AVX1-NEXT: [[TMP17:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32> +; AVX1-NEXT: [[TMP18:%.*]] = sext <4 x i16> [[STRIDED_VEC5]] to <4 x i32> +; AVX1-NEXT: [[TMP19:%.*]] = sext <4 x i16> [[STRIDED_VEC6]] to <4 x i32> +; AVX1-NEXT: [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP4]] +; AVX1-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP5]] +; AVX1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP6]] +; AVX1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP7]] +; AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[TMP20]], i32 0 +; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[TMP21]], i32 0 +; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0 +; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i16, ptr [[TMP23]], i32 0 +; AVX1-NEXT: [[WIDE_VEC11:%.*]] = load <8 x i16>, ptr [[TMP24]], align 2 +; AVX1-NEXT: [[WIDE_VEC12:%.*]] = load <8 x i16>, ptr [[TMP25]], align 2 +; AVX1-NEXT: [[WIDE_VEC13:%.*]] = load <8 x i16>, ptr [[TMP26]], align 2 +; AVX1-NEXT: [[WIDE_VEC14:%.*]] = load <8 x i16>, ptr [[TMP27]], align 2 ; AVX1-NEXT: [[STRIDED_VEC15:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC16:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> @@ -259,49 +242,45 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read ; AVX1-NEXT: [[STRIDED_VEC20:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC21:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC22:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[TMP36:%.*]] = sext <4 x i16> [[STRIDED_VEC15]] to <4 x i32> -; AVX1-NEXT: [[TMP37:%.*]] = sext <4 x i16> [[STRIDED_VEC16]] to <4 x i32> -; AVX1-NEXT: [[TMP38:%.*]] = sext <4 x i16> [[STRIDED_VEC17]] to <4 x i32> -; AVX1-NEXT: [[TMP39:%.*]] = sext <4 x i16> [[STRIDED_VEC18]] to <4 x i32> -; AVX1-NEXT: [[TMP40:%.*]] = mul nsw <4 x i32> [[TMP36]], [[TMP20]] -; AVX1-NEXT: [[TMP41:%.*]] = mul nsw <4 x i32> [[TMP37]], [[TMP21]] -; AVX1-NEXT: [[TMP42:%.*]] = mul nsw <4 x i32> [[TMP38]], [[TMP22]] -; AVX1-NEXT: [[TMP43:%.*]] = mul nsw <4 x i32> [[TMP39]], [[TMP23]] -; AVX1-NEXT: [[TMP52:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32> -; AVX1-NEXT: [[TMP53:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32> -; AVX1-NEXT: [[TMP54:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32> -; AVX1-NEXT: [[TMP55:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32> -; AVX1-NEXT: [[TMP60:%.*]] = sext <4 x i16> [[STRIDED_VEC19]] to <4 x i32> -; AVX1-NEXT: [[TMP61:%.*]] = sext <4 x i16> [[STRIDED_VEC20]] to <4 x i32> -; AVX1-NEXT: [[TMP62:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32> -; AVX1-NEXT: [[TMP63:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32> -; AVX1-NEXT: [[TMP64:%.*]] = mul nsw <4 x i32> [[TMP60]], [[TMP52]] -; AVX1-NEXT: [[TMP65:%.*]] = mul nsw <4 x i32> [[TMP61]], [[TMP53]] -; AVX1-NEXT: [[TMP66:%.*]] = mul nsw <4 x i32> [[TMP62]], [[TMP54]] -; AVX1-NEXT: [[TMP67:%.*]] = mul nsw <4 x i32> [[TMP63]], [[TMP55]] -; AVX1-NEXT: [[TMP68:%.*]] = add nsw <4 x i32> [[TMP64]], [[TMP40]] -; AVX1-NEXT: [[TMP69:%.*]] = add nsw <4 x i32> [[TMP65]], [[TMP41]] -; AVX1-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[TMP66]], [[TMP42]] -; AVX1-NEXT: [[TMP71:%.*]] = add nsw <4 x i32> [[TMP67]], [[TMP43]] -; AVX1-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[TMP72]], i32 0 -; AVX1-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; AVX1-NEXT: store <4 x i32> [[TMP68]], <4 x i32>* [[TMP77]], align 4 -; AVX1-NEXT: [[TMP78:%.*]] = getelementptr inbounds i32, i32* [[TMP72]], i32 4 -; AVX1-NEXT: [[TMP79:%.*]] = bitcast i32* [[TMP78]] to <4 x i32>* -; AVX1-NEXT: store <4 x i32> [[TMP69]], <4 x i32>* [[TMP79]], align 4 -; AVX1-NEXT: [[TMP80:%.*]] = getelementptr inbounds i32, i32* [[TMP72]], i32 8 -; AVX1-NEXT: [[TMP81:%.*]] = bitcast i32* [[TMP80]] to <4 x i32>* -; AVX1-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP81]], align 4 -; AVX1-NEXT: [[TMP82:%.*]] = getelementptr inbounds i32, i32* [[TMP72]], i32 12 -; AVX1-NEXT: [[TMP83:%.*]] = bitcast i32* [[TMP82]] to <4 x i32>* -; AVX1-NEXT: store <4 x i32> [[TMP71]], <4 x i32>* [[TMP83]], align 4 +; AVX1-NEXT: [[TMP28:%.*]] = sext <4 x i16> [[STRIDED_VEC15]] to <4 x i32> +; AVX1-NEXT: [[TMP29:%.*]] = sext <4 x i16> [[STRIDED_VEC16]] to <4 x i32> +; AVX1-NEXT: [[TMP30:%.*]] = sext <4 x i16> [[STRIDED_VEC17]] to <4 x i32> +; AVX1-NEXT: [[TMP31:%.*]] = sext <4 x i16> [[STRIDED_VEC18]] to <4 x i32> +; AVX1-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP28]], [[TMP16]] +; AVX1-NEXT: [[TMP33:%.*]] = mul nsw <4 x i32> [[TMP29]], [[TMP17]] +; AVX1-NEXT: [[TMP34:%.*]] = mul nsw <4 x i32> [[TMP30]], [[TMP18]] +; AVX1-NEXT: [[TMP35:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP19]] +; AVX1-NEXT: [[TMP36:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32> +; AVX1-NEXT: [[TMP37:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32> +; AVX1-NEXT: [[TMP38:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32> +; AVX1-NEXT: [[TMP39:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32> +; AVX1-NEXT: [[TMP40:%.*]] = sext <4 x i16> [[STRIDED_VEC19]] to <4 x i32> +; AVX1-NEXT: [[TMP41:%.*]] = sext <4 x i16> [[STRIDED_VEC20]] to <4 x i32> +; AVX1-NEXT: [[TMP42:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32> +; AVX1-NEXT: [[TMP43:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32> +; AVX1-NEXT: [[TMP44:%.*]] = mul nsw <4 x i32> [[TMP40]], [[TMP36]] +; AVX1-NEXT: [[TMP45:%.*]] = mul nsw <4 x i32> [[TMP41]], [[TMP37]] +; AVX1-NEXT: [[TMP46:%.*]] = mul nsw <4 x i32> [[TMP42]], [[TMP38]] +; AVX1-NEXT: [[TMP47:%.*]] = mul nsw <4 x i32> [[TMP43]], [[TMP39]] +; AVX1-NEXT: [[TMP48:%.*]] = add nsw <4 x i32> [[TMP44]], [[TMP32]] +; AVX1-NEXT: [[TMP49:%.*]] = add nsw <4 x i32> [[TMP45]], [[TMP33]] +; AVX1-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[TMP46]], [[TMP34]] +; AVX1-NEXT: [[TMP51:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP35]] +; AVX1-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 0 +; AVX1-NEXT: store <4 x i32> [[TMP48]], ptr [[TMP56]], align 4 +; AVX1-NEXT: [[TMP57:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 4 +; AVX1-NEXT: store <4 x i32> [[TMP49]], ptr [[TMP57]], align 4 +; AVX1-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 8 +; AVX1-NEXT: store <4 x i32> [[TMP50]], ptr [[TMP58]], align 4 +; AVX1-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 12 +; AVX1-NEXT: store <4 x i32> [[TMP51]], ptr [[TMP59]], align 4 ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; AVX1-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AVX1-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX1-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -310,25 +289,25 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; AVX1-NEXT: [[TMP85:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 -; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP85]] -; AVX1-NEXT: [[TMP86:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -; AVX1-NEXT: [[CONV:%.*]] = sext i16 [[TMP86]] to i32 -; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP85]] -; AVX1-NEXT: [[TMP87:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2 -; AVX1-NEXT: [[CONV5:%.*]] = sext i16 [[TMP87]] to i32 +; AVX1-NEXT: [[TMP61:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP61]] +; AVX1-NEXT: [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; AVX1-NEXT: [[CONV:%.*]] = sext i16 [[TMP62]] to i32 +; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP61]] +; AVX1-NEXT: [[TMP63:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; AVX1-NEXT: [[CONV5:%.*]] = sext i16 [[TMP63]] to i32 ; AVX1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]] -; AVX1-NEXT: [[TMP88:%.*]] = or i64 [[TMP85]], 1 -; AVX1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP88]] -; AVX1-NEXT: [[TMP89:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 -; AVX1-NEXT: [[CONV11:%.*]] = sext i16 [[TMP89]] to i32 -; AVX1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP88]] -; AVX1-NEXT: [[TMP90:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2 -; AVX1-NEXT: [[CONV16:%.*]] = sext i16 [[TMP90]] to i32 +; AVX1-NEXT: [[TMP64:%.*]] = or i64 [[TMP61]], 1 +; AVX1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP64]] +; AVX1-NEXT: [[TMP65:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 +; AVX1-NEXT: [[CONV11:%.*]] = sext i16 [[TMP65]] to i32 +; AVX1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP64]] +; AVX1-NEXT: [[TMP66:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2 +; AVX1-NEXT: [[CONV16:%.*]] = sext i16 [[TMP66]] to i32 ; AVX1-NEXT: [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]] ; AVX1-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]] -; AVX1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[INDVARS_IV]] -; AVX1-NEXT: store i32 [[ADD18]], i32* [[ARRAYIDX20]], align 4 +; AVX1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDVARS_IV]] +; AVX1-NEXT: store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4 ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] ; AVX1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] @@ -353,32 +332,29 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; AVX2-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 -; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[S1:%.*]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[TMP2]], i32 0 -; AVX2-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <16 x i16>* -; AVX2-NEXT: [[WIDE_VEC:%.*]] = load <16 x i16>, <16 x i16>* [[TMP4]], align 2 +; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0 +; AVX2-NEXT: [[WIDE_VEC:%.*]] = load <16 x i16>, ptr [[TMP3]], align 2 ; AVX2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> ; AVX2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> -; AVX2-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[STRIDED_VEC]] to <8 x i32> -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[S2:%.*]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0 -; AVX2-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <16 x i16>* -; AVX2-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i16>, <16 x i16>* [[TMP8]], align 2 +; AVX2-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[STRIDED_VEC]] to <8 x i32> +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; AVX2-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i16>, ptr [[TMP6]], align 2 ; AVX2-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> ; AVX2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> -; AVX2-NEXT: [[TMP9:%.*]] = sext <8 x i16> [[STRIDED_VEC3]] to <8 x i32> -; AVX2-NEXT: [[TMP10:%.*]] = mul nsw <8 x i32> [[TMP9]], [[TMP5]] -; AVX2-NEXT: [[TMP13:%.*]] = sext <8 x i16> [[STRIDED_VEC1]] to <8 x i32> -; AVX2-NEXT: [[TMP15:%.*]] = sext <8 x i16> [[STRIDED_VEC4]] to <8 x i32> -; AVX2-NEXT: [[TMP16:%.*]] = mul nsw <8 x i32> [[TMP15]], [[TMP13]] -; AVX2-NEXT: [[TMP17:%.*]] = add nsw <8 x i32> [[TMP16]], [[TMP10]] -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0 -; AVX2-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <8 x i32>* -; AVX2-NEXT: store <8 x i32> [[TMP17]], <8 x i32>* [[TMP20]], align 4 +; AVX2-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[STRIDED_VEC3]] to <8 x i32> +; AVX2-NEXT: [[TMP8:%.*]] = mul nsw <8 x i32> [[TMP7]], [[TMP4]] +; AVX2-NEXT: [[TMP9:%.*]] = sext <8 x i16> [[STRIDED_VEC1]] to <8 x i32> +; AVX2-NEXT: [[TMP10:%.*]] = sext <8 x i16> [[STRIDED_VEC4]] to <8 x i32> +; AVX2-NEXT: [[TMP11:%.*]] = mul nsw <8 x i32> [[TMP10]], [[TMP9]] +; AVX2-NEXT: [[TMP12:%.*]] = add nsw <8 x i32> [[TMP11]], [[TMP8]] +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; AVX2-NEXT: store <8 x i32> [[TMP12]], ptr [[TMP14]], align 4 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; AVX2-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AVX2-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX2-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -387,25 +363,25 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; AVX2-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 -; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP22]] -; AVX2-NEXT: [[TMP23:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -; AVX2-NEXT: [[CONV:%.*]] = sext i16 [[TMP23]] to i32 -; AVX2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP22]] -; AVX2-NEXT: [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2 -; AVX2-NEXT: [[CONV5:%.*]] = sext i16 [[TMP24]] to i32 +; AVX2-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP16]] +; AVX2-NEXT: [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; AVX2-NEXT: [[CONV:%.*]] = sext i16 [[TMP17]] to i32 +; AVX2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP16]] +; AVX2-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 +; AVX2-NEXT: [[CONV5:%.*]] = sext i16 [[TMP18]] to i32 ; AVX2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]] -; AVX2-NEXT: [[TMP25:%.*]] = or i64 [[TMP22]], 1 -; AVX2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP25]] -; AVX2-NEXT: [[TMP26:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 -; AVX2-NEXT: [[CONV11:%.*]] = sext i16 [[TMP26]] to i32 -; AVX2-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP25]] -; AVX2-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2 -; AVX2-NEXT: [[CONV16:%.*]] = sext i16 [[TMP27]] to i32 +; AVX2-NEXT: [[TMP19:%.*]] = or i64 [[TMP16]], 1 +; AVX2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP19]] +; AVX2-NEXT: [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 +; AVX2-NEXT: [[CONV11:%.*]] = sext i16 [[TMP20]] to i32 +; AVX2-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP19]] +; AVX2-NEXT: [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2 +; AVX2-NEXT: [[CONV16:%.*]] = sext i16 [[TMP21]] to i32 ; AVX2-NEXT: [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]] ; AVX2-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]] -; AVX2-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[INDVARS_IV]] -; AVX2-NEXT: store i32 [[ADD18]], i32* [[ARRAYIDX20]], align 4 +; AVX2-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDVARS_IV]] +; AVX2-NEXT: store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4 ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] ; AVX2-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] @@ -425,24 +401,24 @@ for.body.preheader: for.body: %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] %0 = shl nuw nsw i64 %indvars.iv, 1 - %arrayidx = getelementptr inbounds i16, i16* %s1, i64 %0 - %1 = load i16, i16* %arrayidx, align 2 + %arrayidx = getelementptr inbounds i16, ptr %s1, i64 %0 + %1 = load i16, ptr %arrayidx, align 2 %conv = sext i16 %1 to i32 - %arrayidx4 = getelementptr inbounds i16, i16* %s2, i64 %0 - %2 = load i16, i16* %arrayidx4, align 2 + %arrayidx4 = getelementptr inbounds i16, ptr %s2, i64 %0 + %2 = load i16, ptr %arrayidx4, align 2 %conv5 = sext i16 %2 to i32 %mul6 = mul nsw i32 %conv5, %conv %3 = or i64 %0, 1 - %arrayidx10 = getelementptr inbounds i16, i16* %s1, i64 %3 - %4 = load i16, i16* %arrayidx10, align 2 + %arrayidx10 = getelementptr inbounds i16, ptr %s1, i64 %3 + %4 = load i16, ptr %arrayidx10, align 2 %conv11 = sext i16 %4 to i32 - %arrayidx15 = getelementptr inbounds i16, i16* %s2, i64 %3 - %5 = load i16, i16* %arrayidx15, align 2 + %arrayidx15 = getelementptr inbounds i16, ptr %s2, i64 %3 + %5 = load i16, ptr %arrayidx15, align 2 %conv16 = sext i16 %5 to i32 %mul17 = mul nsw i32 %conv16, %conv11 %add18 = add nsw i32 %mul17, %mul6 - %arrayidx20 = getelementptr inbounds i32, i32* %d1, i64 %indvars.iv - store i32 %add18, i32* %arrayidx20, align 4 + %arrayidx20 = getelementptr inbounds i32, ptr %d1, i64 %indvars.iv + store i32 %add18, ptr %arrayidx20, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond.not, label %for.end.loopexit, label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll index c1ae5c879e6e9..b77ac10925e42 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll @@ -7,46 +7,122 @@ target triple = "x86_64-unknown-linux-gnu" %0 = type { i32 } %1 = type { i64 } -define void @foo(i64* %p, i64* %p.last) unnamed_addr #0 { +define void @foo(ptr %p, ptr %p.last) unnamed_addr #0 { ; CHECK-LABEL: @foo( -; CHECK: vector.body: -; CHECK: [[WIDE_MASKED_GATHER:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP11:%.*]], i32 8, <4 x i1> , <4 x %0*> poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP12:%.*]], i32 8, <4 x i1> , <4 x %0*> poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP13:%.*]], i32 8, <4 x i1> , <4 x %0*> poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP14:%.*]], i32 8, <4 x i1> , <4 x %0*> poison) +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P3:%.*]] = ptrtoint ptr [[P:%.*]] to i64 +; CHECK-NEXT: [[P_LAST1:%.*]] = ptrtoint ptr [[P_LAST:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[P_LAST1]], -1024 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[P3]] +; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 10 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 1024 +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP4]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP5]], i32 8, <4 x i1> , <4 x ptr> poison) +; CHECK-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP6]], i32 8, <4 x i1> , <4 x ptr> poison) +; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP7]], i32 8, <4 x i1> , <4 x ptr> poison) +; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP8]], i32 8, <4 x i1> , <4 x ptr> poison) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16384 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[P2:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[P_INC:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[P_INC]] = getelementptr inbounds i64, ptr [[P2]], i64 128 +; CHECK-NEXT: [[V:%.*]] = load ptr, ptr [[P2]], align 8 +; CHECK-NEXT: [[B:%.*]] = icmp eq ptr [[P_INC]], [[P_LAST]] +; CHECK-NEXT: br i1 [[B]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void ; entry: br label %loop loop: - %p2 = phi i64* [ %p, %entry ], [ %p.inc, %loop ] - %p.inc = getelementptr inbounds i64, i64* %p2, i64 128 - %p3 = bitcast i64* %p2 to %0** - %v = load %0*, %0** %p3, align 8 - %b = icmp eq i64* %p.inc, %p.last + %p2 = phi ptr [ %p, %entry ], [ %p.inc, %loop ] + %p.inc = getelementptr inbounds i64, ptr %p2, i64 128 + %v = load ptr, ptr %p2, align 8 + %b = icmp eq ptr %p.inc, %p.last br i1 %b, label %exit, label %loop exit: ret void } -define void @bar(i64* %p, i64* %p.last) unnamed_addr #0 { +define void @bar(ptr %p, ptr %p.last) unnamed_addr #0 { ; CHECK-LABEL: @bar( -; CHECK: vector.body: -; CHECK: [[WIDE_MASKED_GATHER:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP11:%.*]], i32 8, <4 x i1> , <4 x %1*> poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP12:%.*]], i32 8, <4 x i1> , <4 x %1*> poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP13:%.*]], i32 8, <4 x i1> , <4 x %1*> poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP14:%.*]], i32 8, <4 x i1> , <4 x %1*> poison) +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P3:%.*]] = ptrtoint ptr [[P:%.*]] to i64 +; CHECK-NEXT: [[P_LAST1:%.*]] = ptrtoint ptr [[P_LAST:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[P_LAST1]], -1024 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[P3]] +; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 10 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 1024 +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP4]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP5]], i32 8, <4 x i1> , <4 x ptr> poison) +; CHECK-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP6]], i32 8, <4 x i1> , <4 x ptr> poison) +; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP7]], i32 8, <4 x i1> , <4 x ptr> poison) +; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP8]], i32 8, <4 x i1> , <4 x ptr> poison) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16384 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[P2:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[P_INC:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[P_INC]] = getelementptr inbounds i64, ptr [[P2]], i64 128 +; CHECK-NEXT: [[V:%.*]] = load ptr, ptr [[P2]], align 8 +; CHECK-NEXT: [[B:%.*]] = icmp eq ptr [[P_INC]], [[P_LAST]] +; CHECK-NEXT: br i1 [[B]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void ; entry: br label %loop loop: - %p2 = phi i64* [ %p, %entry ], [ %p.inc, %loop ] - %p.inc = getelementptr inbounds i64, i64* %p2, i64 128 - %p3 = bitcast i64* %p2 to %1** - %v = load %1*, %1** %p3, align 8 - %b = icmp eq i64* %p.inc, %p.last + %p2 = phi ptr [ %p, %entry ], [ %p.inc, %loop ] + %p.inc = getelementptr inbounds i64, ptr %p2, i64 128 + %v = load ptr, ptr %p2, align 8 + %b = icmp eq ptr %p.inc, %p.last br i1 %b, label %exit, label %loop exit: diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll index 9e65096e5fa60..60c30a5b2ae78 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll @@ -3,138 +3,131 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:10:11:12:13" target triple = "x86_64-unknown-linux-gnu" -@jlplt_ijl_alloc_array_1d_10294_got = external dso_local local_unnamed_addr global void ()* +@jlplt_ijl_alloc_array_1d_10294_got = external dso_local local_unnamed_addr global ptr -define {} addrspace(10)* @japi1_vect_42283({} addrspace(10)** nocapture readonly %0, i32 %1) local_unnamed_addr #0 { +define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) local_unnamed_addr #0 { ; CHECK-LABEL: @japi1_vect_42283( ; CHECK-NEXT: top: ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1:%.*]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = load atomic {} addrspace(10)* ({} addrspace(10)*, i64)*, {} addrspace(10)* ({} addrspace(10)*, i64)** bitcast (void ()** @jlplt_ijl_alloc_array_1d_10294_got to {} addrspace(10)* ({} addrspace(10)*, i64)**) unordered, align 8 -; CHECK-NEXT: [[TMP4:%.*]] = tail call {} addrspace(10)* [[TMP3]]({} addrspace(10)* null, i64 0) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast {} addrspace(10)** [[TMP0:%.*]] to { {} addrspace(10)*, i64 } addrspace(10)** -; CHECK-NEXT: [[TMP6:%.*]] = load { {} addrspace(10)*, i64 } addrspace(10)*, { {} addrspace(10)*, i64 } addrspace(10)** [[TMP5]], align 8, !tbaa [[TBAA0:![0-9]+]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast {} addrspace(10)* [[TMP4]] to { {} addrspace(10)*, i64 } addrspace(13)* addrspace(10)* -; CHECK-NEXT: [[TMP8:%.*]] = addrspacecast { {} addrspace(10)*, i64 } addrspace(13)* addrspace(10)* [[TMP7]] to { {} addrspace(10)*, i64 } addrspace(13)* addrspace(11)* -; CHECK-NEXT: [[TMP9:%.*]] = load { {} addrspace(10)*, i64 } addrspace(13)*, { {} addrspace(10)*, i64 } addrspace(13)* addrspace(11)* [[TMP8]], align 8, !tbaa [[TBAA5:![0-9]+]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]] to i8 addrspace(13)* -; CHECK-NEXT: [[DOTELT:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(10)* [[TMP6]], i64 0, i32 0 -; CHECK-NEXT: [[DOTUNPACK:%.*]] = load {} addrspace(10)*, {} addrspace(10)* addrspace(10)* [[DOTELT]], align 8, !tbaa [[TBAA8:![0-9]+]] -; CHECK-NEXT: [[DOTELT1:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(10)* [[TMP6]], i64 0, i32 1 -; CHECK-NEXT: [[DOTUNPACK2:%.*]] = load i64, i64 addrspace(10)* [[DOTELT1]], align 8, !tbaa [[TBAA8]] -; CHECK-NEXT: [[TMP11:%.*]] = add nsw i64 [[TMP2]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP11]], 28 +; CHECK-NEXT: [[TMP3:%.*]] = load atomic ptr, ptr @jlplt_ijl_alloc_array_1d_10294_got unordered, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = tail call ptr addrspace(10) [[TMP3]](ptr addrspace(10) null, i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(10), ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA0:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(10) [[TMP4]] to ptr addrspace(11) +; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(13), ptr addrspace(11) [[TMP6]], align 8, !tbaa [[TBAA5:![0-9]+]] +; CHECK-NEXT: [[DOTELT:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(10) [[TMP5]], i64 0, i32 0 +; CHECK-NEXT: [[DOTUNPACK:%.*]] = load ptr addrspace(10), ptr addrspace(10) [[DOTELT]], align 8, !tbaa [[TBAA8:![0-9]+]] +; CHECK-NEXT: [[DOTELT1:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(10) [[TMP5]], i64 0, i32 1 +; CHECK-NEXT: [[DOTUNPACK2:%.*]] = load i64, ptr addrspace(10) [[DOTELT1]], align 8, !tbaa [[TBAA8]] +; CHECK-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP2]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP8]], 28 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[TMP2]]) ; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0 ; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = sub i64 0, [[MUL_RESULT]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, i8 addrspace(13)* [[TMP10]], i64 [[MUL_RESULT]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp ult i8 addrspace(13)* [[TMP13]], [[TMP10]] -; CHECK-NEXT: [[TMP15:%.*]] = or i1 [[TMP14]], [[MUL_OVERFLOW]] -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], i64 0, i32 1 -; CHECK-NEXT: [[SCEVGEP1:%.*]] = bitcast i64 addrspace(13)* [[SCEVGEP]] to { {} addrspace(10)*, i64 } addrspace(13)* -; CHECK-NEXT: [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[TMP2]]) -; CHECK-NEXT: [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1 -; CHECK-NEXT: [[SCEVGEP15:%.*]] = bitcast { {} addrspace(10)*, i64 } addrspace(13)* [[SCEVGEP1]] to i8 addrspace(13)* -; CHECK-NEXT: [[TMP16:%.*]] = sub i64 0, [[MUL_RESULT3]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, i8 addrspace(13)* [[SCEVGEP15]], i64 [[MUL_RESULT3]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp ult i8 addrspace(13)* [[TMP17]], [[SCEVGEP15]] -; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP18]], [[MUL_OVERFLOW4]] -; CHECK-NEXT: [[TMP20:%.*]] = or i1 [[TMP15]], [[TMP19]] -; CHECK-NEXT: br i1 [[TMP20]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = sub i64 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(13) [[TMP7]], i64 [[MUL_RESULT]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp ult ptr addrspace(13) [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr addrspace(13) [[TMP7]], i64 8 +; CHECK-NEXT: [[MUL1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[TMP2]]) +; CHECK-NEXT: [[MUL_RESULT2:%.*]] = extractvalue { i64, i1 } [[MUL1]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW3:%.*]] = extractvalue { i64, i1 } [[MUL1]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 0, [[MUL_RESULT2]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr addrspace(13) [[UGLYGEP]], i64 [[MUL_RESULT2]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp ult ptr addrspace(13) [[TMP14]], [[UGLYGEP]] +; CHECK-NEXT: [[TMP16:%.*]] = or i1 [[TMP15]], [[MUL_OVERFLOW3]] +; CHECK-NEXT: [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]] +; CHECK-NEXT: br i1 [[TMP17]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP11]], 16 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP11]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x {} addrspace(10)*> poison, {} addrspace(10)* [[DOTUNPACK]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x {} addrspace(10)*> [[BROADCAST_SPLATINSERT]], <4 x {} addrspace(10)*> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x {} addrspace(10)*> poison, {} addrspace(10)* [[DOTUNPACK]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x {} addrspace(10)*> [[BROADCAST_SPLATINSERT9]], <4 x {} addrspace(10)*> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x {} addrspace(10)*> poison, {} addrspace(10)* [[DOTUNPACK]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x {} addrspace(10)*> [[BROADCAST_SPLATINSERT11]], <4 x {} addrspace(10)*> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <4 x {} addrspace(10)*> poison, {} addrspace(10)* [[DOTUNPACK]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x {} addrspace(10)*> [[BROADCAST_SPLATINSERT13]], <4 x {} addrspace(10)*> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP8]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP8]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr addrspace(10)> poison, ptr addrspace(10) [[DOTUNPACK]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr addrspace(10)> [[BROADCAST_SPLATINSERT]], <4 x ptr addrspace(10)> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x ptr addrspace(10)> poison, ptr addrspace(10) [[DOTUNPACK]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x ptr addrspace(10)> [[BROADCAST_SPLATINSERT7]], <4 x ptr addrspace(10)> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x ptr addrspace(10)> poison, ptr addrspace(10) [[DOTUNPACK]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x ptr addrspace(10)> [[BROADCAST_SPLATINSERT9]], <4 x ptr addrspace(10)> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x ptr addrspace(10)> poison, ptr addrspace(10) [[DOTUNPACK]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x ptr addrspace(10)> [[BROADCAST_SPLATINSERT11]], <4 x ptr addrspace(10)> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT13]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT15]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT17]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT19]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT22:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT21]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[STEP_ADD6:%.*]] = add <4 x i64> [[STEP_ADD]], -; CHECK-NEXT: [[STEP_ADD7:%.*]] = add <4 x i64> [[STEP_ADD6]], -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], <4 x i64> [[VEC_IND]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], <4 x i64> [[STEP_ADD]], i32 0 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], <4 x i64> [[STEP_ADD6]], i32 0 -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], <4 x i64> [[STEP_ADD7]], i32 0 -; CHECK-NEXT: call void @llvm.masked.scatter.v4p10sl_s.v4p13p10sl_s(<4 x {} addrspace(10)*> [[BROADCAST_SPLAT]], <4 x {} addrspace(10)* addrspace(13)*> [[TMP21]], i32 8, <4 x i1> ), !tbaa [[TBAA10:![0-9]+]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4p10sl_s.v4p13p10sl_s(<4 x {} addrspace(10)*> [[BROADCAST_SPLAT10]], <4 x {} addrspace(10)* addrspace(13)*> [[TMP22]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4p10sl_s.v4p13p10sl_s(<4 x {} addrspace(10)*> [[BROADCAST_SPLAT12]], <4 x {} addrspace(10)* addrspace(13)*> [[TMP23]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4p10sl_s.v4p13p10sl_s(<4 x {} addrspace(10)*> [[BROADCAST_SPLAT14]], <4 x {} addrspace(10)* addrspace(13)*> [[TMP24]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], <4 x i64> [[VEC_IND]], i32 1 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], <4 x i64> [[STEP_ADD]], i32 1 -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], <4 x i64> [[STEP_ADD6]], i32 1 -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], <4 x i64> [[STEP_ADD7]], i32 1 -; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13i64(<4 x i64> [[BROADCAST_SPLAT16]], <4 x i64 addrspace(13)*> [[TMP25]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13i64(<4 x i64> [[BROADCAST_SPLAT18]], <4 x i64 addrspace(13)*> [[TMP26]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13i64(<4 x i64> [[BROADCAST_SPLAT20]], <4 x i64 addrspace(13)*> [[TMP27]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13i64(<4 x i64> [[BROADCAST_SPLAT22]], <4 x i64 addrspace(13)*> [[TMP28]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: [[STEP_ADD4:%.*]] = add <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[STEP_ADD5:%.*]] = add <4 x i64> [[STEP_ADD4]], +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD4]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD5]], i32 0 +; CHECK-NEXT: call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP18]], i32 8, <4 x i1> ), !tbaa [[TBAA10:![0-9]+]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP19]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT10]], <4 x ptr addrspace(13)> [[TMP20]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT12]], <4 x ptr addrspace(13)> [[TMP21]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD]], i32 1 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD4]], i32 1 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD5]], i32 1 +; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT14]], <4 x ptr addrspace(13)> [[TMP22]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT16]], <4 x ptr addrspace(13)> [[TMP23]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT18]], <4 x ptr addrspace(13)> [[TMP24]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT20]], <4 x ptr addrspace(13)> [[TMP25]], i32 8, <4 x i1> ), !tbaa [[TBAA10]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD7]], -; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD5]], +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP11]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[L44:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TOP:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[L26:%.*]] ; CHECK: L26: -; CHECK-NEXT: [[VALUE_PHI5:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP30:%.*]], [[L26]] ] -; CHECK-NEXT: [[DOTREPACK:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], i64 [[VALUE_PHI5]], i32 0 -; CHECK-NEXT: store {} addrspace(10)* [[DOTUNPACK]], {} addrspace(10)* addrspace(13)* [[DOTREPACK]], align 8, !tbaa [[TBAA10]] -; CHECK-NEXT: [[DOTREPACK4:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], i64 [[VALUE_PHI5]], i32 1 -; CHECK-NEXT: store i64 [[DOTUNPACK2]], i64 addrspace(13)* [[DOTREPACK4]], align 8, !tbaa [[TBAA10]] -; CHECK-NEXT: [[TMP30]] = add i64 [[VALUE_PHI5]], 1 +; CHECK-NEXT: [[VALUE_PHI5:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP27:%.*]], [[L26]] ] +; CHECK-NEXT: [[DOTREPACK:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], i64 [[VALUE_PHI5]], i32 0 +; CHECK-NEXT: store ptr addrspace(10) [[DOTUNPACK]], ptr addrspace(13) [[DOTREPACK]], align 8, !tbaa [[TBAA10]] +; CHECK-NEXT: [[DOTREPACK4:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], i64 [[VALUE_PHI5]], i32 1 +; CHECK-NEXT: store i64 [[DOTUNPACK2]], ptr addrspace(13) [[DOTREPACK4]], align 8, !tbaa [[TBAA10]] +; CHECK-NEXT: [[TMP27]] = add i64 [[VALUE_PHI5]], 1 ; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[VALUE_PHI5]], [[TMP2]] ; CHECK-NEXT: br i1 [[DOTNOT]], label [[L44]], label [[L26]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: L44: -; CHECK-NEXT: ret {} addrspace(10)* null +; CHECK-NEXT: ret ptr addrspace(10) null ; top: %2 = sext i32 %1 to i64 - %3 = load atomic {} addrspace(10)* ({} addrspace(10)*, i64)*, {} addrspace(10)* ({} addrspace(10)*, i64)** bitcast (void ()** @jlplt_ijl_alloc_array_1d_10294_got to {} addrspace(10)* ({} addrspace(10)*, i64)**) unordered, align 8 - %4 = tail call {} addrspace(10)* %3({} addrspace(10)* null, i64 0) - %5 = bitcast {} addrspace(10)** %0 to { {} addrspace(10)*, i64 } addrspace(10)** - %6 = load { {} addrspace(10)*, i64 } addrspace(10)*, { {} addrspace(10)*, i64 } addrspace(10)** %5, align 8, !tbaa !0 - %7 = bitcast {} addrspace(10)* %4 to { {} addrspace(10)*, i64 } addrspace(13)* addrspace(10)* - %8 = addrspacecast { {} addrspace(10)*, i64 } addrspace(13)* addrspace(10)* %7 to { {} addrspace(10)*, i64 } addrspace(13)* addrspace(11)* - %9 = load { {} addrspace(10)*, i64 } addrspace(13)*, { {} addrspace(10)*, i64 } addrspace(13)* addrspace(11)* %8, align 8, !tbaa !5 - %.elt = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(10)* %6, i64 0, i32 0 - %.unpack = load {} addrspace(10)*, {} addrspace(10)* addrspace(10)* %.elt, align 8, !tbaa !8 - %.elt1 = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(10)* %6, i64 0, i32 1 - %.unpack2 = load i64, i64 addrspace(10)* %.elt1, align 8, !tbaa !8 + %3 = load atomic ptr addrspace(10) (ptr addrspace(10), i64)*, ptr addrspace(10) (ptr addrspace(10), i64)** bitcast (ptr @jlplt_ijl_alloc_array_1d_10294_got to ptr addrspace(10) (ptr addrspace(10), i64)**) unordered, align 8 + %4 = tail call ptr addrspace(10) %3(ptr addrspace(10) null, i64 0) + %5 = load ptr addrspace(10), ptr %0, align 8, !tbaa !0 + %6 = addrspacecast ptr addrspace(10) %4 to ptr addrspace(11) + %7 = load ptr addrspace(13), ptr addrspace(11) %6, align 8, !tbaa !5 + %.elt = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(10) %5, i64 0, i32 0 + %.unpack = load ptr addrspace(10), ptr addrspace(10) %.elt, align 8, !tbaa !8 + %.elt1 = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(10) %5, i64 0, i32 1 + %.unpack2 = load i64, ptr addrspace(10) %.elt1, align 8, !tbaa !8 br label %L26 L26: ; preds = %L26, %top - %value_phi5 = phi i64 [ 0, %top ], [ %10, %L26 ] - %.repack = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* %9, i64 %value_phi5, i32 0 - store {} addrspace(10)* %.unpack, {} addrspace(10)* addrspace(13)* %.repack, align 8, !tbaa !10 - %.repack4 = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* %9, i64 %value_phi5, i32 1 - store i64 %.unpack2, i64 addrspace(13)* %.repack4, align 8, !tbaa !10 - %10 = add i64 %value_phi5, 1 + %value_phi5 = phi i64 [ 0, %top ], [ %8, %L26 ] + %.repack = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) %7, i64 %value_phi5, i32 0 + store ptr addrspace(10) %.unpack, ptr addrspace(13) %.repack, align 8, !tbaa !10 + %.repack4 = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) %7, i64 %value_phi5, i32 1 + store i64 %.unpack2, ptr addrspace(13) %.repack4, align 8, !tbaa !10 + %8 = add i64 %value_phi5, 1 %.not = icmp eq i64 %value_phi5, %2 br i1 %.not, label %L44, label %L26 L44: ; preds = %L26 - ret {} addrspace(10)* null + ret ptr addrspace(10) null } attributes #0 = { "target-cpu"="skylake-avx512" "target-features"="+xsaves,+xsavec,+prfchw,+lzcnt,+sahf,+pku,+avx512vl,+avx512bw,+avx512cd,+clwb,+clflushopt,+adx,+avx512dq,+avx512f,+bmi2,+avx2,+bmi,+fsgsbase,+f16c,+avx,+xsave,+aes,+popcnt,+movbe,+sse4.2,+sse4.1,+cx16,+fma,+ssse3,+pclmul,+sse3,-rdrnd,-rtm,-rdseed,-avx512ifma,-avx512pf,-avx512er,-sha,-prefetchwt1,-avx512vbmi,-waitpkg,-avx512vbmi2,-shstk,-gfni,-vaes,-vpclmulqdq,-avx512vnni,-avx512bitalg,-avx512vpopcntdq,-rdpid,-cldemote,-movdiri,-movdir64b,-enqcmd,-avx512vp2intersect,-serialize,-tsxldtrk,-pconfig,-amx-bf16,-amx-tile,-amx-int8,-sse4a,-xop,-lwp,-fma4,-tbm,-mwaitx,-xsaveopt,-clzero,-wbnoinvd,-avx512bf16,-ptwrite,+sse2,+mmx,+fxsr,+64bit,+cx8" } diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll index aac35b578aa65..f9116c06cdd67 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll @@ -4,7 +4,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -define float @reduction_sum_float_ieee(i32 %n, float* %array) { +define float @reduction_sum_float_ieee(i32 %n, ptr %array) { ; CHECK-LABEL: @reduction_sum_float_ieee( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp ne i32 0, 4096 @@ -14,8 +14,8 @@ define float @reduction_sum_float_ieee(i32 %n, float* %array) { ; CHECK: loop: ; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ 0, [[LOOP_PREHEADER]] ] ; CHECK-NEXT: [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ 0.000000e+00, [[LOOP_PREHEADER]] ] -; CHECK-NEXT: [[ADDRESS:%.*]] = getelementptr float, float* [[ARRAY:%.*]], i32 [[IDX]] -; CHECK-NEXT: [[VALUE:%.*]] = load float, float* [[ADDRESS]], align 4 +; CHECK-NEXT: [[ADDRESS:%.*]] = getelementptr float, ptr [[ARRAY:%.*]], i32 [[IDX]] +; CHECK-NEXT: [[VALUE:%.*]] = load float, ptr [[ADDRESS]], align 4 ; CHECK-NEXT: [[SUM_INC]] = fadd float [[SUM]], [[VALUE]] ; CHECK-NEXT: [[IDX_INC]] = add i32 [[IDX]], 1 ; CHECK-NEXT: [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096 @@ -34,8 +34,8 @@ entry: loop: %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ] %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ] - %address = getelementptr float, float* %array, i32 %idx - %value = load float, float* %address + %address = getelementptr float, ptr %array, i32 %idx + %value = load float, ptr %address %sum.inc = fadd float %sum, %value %idx.inc = add i32 %idx, 1 %be.cond = icmp ne i32 %idx.inc, 4096 @@ -46,7 +46,7 @@ loop.exit: ret float %sum.lcssa } -define float @reduction_sum_float_fastmath(i32 %n, float* %array) { +define float @reduction_sum_float_fastmath(i32 %n, ptr %array) { ; CHECK-LABEL: @reduction_sum_float_fastmath( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp ne i32 0, 4096 @@ -57,43 +57,41 @@ define float @reduction_sum_float_fastmath(i32 %n, float* %array) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, float* [[ARRAY:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, float* [[ARRAY]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, float* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[TMP2]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP9]] = fadd fast <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[ARRAY:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, ptr [[TMP2]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP7]] = fadd fast <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[LOOP_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[LOOP_PREHEADER]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ADDRESS:%.*]] = getelementptr float, float* [[ARRAY]], i32 [[IDX]] -; CHECK-NEXT: [[VALUE:%.*]] = load float, float* [[ADDRESS]], align 4 +; CHECK-NEXT: [[ADDRESS:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[IDX]] +; CHECK-NEXT: [[VALUE:%.*]] = load float, ptr [[ADDRESS]], align 4 ; CHECK-NEXT: [[SUM_INC]] = fadd fast float [[SUM]], [[VALUE]] ; CHECK-NEXT: [[IDX_INC]] = add i32 [[IDX]], 1 ; CHECK-NEXT: [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096 -; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: loop.exit.loopexit: -; CHECK-NEXT: [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP_EXIT]] ; CHECK: loop.exit: ; CHECK-NEXT: [[SUM_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[SUM_INC_LCSSA]], [[LOOP_EXIT_LOOPEXIT]] ] @@ -106,8 +104,8 @@ entry: loop: %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ] %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ] - %address = getelementptr float, float* %array, i32 %idx - %value = load float, float* %address + %address = getelementptr float, ptr %array, i32 %idx + %value = load float, ptr %address %sum.inc = fadd fast float %sum, %value %idx.inc = add i32 %idx, 1 %be.cond = icmp ne i32 %idx.inc, 4096 @@ -118,7 +116,7 @@ loop.exit: ret float %sum.lcssa } -define float @reduction_sum_float_only_reassoc(i32 %n, float* %array) { +define float @reduction_sum_float_only_reassoc(i32 %n, ptr %array) { ; CHECK-LABEL: @reduction_sum_float_only_reassoc( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp ne i32 0, 4096 @@ -129,43 +127,41 @@ define float @reduction_sum_float_only_reassoc(i32 %n, float* %array) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, float* [[ARRAY:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, float* [[ARRAY]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, float* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[TMP2]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8]] = fadd reassoc <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP9]] = fadd reassoc <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[ARRAY:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, ptr [[TMP2]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6]] = fadd reassoc <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP7]] = fadd reassoc <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[LOOP_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[LOOP_PREHEADER]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ADDRESS:%.*]] = getelementptr float, float* [[ARRAY]], i32 [[IDX]] -; CHECK-NEXT: [[VALUE:%.*]] = load float, float* [[ADDRESS]], align 4 +; CHECK-NEXT: [[ADDRESS:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[IDX]] +; CHECK-NEXT: [[VALUE:%.*]] = load float, ptr [[ADDRESS]], align 4 ; CHECK-NEXT: [[SUM_INC]] = fadd reassoc float [[SUM]], [[VALUE]] ; CHECK-NEXT: [[IDX_INC]] = add i32 [[IDX]], 1 ; CHECK-NEXT: [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096 -; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], [[LOOP5:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: loop.exit.loopexit: -; CHECK-NEXT: [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP_EXIT]] ; CHECK: loop.exit: ; CHECK-NEXT: [[SUM_LCSSA:%.*]] = phi float [ -0.000000e+00, [[ENTRY:%.*]] ], [ [[SUM_INC_LCSSA]], [[LOOP_EXIT_LOOPEXIT]] ] @@ -178,8 +174,8 @@ entry: loop: %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ] %sum = phi float [ -0.000000e+00, %entry ], [ %sum.inc, %loop ] - %address = getelementptr float, float* %array, i32 %idx - %value = load float, float* %address + %address = getelementptr float, ptr %array, i32 %idx + %value = load float, ptr %address %sum.inc = fadd reassoc float %sum, %value %idx.inc = add i32 %idx, 1 %be.cond = icmp ne i32 %idx.inc, 4096 @@ -190,7 +186,7 @@ loop.exit: ret float %sum.lcssa } -define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, float* %array) { +define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, ptr %array) { ; CHECK-LABEL: @reduction_sum_float_only_reassoc_and_contract( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ENTRY_COND:%.*]] = icmp ne i32 0, 4096 @@ -201,43 +197,41 @@ define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, float* %arra ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, float* [[ARRAY:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, float* [[ARRAY]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, float* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[TMP2]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8]] = fadd reassoc contract <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP9]] = fadd reassoc contract <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[ARRAY:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, ptr [[TMP2]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6]] = fadd reassoc contract <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP7]] = fadd reassoc contract <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc contract <4 x float> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc contract <4 x float> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[LOOP_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[LOOP_PREHEADER]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ADDRESS:%.*]] = getelementptr float, float* [[ARRAY]], i32 [[IDX]] -; CHECK-NEXT: [[VALUE:%.*]] = load float, float* [[ADDRESS]], align 4 +; CHECK-NEXT: [[ADDRESS:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[IDX]] +; CHECK-NEXT: [[VALUE:%.*]] = load float, ptr [[ADDRESS]], align 4 ; CHECK-NEXT: [[SUM_INC]] = fadd reassoc contract float [[SUM]], [[VALUE]] ; CHECK-NEXT: [[IDX_INC]] = add i32 [[IDX]], 1 ; CHECK-NEXT: [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096 -; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], [[LOOP7:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: loop.exit.loopexit: -; CHECK-NEXT: [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP_EXIT]] ; CHECK: loop.exit: ; CHECK-NEXT: [[SUM_LCSSA:%.*]] = phi float [ -0.000000e+00, [[ENTRY:%.*]] ], [ [[SUM_INC_LCSSA]], [[LOOP_EXIT_LOOPEXIT]] ] @@ -250,8 +244,8 @@ entry: loop: %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ] %sum = phi float [ -0.000000e+00, %entry ], [ %sum.inc, %loop ] - %address = getelementptr float, float* %array, i32 %idx - %value = load float, float* %address + %address = getelementptr float, ptr %array, i32 %idx + %value = load float, ptr %address %sum.inc = fadd reassoc contract float %sum, %value %idx.inc = add i32 %idx, 1 %be.cond = icmp ne i32 %idx.inc, 4096 @@ -265,7 +259,7 @@ loop.exit: ; New instructions should have the same FMF as the original code. ; Note that the select inherits FMF from its fcmp condition. -define float @PR35538(float* nocapture readonly %a, i32 %N) #0 { +define float @PR35538(ptr nocapture readonly %a, i32 %N) #0 { ; CHECK-LABEL: @PR35538( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP12:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -280,37 +274,35 @@ define float @PR35538(float* nocapture readonly %a, i32 %N) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = fcmp nnan ninf nsz oge <4 x float> [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP9:%.*]] = fcmp nnan ninf nsz oge <4 x float> [[WIDE_LOAD2]], [[VEC_PHI1]] -; CHECK-NEXT: [[TMP10]] = select <4 x i1> [[TMP8]], <4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]] -; CHECK-NEXT: [[TMP11]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD2]], <4 x float> [[VEC_PHI1]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = fcmp nnan ninf nsz oge <4 x float> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP7:%.*]] = fcmp nnan ninf nsz oge <4 x float> [[WIDE_LOAD2]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP8]] = select <4 x i1> [[TMP6]], <4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]] +; CHECK-NEXT: [[TMP9]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_LOAD2]], <4 x float> [[VEC_PHI1]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp nnan ninf nsz ogt <4 x float> [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select nnan ninf nsz <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP10]], <4 x float> [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = call nnan ninf nsz float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]]) +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp nnan ninf nsz ogt <4 x float> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select nnan ninf nsz <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP8]], <4 x float> [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = call nnan ninf nsz float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -1.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -1.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[MAX_0__LCSSA:%.*]] = phi float [ [[MAX_0_:%.*]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[MAX_0__LCSSA:%.*]] = phi float [ [[MAX_0_:%.*]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[MAX_0_LCSSA:%.*]] = phi float [ -1.000000e+00, [[ENTRY:%.*]] ], [ [[MAX_0__LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -318,13 +310,13 @@ define float @PR35538(float* nocapture readonly %a, i32 %N) #0 { ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[MAX_013:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX_0_]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1_INV:%.*]] = fcmp nnan ninf nsz oge float [[TMP14]], [[MAX_013]] -; CHECK-NEXT: [[MAX_0_]] = select i1 [[CMP1_INV]], float [[TMP14]], float [[MAX_013]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1_INV:%.*]] = fcmp nnan ninf nsz oge float [[TMP12]], [[MAX_013]] +; CHECK-NEXT: [[MAX_0_]] = select i1 [[CMP1_INV]], float [[TMP12]], float [[MAX_013]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP9:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; entry: %cmp12 = icmp sgt i32 %N, 0 @@ -341,8 +333,8 @@ for.cond.cleanup: for.body: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] %max.013 = phi float [ -1.000000e+00, %for.body.lr.ph ], [ %max.0., %for.body ] - %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv - %0 = load float, float* %arrayidx, align 4 + %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 %cmp1.inv = fcmp nnan ninf nsz oge float %0, %max.013 %max.0. = select i1 %cmp1.inv, float %0, float %max.013 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 @@ -352,7 +344,7 @@ for.body: ; Same as above, but this time the select already has matching FMF with its condition. -define float @PR35538_more_FMF(float* nocapture readonly %a, i32 %N) #0 { +define float @PR35538_more_FMF(ptr nocapture readonly %a, i32 %N) #0 { ; CHECK-LABEL: @PR35538_more_FMF( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP12:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -367,37 +359,35 @@ define float @PR35538_more_FMF(float* nocapture readonly %a, i32 %N) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = fcmp nnan ninf oge <4 x float> [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP9:%.*]] = fcmp nnan ninf oge <4 x float> [[WIDE_LOAD2]], [[VEC_PHI1]] -; CHECK-NEXT: [[TMP10]] = select <4 x i1> [[TMP8]], <4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]] -; CHECK-NEXT: [[TMP11]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD2]], <4 x float> [[VEC_PHI1]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = fcmp nnan ninf oge <4 x float> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP7:%.*]] = fcmp nnan ninf oge <4 x float> [[WIDE_LOAD2]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP8]] = select <4 x i1> [[TMP6]], <4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]] +; CHECK-NEXT: [[TMP9]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_LOAD2]], <4 x float> [[VEC_PHI1]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp nnan ninf ogt <4 x float> [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select nnan ninf <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP10]], <4 x float> [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = call nnan ninf float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]]) +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp nnan ninf ogt <4 x float> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select nnan ninf <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP8]], <4 x float> [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = call nnan ninf float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -1.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -1.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[MAX_0__LCSSA:%.*]] = phi float [ [[MAX_0_:%.*]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[MAX_0__LCSSA:%.*]] = phi float [ [[MAX_0_:%.*]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[MAX_0_LCSSA:%.*]] = phi float [ -1.000000e+00, [[ENTRY:%.*]] ], [ [[MAX_0__LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -405,13 +395,13 @@ define float @PR35538_more_FMF(float* nocapture readonly %a, i32 %N) #0 { ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[MAX_013:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX_0_]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1_INV:%.*]] = fcmp nnan ninf oge float [[TMP14]], [[MAX_013]] -; CHECK-NEXT: [[MAX_0_]] = select nnan ninf i1 [[CMP1_INV]], float [[TMP14]], float [[MAX_013]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1_INV:%.*]] = fcmp nnan ninf oge float [[TMP12]], [[MAX_013]] +; CHECK-NEXT: [[MAX_0_]] = select nnan ninf i1 [[CMP1_INV]], float [[TMP12]], float [[MAX_013]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP11:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; entry: %cmp12 = icmp sgt i32 %N, 0 @@ -428,8 +418,8 @@ for.cond.cleanup: for.body: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] %max.013 = phi float [ -1.000000e+00, %for.body.lr.ph ], [ %max.0., %for.body ] - %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv - %0 = load float, float* %arrayidx, align 4 + %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 %cmp1.inv = fcmp nnan ninf oge float %0, %max.013 %max.0. = select nnan ninf i1 %cmp1.inv, float %0, float %max.013 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll index d6c43e61e09b9..e27e44b4fd91f 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll @@ -8,7 +8,7 @@ target triple = "x86_64-unknown-linux-gnu" ; even if it contains scalarized load (gather on AVX2) ; Function Attrs: norecurse nounwind readonly uwtable -define i32 @matrix_row_col([100 x i32]* nocapture readonly %data, i32 %i, i32 %j) local_unnamed_addr #0 { +define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_unnamed_addr #0 { ; CHECK-LABEL: @matrix_row_col( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[I:%.*]] to i64 @@ -18,7 +18,7 @@ define i32 @matrix_row_col([100 x i32]* nocapture readonly %data, i32 %i, i32 %j ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -27,59 +27,58 @@ define i32 @matrix_row_col([100 x i32]* nocapture readonly %data, i32 %i, i32 %j ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 ; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !tbaa [[TBAA1:![0-9]+]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP3]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP4]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP5]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP6]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP7]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA1]] -; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA1]] -; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP13]], align 4, !tbaa [[TBAA1]] -; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA1]] -; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA1]] -; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA1]] -; CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA1]] -; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA1]] -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <8 x i32> poison, i32 [[TMP19]], i32 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP20]], i32 1 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP21]], i32 2 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP22]], i32 3 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP23]], i32 4 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP24]], i32 5 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP25]], i32 6 -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <8 x i32> [[TMP33]], i32 [[TMP26]], i32 7 -; CHECK-NEXT: [[TMP35:%.*]] = mul nsw <8 x i32> [[TMP34]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP36:%.*]] = add <8 x i32> [[VEC_PHI]], -; CHECK-NEXT: [[TMP37]] = add <8 x i32> [[TMP36]], [[TMP35]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !tbaa [[TBAA1:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP3]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP4]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP5]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP6]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP7]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP13]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP15]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP17]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <8 x i32> poison, i32 [[TMP18]], i32 0 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP19]], i32 1 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP20]], i32 2 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP21]], i32 3 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP22]], i32 4 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP23]], i32 5 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP24]], i32 6 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP25]], i32 7 +; CHECK-NEXT: [[TMP34:%.*]] = mul nsw <8 x i32> [[TMP33]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP35:%.*]] = add <8 x i32> [[VEC_PHI]], +; CHECK-NEXT: [[TMP36]] = add <8 x i32> [[TMP35]], [[TMP34]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 -; CHECK-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 +; CHECK-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP37]]) +; CHECK-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP36]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, 96 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP38]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP38]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ADD7_LCSSA]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP40:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA1]] -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, !tbaa [[TBAA1]] -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP41]], [[TMP40]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP40]], [[TMP39]] ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SUM_015]], 4 ; CHECK-NEXT: [[ADD7]] = add i32 [[ADD]], [[MUL]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -98,11 +97,11 @@ entry: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] %sum.015 = phi i32 [ 0, %entry ], [ %add7, %for.body ] ; first consecutive load as vector load - %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %idxprom, i64 %indvars.iv - %0 = load i32, i32* %arrayidx2, align 4, !tbaa !1 + %arrayidx2 = getelementptr inbounds [100 x i32], ptr %data, i64 %idxprom, i64 %indvars.iv + %0 = load i32, ptr %arrayidx2, align 4, !tbaa !1 ; second strided load scalarized - %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %indvars.iv, i64 %idxprom5 - %1 = load i32, i32* %arrayidx6, align 4, !tbaa !1 + %arrayidx6 = getelementptr inbounds [100 x i32], ptr %data, i64 %indvars.iv, i64 %idxprom5 + %1 = load i32, ptr %arrayidx6, align 4, !tbaa !1 %mul = mul nsw i32 %1, %0 %add = add i32 %sum.015, 4 %add7 = add i32 %add, %mul diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll index 103226e4af9c6..d5a649697a03e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll @@ -5,7 +5,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 { +define dso_local void @tail_folding_enabled(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) local_unnamed_addr #0 { ; CHECK-LABEL: @tail_folding_enabled( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -18,22 +18,19 @@ define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noal ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison) -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison) -; CHECK-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison) +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <8 x i1> [[TMP1]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -43,16 +40,16 @@ define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noal ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]] -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; entry: br label %for.body @@ -62,13 +59,13 @@ for.cond.cleanup: for.body: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv - %1 = load i32, i32* %arrayidx2, align 4 + %arrayidx = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 %add = add nsw i32 %1, %0 - %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv - store i32 %add, i32* %arrayidx4, align 4 + %arrayidx4 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + store i32 %add, ptr %arrayidx4, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 430 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !6 @@ -76,7 +73,7 @@ for.body: ; Marking function as optsize turns tail folding on, as if explicit tail folding ; flag was enabled. -define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 { +define dso_local void @tail_folding_disabled(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) local_unnamed_addr #0 { ; CHECK-LABEL: @tail_folding_disabled( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -89,22 +86,19 @@ define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noa ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison) -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison) -; CHECK-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison) +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <8 x i1> [[TMP1]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -114,13 +108,13 @@ define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noa ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]] -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -133,13 +127,13 @@ for.cond.cleanup: for.body: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv - %1 = load i32, i32* %arrayidx2, align 4 + %arrayidx = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 %add = add nsw i32 %1, %0 - %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv - store i32 %add, i32* %arrayidx4, align 4 + %arrayidx4 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + store i32 %add, ptr %arrayidx4, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 430 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10 @@ -155,7 +149,7 @@ for.body: ; } ; -define i32 @reduction_i32(i32* nocapture readonly %A, i32* nocapture readonly %B, i32 %N) #0 { +define i32 @reduction_i32(ptr nocapture readonly %A, ptr nocapture readonly %B, i32 %N) #0 { ; CHECK-LABEL: @reduction_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N:%.*]], -1 @@ -172,48 +166,46 @@ define i32 @reduction_i32(i32* nocapture readonly %A, i32* nocapture readonly %B ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], ; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP10]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison) -; CHECK-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[TMP12]] = add <8 x i32> [[TMP11]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP12]], <8 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP6]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison) +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP8]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison) +; CHECK-NEXT: [[TMP9:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP10]] = add <8 x i32> [[TMP9]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP10]], <8 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP13]]) +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP11]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[SUM_0:%.*]] = phi i32 [ [[SUM_1:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDXA]], align 4 -; CHECK-NEXT: [[ARRAYIDXB:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[ARRAYIDXB]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDXA]], align 4 +; CHECK-NEXT: [[ARRAYIDXB:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDXB]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP14]] ; CHECK-NEXT: [[SUM_1]] = add nuw nsw i32 [[ADD]], [[SUM_0]] ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_1]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_1]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SUM_1_LCSSA]] ; entry: @@ -223,10 +215,10 @@ for.body: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] %sum.0 = phi i32 [ %sum.1, %for.body ], [ 0, %entry ] %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %arrayidxA = getelementptr inbounds i32, i32* %A, i64 %indvars.iv - %0 = load i32, i32* %arrayidxA, align 4 - %arrayidxB = getelementptr inbounds i32, i32* %B, i64 %indvars.iv - %1 = load i32, i32* %arrayidxB, align 4 + %arrayidxA = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + %0 = load i32, ptr %arrayidxA, align 4 + %arrayidxB = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %1 = load i32, ptr %arrayidxB, align 4 %add = add nsw i32 %1, %0 %sum.1 = add nuw nsw i32 %add, %sum.0 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 @@ -237,12 +229,6 @@ for.cond.cleanup: ret i32 %sum.1 } -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[MD_IS_VEC:![0-9]+]]} -; CHECK-NEXT: [[MD_IS_VEC:![0-9]+]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK-NEXT: [[LOOP3]] = distinct !{[[LOOP3]], [[MD_RT_UNROLL_DIS:![0-9]+]], [[MD_IS_VEC]]} -; CHECK-NEXT: [[MD_RT_UNROLL_DIS]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK-NEXT: [[LOOP4]] = distinct !{[[LOOP4]], [[MD_IS_VEC]]} -; CHECK-NEXT: [[LOOP5]] = distinct !{[[LOOP5]], [[MD_RT_UNROLL_DIS]], [[MD_IS_VEC]]} attributes #0 = { nounwind optsize uwtable "target-cpu"="core-avx2" "target-features"="+avx,+avx2" } diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll index d49047c5ff76c..090d5d1371a3b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll @@ -8,7 +8,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -define i32 @uniform_load(i32* align(4) %addr) { +define i32 @uniform_load(ptr align(4) %addr) { ; CHECK-LABEL: @uniform_load( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -16,7 +16,7 @@ define i32 @uniform_load(i32* align(4) %addr) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ADDR:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ADDR:%.*]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -28,7 +28,7 @@ define i32 @uniform_load(i32* align(4) %addr) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[ADDR]], align 4 +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[ADDR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] @@ -41,7 +41,7 @@ entry: for.body: %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] - %load = load i32, i32* %addr + %load = load i32, ptr %addr %iv.next = add nuw nsw i64 %iv, 1 %exitcond = icmp eq i64 %iv, 4096 br i1 %exitcond, label %loopexit, label %for.body @@ -50,7 +50,7 @@ loopexit: ret i32 %load } -define i32 @uniform_load2(i32* align(4) %addr) { +define i32 @uniform_load2(ptr align(4) %addr) { ; CHECK-LABEL: @uniform_load2( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -62,7 +62,7 @@ define i32 @uniform_load2(i32* align(4) %addr) { ; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ADDR:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ADDR:%.*]], align 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT9]] @@ -86,7 +86,7 @@ define i32 @uniform_load2(i32* align(4) %addr) { ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[ACCUM_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[ADDR]], align 4 +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[ADDR]], align 4 ; CHECK-NEXT: [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[LOAD]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 @@ -101,7 +101,7 @@ entry: for.body: %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] %accum = phi i32 [%accum.next, %for.body], [0, %entry] - %load = load i32, i32* %addr + %load = load i32, ptr %addr %accum.next = add i32 %accum, %load %iv.next = add nuw nsw i64 %iv, 1 %exitcond = icmp eq i64 %iv, 4096 @@ -111,7 +111,7 @@ loopexit: ret i32 %accum.next } -define i32 @uniform_address(i32* align(4) %addr, i32 %byte_offset) { +define i32 @uniform_address(ptr align(4) %addr, i32 %byte_offset) { ; CHECK-LABEL: @uniform_address( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -123,14 +123,14 @@ define i32 @uniform_address(i32* align(4) %addr, i32 %byte_offset) { ; CHECK-NEXT: [[TMP1:%.*]] = udiv i32 [[BYTE_OFFSET]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[BYTE_OFFSET]], 4 ; CHECK-NEXT: [[TMP3:%.*]] = udiv i32 [[BYTE_OFFSET]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[ADDR:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[ADDR]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[ADDR]], i32 [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[ADDR]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -143,8 +143,8 @@ define i32 @uniform_address(i32* align(4) %addr, i32 %byte_offset) { ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[OFFSET:%.*]] = udiv i32 [[BYTE_OFFSET]], 4 -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[OFFSET]] -; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[GEP]], align 4 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[ADDR]], i32 [[OFFSET]] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] @@ -158,8 +158,8 @@ entry: for.body: %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] %offset = udiv i32 %byte_offset, 4 - %gep = getelementptr i32, i32* %addr, i32 %offset - %load = load i32, i32* %gep + %gep = getelementptr i32, ptr %addr, i32 %offset + %load = load i32, ptr %gep %iv.next = add nuw nsw i64 %iv, 1 %exitcond = icmp eq i64 %iv, 4096 br i1 %exitcond, label %loopexit, label %for.body @@ -170,7 +170,7 @@ loopexit: -define void @uniform_store_uniform_value(i32* align(4) %addr) { +define void @uniform_store_uniform_value(ptr align(4) %addr) { ; CHECK-LABEL: @uniform_store_uniform_value( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -178,7 +178,7 @@ define void @uniform_store_uniform_value(i32* align(4) %addr) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: store i32 0, i32* [[ADDR:%.*]], align 4 +; CHECK-NEXT: store i32 0, ptr [[ADDR:%.*]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -190,7 +190,7 @@ define void @uniform_store_uniform_value(i32* align(4) %addr) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: store i32 0, i32* [[ADDR]], align 4 +; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] @@ -202,7 +202,7 @@ entry: for.body: %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] - store i32 0, i32* %addr + store i32 0, ptr %addr %iv.next = add nuw nsw i64 %iv, 1 %exitcond = icmp eq i64 %iv, 4096 br i1 %exitcond, label %loopexit, label %for.body @@ -211,15 +211,15 @@ loopexit: ret void } -define void @uniform_store_varying_value(i32* align(4) %addr) { +define void @uniform_store_varying_value(ptr align(4) %addr) { ; CHECK-LABEL: @uniform_store_varying_value( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 2 @@ -236,8 +236,8 @@ define void @uniform_store_varying_value(i32* align(4) %addr) { ; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP0]], 13 ; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 14 ; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP0]], 15 -; CHECK-NEXT: store i32 [[TMP16]], i32* [[ADDR:%.*]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: store i32 [[TMP16]], ptr [[ADDR:%.*]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 16 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: @@ -249,7 +249,7 @@ define void @uniform_store_varying_value(i32* align(4) %addr) { ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[IV_I32:%.*]] = trunc i64 [[IV]] to i32 -; CHECK-NEXT: store i32 [[IV_I32]], i32* [[ADDR]], align 4 +; CHECK-NEXT: store i32 [[IV_I32]], ptr [[ADDR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] @@ -262,7 +262,7 @@ entry: for.body: %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] %iv.i32 = trunc i64 %iv to i32 - store i32 %iv.i32, i32* %addr + store i32 %iv.i32, ptr %addr %iv.next = add nuw nsw i64 %iv, 1 %exitcond = icmp eq i64 %iv, 4096 br i1 %exitcond, label %loopexit, label %for.body @@ -271,15 +271,15 @@ loopexit: ret void } -define void @uniform_rw(i32* align(4) %addr) { +define void @uniform_rw(ptr align(4) %addr) { ; CHECK-LABEL: @uniform_rw( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[ADDR:%.*]], align 4 +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[ADDR:%.*]], align 4 ; CHECK-NEXT: [[INC:%.*]] = add i32 [[LOAD]], 1 -; CHECK-NEXT: store i32 [[INC]], i32* [[ADDR]], align 4 +; CHECK-NEXT: store i32 [[INC]], ptr [[ADDR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT:%.*]], label [[FOR_BODY]] @@ -291,9 +291,9 @@ entry: for.body: %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] - %load = load i32, i32* %addr + %load = load i32, ptr %addr %inc = add i32 %load, 1 - store i32 %inc, i32* %addr + store i32 %inc, ptr %addr %iv.next = add nuw nsw i64 %iv, 1 %exitcond = icmp eq i64 %iv, 4096 br i1 %exitcond, label %loopexit, label %for.body @@ -302,27 +302,23 @@ loopexit: ret void } -define void @uniform_copy(i32* %A, i32* %B) { +define void @uniform_copy(ptr %A, ptr %B) { ; CHECK-LABEL: @uniform_copy( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* -; CHECK-NEXT: [[A3:%.*]] = bitcast i32* [[A:%.*]] to i8* ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 1 -; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 1 -; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]] +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 4 +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[UGLYGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[UGLYGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12 -; CHECK-NEXT: store i32 [[TMP0]], i32* [[B]], align 4, !alias.scope !15, !noalias !12 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !alias.scope !12 +; CHECK-NEXT: store i32 [[TMP0]], ptr [[B]], align 4, !alias.scope !15, !noalias !12 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] @@ -334,8 +330,8 @@ define void @uniform_copy(i32* %A, i32* %B) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[A]], align 4 -; CHECK-NEXT: store i32 [[LOAD]], i32* [[B]], align 4 +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: store i32 [[LOAD]], ptr [[B]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] @@ -347,8 +343,8 @@ entry: for.body: %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] - %load = load i32, i32* %A - store i32 %load, i32* %B + %load = load i32, ptr %A + store i32 %load, ptr %B %iv.next = add nuw nsw i64 %iv, 1 %exitcond = icmp eq i64 %iv, 4096 br i1 %exitcond, label %loopexit, label %for.body @@ -358,16 +354,15 @@ loopexit: } -declare void @init(i32*) +declare void @init(ptr) ;; Count the number of bits set in a bit vector -- key point of relevance is ;; that the byte load is uniform across 8 iterations at a time. -define i32 @test_count_bits(i8* %test_base) { +define i32 @test_count_bits(ptr %test_base) { ; CHECK-LABEL: @test_count_bits( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4096 x i32], align 4 -; CHECK-NEXT: [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32* -; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -393,26 +388,26 @@ define i32 @test_count_bits(i8* %test_base) { ; CHECK-NEXT: [[TMP13:%.*]] = udiv i64 [[TMP5]], 8 ; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP6]], 8 ; CHECK-NEXT: [[TMP15:%.*]] = udiv i64 [[TMP7]], 8 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP24:%.*]] = load i8, i8* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP25:%.*]] = load i8, i8* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP26:%.*]] = load i8, i8* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP27:%.*]] = load i8, i8* [[TMP19]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP25:%.*]] = load i8, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP26:%.*]] = load i8, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[TMP19]], align 1 ; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x i8> poison, i8 [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP29:%.*]] = insertelement <4 x i8> [[TMP28]], i8 [[TMP25]], i32 1 ; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i8> [[TMP29]], i8 [[TMP26]], i32 2 ; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x i8> [[TMP30]], i8 [[TMP27]], i32 3 -; CHECK-NEXT: [[TMP32:%.*]] = load i8, i8* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i8, i8* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i8, i8* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i8, i8* [[TMP23]], align 1 +; CHECK-NEXT: [[TMP32:%.*]] = load i8, ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = load i8, ptr [[TMP23]], align 1 ; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i8> poison, i8 [[TMP32]], i32 0 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i8> [[TMP36]], i8 [[TMP33]], i32 1 ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i8> [[TMP37]], i8 [[TMP34]], i32 2 @@ -447,8 +442,8 @@ define i32 @test_count_bits(i8* %test_base) { ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[BYTE:%.*]] = udiv i64 [[IV]], 8 -; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[BYTE]] -; CHECK-NEXT: [[EARLYCND:%.*]] = load i8, i8* [[TEST_ADDR]], align 1 +; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[BYTE]] +; CHECK-NEXT: [[EARLYCND:%.*]] = load i8, ptr [[TEST_ADDR]], align 1 ; CHECK-NEXT: [[BIT:%.*]] = urem i64 [[IV]], 8 ; CHECK-NEXT: [[BIT_TRUNC:%.*]] = trunc i64 [[BIT]] to i8 ; CHECK-NEXT: [[MASK:%.*]] = lshr i8 [[EARLYCND]], [[BIT_TRUNC]] @@ -463,16 +458,15 @@ define i32 @test_count_bits(i8* %test_base) { ; entry: %alloca = alloca [4096 x i32] - %base = bitcast [4096 x i32]* %alloca to i32* - call void @init(i32* %base) + call void @init(ptr %alloca) br label %loop loop: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] %accum = phi i32 [ 0, %entry ], [ %accum.next, %loop ] %iv.next = add i64 %iv, 1 %byte = udiv i64 %iv, 8 - %test_addr = getelementptr inbounds i8, i8* %test_base, i64 %byte - %earlycnd = load i8, i8* %test_addr + %test_addr = getelementptr inbounds i8, ptr %test_base, i64 %byte + %earlycnd = load i8, ptr %test_addr %bit = urem i64 %iv, 8 %bit.trunc = trunc i64 %bit to i8 %mask = lshr i8 %earlycnd, %bit.trunc @@ -501,7 +495,7 @@ define i32 @uniform_load_global() { ; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @GAddr, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @GAddr, align 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT9]] @@ -525,7 +519,7 @@ define i32 @uniform_load_global() { ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[ACCUM_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* @GAddr, align 4 +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr @GAddr, align 4 ; CHECK-NEXT: [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[LOAD]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 @@ -540,7 +534,7 @@ entry: for.body: %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] %accum = phi i32 [%accum.next, %for.body], [0, %entry] - %load = load i32, i32* @GAddr + %load = load i32, ptr @GAddr %accum.next = add i32 %accum, %load %iv.next = add nuw nsw i64 %iv, 1 %exitcond = icmp eq i64 %iv, 4096 @@ -563,7 +557,7 @@ define i32 @uniform_load_constexpr() { ; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr getelementptr (i32, ptr @GAddr, i64 5), align 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT9]] @@ -587,7 +581,7 @@ define i32 @uniform_load_constexpr() { ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[ACCUM_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4 +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr getelementptr (i32, ptr @GAddr, i64 5), align 4 ; CHECK-NEXT: [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[LOAD]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 @@ -602,7 +596,7 @@ entry: for.body: ; preds = %for.body, %entry %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] %accum = phi i32 [ %accum.next, %for.body ], [ 0, %entry ] - %load = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4 + %load = load i32, ptr getelementptr (i32, ptr @GAddr, i64 5), align 4 %accum.next = add i32 %accum, %load %iv.next = add nuw nsw i64 %iv, 1 %exitcond = icmp eq i64 %iv, 4096 diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll index 7cfaedef32482..64378d501e59b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux" ; ; The source code for the test: ; -; void foo(float* restrict A, float* restrict B) +; void foo(ptr restrict A, ptr restrict B) ; { ; for (int i = 0; i < 20; ++i) A[i] += B[i]; ; } @@ -16,7 +16,7 @@ target triple = "x86_64-unknown-linux" ; ; This loop will be vectorized, although the trip count is below the threshold, but vectorization is explicitly forced in metadata. ; -define void @vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) { +define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: @vectorized( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -25,20 +25,17 @@ define void @vectorized(float* noalias nocapture %A, float* noalias nocapture re ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0 -; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 20, 16 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -47,15 +44,15 @@ define void @vectorized(float* noalias nocapture %A, float* noalias nocapture re ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !0 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !0 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]] -; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]] +; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -64,12 +61,12 @@ entry: for.body: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv - %0 = load float, float* %arrayidx, align 4, !llvm.access.group !11 - %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv - %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !11 + %arrayidx = getelementptr inbounds float, ptr %B, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4, !llvm.access.group !11 + %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv + %1 = load float, ptr %arrayidx2, align 4, !llvm.access.group !11 %add = fadd fast float %0, %1 - store float %add, float* %arrayidx2, align 4, !llvm.access.group !11 + store float %add, ptr %arrayidx2, align 4, !llvm.access.group !11 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 20 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1 @@ -86,7 +83,7 @@ for.end: ; This loop will be vectorized as the trip count is below the threshold but no ; scalar iterations are needed thanks to folding its tail. ; -define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture readonly %B) { +define void @vectorized1(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: @vectorized1( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -97,22 +94,19 @@ define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture r ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[INDUCTION]], -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group !6 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group !6 -; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP6]] to <8 x float>* -; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group !6 +; CHECK-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP6:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP6]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP7:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -120,15 +114,15 @@ define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture r ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !6 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP11]], [[TMP12]] -; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP9]] +; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP6]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP9:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -137,12 +131,12 @@ entry: for.body: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv - %0 = load float, float* %arrayidx, align 4, !llvm.access.group !13 - %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv - %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !13 + %arrayidx = getelementptr inbounds float, ptr %B, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4, !llvm.access.group !13 + %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv + %1 = load float, ptr %arrayidx2, align 4, !llvm.access.group !13 %add = fadd fast float %0, %1 - store float %add, float* %arrayidx2, align 4, !llvm.access.group !13 + store float %add, ptr %arrayidx2, align 4, !llvm.access.group !13 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 20 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3 @@ -158,7 +152,7 @@ for.end: ; This loop will be vectorized as the trip count is below the threshold but no ; scalar iterations are needed. ; -define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture readonly %B) { +define void @vectorized2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: @vectorized2( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -167,20 +161,17 @@ define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture r ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !6 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !6 -; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP6]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, 16 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -189,15 +180,15 @@ define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture r ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !6 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]] -; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]] +; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP6]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP11:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -206,12 +197,12 @@ entry: for.body: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv - %0 = load float, float* %arrayidx, align 4, !llvm.access.group !13 - %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv - %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !13 + %arrayidx = getelementptr inbounds float, ptr %B, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4, !llvm.access.group !13 + %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv + %1 = load float, ptr %arrayidx2, align 4, !llvm.access.group !13 %add = fadd fast float %0, %1 - store float %add, float* %arrayidx2, align 4, !llvm.access.group !13 + store float %add, ptr %arrayidx2, align 4, !llvm.access.group !13 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 16 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll index c0e375481dc19..a63a7915346e9 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll @@ -9,7 +9,7 @@ target triple = "x86_64-apple-macosx10.8.0" ; can be vectorized with predication, scalarizing it would cause its pointer ; operand to become non-uniform. ; -define i32 @predicated_sdiv_masked_load(i32* %a, i32* %b, i32 %x, i1 %c) { +define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) { ; CHECK-LABEL: @predicated_sdiv_masked_load( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i32 0 @@ -17,54 +17,52 @@ define i32 @predicated_sdiv_masked_load(i32* %a, i32* %b, i32 %x, i1 %c) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE2:%.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[PRED_SDIV_CONTINUE2]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP17:%.*]], [[PRED_SDIV_CONTINUE2]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP6]], i32 4, <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> poison) -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[TMP4]], i32 4, <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> poison) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] ; CHECK: pred.sdiv.if: -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[WIDE_MASKED_LOAD]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = sdiv i32 [[TMP8]], [[X:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> poison, i32 [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[WIDE_MASKED_LOAD]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = sdiv i32 [[TMP6]], [[X:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 ; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE]] ; CHECK: pred.sdiv.continue: -; CHECK-NEXT: [[TMP11:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_SDIV_IF]] ] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 -; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_SDIV_IF1:%.*]], label [[PRED_SDIV_CONTINUE2]] +; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_SDIV_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_SDIV_IF1:%.*]], label [[PRED_SDIV_CONTINUE2]] ; CHECK: pred.sdiv.if1: -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[WIDE_MASKED_LOAD]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = sdiv i32 [[TMP13]], [[X]] -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_MASKED_LOAD]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = sdiv i32 [[TMP11]], [[X]] +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i32 1 ; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE2]] ; CHECK: pred.sdiv.continue2: -; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i32> [ [[TMP11]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP15]], [[PRED_SDIV_IF1]] ] -; CHECK-NEXT: [[TMP17:%.*]] = add nsw <2 x i32> [[TMP16]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP18:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP17]], <2 x i32> [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP19]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_SDIV_IF1]] ] +; CHECK-NEXT: [[TMP15:%.*]] = add nsw <2 x i32> [[TMP14]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP16:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP15]], <2 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP17]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP19]]) +; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP17]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 10000, [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[T7:%.*]], [[FOR_INC]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] -; CHECK-NEXT: [[T1:%.*]] = load i32, i32* [[T0]], align 4 +; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[T7:%.*]], [[FOR_INC]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]] +; CHECK-NEXT: [[T1:%.*]] = load i32, ptr [[T0]], align 4 ; CHECK-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK: if.then: -; CHECK-NEXT: [[T2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] -; CHECK-NEXT: [[T3:%.*]] = load i32, i32* [[T2]], align 4 +; CHECK-NEXT: [[T2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]] +; CHECK-NEXT: [[T3:%.*]] = load i32, ptr [[T2]], align 4 ; CHECK-NEXT: [[T4:%.*]] = sdiv i32 [[T3]], [[X]] ; CHECK-NEXT: [[T5:%.*]] = add nsw i32 [[T4]], [[T1]] ; CHECK-NEXT: br label [[FOR_INC]] @@ -75,7 +73,7 @@ define i32 @predicated_sdiv_masked_load(i32* %a, i32* %b, i32 %x, i1 %c) { ; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[I_NEXT]], 10000 ; CHECK-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[T8:%.*]] = phi i32 [ [[T7]], [[FOR_INC]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[T8:%.*]] = phi i32 [ [[T7]], [[FOR_INC]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[T8]] ; ; SINK-GATHER-LABEL: @predicated_sdiv_masked_load( @@ -87,112 +85,110 @@ define i32 @predicated_sdiv_masked_load(i32* %a, i32* %b, i32 %x, i1 %c) { ; SINK-GATHER-NEXT: br label [[VECTOR_BODY:%.*]] ; SINK-GATHER: vector.body: ; SINK-GATHER-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE14:%.*]] ] -; SINK-GATHER-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[PRED_SDIV_CONTINUE14]] ] +; SINK-GATHER-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_SDIV_CONTINUE14]] ] ; SINK-GATHER-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; SINK-GATHER-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; SINK-GATHER-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; SINK-GATHER-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* -; SINK-GATHER-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4 -; SINK-GATHER-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP0]] -; SINK-GATHER-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP4]], i32 0 -; SINK-GATHER-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* -; SINK-GATHER-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP6]], i32 4, <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> poison) -; SINK-GATHER-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 0 -; SINK-GATHER-NEXT: br i1 [[TMP7]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] +; SINK-GATHER-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; SINK-GATHER-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; SINK-GATHER-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4 +; SINK-GATHER-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP0]] +; SINK-GATHER-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0 +; SINK-GATHER-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> poison) +; SINK-GATHER-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 0 +; SINK-GATHER-NEXT: br i1 [[TMP5]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] ; SINK-GATHER: pred.sdiv.if: -; SINK-GATHER-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 0 -; SINK-GATHER-NEXT: [[TMP9:%.*]] = sdiv i32 [[TMP8]], [[X:%.*]] -; SINK-GATHER-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> poison, i32 [[TMP9]], i32 0 +; SINK-GATHER-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 0 +; SINK-GATHER-NEXT: [[TMP7:%.*]] = sdiv i32 [[TMP6]], [[X:%.*]] +; SINK-GATHER-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> poison, i32 [[TMP7]], i32 0 ; SINK-GATHER-NEXT: br label [[PRED_SDIV_CONTINUE]] ; SINK-GATHER: pred.sdiv.continue: -; SINK-GATHER-NEXT: [[TMP11:%.*]] = phi <8 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_SDIV_IF]] ] -; SINK-GATHER-NEXT: [[TMP12:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 1 -; SINK-GATHER-NEXT: br i1 [[TMP12]], label [[PRED_SDIV_IF1:%.*]], label [[PRED_SDIV_CONTINUE2:%.*]] +; SINK-GATHER-NEXT: [[TMP9:%.*]] = phi <8 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_SDIV_IF]] ] +; SINK-GATHER-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 1 +; SINK-GATHER-NEXT: br i1 [[TMP10]], label [[PRED_SDIV_IF1:%.*]], label [[PRED_SDIV_CONTINUE2:%.*]] ; SINK-GATHER: pred.sdiv.if1: -; SINK-GATHER-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 1 -; SINK-GATHER-NEXT: [[TMP14:%.*]] = sdiv i32 [[TMP13]], [[X]] -; SINK-GATHER-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP14]], i32 1 +; SINK-GATHER-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 1 +; SINK-GATHER-NEXT: [[TMP12:%.*]] = sdiv i32 [[TMP11]], [[X]] +; SINK-GATHER-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP12]], i32 1 ; SINK-GATHER-NEXT: br label [[PRED_SDIV_CONTINUE2]] ; SINK-GATHER: pred.sdiv.continue2: -; SINK-GATHER-NEXT: [[TMP16:%.*]] = phi <8 x i32> [ [[TMP11]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP15]], [[PRED_SDIV_IF1]] ] -; SINK-GATHER-NEXT: [[TMP17:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 2 -; SINK-GATHER-NEXT: br i1 [[TMP17]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4:%.*]] +; SINK-GATHER-NEXT: [[TMP14:%.*]] = phi <8 x i32> [ [[TMP9]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_SDIV_IF1]] ] +; SINK-GATHER-NEXT: [[TMP15:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 2 +; SINK-GATHER-NEXT: br i1 [[TMP15]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4:%.*]] ; SINK-GATHER: pred.sdiv.if3: -; SINK-GATHER-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 2 -; SINK-GATHER-NEXT: [[TMP19:%.*]] = sdiv i32 [[TMP18]], [[X]] -; SINK-GATHER-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP19]], i32 2 +; SINK-GATHER-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 2 +; SINK-GATHER-NEXT: [[TMP17:%.*]] = sdiv i32 [[TMP16]], [[X]] +; SINK-GATHER-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[TMP17]], i32 2 ; SINK-GATHER-NEXT: br label [[PRED_SDIV_CONTINUE4]] ; SINK-GATHER: pred.sdiv.continue4: -; SINK-GATHER-NEXT: [[TMP21:%.*]] = phi <8 x i32> [ [[TMP16]], [[PRED_SDIV_CONTINUE2]] ], [ [[TMP20]], [[PRED_SDIV_IF3]] ] -; SINK-GATHER-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 3 -; SINK-GATHER-NEXT: br i1 [[TMP22]], label [[PRED_SDIV_IF5:%.*]], label [[PRED_SDIV_CONTINUE6:%.*]] +; SINK-GATHER-NEXT: [[TMP19:%.*]] = phi <8 x i32> [ [[TMP14]], [[PRED_SDIV_CONTINUE2]] ], [ [[TMP18]], [[PRED_SDIV_IF3]] ] +; SINK-GATHER-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 3 +; SINK-GATHER-NEXT: br i1 [[TMP20]], label [[PRED_SDIV_IF5:%.*]], label [[PRED_SDIV_CONTINUE6:%.*]] ; SINK-GATHER: pred.sdiv.if5: -; SINK-GATHER-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 3 -; SINK-GATHER-NEXT: [[TMP24:%.*]] = sdiv i32 [[TMP23]], [[X]] -; SINK-GATHER-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP24]], i32 3 +; SINK-GATHER-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 3 +; SINK-GATHER-NEXT: [[TMP22:%.*]] = sdiv i32 [[TMP21]], [[X]] +; SINK-GATHER-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP22]], i32 3 ; SINK-GATHER-NEXT: br label [[PRED_SDIV_CONTINUE6]] ; SINK-GATHER: pred.sdiv.continue6: -; SINK-GATHER-NEXT: [[TMP26:%.*]] = phi <8 x i32> [ [[TMP21]], [[PRED_SDIV_CONTINUE4]] ], [ [[TMP25]], [[PRED_SDIV_IF5]] ] -; SINK-GATHER-NEXT: [[TMP27:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 4 -; SINK-GATHER-NEXT: br i1 [[TMP27]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8:%.*]] +; SINK-GATHER-NEXT: [[TMP24:%.*]] = phi <8 x i32> [ [[TMP19]], [[PRED_SDIV_CONTINUE4]] ], [ [[TMP23]], [[PRED_SDIV_IF5]] ] +; SINK-GATHER-NEXT: [[TMP25:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 4 +; SINK-GATHER-NEXT: br i1 [[TMP25]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8:%.*]] ; SINK-GATHER: pred.sdiv.if7: -; SINK-GATHER-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 4 -; SINK-GATHER-NEXT: [[TMP29:%.*]] = sdiv i32 [[TMP28]], [[X]] -; SINK-GATHER-NEXT: [[TMP30:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP29]], i32 4 +; SINK-GATHER-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 4 +; SINK-GATHER-NEXT: [[TMP27:%.*]] = sdiv i32 [[TMP26]], [[X]] +; SINK-GATHER-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP27]], i32 4 ; SINK-GATHER-NEXT: br label [[PRED_SDIV_CONTINUE8]] ; SINK-GATHER: pred.sdiv.continue8: -; SINK-GATHER-NEXT: [[TMP31:%.*]] = phi <8 x i32> [ [[TMP26]], [[PRED_SDIV_CONTINUE6]] ], [ [[TMP30]], [[PRED_SDIV_IF7]] ] -; SINK-GATHER-NEXT: [[TMP32:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 5 -; SINK-GATHER-NEXT: br i1 [[TMP32]], label [[PRED_SDIV_IF9:%.*]], label [[PRED_SDIV_CONTINUE10:%.*]] +; SINK-GATHER-NEXT: [[TMP29:%.*]] = phi <8 x i32> [ [[TMP24]], [[PRED_SDIV_CONTINUE6]] ], [ [[TMP28]], [[PRED_SDIV_IF7]] ] +; SINK-GATHER-NEXT: [[TMP30:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 5 +; SINK-GATHER-NEXT: br i1 [[TMP30]], label [[PRED_SDIV_IF9:%.*]], label [[PRED_SDIV_CONTINUE10:%.*]] ; SINK-GATHER: pred.sdiv.if9: -; SINK-GATHER-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 5 -; SINK-GATHER-NEXT: [[TMP34:%.*]] = sdiv i32 [[TMP33]], [[X]] -; SINK-GATHER-NEXT: [[TMP35:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP34]], i32 5 +; SINK-GATHER-NEXT: [[TMP31:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 5 +; SINK-GATHER-NEXT: [[TMP32:%.*]] = sdiv i32 [[TMP31]], [[X]] +; SINK-GATHER-NEXT: [[TMP33:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP32]], i32 5 ; SINK-GATHER-NEXT: br label [[PRED_SDIV_CONTINUE10]] ; SINK-GATHER: pred.sdiv.continue10: -; SINK-GATHER-NEXT: [[TMP36:%.*]] = phi <8 x i32> [ [[TMP31]], [[PRED_SDIV_CONTINUE8]] ], [ [[TMP35]], [[PRED_SDIV_IF9]] ] -; SINK-GATHER-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 6 -; SINK-GATHER-NEXT: br i1 [[TMP37]], label [[PRED_SDIV_IF11:%.*]], label [[PRED_SDIV_CONTINUE12:%.*]] +; SINK-GATHER-NEXT: [[TMP34:%.*]] = phi <8 x i32> [ [[TMP29]], [[PRED_SDIV_CONTINUE8]] ], [ [[TMP33]], [[PRED_SDIV_IF9]] ] +; SINK-GATHER-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 6 +; SINK-GATHER-NEXT: br i1 [[TMP35]], label [[PRED_SDIV_IF11:%.*]], label [[PRED_SDIV_CONTINUE12:%.*]] ; SINK-GATHER: pred.sdiv.if11: -; SINK-GATHER-NEXT: [[TMP38:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 6 -; SINK-GATHER-NEXT: [[TMP39:%.*]] = sdiv i32 [[TMP38]], [[X]] -; SINK-GATHER-NEXT: [[TMP40:%.*]] = insertelement <8 x i32> [[TMP36]], i32 [[TMP39]], i32 6 +; SINK-GATHER-NEXT: [[TMP36:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 6 +; SINK-GATHER-NEXT: [[TMP37:%.*]] = sdiv i32 [[TMP36]], [[X]] +; SINK-GATHER-NEXT: [[TMP38:%.*]] = insertelement <8 x i32> [[TMP34]], i32 [[TMP37]], i32 6 ; SINK-GATHER-NEXT: br label [[PRED_SDIV_CONTINUE12]] ; SINK-GATHER: pred.sdiv.continue12: -; SINK-GATHER-NEXT: [[TMP41:%.*]] = phi <8 x i32> [ [[TMP36]], [[PRED_SDIV_CONTINUE10]] ], [ [[TMP40]], [[PRED_SDIV_IF11]] ] -; SINK-GATHER-NEXT: [[TMP42:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 7 -; SINK-GATHER-NEXT: br i1 [[TMP42]], label [[PRED_SDIV_IF13:%.*]], label [[PRED_SDIV_CONTINUE14]] +; SINK-GATHER-NEXT: [[TMP39:%.*]] = phi <8 x i32> [ [[TMP34]], [[PRED_SDIV_CONTINUE10]] ], [ [[TMP38]], [[PRED_SDIV_IF11]] ] +; SINK-GATHER-NEXT: [[TMP40:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 7 +; SINK-GATHER-NEXT: br i1 [[TMP40]], label [[PRED_SDIV_IF13:%.*]], label [[PRED_SDIV_CONTINUE14]] ; SINK-GATHER: pred.sdiv.if13: -; SINK-GATHER-NEXT: [[TMP43:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 7 -; SINK-GATHER-NEXT: [[TMP44:%.*]] = sdiv i32 [[TMP43]], [[X]] -; SINK-GATHER-NEXT: [[TMP45:%.*]] = insertelement <8 x i32> [[TMP41]], i32 [[TMP44]], i32 7 +; SINK-GATHER-NEXT: [[TMP41:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 7 +; SINK-GATHER-NEXT: [[TMP42:%.*]] = sdiv i32 [[TMP41]], [[X]] +; SINK-GATHER-NEXT: [[TMP43:%.*]] = insertelement <8 x i32> [[TMP39]], i32 [[TMP42]], i32 7 ; SINK-GATHER-NEXT: br label [[PRED_SDIV_CONTINUE14]] ; SINK-GATHER: pred.sdiv.continue14: -; SINK-GATHER-NEXT: [[TMP46:%.*]] = phi <8 x i32> [ [[TMP41]], [[PRED_SDIV_CONTINUE12]] ], [ [[TMP45]], [[PRED_SDIV_IF13]] ] -; SINK-GATHER-NEXT: [[TMP47:%.*]] = add nsw <8 x i32> [[TMP46]], [[WIDE_LOAD]] -; SINK-GATHER-NEXT: [[TMP48:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT]], -; SINK-GATHER-NEXT: [[PREDPHI:%.*]] = select <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> [[TMP47]], <8 x i32> [[WIDE_LOAD]] -; SINK-GATHER-NEXT: [[TMP49]] = add <8 x i32> [[VEC_PHI]], [[PREDPHI]] +; SINK-GATHER-NEXT: [[TMP44:%.*]] = phi <8 x i32> [ [[TMP39]], [[PRED_SDIV_CONTINUE12]] ], [ [[TMP43]], [[PRED_SDIV_IF13]] ] +; SINK-GATHER-NEXT: [[TMP45:%.*]] = add nsw <8 x i32> [[TMP44]], [[WIDE_LOAD]] +; SINK-GATHER-NEXT: [[TMP46:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT]], +; SINK-GATHER-NEXT: [[PREDPHI:%.*]] = select <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> [[TMP45]], <8 x i32> [[WIDE_LOAD]] +; SINK-GATHER-NEXT: [[TMP47]] = add <8 x i32> [[VEC_PHI]], [[PREDPHI]] ; SINK-GATHER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; SINK-GATHER-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; SINK-GATHER-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SINK-GATHER-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; SINK-GATHER-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SINK-GATHER: middle.block: -; SINK-GATHER-NEXT: [[TMP51:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP49]]) +; SINK-GATHER-NEXT: [[TMP49:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP47]]) ; SINK-GATHER-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; SINK-GATHER-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; SINK-GATHER: scalar.ph: ; SINK-GATHER-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SINK-GATHER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ] +; SINK-GATHER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP49]], [[MIDDLE_BLOCK]] ] ; SINK-GATHER-NEXT: br label [[FOR_BODY:%.*]] ; SINK-GATHER: for.body: ; SINK-GATHER-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ] ; SINK-GATHER-NEXT: [[R:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[T7:%.*]], [[FOR_INC]] ] -; SINK-GATHER-NEXT: [[T0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] -; SINK-GATHER-NEXT: [[T1:%.*]] = load i32, i32* [[T0]], align 4 +; SINK-GATHER-NEXT: [[T0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]] +; SINK-GATHER-NEXT: [[T1:%.*]] = load i32, ptr [[T0]], align 4 ; SINK-GATHER-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; SINK-GATHER: if.then: -; SINK-GATHER-NEXT: [[T2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] -; SINK-GATHER-NEXT: [[T3:%.*]] = load i32, i32* [[T2]], align 4 +; SINK-GATHER-NEXT: [[T2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]] +; SINK-GATHER-NEXT: [[T3:%.*]] = load i32, ptr [[T2]], align 4 ; SINK-GATHER-NEXT: [[T4:%.*]] = sdiv i32 [[T3]], [[X]] ; SINK-GATHER-NEXT: [[T5:%.*]] = add nsw i32 [[T4]], [[T1]] ; SINK-GATHER-NEXT: br label [[FOR_INC]] @@ -203,7 +199,7 @@ define i32 @predicated_sdiv_masked_load(i32* %a, i32* %b, i32 %x, i1 %c) { ; SINK-GATHER-NEXT: [[COND:%.*]] = icmp eq i64 [[I_NEXT]], 10000 ; SINK-GATHER-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; SINK-GATHER: for.end: -; SINK-GATHER-NEXT: [[T8:%.*]] = phi i32 [ [[T7]], [[FOR_INC]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ] +; SINK-GATHER-NEXT: [[T8:%.*]] = phi i32 [ [[T7]], [[FOR_INC]] ], [ [[TMP49]], [[MIDDLE_BLOCK]] ] ; SINK-GATHER-NEXT: ret i32 [[T8]] ; entry: @@ -212,13 +208,13 @@ entry: for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] %r = phi i32 [ 0, %entry ], [ %t7, %for.inc ] - %t0 = getelementptr inbounds i32, i32* %a, i64 %i - %t1 = load i32, i32* %t0, align 4 + %t0 = getelementptr inbounds i32, ptr %a, i64 %i + %t1 = load i32, ptr %t0, align 4 br i1 %c, label %if.then, label %for.inc if.then: - %t2 = getelementptr inbounds i32, i32* %b, i64 %i - %t3 = load i32, i32* %t2, align 4 + %t2 = getelementptr inbounds i32, ptr %b, i64 %i + %t3 = load i32, ptr %t2, align 4 %t4 = sdiv i32 %t3, %x %t5 = add nsw i32 %t4, %t1 br label %for.inc @@ -238,7 +234,7 @@ for.end: ; This test ensures that a load, which would have been widened otherwise is ; instead scalarized if Cost-Model so decided as part of its ; sink-scalar-operands optimization for predicated instructions. -define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) { +define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) { ; CHECK-LABEL: @scalarize_and_sink_gather( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) @@ -261,8 +257,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) { ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; CHECK: pred.udiv.if: ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], [[X]] ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0 ; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE]] @@ -273,8 +269,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) { ; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]] ; CHECK: pred.udiv.if1: ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], [[X]] ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP13]], i32 1 ; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE2]] @@ -302,8 +298,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) { ; CHECK-NEXT: [[I7:%.*]] = mul i64 [[I]], 777 ; CHECK-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK: if.then: -; CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I7]] -; CHECK-NEXT: [[T2:%.*]] = load i32, i32* [[T0]], align 4 +; CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I7]] +; CHECK-NEXT: [[T2:%.*]] = load i32, ptr [[T0]], align 4 ; CHECK-NEXT: [[T4:%.*]] = udiv i32 [[T2]], [[X]] ; CHECK-NEXT: br label [[FOR_INC]] ; CHECK: for.inc: @@ -338,8 +334,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) { ; SINK-GATHER-NEXT: br i1 [[TMP1]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; SINK-GATHER: pred.udiv.if: ; SINK-GATHER-NEXT: [[TMP2:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0 -; SINK-GATHER-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP2]] -; SINK-GATHER-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +; SINK-GATHER-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP2]] +; SINK-GATHER-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 ; SINK-GATHER-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], [[X]] ; SINK-GATHER-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i32 0 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE]] @@ -350,8 +346,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) { ; SINK-GATHER-NEXT: br i1 [[TMP9]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2:%.*]] ; SINK-GATHER: pred.udiv.if1: ; SINK-GATHER-NEXT: [[TMP10:%.*]] = extractelement <8 x i64> [[TMP0]], i32 1 -; SINK-GATHER-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]] -; SINK-GATHER-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4 +; SINK-GATHER-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]] +; SINK-GATHER-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4 ; SINK-GATHER-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], [[X]] ; SINK-GATHER-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP13]], i32 1 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE2]] @@ -362,8 +358,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) { ; SINK-GATHER-NEXT: br i1 [[TMP17]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]] ; SINK-GATHER: pred.udiv.if3: ; SINK-GATHER-NEXT: [[TMP18:%.*]] = extractelement <8 x i64> [[TMP0]], i32 2 -; SINK-GATHER-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP18]] -; SINK-GATHER-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +; SINK-GATHER-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] +; SINK-GATHER-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 ; SINK-GATHER-NEXT: [[TMP21:%.*]] = udiv i32 [[TMP20]], [[X]] ; SINK-GATHER-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP21]], i32 2 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE4]] @@ -374,8 +370,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) { ; SINK-GATHER-NEXT: br i1 [[TMP25]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]] ; SINK-GATHER: pred.udiv.if5: ; SINK-GATHER-NEXT: [[TMP26:%.*]] = extractelement <8 x i64> [[TMP0]], i32 3 -; SINK-GATHER-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP26]] -; SINK-GATHER-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +; SINK-GATHER-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP26]] +; SINK-GATHER-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4 ; SINK-GATHER-NEXT: [[TMP29:%.*]] = udiv i32 [[TMP28]], [[X]] ; SINK-GATHER-NEXT: [[TMP30:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP29]], i32 3 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE6]] @@ -386,8 +382,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) { ; SINK-GATHER-NEXT: br i1 [[TMP33]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]] ; SINK-GATHER: pred.udiv.if7: ; SINK-GATHER-NEXT: [[TMP34:%.*]] = extractelement <8 x i64> [[TMP0]], i32 4 -; SINK-GATHER-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP34]] -; SINK-GATHER-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +; SINK-GATHER-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP34]] +; SINK-GATHER-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4 ; SINK-GATHER-NEXT: [[TMP37:%.*]] = udiv i32 [[TMP36]], [[X]] ; SINK-GATHER-NEXT: [[TMP38:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP37]], i32 4 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE8]] @@ -398,8 +394,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) { ; SINK-GATHER-NEXT: br i1 [[TMP41]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]] ; SINK-GATHER: pred.udiv.if9: ; SINK-GATHER-NEXT: [[TMP42:%.*]] = extractelement <8 x i64> [[TMP0]], i32 5 -; SINK-GATHER-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP42]] -; SINK-GATHER-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +; SINK-GATHER-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP42]] +; SINK-GATHER-NEXT: [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4 ; SINK-GATHER-NEXT: [[TMP45:%.*]] = udiv i32 [[TMP44]], [[X]] ; SINK-GATHER-NEXT: [[TMP46:%.*]] = insertelement <8 x i32> [[TMP40]], i32 [[TMP45]], i32 5 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE10]] @@ -410,8 +406,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) { ; SINK-GATHER-NEXT: br i1 [[TMP49]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]] ; SINK-GATHER: pred.udiv.if11: ; SINK-GATHER-NEXT: [[TMP50:%.*]] = extractelement <8 x i64> [[TMP0]], i32 6 -; SINK-GATHER-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP50]] -; SINK-GATHER-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4 +; SINK-GATHER-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP50]] +; SINK-GATHER-NEXT: [[TMP52:%.*]] = load i32, ptr [[TMP51]], align 4 ; SINK-GATHER-NEXT: [[TMP53:%.*]] = udiv i32 [[TMP52]], [[X]] ; SINK-GATHER-NEXT: [[TMP54:%.*]] = insertelement <8 x i32> [[TMP48]], i32 [[TMP53]], i32 6 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE12]] @@ -422,8 +418,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) { ; SINK-GATHER-NEXT: br i1 [[TMP57]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14]] ; SINK-GATHER: pred.udiv.if13: ; SINK-GATHER-NEXT: [[TMP58:%.*]] = extractelement <8 x i64> [[TMP0]], i32 7 -; SINK-GATHER-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP58]] -; SINK-GATHER-NEXT: [[TMP60:%.*]] = load i32, i32* [[TMP59]], align 4 +; SINK-GATHER-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP58]] +; SINK-GATHER-NEXT: [[TMP60:%.*]] = load i32, ptr [[TMP59]], align 4 ; SINK-GATHER-NEXT: [[TMP61:%.*]] = udiv i32 [[TMP60]], [[X]] ; SINK-GATHER-NEXT: [[TMP62:%.*]] = insertelement <8 x i32> [[TMP56]], i32 [[TMP61]], i32 7 ; SINK-GATHER-NEXT: br label [[PRED_UDIV_CONTINUE14]] @@ -451,8 +447,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) { ; SINK-GATHER-NEXT: [[I7:%.*]] = mul i64 [[I]], 777 ; SINK-GATHER-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; SINK-GATHER: if.then: -; SINK-GATHER-NEXT: [[T0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I7]] -; SINK-GATHER-NEXT: [[T2:%.*]] = load i32, i32* [[T0]], align 4 +; SINK-GATHER-NEXT: [[T0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I7]] +; SINK-GATHER-NEXT: [[T2:%.*]] = load i32, ptr [[T0]], align 4 ; SINK-GATHER-NEXT: [[T4:%.*]] = udiv i32 [[T2]], [[X]] ; SINK-GATHER-NEXT: br label [[FOR_INC]] ; SINK-GATHER: for.inc: @@ -475,8 +471,8 @@ for.body: br i1 %c, label %if.then, label %for.inc if.then: - %t0 = getelementptr inbounds i32, i32* %a, i64 %i7 - %t2 = load i32, i32* %t0, align 4 + %t0 = getelementptr inbounds i32, ptr %a, i64 %i7 + %t2 = load i32, ptr %t0, align 4 %t4 = udiv i32 %t2, %x br label %for.inc diff --git a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll index 5b73e6222efc1..cfdb2c81428d6 100644 --- a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll +++ b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll @@ -6,7 +6,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16 ; Make sure the loop is vectorized and unrolled under -Os without folding its ; tail based on its trip-count being provably divisible by chosen VFxIC. -define dso_local void @constTC(i32* noalias nocapture %A) optsize { +define dso_local void @constTC(ptr noalias nocapture %A) optsize { ; CHECK-LABEL: @constTC( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -17,21 +17,18 @@ define dso_local void @constTC(i32* noalias nocapture %A) optsize { ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> , <2 x i32>* [[TMP7]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 2 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> , <2 x i32>* [[TMP9]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 4 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> , <2 x i32>* [[TMP11]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: store <2 x i32> , ptr [[TMP6]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 2 +; CHECK-NEXT: store <2 x i32> , ptr [[TMP7]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 4 +; CHECK-NEXT: store <2 x i32> , ptr [[TMP8]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 6 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1800 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1800 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1800, 1800 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -40,11 +37,11 @@ define dso_local void @constTC(i32* noalias nocapture %A) optsize { ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[RIV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[RIV]] -; CHECK-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[RIV]] +; CHECK-NEXT: store i32 13, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 1800 -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -53,8 +50,8 @@ entry: loop: %riv = phi i32 [ 0, %entry ], [ %rivPlus1, %loop ] - %arrayidx = getelementptr inbounds i32, i32* %A, i32 %riv - store i32 13, i32* %arrayidx, align 1 + %arrayidx = getelementptr inbounds i32, ptr %A, i32 %riv + store i32 13, ptr %arrayidx, align 1 %rivPlus1 = add nuw nsw i32 %riv, 1 %cond = icmp eq i32 %rivPlus1, 1800 br i1 %cond, label %exit, label %loop diff --git a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll index baeab28e5ad87..44066a9ca8793 100644 --- a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll +++ b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll @@ -6,7 +6,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16 ; Make sure the loop is vectorized under -Os without folding its tail based on ; its trip-count's lower bits known to be zero. -define dso_local void @alignTC(i32* noalias nocapture %A, i32 %n) optsize { +define dso_local void @alignTC(ptr noalias nocapture %A, i32 %n) optsize { ; CHECK-LABEL: @alignTC( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALIGNEDTC:%.*]] = and i32 [[N:%.*]], -8 @@ -19,13 +19,12 @@ define dso_local void @alignTC(i32* noalias nocapture %A, i32 %n) optsize { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> , <4 x i32>* [[TMP3]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <4 x i32> , ptr [[TMP2]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[ALIGNEDTC]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -34,11 +33,11 @@ define dso_local void @alignTC(i32* noalias nocapture %A, i32 %n) optsize { ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[RIV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[RIV]] -; CHECK-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[RIV]] +; CHECK-NEXT: store i32 13, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[ALIGNEDTC]] -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -48,8 +47,8 @@ entry: loop: %riv = phi i32 [ 0, %entry ], [ %rivPlus1, %loop ] - %arrayidx = getelementptr inbounds i32, i32* %A, i32 %riv - store i32 13, i32* %arrayidx, align 1 + %arrayidx = getelementptr inbounds i32, ptr %A, i32 %riv + store i32 13, ptr %arrayidx, align 1 %rivPlus1 = add nuw nsw i32 %riv, 1 %cond = icmp eq i32 %rivPlus1, %alignedTC br i1 %cond, label %exit, label %loop @@ -61,7 +60,7 @@ exit: ; Make sure the loop is vectorized under -Os without folding its tail based on ; its trip-count's lower bits assumed to be zero. -define dso_local void @assumeAlignedTC(i32* noalias nocapture %A, i32 %p, i32 %q) optsize { +define dso_local void @assumeAlignedTC(ptr noalias nocapture %A, i32 %p, i32 %q) optsize { ; CHECK-LABEL: @assumeAlignedTC( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[P:%.*]], 3 @@ -85,13 +84,12 @@ define dso_local void @assumeAlignedTC(i32* noalias nocapture %A, i32 %p, i32 %q ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> , <4 x i32>* [[TMP3]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <4 x i32> , ptr [[TMP2]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -100,11 +98,11 @@ define dso_local void @assumeAlignedTC(i32* noalias nocapture %A, i32 %p, i32 %q ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[RIV:%.*]] = phi i32 [ [[RIVPLUS1:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[RIV]] -; CHECK-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[RIV]] +; CHECK-NEXT: store i32 13, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], [[LOOP5:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: exit.loopexit: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -126,8 +124,8 @@ guarded: loop: %riv = phi i32 [ 0, %guarded ], [ %rivPlus1, %loop ] - %arrayidx = getelementptr inbounds i32, i32* %A, i32 %riv - store i32 13, i32* %arrayidx, align 1 + %arrayidx = getelementptr inbounds i32, ptr %A, i32 %riv + store i32 13, ptr %arrayidx, align 1 %rivPlus1 = add nuw nsw i32 %riv, 1 %cond = icmp eq i32 %rivPlus1, %n br i1 %cond, label %exit, label %loop @@ -139,7 +137,7 @@ exit: ; Make sure the loop's tail is folded when vectorized under -Os based on its trip-count's ; not being provably divisible by chosen VF. -define dso_local void @cannotProveAlignedTC(i32* noalias nocapture %A, i32 %p, i32 %q) optsize { +define dso_local void @cannotProveAlignedTC(ptr noalias nocapture %A, i32 %p, i32 %q) optsize { ; CHECK-LABEL: @cannotProveAlignedTC( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[P:%.*]], 3 @@ -171,38 +169,38 @@ define dso_local void @cannotProveAlignedTC(i32* noalias nocapture %A, i32 %p, i ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP2]] -; CHECK-NEXT: store i32 13, i32* [[TMP3]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP2]] +; CHECK-NEXT: store i32 13, ptr [[TMP3]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 ; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] ; CHECK: pred.store.if1: ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP5]] -; CHECK-NEXT: store i32 13, i32* [[TMP6]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP5]] +; CHECK-NEXT: store i32 13, ptr [[TMP6]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] ; CHECK: pred.store.continue2: ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 ; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] ; CHECK: pred.store.if3: ; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP8]] -; CHECK-NEXT: store i32 13, i32* [[TMP9]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP8]] +; CHECK-NEXT: store i32 13, ptr [[TMP9]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] ; CHECK: pred.store.continue4: ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 ; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] ; CHECK: pred.store.if5: ; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 3 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP11]] -; CHECK-NEXT: store i32 13, i32* [[TMP12]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP11]] +; CHECK-NEXT: store i32 13, ptr [[TMP12]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] ; CHECK: pred.store.continue6: ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -210,11 +208,11 @@ define dso_local void @cannotProveAlignedTC(i32* noalias nocapture %A, i32 %p, i ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[RIV:%.*]] = phi i32 [ [[RIVPLUS1:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[RIV]] -; CHECK-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[RIV]] +; CHECK-NEXT: store i32 13, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], [[LOOP7:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit.loopexit: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -236,8 +234,8 @@ guarded: loop: %riv = phi i32 [ 0, %guarded ], [ %rivPlus1, %loop ] - %arrayidx = getelementptr inbounds i32, i32* %A, i32 %riv - store i32 13, i32* %arrayidx, align 1 + %arrayidx = getelementptr inbounds i32, ptr %A, i32 %riv + store i32 13, ptr %arrayidx, align 1 %rivPlus1 = add nuw nsw i32 %riv, 1 %cond = icmp eq i32 %rivPlus1, %n br i1 %cond, label %exit, label %loop diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll index 0ca133d9e25ac..57e3fa87c3cf7 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll @@ -4,7 +4,7 @@ ; ; Integer reduction with a start value of 5 ; -define i64 @int_reduction_add(i64* %a, i64 %N) { +define i64 @int_reduction_add(ptr %a, i64 %N) { ; CHECK-LABEL: @int_reduction_add( ; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 @@ -18,18 +18,17 @@ define i64 @int_reduction_add(i64* %a, i64 %N) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[TMP2]] to <4 x i64>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, <4 x i64>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4]] = add <4 x i64> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3]] = add <4 x i64> [[WIDE_LOAD]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP3]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: @@ -37,43 +36,42 @@ define i64 @int_reduction_add(i64* %a, i64 %N) { ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 5, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 5, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP5]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[N]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]] -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[TMP7]], [[VEC_EPILOG_PH]] ], [ [[TMP12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[TMP10]] to <4 x i64>* -; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i64>, <4 x i64>* [[TMP11]], align 4 -; CHECK-NEXT: [[TMP12]] = add <4 x i64> [[WIDE_LOAD7]], [[VEC_PHI6]] -; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[TMP6]], [[VEC_EPILOG_PH]] ], [ [[TMP10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX5]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i64>, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP10]] = add <4 x i64> [[WIDE_LOAD7]], [[VEC_PHI6]] +; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP12]]) +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP10]]) ; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N4]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi i64 [ 5, [[ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi i64 [ 5, [[ITER_CHECK]] ], [ [[TMP5]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP12]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX9]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP15:%.*]] = load i64, i64* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD]] = add i64 [[TMP15]], [[SUM]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = add i64 [[TMP13]], [[SUM]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ [[TMP12]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[ADD_LCSSA]] ; entry: @@ -82,8 +80,8 @@ entry: for.body: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] %sum = phi i64 [ 5, %entry ], [ %add, %for.body ] - %arrayidx = getelementptr inbounds i64, i64* %a, i64 %iv - %0 = load i64, i64* %arrayidx + %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv + %0 = load i64, ptr %arrayidx %add = add i64 %0, %sum %iv.next = add nuw nsw i64 %iv, 1 %exitcond.not = icmp eq i64 %iv.next, %N @@ -96,7 +94,7 @@ for.end: ; ; Floating point max reduction ; -define float @fp_reduction_max(float* noalias %a, i64 %N) { +define float @fp_reduction_max(ptr noalias %a, i64 %N) { ; CHECK-LABEL: @fp_reduction_max( ; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 @@ -110,19 +108,18 @@ define float @fp_reduction_max(float* noalias %a, i64 %N) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fcmp fast ogt <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp fast ogt <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP4]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: @@ -130,7 +127,7 @@ define float @fp_reduction_max(float* noalias %a, i64 %N) { ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[N]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]] @@ -138,38 +135,37 @@ define float @fp_reduction_max(float* noalias %a, i64 %N) { ; CHECK-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x float> [[MINMAX_IDENT_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* [[TMP11]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = fcmp fast ogt <4 x float> [[VEC_PHI6]], [[WIDE_LOAD7]] -; CHECK-NEXT: [[TMP13]] = select <4 x i1> [[TMP12]], <4 x float> [[VEC_PHI6]], <4 x float> [[WIDE_LOAD7]] -; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX5]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = fcmp fast ogt <4 x float> [[VEC_PHI6]], [[WIDE_LOAD7]] +; CHECK-NEXT: [[TMP11]] = select <4 x i1> [[TMP10]], <4 x float> [[VEC_PHI6]], <4 x float> [[WIDE_LOAD7]] +; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP13]]) +; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP11]]) ; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N4]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi float [ 0.000000e+00, [[ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi float [ 0.000000e+00, [[ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[RESULT_08:%.*]] = phi float [ [[V0:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX9]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[L0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[L0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[C0:%.*]] = fcmp fast ogt float [[RESULT_08]], [[L0]] ; CHECK-NEXT: [[V0]] = select fast i1 [[C0]], float [[RESULT_08]], float [[L0]] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[V0_LCSSA:%.*]] = phi float [ [[V0]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[V0_LCSSA:%.*]] = phi float [ [[V0]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[V0_LCSSA]] ; entry: @@ -178,8 +174,8 @@ entry: for.body: %indvars.iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] %result.08 = phi float [ %v0, %for.body ], [ 0.000000e+00, %entry ] - %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv - %l0 = load float, float* %arrayidx + %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv + %l0 = load float, ptr %arrayidx %c0 = fcmp fast ogt float %result.08, %l0 %v0 = select fast i1 %c0, float %result.08, float %l0 %iv.next = add i64 %indvars.iv, 1 @@ -193,7 +189,7 @@ for.end: ; ; Extension is required before the reduction operation & result is truncated ; -define i16 @reduction_or_trunc(i16* noalias nocapture %ptr) { +define i16 @reduction_or_trunc(ptr noalias nocapture %ptr) { ; CHECK-LABEL: @reduction_or_trunc( ; CHECK-NEXT: iter.check: ; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] @@ -203,72 +199,70 @@ define i16 @reduction_or_trunc(i16* noalias nocapture %ptr) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[VEC_PHI]], -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <4 x i16>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i16> [[WIDE_LOAD]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i32> [[TMP1]], [[TMP5]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP3]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[WIDE_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[TMP1]], [[TMP4]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i16> -; CHECK-NEXT: [[TMP9]] = zext <4 x i16> [[TMP8]] to <4 x i32> -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 +; CHECK-NEXT: [[TMP7:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i16> +; CHECK-NEXT: [[TMP8]] = zext <4 x i16> [[TMP7]] to <4 x i32> +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i32> [[TMP9]] to <4 x i16> -; CHECK-NEXT: [[TMP11:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP10]]) -; CHECK-NEXT: [[TMP12:%.*]] = zext i16 [[TMP11]] to i32 +; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP9]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext i16 [[TMP10]] to i32 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 256, 256 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP12]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 256, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP13]], [[VEC_EPILOG_PH]] ], [ [[TMP23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = and <4 x i32> [[VEC_PHI3]], -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i32 [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP16]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[TMP17]] to <4 x i16>* -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i16>, <4 x i16>* [[TMP18]], align 2 -; CHECK-NEXT: [[TMP19:%.*]] = zext <4 x i16> [[WIDE_LOAD4]] to <4 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = or <4 x i32> [[TMP15]], [[TMP19]] -; CHECK-NEXT: [[INDEX_NEXT5]] = add nuw i32 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT5]], 256 -; CHECK-NEXT: [[TMP22:%.*]] = trunc <4 x i32> [[TMP20]] to <4 x i16> -; CHECK-NEXT: [[TMP23]] = zext <4 x i16> [[TMP22]] to <4 x i32> -; CHECK-NEXT: br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP12]], [[VEC_EPILOG_PH]] ], [ [[TMP21:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[INDEX2]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = and <4 x i32> [[VEC_PHI3]], +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i16>, ptr [[TMP16]], align 2 +; CHECK-NEXT: [[TMP17:%.*]] = zext <4 x i16> [[WIDE_LOAD4]] to <4 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = or <4 x i32> [[TMP14]], [[TMP17]] +; CHECK-NEXT: [[INDEX_NEXT5]] = add nuw i32 [[INDEX2]], 4 +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT5]], 256 +; CHECK-NEXT: [[TMP20:%.*]] = trunc <4 x i32> [[TMP18]] to <4 x i16> +; CHECK-NEXT: [[TMP21]] = zext <4 x i16> [[TMP20]] to <4 x i32> +; CHECK-NEXT: br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP24:%.*]] = trunc <4 x i32> [[TMP23]] to <4 x i16> -; CHECK-NEXT: [[TMP25:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP24]]) -; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP25]] to i32 +; CHECK-NEXT: [[TMP22:%.*]] = trunc <4 x i32> [[TMP21]] to <4 x i16> +; CHECK-NEXT: [[TMP23:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP22]]) +; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP23]] to i32 ; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i32 256, 256 ; CHECK-NEXT: br i1 [[CMP_N1]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 256, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX6:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[TMP12]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX6:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[SUM_02P:%.*]] = phi i32 [ [[XOR:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX6]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[SUM_02:%.*]] = and i32 [[SUM_02P]], 65535 -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i32 [[IV]] -; CHECK-NEXT: [[LOAD:%.*]] = load i16, i16* [[GEP]], align 2 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i32 [[IV]] +; CHECK-NEXT: [[LOAD:%.*]] = load i16, ptr [[GEP]], align 2 ; CHECK-NEXT: [[EXT:%.*]] = zext i16 [[LOAD]] to i32 ; CHECK-NEXT: [[XOR]] = or i32 [[SUM_02]], [[EXT]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[TMP26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[RET:%.*]] = trunc i32 [[XOR_LCSSA]] to i16 ; CHECK-NEXT: ret i16 [[RET]] ; @@ -279,8 +273,8 @@ for.body: %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ] %sum.02p = phi i32 [ %xor, %for.body ], [ 0, %entry ] %sum.02 = and i32 %sum.02p, 65535 - %gep = getelementptr inbounds i16, i16* %ptr, i32 %iv - %load = load i16, i16* %gep + %gep = getelementptr inbounds i16, ptr %ptr, i32 %iv + %load = load i16, ptr %gep %ext = zext i16 %load to i32 %xor = or i32 %sum.02, %ext %iv.next = add i32 %iv, 1 @@ -295,7 +289,7 @@ for.end: ; ; More than one reduction in the loop ; -define float @multiple_fp_rdx(float* %A, i64 %N) { +define float @multiple_fp_rdx(ptr %A, i64 %N) { ; CHECK-LABEL: @multiple_fp_rdx( ; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 @@ -309,21 +303,20 @@ define float @multiple_fp_rdx(float* %A, i64 %N) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4]] = fadd fast <4 x float> [[VEC_PHI2]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3]] = fadd fast <4 x float> [[VEC_PHI2]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]]) -; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP4]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: @@ -331,52 +324,51 @@ define float @multiple_fp_rdx(float* %A, i64 %N) { ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.500000e+01, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP8]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX3:%.*]] = phi float [ 1.000000e+01, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.500000e+01, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX3:%.*]] = phi float [ 1.000000e+01, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[N]], 4 ; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[N]], [[N_MOD_VF4]] -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> , float [[BC_MERGE_RDX]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> zeroinitializer, float [[BC_MERGE_RDX3]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> , float [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> zeroinitializer, float [[BC_MERGE_RDX3]], i32 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi <4 x float> [ [[TMP9]], [[VEC_EPILOG_PH]] ], [ [[TMP16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <4 x float> [ [[TMP10]], [[VEC_EPILOG_PH]] ], [ [[TMP15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP14]], align 4 -; CHECK-NEXT: [[TMP15]] = fadd fast <4 x float> [[VEC_PHI9]], [[WIDE_LOAD10]] -; CHECK-NEXT: [[TMP16]] = fmul fast <4 x float> [[VEC_PHI8]], [[WIDE_LOAD10]] -; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi <4 x float> [ [[TMP8]], [[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <4 x float> [ [[TMP9]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX7]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13]] = fadd fast <4 x float> [[VEC_PHI9]], [[WIDE_LOAD10]] +; CHECK-NEXT: [[TMP14]] = fmul fast <4 x float> [[VEC_PHI8]], [[WIDE_LOAD10]] +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 4 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP15]]) -; CHECK-NEXT: [[TMP19:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP16]]) +; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP13]]) +; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP14]]) ; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX12:%.*]] = phi float [ 1.500000e+01, [[ITER_CHECK]] ], [ [[TMP8]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX13:%.*]] = phi float [ 1.000000e+01, [[ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX12:%.*]] = phi float [ 1.500000e+01, [[ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX13:%.*]] = phi float [ 1.000000e+01, [[ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[PROD:%.*]] = phi float [ [[BC_MERGE_RDX12]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[SUM:%.*]] = phi float [ [[BC_MERGE_RDX13]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP20:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD]] = fadd fast float [[SUM]], [[TMP20]] -; CHECK-NEXT: [[MUL]] = fmul fast float [[PROD]], [[TMP20]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = fadd fast float [[SUM]], [[TMP18]] +; CHECK-NEXT: [[MUL]] = fmul fast float [[PROD]], [[TMP18]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[DIV:%.*]] = fdiv float [[MUL_LCSSA]], [[ADD_LCSSA]] ; CHECK-NEXT: ret float [[DIV]] ; @@ -387,8 +379,8 @@ for.body: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] %prod = phi float [ 15.000000e+00, %entry ], [ %mul, %for.body ] %sum = phi float [ 10.000000e+00, %entry ], [ %add, %for.body ] - %arrayidx = getelementptr inbounds float, float* %A, i64 %iv - %0 = load float, float* %arrayidx + %arrayidx = getelementptr inbounds float, ptr %A, i64 %iv + %0 = load float, ptr %arrayidx %add = fadd fast float %sum, %0 %mul = fmul fast float %prod, %0 %iv.next = add nuw nsw i64 %iv, 1 @@ -403,7 +395,7 @@ for.end: ; ; Start value of the reduction is a Phi node from the outer loop ; -define i32 @reduction_phi_start_val(i32* %A, i64 %N) { +define i32 @reduction_phi_start_val(ptr %A, i64 %N) { ; CHECK-LABEL: @reduction_phi_start_val( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[ITER_CHECK:%.*]] @@ -422,18 +414,17 @@ define i32 @reduction_phi_start_val(i32* %A, i64 %N) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5]] = sub <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4]] = sub <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: @@ -441,43 +432,42 @@ define i32 @reduction_phi_start_val(i32* %A, i64 %N) { ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START_SUM]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START_SUM]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[N]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]] -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP8]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4 -; CHECK-NEXT: [[TMP13]] = sub <4 x i32> [[VEC_PHI6]], [[WIDE_LOAD7]] -; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP7]], [[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX5]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11]] = sub <4 x i32> [[VEC_PHI6]], [[WIDE_LOAD7]] +; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP11]]) ; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N4]], label [[FOR_COND]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi i32 [ [[START_SUM]], [[ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi i32 [ [[START_SUM]], [[ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ [[BC_MERGE_RDX9]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SUB:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]] -; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[SUB]] = sub nsw i32 [[SUM]], [[LOAD]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: for.cond: -; CHECK-NEXT: [[SUB_LCSSA]] = phi i32 [ [[SUB]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUB_LCSSA]] = phi i32 [ [[SUB]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1 ; CHECK-NEXT: [[OUTER_EXITCOND_NOT:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[OUTER_EXITCOND_NOT]], label [[FOR_END:%.*]], label [[ITER_CHECK]] @@ -496,8 +486,8 @@ for.cond.preheader: for.body: %iv = phi i64 [ 0, %for.cond.preheader ], [ %iv.next, %for.body ] %sum = phi i32 [ %start.sum, %for.cond.preheader ], [ %sub, %for.body ] - %arrayidx = getelementptr inbounds i32, i32* %A, i64 %iv - %load = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %A, i64 %iv + %load = load i32, ptr %arrayidx, align 4 %sub = sub nsw i32 %sum, %load %iv.next = add nuw nsw i64 %iv, 1 %exitcond.not = icmp eq i64 %iv.next, %N diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll index 51afa12c0e0e5..850178538cbf9 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll @@ -47,7 +47,7 @@ for.cond1.for.end_crit_edge: ; preds = %for.body ; The 'previous' instruction of %for.2 is in a separate block. ; PR54195. -define void @multiple_recurrences_with_previous_in_different_block(i32 %a, i8 %b, i64* %ptr) { +define void @multiple_recurrences_with_previous_in_different_block(i32 %a, i8 %b, ptr %ptr) { ; CHECK-LABEL: @multiple_recurrences_with_previous_in_different_block( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] @@ -62,8 +62,8 @@ define void @multiple_recurrences_with_previous_in_different_block(i32 %a, i8 %b ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1000 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP_LATCH]] ; CHECK: loop.latch: -; CHECK-NEXT: [[PTR_GEP:%.*]] = getelementptr inbounds i64, i64* [[PTR:%.*]], i64 [[IV]] -; CHECK-NEXT: store i64 [[SUB]], i64* [[PTR_GEP]], align 4 +; CHECK-NEXT: [[PTR_GEP:%.*]] = getelementptr inbounds i64, ptr [[PTR:%.*]], i64 [[IV]] +; CHECK-NEXT: store i64 [[SUB]], ptr [[PTR_GEP]], align 4 ; CHECK-NEXT: [[FOR_2_NEXT]] = zext i32 [[A:%.*]] to i64 ; CHECK-NEXT: br label [[LOOP_HEADER]] ; CHECK: exit: @@ -84,8 +84,8 @@ loop.header: br i1 %exitcond, label %exit, label %loop.latch loop.latch: - %ptr.gep = getelementptr inbounds i64, i64* %ptr, i64 %iv - store i64 %sub, i64* %ptr.gep + %ptr.gep = getelementptr inbounds i64, ptr %ptr, i64 %iv + store i64 %sub, ptr %ptr.gep %for.2.next = zext i32 %a to i64 br label %loop.header @@ -93,7 +93,7 @@ exit: ret void } -define void @test_pr54223_sink_after_insertion_order(float* noalias %a, float* noalias %b, float* noalias %dst) { +define void @test_pr54223_sink_after_insertion_order(ptr noalias %a, ptr noalias %b, ptr noalias %dst) { ; CHECK-LABEL: @test_pr54223_sink_after_insertion_order( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -104,23 +104,22 @@ define void @test_pr54223_sink_after_insertion_order(float* noalias %a, float* n ; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VECTOR_RECUR1:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[A:%.*]], align 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[VECTOR_RECUR]], <4 x float> [[BROADCAST_SPLAT]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[B:%.*]], align 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[TMP4]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT3]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[VECTOR_RECUR1]], <4 x float> [[BROADCAST_SPLAT3]], <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = fneg <4 x float> [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x float> zeroinitializer) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <4 x float> [[TMP7]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x float> [[BROADCAST_SPLAT]], i32 3 @@ -139,11 +138,11 @@ define void @test_pr54223_sink_after_insertion_order(float* noalias %a, float* n ; CHECK-NEXT: [[SCALAR_RECUR7:%.*]] = phi float [ [[SCALAR_RECUR_INIT6]], [[SCALAR_PH]] ], [ [[FOR_2_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[NEG:%.*]] = fneg float [[SCALAR_RECUR7]] ; CHECK-NEXT: [[MULADD:%.*]] = call float @llvm.fmuladd.f32(float [[SCALAR_RECUR]], float [[NEG]], float 0.000000e+00) -; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr inbounds float, float* [[DST]], i64 [[IV]] +; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[FOR_1_NEXT]] = load float, float* [[A]], align 4 -; CHECK-NEXT: [[FOR_2_NEXT]] = load float, float* [[B]], align 4 -; CHECK-NEXT: store float [[MULADD]], float* [[DST_GEP]], align 4 +; CHECK-NEXT: [[FOR_1_NEXT]] = load float, ptr [[A]], align 4 +; CHECK-NEXT: [[FOR_2_NEXT]] = load float, ptr [[B]], align 4 +; CHECK-NEXT: store float [[MULADD]], ptr [[DST_GEP]], align 4 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 10000 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: exit: @@ -158,11 +157,11 @@ loop: %for.2 = phi float [ 0.0, %entry ], [ %for.2.next, %loop ] %neg = fneg float %for.2 %muladd = call float @llvm.fmuladd.f32(float %for.1, float %neg, float 0.000000e+00) - %dst.gep = getelementptr inbounds float, float* %dst, i64 %iv + %dst.gep = getelementptr inbounds float, ptr %dst, i64 %iv %iv.next = add nuw nsw i64 %iv, 1 - %for.1.next = load float, float* %a, align 4 - %for.2.next = load float, float* %b, align 4 - store float %muladd, float* %dst.gep + %for.1.next = load float, ptr %a, align 4 + %for.2.next = load float, ptr %b, align 4 + store float %muladd, ptr %dst.gep %exitcond.not = icmp eq i64 %iv.next, 10000 br i1 %exitcond.not, label %exit, label %loop @@ -172,7 +171,7 @@ exit: declare float @llvm.fmuladd.f32(float, float, float) #1 -define void @test_pr54227(i32* noalias %a, i32* noalias %b) { +define void @test_pr54227(ptr noalias %a, ptr noalias %b) { ; CHECK-LABEL: @test_pr54227( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] @@ -186,10 +185,10 @@ define void @test_pr54227(i32* noalias %a, i32* noalias %b) { ; CHECK-NEXT: [[AND1]] = and i32 [[E_0]], [[F_0]] ; CHECK-NEXT: [[MUL]] = shl nsw i32 [[OR]], 1 ; CHECK-NEXT: [[ADD]] = or i32 [[AND1]], 1 -; CHECK-NEXT: [[A_GEP:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IV]] -; CHECK-NEXT: store i32 [[ADD]], i32* [[A_GEP]], align 4 -; CHECK-NEXT: [[B_GEP:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]] -; CHECK-NEXT: store i32 [[MUL]], i32* [[B_GEP]], align 4 +; CHECK-NEXT: [[A_GEP:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[A_GEP]], align 4 +; CHECK-NEXT: [[B_GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: store i32 [[MUL]], ptr [[B_GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1000 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]] @@ -209,10 +208,10 @@ loop: %and1 = and i32 %e.0, %f.0 %mul = shl nsw i32 %or, 1 %add = or i32 %and1, 1 - %a.gep = getelementptr inbounds i32, i32* %a, i64 %iv - store i32 %add, i32* %a.gep, align 4 - %b.gep = getelementptr inbounds i32, i32* %a, i64 %iv - store i32 %mul, i32* %b.gep, align 4 + %a.gep = getelementptr inbounds i32, ptr %a, i64 %iv + store i32 %add, ptr %a.gep, align 4 + %b.gep = getelementptr inbounds i32, ptr %a, i64 %iv + store i32 %mul, ptr %b.gep, align 4 %iv.next = add nuw i64 %iv, 1 %exitcond = icmp eq i64 %iv, 1000 br i1 %exitcond, label %exit, label %loop @@ -221,7 +220,7 @@ exit: ret void } -define void @test_pr54233_for_depend_on_each_other(i32* noalias %a, i32* noalias %b) { +define void @test_pr54233_for_depend_on_each_other(ptr noalias %a, ptr noalias %b) { ; CHECK-LABEL: @test_pr54233_for_depend_on_each_other( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] @@ -234,9 +233,9 @@ define void @test_pr54233_for_depend_on_each_other(i32* noalias %a, i32* noalias ; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[SHL]], 255 ; CHECK-NEXT: [[AND:%.*]] = and i32 [[XOR]], [[OR]] ; CHECK-NEXT: [[FOR_1_NEXT]] = xor i32 12, [[FOR_2]] -; CHECK-NEXT: [[FOR_2_NEXT]] = load i32, i32* [[B:%.*]], align 4 -; CHECK-NEXT: [[A_GEP:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IV]] -; CHECK-NEXT: store i32 [[AND]], i32* [[A_GEP]], align 4 +; CHECK-NEXT: [[FOR_2_NEXT]] = load i32, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[A_GEP:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] +; CHECK-NEXT: store i32 [[AND]], ptr [[A_GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1000 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]] @@ -255,9 +254,9 @@ loop: %xor = xor i32 %shl, 255 %and = and i32 %xor, %or %for.1.next = xor i32 12, %for.2 - %for.2.next = load i32, i32* %b - %a.gep = getelementptr inbounds i32, i32* %a, i64 %iv - store i32 %and, i32* %a.gep, align 4 + %for.2.next = load i32, ptr %b + %a.gep = getelementptr inbounds i32, ptr %a, i64 %iv + store i32 %and, ptr %a.gep, align 4 %iv.next = add nuw i64 %iv, 1 %exitcond = icmp eq i64 %iv, 1000 br i1 %exitcond, label %exit, label %loop @@ -266,7 +265,7 @@ exit: ret void } -define void @pr54218(i8* %dst, i32* noalias %d) { +define void @pr54218(ptr %dst, ptr noalias %d) { ; CHECK-LABEL: @pr54218( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] @@ -279,10 +278,10 @@ define void @pr54218(i8* %dst, i32* noalias %d) { ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1000 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP_LATCH]] ; CHECK: loop.latch: -; CHECK-NEXT: [[FOR_1_NEXT]] = load i32, i32* [[D:%.*]], align 4 +; CHECK-NEXT: [[FOR_1_NEXT]] = load i32, ptr [[D:%.*]], align 4 ; CHECK-NEXT: [[P_NEXT]] = add i8 [[SEL]], -1 -; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[IV]] -; CHECK-NEXT: store i8 [[SEL]], i8* [[DST_GEP]], align 1 +; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[IV]] +; CHECK-NEXT: store i8 [[SEL]], ptr [[DST_GEP]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 ; CHECK-NEXT: br label [[LOOP_HEADER]] ; CHECK: exit: @@ -301,10 +300,10 @@ loop.header: br i1 %exitcond, label %exit, label %loop.latch loop.latch: - %for.1.next = load i32, i32* %d, align 4 + %for.1.next = load i32, ptr %d, align 4 %p.next = add i8 %sel, -1 - %dst.gep = getelementptr inbounds i8, i8* %dst, i64 %iv - store i8 %sel, i8* %dst.gep + %dst.gep = getelementptr inbounds i8, ptr %dst, i64 %iv + store i8 %sel, ptr %dst.gep %iv.next = add nuw i64 %iv, 1 br label %loop.header @@ -312,7 +311,7 @@ exit: ret void } -define void @pr54254_fors_depend_on_each_other(i32* %dst) { +define void @pr54254_fors_depend_on_each_other(ptr %dst) { ; CHECK-LABEL: @pr54254_fors_depend_on_each_other( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] @@ -325,9 +324,9 @@ define void @pr54254_fors_depend_on_each_other(i32* %dst) { ; CHECK-NEXT: [[REM:%.*]] = srem i32 [[F_0]], [[NEG]] ; CHECK-NEXT: [[XOR]] = xor i32 [[C_0]], 1 ; CHECK-NEXT: [[XOR1]] = xor i32 [[REM]], 1 -; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[IV]] +; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[IV]] ; CHECK-NEXT: [[REM_NEXT]] = add i32 [[REM]], 10 -; CHECK-NEXT: store i32 [[REM]], i32* [[DST_GEP]], align 4 +; CHECK-NEXT: store i32 [[REM]], ptr [[DST_GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1000 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]] @@ -346,9 +345,9 @@ loop: %rem = srem i32 %f.0, %neg %xor = xor i32 %c.0, 1 %xor1 = xor i32 %rem, 1 - %dst.gep = getelementptr inbounds i32, i32* %dst, i64 %iv + %dst.gep = getelementptr inbounds i32, ptr %dst, i64 %iv %rem.next = add i32 %rem, 10 - store i32 %rem, i32* %dst.gep + store i32 %rem, ptr %dst.gep %iv.next = add nuw i64 %iv, 1 %exitcond = icmp eq i64 %iv, 1000 br i1 %exitcond, label %exit, label %loop diff --git a/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll b/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll index 1e58679acca98..43229dc83d52e 100644 --- a/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll +++ b/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll @@ -4,16 +4,16 @@ ; The function finds the smallest value from a float vector. ; Check if vectorization is enabled by instruction flag `fcmp nnan`. -define float @minloop(float* nocapture readonly %arg) { +define float @minloop(ptr nocapture readonly %arg) { ; CHECK-LABEL: @minloop( ; CHECK-NEXT: top: -; CHECK-NEXT: [[T:%.*]] = load float, float* [[ARG:%.*]], align 4 +; CHECK-NEXT: [[T:%.*]] = load float, ptr [[ARG:%.*]], align 4 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[T1:%.*]] = phi i64 [ [[T7:%.*]], [[LOOP]] ], [ 1, [[TOP:%.*]] ] ; CHECK-NEXT: [[T2:%.*]] = phi float [ [[T6:%.*]], [[LOOP]] ], [ [[T]], [[TOP]] ] -; CHECK-NEXT: [[T3:%.*]] = getelementptr float, float* [[ARG]], i64 [[T1]] -; CHECK-NEXT: [[T4:%.*]] = load float, float* [[T3]], align 4 +; CHECK-NEXT: [[T3:%.*]] = getelementptr float, ptr [[ARG]], i64 [[T1]] +; CHECK-NEXT: [[T4:%.*]] = load float, ptr [[T3]], align 4 ; CHECK-NEXT: [[T5:%.*]] = fcmp nnan olt float [[T2]], [[T4]] ; CHECK-NEXT: [[T6]] = select i1 [[T5]], float [[T2]], float [[T4]] ; CHECK-NEXT: [[T7]] = add i64 [[T1]], 1 @@ -24,14 +24,14 @@ define float @minloop(float* nocapture readonly %arg) { ; CHECK-NEXT: ret float [[T6_LCSSA]] ; top: - %t = load float, float* %arg + %t = load float, ptr %arg br label %loop loop: ; preds = %loop, %top %t1 = phi i64 [ %t7, %loop ], [ 1, %top ] %t2 = phi float [ %t6, %loop ], [ %t, %top ] - %t3 = getelementptr float, float* %arg, i64 %t1 - %t4 = load float, float* %t3, align 4 + %t3 = getelementptr float, ptr %arg, i64 %t1 + %t4 = load float, ptr %t3, align 4 %t5 = fcmp nnan olt float %t2, %t4 %t6 = select i1 %t5, float %t2, float %t4 %t7 = add i64 %t1, 1 @@ -44,10 +44,10 @@ out: ; preds = %loop ; Check if vectorization is still enabled by function attribute. -define float @minloopattr(float* nocapture readonly %arg) #0 { +define float @minloopattr(ptr nocapture readonly %arg) #0 { ; CHECK-LABEL: @minloopattr( ; CHECK-NEXT: top: -; CHECK-NEXT: [[T:%.*]] = load float, float* [[ARG:%.*]], align 4 +; CHECK-NEXT: [[T:%.*]] = load float, ptr [[ARG:%.*]], align 4 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[T]], i32 0 @@ -55,49 +55,48 @@ define float @minloopattr(float* nocapture readonly %arg) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, float* [[ARG]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, float* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fcmp olt <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, ptr [[ARG]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp olt <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536 +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP4]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 65536, 65536 ; CHECK-NEXT: br i1 [[CMP_N]], label [[OUT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 65537, [[MIDDLE_BLOCK]] ], [ 1, [[TOP:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[T]], [[TOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[T]], [[TOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[T1:%.*]] = phi i64 [ [[T7:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[T2:%.*]] = phi float [ [[T6:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[T3:%.*]] = getelementptr float, float* [[ARG]], i64 [[T1]] -; CHECK-NEXT: [[T4:%.*]] = load float, float* [[T3]], align 4 +; CHECK-NEXT: [[T3:%.*]] = getelementptr float, ptr [[ARG]], i64 [[T1]] +; CHECK-NEXT: [[T4:%.*]] = load float, ptr [[T3]], align 4 ; CHECK-NEXT: [[T5:%.*]] = fcmp olt float [[T2]], [[T4]] ; CHECK-NEXT: [[T6]] = select i1 [[T5]], float [[T2]], float [[T4]] ; CHECK-NEXT: [[T7]] = add i64 [[T1]], 1 ; CHECK-NEXT: [[T8:%.*]] = icmp eq i64 [[T7]], 65537 -; CHECK-NEXT: br i1 [[T8]], label [[OUT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[T8]], label [[OUT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: out: -; CHECK-NEXT: [[T6_LCSSA:%.*]] = phi float [ [[T6]], [[LOOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[T6_LCSSA:%.*]] = phi float [ [[T6]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[T6_LCSSA]] ; top: - %t = load float, float* %arg + %t = load float, ptr %arg br label %loop loop: ; preds = %loop, %top %t1 = phi i64 [ %t7, %loop ], [ 1, %top ] %t2 = phi float [ %t6, %loop ], [ %t, %top ] - %t3 = getelementptr float, float* %arg, i64 %t1 - %t4 = load float, float* %t3, align 4 + %t3 = getelementptr float, ptr %arg, i64 %t1 + %t4 = load float, ptr %t3, align 4 %t5 = fcmp olt float %t2, %t4 %t6 = select i1 %t5, float %t2, float %t4 %t7 = add i64 %t1, 1 @@ -110,16 +109,16 @@ out: ; preds = %loop ; Check if vectorization is prevented without the flag or attribute. -define float @minloopnovec(float* nocapture readonly %arg) { +define float @minloopnovec(ptr nocapture readonly %arg) { ; CHECK-LABEL: @minloopnovec( ; CHECK-NEXT: top: -; CHECK-NEXT: [[T:%.*]] = load float, float* [[ARG:%.*]], align 4 +; CHECK-NEXT: [[T:%.*]] = load float, ptr [[ARG:%.*]], align 4 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[T1:%.*]] = phi i64 [ [[T7:%.*]], [[LOOP]] ], [ 1, [[TOP:%.*]] ] ; CHECK-NEXT: [[T2:%.*]] = phi float [ [[T6:%.*]], [[LOOP]] ], [ [[T]], [[TOP]] ] -; CHECK-NEXT: [[T3:%.*]] = getelementptr float, float* [[ARG]], i64 [[T1]] -; CHECK-NEXT: [[T4:%.*]] = load float, float* [[T3]], align 4 +; CHECK-NEXT: [[T3:%.*]] = getelementptr float, ptr [[ARG]], i64 [[T1]] +; CHECK-NEXT: [[T4:%.*]] = load float, ptr [[T3]], align 4 ; CHECK-NEXT: [[T5:%.*]] = fcmp olt float [[T2]], [[T4]] ; CHECK-NEXT: [[T6]] = select i1 [[T5]], float [[T2]], float [[T4]] ; CHECK-NEXT: [[T7]] = add i64 [[T1]], 1 @@ -130,14 +129,14 @@ define float @minloopnovec(float* nocapture readonly %arg) { ; CHECK-NEXT: ret float [[T6_LCSSA]] ; top: - %t = load float, float* %arg + %t = load float, ptr %arg br label %loop loop: ; preds = %loop, %top %t1 = phi i64 [ %t7, %loop ], [ 1, %top ] %t2 = phi float [ %t6, %loop ], [ %t, %top ] - %t3 = getelementptr float, float* %arg, i64 %t1 - %t4 = load float, float* %t3, align 4 + %t3 = getelementptr float, ptr %arg, i64 %t1 + %t4 = load float, ptr %t3, align 4 %t5 = fcmp olt float %t2, %t4 %t6 = select i1 %t5, float %t2, float %t4 %t7 = add i64 %t1, 1 @@ -150,16 +149,16 @@ out: ; preds = %loop ; This test is checking that we don't vectorize when only one of the required attributes is set. ; Note that this test should not vectorize even after switching to IR-level FMF. -define float @minloopmissingnsz(float* nocapture readonly %arg) #1 { +define float @minloopmissingnsz(ptr nocapture readonly %arg) #1 { ; CHECK-LABEL: @minloopmissingnsz( ; CHECK-NEXT: top: -; CHECK-NEXT: [[T:%.*]] = load float, float* [[ARG:%.*]], align 4 +; CHECK-NEXT: [[T:%.*]] = load float, ptr [[ARG:%.*]], align 4 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[T1:%.*]] = phi i64 [ [[T7:%.*]], [[LOOP]] ], [ 1, [[TOP:%.*]] ] ; CHECK-NEXT: [[T2:%.*]] = phi float [ [[T6:%.*]], [[LOOP]] ], [ [[T]], [[TOP]] ] -; CHECK-NEXT: [[T3:%.*]] = getelementptr float, float* [[ARG]], i64 [[T1]] -; CHECK-NEXT: [[T4:%.*]] = load float, float* [[T3]], align 4 +; CHECK-NEXT: [[T3:%.*]] = getelementptr float, ptr [[ARG]], i64 [[T1]] +; CHECK-NEXT: [[T4:%.*]] = load float, ptr [[T3]], align 4 ; CHECK-NEXT: [[T5:%.*]] = fcmp olt float [[T2]], [[T4]] ; CHECK-NEXT: [[T6]] = select i1 [[T5]], float [[T2]], float [[T4]] ; CHECK-NEXT: [[T7]] = add i64 [[T1]], 1 @@ -170,14 +169,14 @@ define float @minloopmissingnsz(float* nocapture readonly %arg) #1 { ; CHECK-NEXT: ret float [[T6_LCSSA]] ; top: - %t = load float, float* %arg + %t = load float, ptr %arg br label %loop loop: ; preds = %loop, %top %t1 = phi i64 [ %t7, %loop ], [ 1, %top ] %t2 = phi float [ %t6, %loop ], [ %t, %top ] - %t3 = getelementptr float, float* %arg, i64 %t1 - %t4 = load float, float* %t3, align 4 + %t3 = getelementptr float, ptr %arg, i64 %t1 + %t4 = load float, ptr %t3, align 4 %t5 = fcmp olt float %t2, %t4 %t6 = select i1 %t5, float %t2, float %t4 %t7 = add i64 %t1, 1 diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll index 265f49c40a18b..a72ab450a9094 100644 --- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll +++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll @@ -3,7 +3,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) { +define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -17,11 +17,12 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) { ; CHECK: vector.memcheck: ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 ; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP5]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 4 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[A]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[B]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -29,26 +30,23 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !0, !noalias !3 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !3 -; CHECK-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD6]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], -; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i32> , <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = and <4 x i1> [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP11]], -; CHECK-NEXT: [[TMP16:%.*]] = and <4 x i1> [[TMP10]], [[TMP15]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> , <4 x i32> -; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP16]], <4 x i32> [[TMP13]], <4 x i32> [[PREDPHI]] -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[PREDPHI7]], <4 x i32>* [[TMP17]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4, !alias.scope !3 +; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD2]], +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = and <4 x i1> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP10]], +; CHECK-NEXT: [[TMP15:%.*]] = and <4 x i1> [[TMP9]], [[TMP14]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP13]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[PREDPHI3:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP12]], <4 x i32> [[PREDPHI]] +; CHECK-NEXT: store <4 x i32> [[PREDPHI3]], ptr [[TMP7]], align 4, !alias.scope !0, !noalias !3 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -57,22 +55,22 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[IF_END14:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] ; CHECK-NEXT: br i1 [[CMP3]], label [[IF_THEN:%.*]], label [[IF_END14]] ; CHECK: if.then: -; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP19]], 19 +; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP17]], 19 ; CHECK-NEXT: br i1 [[CMP6]], label [[IF_END14]], label [[IF_ELSE:%.*]] ; CHECK: if.else: -; CHECK-NEXT: [[CMP10:%.*]] = icmp slt i32 [[TMP20]], 4 +; CHECK-NEXT: [[CMP10:%.*]] = icmp slt i32 [[TMP18]], 4 ; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP10]], i32 4, i32 5 ; CHECK-NEXT: br label [[IF_END14]] ; CHECK: if.end14: ; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 9, [[FOR_BODY]] ], [ 3, [[IF_THEN]] ], [ [[DOT]], [[IF_ELSE]] ] -; CHECK-NEXT: store i32 [[X_0]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: store i32 [[X_0]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] @@ -88,10 +86,10 @@ entry: for.body: %indvars.iv = phi i64 [ %indvars.iv.next, %if.end14 ], [ 0, %entry ] - %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv - %1 = load i32, i32* %arrayidx2, align 4 + %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 %cmp3 = icmp sgt i32 %0, %1 br i1 %cmp3, label %if.then, label %if.end14 @@ -106,7 +104,7 @@ if.else: if.end14: %x.0 = phi i32 [ 9, %for.body ], [ 3, %if.then ], [ %., %if.else ] ; <------------- A PHI with 3 entries that we can still vectorize. - store i32 %x.0, i32* %arrayidx, align 4 + store i32 %x.0, ptr %arrayidx, align 4 %indvars.iv.next = add i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 %exitcond = icmp eq i32 %lftr.wideiv, %n diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll index 8aabbb2ba09e6..0310f70773922 100644 --- a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll @@ -7,139 +7,123 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; Test predication of non-void instructions, specifically (i) that these ; instructions permit vectorization and (ii) the creation of an insertelement ; and a Phi node. We check the full 2-element sequence for all predicate instructions. -define void @test(i32* nocapture %asd, i32* nocapture %aud, +define void @test(ptr nocapture %asd, ptr nocapture %aud, ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ASD1:%.*]] = bitcast i32* [[ASD:%.*]] to i8* -; CHECK-NEXT: [[AUD3:%.*]] = bitcast i32* [[AUD:%.*]] to i8* -; CHECK-NEXT: [[ASR6:%.*]] = bitcast i32* [[ASR:%.*]] to i8* -; CHECK-NEXT: [[AUR9:%.*]] = bitcast i32* [[AUR:%.*]] to i8* -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[ASD]], i64 128 -; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[AUD]], i64 128 -; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[ASR]], i64 128 -; CHECK-NEXT: [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8* -; CHECK-NEXT: [[SCEVGEP10:%.*]] = getelementptr i32, i32* [[AUR]], i64 128 -; CHECK-NEXT: [[SCEVGEP1011:%.*]] = bitcast i32* [[SCEVGEP10]] to i8* -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP45]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP2]] +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[ASD:%.*]], i64 512 +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[AUD:%.*]], i64 512 +; CHECK-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[ASR:%.*]], i64 512 +; CHECK-NEXT: [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[AUR:%.*]], i64 512 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[AUD]], [[UGLYGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: [[BOUND012:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP78]] -; CHECK-NEXT: [[BOUND113:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP2]] -; CHECK-NEXT: [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]] -; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT14]] -; CHECK-NEXT: [[BOUND015:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP1011]] -; CHECK-NEXT: [[BOUND116:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP2]] +; CHECK-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[ASR]], [[UGLYGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] +; CHECK-NEXT: [[BOUND07:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP3]] +; CHECK-NEXT: [[BOUND18:%.*]] = icmp ult ptr [[AUR]], [[UGLYGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT9:%.*]] = and i1 [[BOUND07]], [[BOUND18]] +; CHECK-NEXT: [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]] +; CHECK-NEXT: [[BOUND011:%.*]] = icmp ult ptr [[AUD]], [[UGLYGEP2]] +; CHECK-NEXT: [[BOUND112:%.*]] = icmp ult ptr [[ASR]], [[UGLYGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT13:%.*]] = and i1 [[BOUND011]], [[BOUND112]] +; CHECK-NEXT: [[CONFLICT_RDX14:%.*]] = or i1 [[CONFLICT_RDX10]], [[FOUND_CONFLICT13]] +; CHECK-NEXT: [[BOUND015:%.*]] = icmp ult ptr [[AUD]], [[UGLYGEP3]] +; CHECK-NEXT: [[BOUND116:%.*]] = icmp ult ptr [[AUR]], [[UGLYGEP1]] ; CHECK-NEXT: [[FOUND_CONFLICT17:%.*]] = and i1 [[BOUND015]], [[BOUND116]] -; CHECK-NEXT: [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT17]] -; CHECK-NEXT: [[BOUND019:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP78]] -; CHECK-NEXT: [[BOUND120:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP45]] +; CHECK-NEXT: [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX14]], [[FOUND_CONFLICT17]] +; CHECK-NEXT: [[BOUND019:%.*]] = icmp ult ptr [[ASR]], [[UGLYGEP3]] +; CHECK-NEXT: [[BOUND120:%.*]] = icmp ult ptr [[AUR]], [[UGLYGEP2]] ; CHECK-NEXT: [[FOUND_CONFLICT21:%.*]] = and i1 [[BOUND019]], [[BOUND120]] ; CHECK-NEXT: [[CONFLICT_RDX22:%.*]] = or i1 [[CONFLICT_RDX18]], [[FOUND_CONFLICT21]] -; CHECK-NEXT: [[BOUND023:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP1011]] -; CHECK-NEXT: [[BOUND124:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP45]] -; CHECK-NEXT: [[FOUND_CONFLICT25:%.*]] = and i1 [[BOUND023]], [[BOUND124]] -; CHECK-NEXT: [[CONFLICT_RDX26:%.*]] = or i1 [[CONFLICT_RDX22]], [[FOUND_CONFLICT25]] -; CHECK-NEXT: [[BOUND027:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP1011]] -; CHECK-NEXT: [[BOUND128:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP78]] -; CHECK-NEXT: [[FOUND_CONFLICT29:%.*]] = and i1 [[BOUND027]], [[BOUND128]] -; CHECK-NEXT: [[CONFLICT_RDX30:%.*]] = or i1 [[CONFLICT_RDX26]], [[FOUND_CONFLICT29]] -; CHECK-NEXT: br i1 [[CONFLICT_RDX30]], label [[SCALAR_PH:%.*]], label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX22]], label [[SCALAR_PH:%.*]], label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE35:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE27:%.*]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[AUD]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[ASR]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[AUR]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4, !alias.scope !5, !noalias !8 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD31:%.*]] = load <2 x i32>, <2 x i32>* [[TMP8]], align 4, !alias.scope !12, !noalias !13 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD32:%.*]] = load <2 x i32>, <2 x i32>* [[TMP10]], align 4, !alias.scope !14, !noalias !15 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD33:%.*]] = load <2 x i32>, <2 x i32>* [[TMP12]], align 4, !alias.scope !15 -; CHECK-NEXT: [[TMP13:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP14:%.*]] = add nsw <2 x i32> [[WIDE_LOAD31]], -; CHECK-NEXT: [[TMP15:%.*]] = add nsw <2 x i32> [[WIDE_LOAD32]], -; CHECK-NEXT: [[TMP16:%.*]] = add nsw <2 x i32> [[WIDE_LOAD33]], -; CHECK-NEXT: [[TMP17:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[TMP17]], i32 0 -; CHECK-NEXT: br i1 [[TMP18]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[AUD]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ASR]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[AUR]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP5]], align 4, !alias.scope !5, !noalias !8 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD23:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope !12, !noalias !13 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <2 x i32>, ptr [[TMP7]], align 4, !alias.scope !14, !noalias !15 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD25:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4, !alias.scope !15 +; CHECK-NEXT: [[TMP9:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <2 x i32> [[WIDE_LOAD23]], +; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i32> [[WIDE_LOAD24]], +; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i32> [[WIDE_LOAD25]], +; CHECK-NEXT: [[TMP13:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP13]], i32 0 +; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] ; CHECK: pred.urem.if: -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP21:%.*]] = sdiv i32 [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = sdiv i32 [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> poison, i32 [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD23]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = udiv i32 [[TMP19]], [[TMP20]] ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[WIDE_LOAD31]], i32 0 -; CHECK-NEXT: [[TMP25:%.*]] = udiv i32 [[TMP23]], [[TMP24]] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[WIDE_LOAD24]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = srem i32 [[TMP23]], [[TMP24]] ; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x i32> poison, i32 [[TMP25]], i32 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x i32> [[WIDE_LOAD32]], i32 0 -; CHECK-NEXT: [[TMP29:%.*]] = srem i32 [[TMP27]], [[TMP28]] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x i32> [[WIDE_LOAD25]], i32 0 +; CHECK-NEXT: [[TMP29:%.*]] = urem i32 [[TMP27]], [[TMP28]] ; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> poison, i32 [[TMP29]], i32 0 -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i32> [[WIDE_LOAD33]], i32 0 -; CHECK-NEXT: [[TMP33:%.*]] = urem i32 [[TMP31]], [[TMP32]] -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <2 x i32> poison, i32 [[TMP33]], i32 0 ; CHECK-NEXT: br label [[PRED_UREM_CONTINUE]] ; CHECK: pred.urem.continue: -; CHECK-NEXT: [[TMP35:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP22]], [[PRED_UREM_IF]] ] -; CHECK-NEXT: [[TMP36:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP26]], [[PRED_UREM_IF]] ] -; CHECK-NEXT: [[TMP37:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP30]], [[PRED_UREM_IF]] ] -; CHECK-NEXT: [[TMP38:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP34]], [[PRED_UREM_IF]] ] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP17]], i32 1 -; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_UREM_IF34:%.*]], label [[PRED_UREM_CONTINUE35]] -; CHECK: pred.urem.if34: -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP42:%.*]] = sdiv i32 [[TMP40]], [[TMP41]] -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x i32> [[TMP35]], i32 [[TMP42]], i32 1 -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1 -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[WIDE_LOAD31]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = udiv i32 [[TMP44]], [[TMP45]] -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <2 x i32> [[TMP36]], i32 [[TMP46]], i32 1 -; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1 -; CHECK-NEXT: [[TMP49:%.*]] = extractelement <2 x i32> [[WIDE_LOAD32]], i32 1 -; CHECK-NEXT: [[TMP50:%.*]] = srem i32 [[TMP48]], [[TMP49]] -; CHECK-NEXT: [[TMP51:%.*]] = insertelement <2 x i32> [[TMP37]], i32 [[TMP50]], i32 1 -; CHECK-NEXT: [[TMP52:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1 -; CHECK-NEXT: [[TMP53:%.*]] = extractelement <2 x i32> [[WIDE_LOAD33]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = urem i32 [[TMP52]], [[TMP53]] -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP54]], i32 1 -; CHECK-NEXT: br label [[PRED_UREM_CONTINUE35]] -; CHECK: pred.urem.continue35: -; CHECK-NEXT: [[TMP56:%.*]] = phi <2 x i32> [ [[TMP35]], [[PRED_UREM_CONTINUE]] ], [ [[TMP43]], [[PRED_UREM_IF34]] ] -; CHECK-NEXT: [[TMP57:%.*]] = phi <2 x i32> [ [[TMP36]], [[PRED_UREM_CONTINUE]] ], [ [[TMP47]], [[PRED_UREM_IF34]] ] -; CHECK-NEXT: [[TMP58:%.*]] = phi <2 x i32> [ [[TMP37]], [[PRED_UREM_CONTINUE]] ], [ [[TMP51]], [[PRED_UREM_IF34]] ] -; CHECK-NEXT: [[TMP59:%.*]] = phi <2 x i32> [ [[TMP38]], [[PRED_UREM_CONTINUE]] ], [ [[TMP55]], [[PRED_UREM_IF34]] ] -; CHECK-NEXT: [[TMP60:%.*]] = xor <2 x i1> [[TMP17]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP60]], <2 x i32> [[TMP13]], <2 x i32> [[TMP56]] -; CHECK-NEXT: [[PREDPHI36:%.*]] = select <2 x i1> [[TMP60]], <2 x i32> [[TMP14]], <2 x i32> [[TMP57]] -; CHECK-NEXT: [[PREDPHI37:%.*]] = select <2 x i1> [[TMP60]], <2 x i32> [[TMP15]], <2 x i32> [[TMP58]] -; CHECK-NEXT: [[PREDPHI38:%.*]] = select <2 x i1> [[TMP60]], <2 x i32> [[TMP16]], <2 x i32> [[TMP59]] -; CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[PREDPHI]], <2 x i32>* [[TMP62]], align 4, !alias.scope !5, !noalias !8 -; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[PREDPHI36]], <2 x i32>* [[TMP64]], align 4, !alias.scope !12, !noalias !13 -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP66:%.*]] = bitcast i32* [[TMP65]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[PREDPHI37]], <2 x i32>* [[TMP66]], align 4, !alias.scope !14, !noalias !15 -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP68:%.*]] = bitcast i32* [[TMP67]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[PREDPHI38]], <2 x i32>* [[TMP68]], align 4, !alias.scope !15 +; CHECK-NEXT: [[TMP31:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP18]], [[PRED_UREM_IF]] ] +; CHECK-NEXT: [[TMP32:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP22]], [[PRED_UREM_IF]] ] +; CHECK-NEXT: [[TMP33:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP26]], [[PRED_UREM_IF]] ] +; CHECK-NEXT: [[TMP34:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP30]], [[PRED_UREM_IF]] ] +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1 +; CHECK-NEXT: br i1 [[TMP35]], label [[PRED_UREM_IF26:%.*]], label [[PRED_UREM_CONTINUE27]] +; CHECK: pred.urem.if26: +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <2 x i32> [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP38:%.*]] = sdiv i32 [[TMP36]], [[TMP37]] +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <2 x i32> [[TMP31]], i32 [[TMP38]], i32 1 +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x i32> [[WIDE_LOAD23]], i32 1 +; CHECK-NEXT: [[TMP42:%.*]] = udiv i32 [[TMP40]], [[TMP41]] +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP42]], i32 1 +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[WIDE_LOAD24]], i32 1 +; CHECK-NEXT: [[TMP46:%.*]] = srem i32 [[TMP44]], [[TMP45]] +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <2 x i32> [[TMP33]], i32 [[TMP46]], i32 1 +; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <2 x i32> [[WIDE_LOAD25]], i32 1 +; CHECK-NEXT: [[TMP50:%.*]] = urem i32 [[TMP48]], [[TMP49]] +; CHECK-NEXT: [[TMP51:%.*]] = insertelement <2 x i32> [[TMP34]], i32 [[TMP50]], i32 1 +; CHECK-NEXT: br label [[PRED_UREM_CONTINUE27]] +; CHECK: pred.urem.continue27: +; CHECK-NEXT: [[TMP52:%.*]] = phi <2 x i32> [ [[TMP31]], [[PRED_UREM_CONTINUE]] ], [ [[TMP39]], [[PRED_UREM_IF26]] ] +; CHECK-NEXT: [[TMP53:%.*]] = phi <2 x i32> [ [[TMP32]], [[PRED_UREM_CONTINUE]] ], [ [[TMP43]], [[PRED_UREM_IF26]] ] +; CHECK-NEXT: [[TMP54:%.*]] = phi <2 x i32> [ [[TMP33]], [[PRED_UREM_CONTINUE]] ], [ [[TMP47]], [[PRED_UREM_IF26]] ] +; CHECK-NEXT: [[TMP55:%.*]] = phi <2 x i32> [ [[TMP34]], [[PRED_UREM_CONTINUE]] ], [ [[TMP51]], [[PRED_UREM_IF26]] ] +; CHECK-NEXT: [[TMP56:%.*]] = xor <2 x i1> [[TMP13]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP56]], <2 x i32> [[TMP9]], <2 x i32> [[TMP52]] +; CHECK-NEXT: [[PREDPHI28:%.*]] = select <2 x i1> [[TMP56]], <2 x i32> [[TMP10]], <2 x i32> [[TMP53]] +; CHECK-NEXT: [[PREDPHI29:%.*]] = select <2 x i1> [[TMP56]], <2 x i32> [[TMP11]], <2 x i32> [[TMP54]] +; CHECK-NEXT: [[PREDPHI30:%.*]] = select <2 x i1> [[TMP56]], <2 x i32> [[TMP12]], <2 x i32> [[TMP55]] +; CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP57]], align 4, !alias.scope !5, !noalias !8 +; CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; CHECK-NEXT: store <2 x i32> [[PREDPHI28]], ptr [[TMP58]], align 4, !alias.scope !12, !noalias !13 +; CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: store <2 x i32> [[PREDPHI29]], ptr [[TMP59]], align 4, !alias.scope !14, !noalias !15 +; CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; CHECK-NEXT: store <2 x i32> [[PREDPHI30]], ptr [[TMP60]], align 4, !alias.scope !15 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP69:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; CHECK-NEXT: br i1 [[TMP69]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[TMP61:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; CHECK-NEXT: br i1 [[TMP61]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -150,14 +134,14 @@ define void @test(i32* nocapture %asd, i32* nocapture %aud, ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END:%.*]] ] -; CHECK-NEXT: [[ISD:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[IUD:%.*]] = getelementptr inbounds i32, i32* [[AUD]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[ISR:%.*]] = getelementptr inbounds i32, i32* [[ASR]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[IUR:%.*]] = getelementptr inbounds i32, i32* [[AUR]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[LSD:%.*]] = load i32, i32* [[ISD]], align 4 -; CHECK-NEXT: [[LUD:%.*]] = load i32, i32* [[IUD]], align 4 -; CHECK-NEXT: [[LSR:%.*]] = load i32, i32* [[ISR]], align 4 -; CHECK-NEXT: [[LUR:%.*]] = load i32, i32* [[IUR]], align 4 +; CHECK-NEXT: [[ISD:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IUD:%.*]] = getelementptr inbounds i32, ptr [[AUD]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[ISR:%.*]] = getelementptr inbounds i32, ptr [[ASR]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IUR:%.*]] = getelementptr inbounds i32, ptr [[AUR]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[LSD:%.*]] = load i32, ptr [[ISD]], align 4 +; CHECK-NEXT: [[LUD:%.*]] = load i32, ptr [[IUD]], align 4 +; CHECK-NEXT: [[LSR:%.*]] = load i32, ptr [[ISR]], align 4 +; CHECK-NEXT: [[LUR:%.*]] = load i32, ptr [[IUR]], align 4 ; CHECK-NEXT: [[PSD:%.*]] = add nsw i32 [[LSD]], 23 ; CHECK-NEXT: [[PUD:%.*]] = add nsw i32 [[LUD]], 24 ; CHECK-NEXT: [[PSR:%.*]] = add nsw i32 [[LSR]], 25 @@ -175,131 +159,123 @@ define void @test(i32* nocapture %asd, i32* nocapture %aud, ; CHECK-NEXT: [[YUD_0:%.*]] = phi i32 [ [[RUD]], [[IF_THEN]] ], [ [[PUD]], [[FOR_BODY]] ] ; CHECK-NEXT: [[YSR_0:%.*]] = phi i32 [ [[RSR]], [[IF_THEN]] ], [ [[PSR]], [[FOR_BODY]] ] ; CHECK-NEXT: [[YUR_0:%.*]] = phi i32 [ [[RUR]], [[IF_THEN]] ], [ [[PUR]], [[FOR_BODY]] ] -; CHECK-NEXT: store i32 [[YSD_0]], i32* [[ISD]], align 4 -; CHECK-NEXT: store i32 [[YUD_0]], i32* [[IUD]], align 4 -; CHECK-NEXT: store i32 [[YSR_0]], i32* [[ISR]], align 4 -; CHECK-NEXT: store i32 [[YUR_0]], i32* [[IUR]], align 4 +; CHECK-NEXT: store i32 [[YSD_0]], ptr [[ISD]], align 4 +; CHECK-NEXT: store i32 [[YUD_0]], ptr [[IUD]], align 4 +; CHECK-NEXT: store i32 [[YSR_0]], ptr [[ISR]], align 4 +; CHECK-NEXT: store i32 [[YUR_0]], ptr [[IUR]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; ; UNROLL-NO-VF-LABEL: @test( ; UNROLL-NO-VF-NEXT: entry: -; UNROLL-NO-VF-NEXT: [[ASD1:%.*]] = bitcast i32* [[ASD:%.*]] to i8* -; UNROLL-NO-VF-NEXT: [[AUD3:%.*]] = bitcast i32* [[AUD:%.*]] to i8* -; UNROLL-NO-VF-NEXT: [[ASR6:%.*]] = bitcast i32* [[ASR:%.*]] to i8* -; UNROLL-NO-VF-NEXT: [[AUR9:%.*]] = bitcast i32* [[AUR:%.*]] to i8* ; UNROLL-NO-VF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; UNROLL-NO-VF: vector.memcheck: -; UNROLL-NO-VF-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[ASD]], i64 128 -; UNROLL-NO-VF-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; UNROLL-NO-VF-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[AUD]], i64 128 -; UNROLL-NO-VF-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; UNROLL-NO-VF-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[ASR]], i64 128 -; UNROLL-NO-VF-NEXT: [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8* -; UNROLL-NO-VF-NEXT: [[SCEVGEP10:%.*]] = getelementptr i32, i32* [[AUR]], i64 128 -; UNROLL-NO-VF-NEXT: [[SCEVGEP1011:%.*]] = bitcast i32* [[SCEVGEP10]] to i8* -; UNROLL-NO-VF-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP45]] -; UNROLL-NO-VF-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP2]] +; UNROLL-NO-VF-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[ASD:%.*]], i64 512 +; UNROLL-NO-VF-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[AUD:%.*]], i64 512 +; UNROLL-NO-VF-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[ASR:%.*]], i64 512 +; UNROLL-NO-VF-NEXT: [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[AUR:%.*]], i64 512 +; UNROLL-NO-VF-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP1]] +; UNROLL-NO-VF-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[AUD]], [[UGLYGEP]] ; UNROLL-NO-VF-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; UNROLL-NO-VF-NEXT: [[BOUND012:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP78]] -; UNROLL-NO-VF-NEXT: [[BOUND113:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP2]] -; UNROLL-NO-VF-NEXT: [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]] -; UNROLL-NO-VF-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT14]] -; UNROLL-NO-VF-NEXT: [[BOUND015:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP1011]] -; UNROLL-NO-VF-NEXT: [[BOUND116:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP2]] +; UNROLL-NO-VF-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP2]] +; UNROLL-NO-VF-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[ASR]], [[UGLYGEP]] +; UNROLL-NO-VF-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]] +; UNROLL-NO-VF-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] +; UNROLL-NO-VF-NEXT: [[BOUND07:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP3]] +; UNROLL-NO-VF-NEXT: [[BOUND18:%.*]] = icmp ult ptr [[AUR]], [[UGLYGEP]] +; UNROLL-NO-VF-NEXT: [[FOUND_CONFLICT9:%.*]] = and i1 [[BOUND07]], [[BOUND18]] +; UNROLL-NO-VF-NEXT: [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]] +; UNROLL-NO-VF-NEXT: [[BOUND011:%.*]] = icmp ult ptr [[AUD]], [[UGLYGEP2]] +; UNROLL-NO-VF-NEXT: [[BOUND112:%.*]] = icmp ult ptr [[ASR]], [[UGLYGEP1]] +; UNROLL-NO-VF-NEXT: [[FOUND_CONFLICT13:%.*]] = and i1 [[BOUND011]], [[BOUND112]] +; UNROLL-NO-VF-NEXT: [[CONFLICT_RDX14:%.*]] = or i1 [[CONFLICT_RDX10]], [[FOUND_CONFLICT13]] +; UNROLL-NO-VF-NEXT: [[BOUND015:%.*]] = icmp ult ptr [[AUD]], [[UGLYGEP3]] +; UNROLL-NO-VF-NEXT: [[BOUND116:%.*]] = icmp ult ptr [[AUR]], [[UGLYGEP1]] ; UNROLL-NO-VF-NEXT: [[FOUND_CONFLICT17:%.*]] = and i1 [[BOUND015]], [[BOUND116]] -; UNROLL-NO-VF-NEXT: [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT17]] -; UNROLL-NO-VF-NEXT: [[BOUND019:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP78]] -; UNROLL-NO-VF-NEXT: [[BOUND120:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP45]] +; UNROLL-NO-VF-NEXT: [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX14]], [[FOUND_CONFLICT17]] +; UNROLL-NO-VF-NEXT: [[BOUND019:%.*]] = icmp ult ptr [[ASR]], [[UGLYGEP3]] +; UNROLL-NO-VF-NEXT: [[BOUND120:%.*]] = icmp ult ptr [[AUR]], [[UGLYGEP2]] ; UNROLL-NO-VF-NEXT: [[FOUND_CONFLICT21:%.*]] = and i1 [[BOUND019]], [[BOUND120]] ; UNROLL-NO-VF-NEXT: [[CONFLICT_RDX22:%.*]] = or i1 [[CONFLICT_RDX18]], [[FOUND_CONFLICT21]] -; UNROLL-NO-VF-NEXT: [[BOUND023:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP1011]] -; UNROLL-NO-VF-NEXT: [[BOUND124:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP45]] -; UNROLL-NO-VF-NEXT: [[FOUND_CONFLICT25:%.*]] = and i1 [[BOUND023]], [[BOUND124]] -; UNROLL-NO-VF-NEXT: [[CONFLICT_RDX26:%.*]] = or i1 [[CONFLICT_RDX22]], [[FOUND_CONFLICT25]] -; UNROLL-NO-VF-NEXT: [[BOUND027:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP1011]] -; UNROLL-NO-VF-NEXT: [[BOUND128:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP78]] -; UNROLL-NO-VF-NEXT: [[FOUND_CONFLICT29:%.*]] = and i1 [[BOUND027]], [[BOUND128]] -; UNROLL-NO-VF-NEXT: [[CONFLICT_RDX30:%.*]] = or i1 [[CONFLICT_RDX26]], [[FOUND_CONFLICT29]] -; UNROLL-NO-VF-NEXT: br i1 [[CONFLICT_RDX30]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-VF-NEXT: br i1 [[CONFLICT_RDX22]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-VF: vector.ph: ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: -; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE33:%.*]] ] -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION31:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDUCTION31]] -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[AUD]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[AUD]], i64 [[INDUCTION31]] -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[ASR]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[ASR]], i64 [[INDUCTION31]] -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[AUR]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[AUR]], i64 [[INDUCTION31]] -; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4, !alias.scope !5, !noalias !8 -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP1]], align 4, !alias.scope !5, !noalias !8 -; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP2]], align 4, !alias.scope !12, !noalias !13 -; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !alias.scope !12, !noalias !13 -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !alias.scope !14, !noalias !15 -; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !alias.scope !14, !noalias !15 -; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !alias.scope !15 -; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !alias.scope !15 -; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = add nsw i32 [[TMP8]], 23 -; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP9]], 23 -; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = add nsw i32 [[TMP10]], 24 -; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = add nsw i32 [[TMP11]], 24 -; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP12]], 25 -; UNROLL-NO-VF-NEXT: [[TMP21:%.*]] = add nsw i32 [[TMP13]], 25 -; UNROLL-NO-VF-NEXT: [[TMP22:%.*]] = add nsw i32 [[TMP14]], 26 -; UNROLL-NO-VF-NEXT: [[TMP23:%.*]] = add nsw i32 [[TMP15]], 26 -; UNROLL-NO-VF-NEXT: [[TMP24:%.*]] = icmp slt i32 [[TMP8]], 100 -; UNROLL-NO-VF-NEXT: [[TMP25:%.*]] = icmp slt i32 [[TMP9]], 100 -; UNROLL-NO-VF-NEXT: br i1 [[TMP24]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] +; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE24:%.*]] ] +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[AUD]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[AUD]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[ASR]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[ASR]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[AUR]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[AUR]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP2]], align 4, !alias.scope !5, !noalias !8 +; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP3]], align 4, !alias.scope !5, !noalias !8 +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP4]], align 4, !alias.scope !12, !noalias !13 +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP5]], align 4, !alias.scope !12, !noalias !13 +; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP6]], align 4, !alias.scope !14, !noalias !15 +; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP7]], align 4, !alias.scope !14, !noalias !15 +; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope !15 +; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope !15 +; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = add nsw i32 [[TMP10]], 23 +; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = add nsw i32 [[TMP11]], 23 +; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP12]], 24 +; UNROLL-NO-VF-NEXT: [[TMP21:%.*]] = add nsw i32 [[TMP13]], 24 +; UNROLL-NO-VF-NEXT: [[TMP22:%.*]] = add nsw i32 [[TMP14]], 25 +; UNROLL-NO-VF-NEXT: [[TMP23:%.*]] = add nsw i32 [[TMP15]], 25 +; UNROLL-NO-VF-NEXT: [[TMP24:%.*]] = add nsw i32 [[TMP16]], 26 +; UNROLL-NO-VF-NEXT: [[TMP25:%.*]] = add nsw i32 [[TMP17]], 26 +; UNROLL-NO-VF-NEXT: [[TMP26:%.*]] = icmp slt i32 [[TMP10]], 100 +; UNROLL-NO-VF-NEXT: [[TMP27:%.*]] = icmp slt i32 [[TMP11]], 100 +; UNROLL-NO-VF-NEXT: br i1 [[TMP26]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]] ; UNROLL-NO-VF: pred.urem.if: -; UNROLL-NO-VF-NEXT: [[TMP26:%.*]] = sdiv i32 [[TMP16]], [[TMP8]] -; UNROLL-NO-VF-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP18]], [[TMP10]] -; UNROLL-NO-VF-NEXT: [[TMP28:%.*]] = srem i32 [[TMP20]], [[TMP12]] -; UNROLL-NO-VF-NEXT: [[TMP29:%.*]] = urem i32 [[TMP22]], [[TMP14]] +; UNROLL-NO-VF-NEXT: [[TMP28:%.*]] = sdiv i32 [[TMP18]], [[TMP10]] +; UNROLL-NO-VF-NEXT: [[TMP29:%.*]] = udiv i32 [[TMP20]], [[TMP12]] +; UNROLL-NO-VF-NEXT: [[TMP30:%.*]] = srem i32 [[TMP22]], [[TMP14]] +; UNROLL-NO-VF-NEXT: [[TMP31:%.*]] = urem i32 [[TMP24]], [[TMP16]] ; UNROLL-NO-VF-NEXT: br label [[PRED_UREM_CONTINUE]] ; UNROLL-NO-VF: pred.urem.continue: -; UNROLL-NO-VF-NEXT: [[TMP30:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP26]], [[PRED_UREM_IF]] ] -; UNROLL-NO-VF-NEXT: [[TMP31:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP27]], [[PRED_UREM_IF]] ] ; UNROLL-NO-VF-NEXT: [[TMP32:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP28]], [[PRED_UREM_IF]] ] ; UNROLL-NO-VF-NEXT: [[TMP33:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP29]], [[PRED_UREM_IF]] ] -; UNROLL-NO-VF-NEXT: br i1 [[TMP25]], label [[PRED_UREM_IF32:%.*]], label [[PRED_UREM_CONTINUE33]] -; UNROLL-NO-VF: pred.urem.if31: -; UNROLL-NO-VF-NEXT: [[TMP34:%.*]] = sdiv i32 [[TMP17]], [[TMP9]] -; UNROLL-NO-VF-NEXT: [[TMP35:%.*]] = udiv i32 [[TMP19]], [[TMP11]] -; UNROLL-NO-VF-NEXT: [[TMP36:%.*]] = srem i32 [[TMP21]], [[TMP13]] -; UNROLL-NO-VF-NEXT: [[TMP37:%.*]] = urem i32 [[TMP23]], [[TMP15]] -; UNROLL-NO-VF-NEXT: br label [[PRED_UREM_CONTINUE33]] -; UNROLL-NO-VF: pred.urem.continue32: -; UNROLL-NO-VF-NEXT: [[TMP38:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP34]], [[PRED_UREM_IF32]] ] -; UNROLL-NO-VF-NEXT: [[TMP39:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP35]], [[PRED_UREM_IF32]] ] -; UNROLL-NO-VF-NEXT: [[TMP40:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP36]], [[PRED_UREM_IF32]] ] -; UNROLL-NO-VF-NEXT: [[TMP41:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP37]], [[PRED_UREM_IF32]] ] -; UNROLL-NO-VF-NEXT: [[TMP42:%.*]] = xor i1 [[TMP24]], true -; UNROLL-NO-VF-NEXT: [[TMP43:%.*]] = xor i1 [[TMP25]], true -; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP42]], i32 [[TMP16]], i32 [[TMP30]] -; UNROLL-NO-VF-NEXT: [[PREDPHI34:%.*]] = select i1 [[TMP43]], i32 [[TMP17]], i32 [[TMP38]] -; UNROLL-NO-VF-NEXT: [[PREDPHI35:%.*]] = select i1 [[TMP42]], i32 [[TMP18]], i32 [[TMP31]] -; UNROLL-NO-VF-NEXT: [[PREDPHI36:%.*]] = select i1 [[TMP43]], i32 [[TMP19]], i32 [[TMP39]] -; UNROLL-NO-VF-NEXT: [[PREDPHI37:%.*]] = select i1 [[TMP42]], i32 [[TMP20]], i32 [[TMP32]] -; UNROLL-NO-VF-NEXT: [[PREDPHI38:%.*]] = select i1 [[TMP43]], i32 [[TMP21]], i32 [[TMP40]] -; UNROLL-NO-VF-NEXT: [[PREDPHI39:%.*]] = select i1 [[TMP42]], i32 [[TMP22]], i32 [[TMP33]] -; UNROLL-NO-VF-NEXT: [[PREDPHI40:%.*]] = select i1 [[TMP43]], i32 [[TMP23]], i32 [[TMP41]] -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI]], i32* [[TMP0]], align 4, !alias.scope !5, !noalias !8 -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI34]], i32* [[TMP1]], align 4, !alias.scope !5, !noalias !8 -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI35]], i32* [[TMP2]], align 4, !alias.scope !12, !noalias !13 -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI36]], i32* [[TMP3]], align 4, !alias.scope !12, !noalias !13 -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI37]], i32* [[TMP4]], align 4, !alias.scope !14, !noalias !15 -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI38]], i32* [[TMP5]], align 4, !alias.scope !14, !noalias !15 -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI39]], i32* [[TMP6]], align 4, !alias.scope !15 -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI40]], i32* [[TMP7]], align 4, !alias.scope !15 +; UNROLL-NO-VF-NEXT: [[TMP34:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP30]], [[PRED_UREM_IF]] ] +; UNROLL-NO-VF-NEXT: [[TMP35:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP31]], [[PRED_UREM_IF]] ] +; UNROLL-NO-VF-NEXT: br i1 [[TMP27]], label [[PRED_UREM_IF23:%.*]], label [[PRED_UREM_CONTINUE24]] +; UNROLL-NO-VF: pred.urem.if23: +; UNROLL-NO-VF-NEXT: [[TMP36:%.*]] = sdiv i32 [[TMP19]], [[TMP11]] +; UNROLL-NO-VF-NEXT: [[TMP37:%.*]] = udiv i32 [[TMP21]], [[TMP13]] +; UNROLL-NO-VF-NEXT: [[TMP38:%.*]] = srem i32 [[TMP23]], [[TMP15]] +; UNROLL-NO-VF-NEXT: [[TMP39:%.*]] = urem i32 [[TMP25]], [[TMP17]] +; UNROLL-NO-VF-NEXT: br label [[PRED_UREM_CONTINUE24]] +; UNROLL-NO-VF: pred.urem.continue24: +; UNROLL-NO-VF-NEXT: [[TMP40:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP36]], [[PRED_UREM_IF23]] ] +; UNROLL-NO-VF-NEXT: [[TMP41:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP37]], [[PRED_UREM_IF23]] ] +; UNROLL-NO-VF-NEXT: [[TMP42:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP38]], [[PRED_UREM_IF23]] ] +; UNROLL-NO-VF-NEXT: [[TMP43:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP39]], [[PRED_UREM_IF23]] ] +; UNROLL-NO-VF-NEXT: [[TMP44:%.*]] = xor i1 [[TMP26]], true +; UNROLL-NO-VF-NEXT: [[TMP45:%.*]] = xor i1 [[TMP27]], true +; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP44]], i32 [[TMP18]], i32 [[TMP32]] +; UNROLL-NO-VF-NEXT: [[PREDPHI25:%.*]] = select i1 [[TMP45]], i32 [[TMP19]], i32 [[TMP40]] +; UNROLL-NO-VF-NEXT: [[PREDPHI26:%.*]] = select i1 [[TMP44]], i32 [[TMP20]], i32 [[TMP33]] +; UNROLL-NO-VF-NEXT: [[PREDPHI27:%.*]] = select i1 [[TMP45]], i32 [[TMP21]], i32 [[TMP41]] +; UNROLL-NO-VF-NEXT: [[PREDPHI28:%.*]] = select i1 [[TMP44]], i32 [[TMP22]], i32 [[TMP34]] +; UNROLL-NO-VF-NEXT: [[PREDPHI29:%.*]] = select i1 [[TMP45]], i32 [[TMP23]], i32 [[TMP42]] +; UNROLL-NO-VF-NEXT: [[PREDPHI30:%.*]] = select i1 [[TMP44]], i32 [[TMP24]], i32 [[TMP35]] +; UNROLL-NO-VF-NEXT: [[PREDPHI31:%.*]] = select i1 [[TMP45]], i32 [[TMP25]], i32 [[TMP43]] +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI]], ptr [[TMP2]], align 4, !alias.scope !5, !noalias !8 +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI25]], ptr [[TMP3]], align 4, !alias.scope !5, !noalias !8 +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI26]], ptr [[TMP4]], align 4, !alias.scope !12, !noalias !13 +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI27]], ptr [[TMP5]], align 4, !alias.scope !12, !noalias !13 +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI28]], ptr [[TMP6]], align 4, !alias.scope !14, !noalias !15 +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI29]], ptr [[TMP7]], align 4, !alias.scope !14, !noalias !15 +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI30]], ptr [[TMP8]], align 4, !alias.scope !15 +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI31]], ptr [[TMP9]], align 4, !alias.scope !15 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; UNROLL-NO-VF-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; UNROLL-NO-VF-NEXT: br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128 ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -310,14 +286,14 @@ define void @test(i32* nocapture %asd, i32* nocapture %aud, ; UNROLL-NO-VF-NEXT: ret void ; UNROLL-NO-VF: for.body: ; UNROLL-NO-VF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END:%.*]] ] -; UNROLL-NO-VF-NEXT: [[ISD:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDVARS_IV]] -; UNROLL-NO-VF-NEXT: [[IUD:%.*]] = getelementptr inbounds i32, i32* [[AUD]], i64 [[INDVARS_IV]] -; UNROLL-NO-VF-NEXT: [[ISR:%.*]] = getelementptr inbounds i32, i32* [[ASR]], i64 [[INDVARS_IV]] -; UNROLL-NO-VF-NEXT: [[IUR:%.*]] = getelementptr inbounds i32, i32* [[AUR]], i64 [[INDVARS_IV]] -; UNROLL-NO-VF-NEXT: [[LSD:%.*]] = load i32, i32* [[ISD]], align 4 -; UNROLL-NO-VF-NEXT: [[LUD:%.*]] = load i32, i32* [[IUD]], align 4 -; UNROLL-NO-VF-NEXT: [[LSR:%.*]] = load i32, i32* [[ISR]], align 4 -; UNROLL-NO-VF-NEXT: [[LUR:%.*]] = load i32, i32* [[IUR]], align 4 +; UNROLL-NO-VF-NEXT: [[ISD:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[INDVARS_IV]] +; UNROLL-NO-VF-NEXT: [[IUD:%.*]] = getelementptr inbounds i32, ptr [[AUD]], i64 [[INDVARS_IV]] +; UNROLL-NO-VF-NEXT: [[ISR:%.*]] = getelementptr inbounds i32, ptr [[ASR]], i64 [[INDVARS_IV]] +; UNROLL-NO-VF-NEXT: [[IUR:%.*]] = getelementptr inbounds i32, ptr [[AUR]], i64 [[INDVARS_IV]] +; UNROLL-NO-VF-NEXT: [[LSD:%.*]] = load i32, ptr [[ISD]], align 4 +; UNROLL-NO-VF-NEXT: [[LUD:%.*]] = load i32, ptr [[IUD]], align 4 +; UNROLL-NO-VF-NEXT: [[LSR:%.*]] = load i32, ptr [[ISR]], align 4 +; UNROLL-NO-VF-NEXT: [[LUR:%.*]] = load i32, ptr [[IUR]], align 4 ; UNROLL-NO-VF-NEXT: [[PSD:%.*]] = add nsw i32 [[LSD]], 23 ; UNROLL-NO-VF-NEXT: [[PUD:%.*]] = add nsw i32 [[LUD]], 24 ; UNROLL-NO-VF-NEXT: [[PSR:%.*]] = add nsw i32 [[LSR]], 25 @@ -335,15 +311,15 @@ define void @test(i32* nocapture %asd, i32* nocapture %aud, ; UNROLL-NO-VF-NEXT: [[YUD_0:%.*]] = phi i32 [ [[RUD]], [[IF_THEN]] ], [ [[PUD]], [[FOR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[YSR_0:%.*]] = phi i32 [ [[RSR]], [[IF_THEN]] ], [ [[PSR]], [[FOR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[YUR_0:%.*]] = phi i32 [ [[RUR]], [[IF_THEN]] ], [ [[PUR]], [[FOR_BODY]] ] -; UNROLL-NO-VF-NEXT: store i32 [[YSD_0]], i32* [[ISD]], align 4 -; UNROLL-NO-VF-NEXT: store i32 [[YUD_0]], i32* [[IUD]], align 4 -; UNROLL-NO-VF-NEXT: store i32 [[YSR_0]], i32* [[ISR]], align 4 -; UNROLL-NO-VF-NEXT: store i32 [[YUR_0]], i32* [[IUR]], align 4 +; UNROLL-NO-VF-NEXT: store i32 [[YSD_0]], ptr [[ISD]], align 4 +; UNROLL-NO-VF-NEXT: store i32 [[YUD_0]], ptr [[IUD]], align 4 +; UNROLL-NO-VF-NEXT: store i32 [[YSR_0]], ptr [[ISR]], align 4 +; UNROLL-NO-VF-NEXT: store i32 [[YUR_0]], ptr [[IUR]], align 4 ; UNROLL-NO-VF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; UNROLL-NO-VF-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128 ; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; - i32* nocapture %asr, i32* nocapture %aur) { + ptr nocapture %asr, ptr nocapture %aur) { entry: br label %for.body @@ -352,14 +328,14 @@ for.cond.cleanup: ; preds = %if.end for.body: ; preds = %if.end, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ] - %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv - %iud = getelementptr inbounds i32, i32* %aud, i64 %indvars.iv - %isr = getelementptr inbounds i32, i32* %asr, i64 %indvars.iv - %iur = getelementptr inbounds i32, i32* %aur, i64 %indvars.iv - %lsd = load i32, i32* %isd, align 4 - %lud = load i32, i32* %iud, align 4 - %lsr = load i32, i32* %isr, align 4 - %lur = load i32, i32* %iur, align 4 + %isd = getelementptr inbounds i32, ptr %asd, i64 %indvars.iv + %iud = getelementptr inbounds i32, ptr %aud, i64 %indvars.iv + %isr = getelementptr inbounds i32, ptr %asr, i64 %indvars.iv + %iur = getelementptr inbounds i32, ptr %aur, i64 %indvars.iv + %lsd = load i32, ptr %isd, align 4 + %lud = load i32, ptr %iud, align 4 + %lsr = load i32, ptr %isr, align 4 + %lur = load i32, ptr %iur, align 4 %psd = add nsw i32 %lsd, 23 %pud = add nsw i32 %lud, 24 %psr = add nsw i32 %lsr, 25 @@ -379,75 +355,68 @@ if.end: ; preds = %if.then, %for.body %yud.0 = phi i32 [ %rud, %if.then ], [ %pud, %for.body ] %ysr.0 = phi i32 [ %rsr, %if.then ], [ %psr, %for.body ] %yur.0 = phi i32 [ %rur, %if.then ], [ %pur, %for.body ] - store i32 %ysd.0, i32* %isd, align 4 - store i32 %yud.0, i32* %iud, align 4 - store i32 %ysr.0, i32* %isr, align 4 - store i32 %yur.0, i32* %iur, align 4 + store i32 %ysd.0, ptr %isd, align 4 + store i32 %yud.0, ptr %iud, align 4 + store i32 %ysr.0, ptr %isr, align 4 + store i32 %yur.0, ptr %iur, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 128 br i1 %exitcond, label %for.cond.cleanup, label %for.body } -define void @test_scalar2scalar(i32* nocapture %asd, i32* nocapture %bsd) { +define void @test_scalar2scalar(ptr nocapture %asd, ptr nocapture %bsd) { ; CHECK-LABEL: @test_scalar2scalar( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ASD1:%.*]] = bitcast i32* [[ASD:%.*]] to i8* -; CHECK-NEXT: [[BSD3:%.*]] = bitcast i32* [[BSD:%.*]] to i8* -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[ASD]], i64 128 -; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[BSD]], i64 128 -; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP45]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[BSD3]], [[SCEVGEP2]] +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[ASD:%.*]], i64 512 +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[BSD:%.*]], i64 512 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[BSD]], [[UGLYGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE8:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE4:%.*]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4, !alias.scope !19, !noalias !22 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4, !alias.scope !22 -; CHECK-NEXT: [[TMP7:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP8:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4, !alias.scope !19, !noalias !22 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4, !alias.scope !22 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 +; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] ; CHECK: pred.sdiv.if: -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = sdiv i32 [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[WIDE_LOAD6]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = sdiv i32 [[TMP13]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP14]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = sdiv i32 [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = sdiv i32 [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0 ; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE]] ; CHECK: pred.sdiv.continue: -; CHECK-NEXT: [[TMP16:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP12]], [[PRED_SDIV_IF]] ] -; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP15]], [[PRED_SDIV_IF]] ] -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 -; CHECK-NEXT: br i1 [[TMP18]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8]] -; CHECK: pred.sdiv.if7: -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP21:%.*]] = sdiv i32 [[TMP19]], [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[WIDE_LOAD6]], i32 1 -; CHECK-NEXT: [[TMP23:%.*]] = sdiv i32 [[TMP22]], [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP23]], i32 1 -; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE8]] -; CHECK: pred.sdiv.continue8: -; CHECK-NEXT: [[TMP25:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP21]], [[PRED_SDIV_IF7]] ] -; CHECK-NEXT: [[TMP26:%.*]] = phi <2 x i32> [ [[TMP17]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP24]], [[PRED_SDIV_IF7]] ] -; CHECK-NEXT: [[TMP27:%.*]] = xor <2 x i1> [[TMP8]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP27]], <2 x i32> [[TMP7]], <2 x i32> [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[PREDPHI]], <2 x i32>* [[TMP29]], align 4, !alias.scope !19, !noalias !22 +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_SDIV_IF]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_SDIV_IF]] ] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 +; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4]] +; CHECK: pred.sdiv.if3: +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = sdiv i32 [[TMP17]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 1 +; CHECK-NEXT: [[TMP21:%.*]] = sdiv i32 [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP21]], i32 1 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE4]] +; CHECK: pred.sdiv.continue4: +; CHECK-NEXT: [[TMP23:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP19]], [[PRED_SDIV_IF3]] ] +; CHECK-NEXT: [[TMP24:%.*]] = phi <2 x i32> [ [[TMP15]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP22]], [[PRED_SDIV_IF3]] ] +; CHECK-NEXT: [[TMP25:%.*]] = xor <2 x i1> [[TMP6]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP25]], <2 x i32> [[TMP5]], <2 x i32> [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP26]], align 4, !alias.scope !19, !noalias !22 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -458,10 +427,10 @@ define void @test_scalar2scalar(i32* nocapture %asd, i32* nocapture %bsd) { ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END:%.*]] ] -; CHECK-NEXT: [[ISD:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[LSD:%.*]] = load i32, i32* [[ISD]], align 4 -; CHECK-NEXT: [[ISD_B:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[LSD_B:%.*]] = load i32, i32* [[ISD_B]], align 4 +; CHECK-NEXT: [[ISD:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[LSD:%.*]] = load i32, ptr [[ISD]], align 4 +; CHECK-NEXT: [[ISD_B:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[LSD_B:%.*]] = load i32, ptr [[ISD_B]], align 4 ; CHECK-NEXT: [[PSD:%.*]] = add nsw i32 [[LSD]], 23 ; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[LSD]], 100 ; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_END]] @@ -471,68 +440,64 @@ define void @test_scalar2scalar(i32* nocapture %asd, i32* nocapture %bsd) { ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: ; CHECK-NEXT: [[YSD_0:%.*]] = phi i32 [ [[RSD]], [[IF_THEN]] ], [ [[PSD]], [[FOR_BODY]] ] -; CHECK-NEXT: store i32 [[YSD_0]], i32* [[ISD]], align 4 +; CHECK-NEXT: store i32 [[YSD_0]], ptr [[ISD]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; ; UNROLL-NO-VF-LABEL: @test_scalar2scalar( ; UNROLL-NO-VF-NEXT: entry: -; UNROLL-NO-VF-NEXT: [[ASD1:%.*]] = bitcast i32* [[ASD:%.*]] to i8* -; UNROLL-NO-VF-NEXT: [[BSD3:%.*]] = bitcast i32* [[BSD:%.*]] to i8* ; UNROLL-NO-VF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; UNROLL-NO-VF: vector.memcheck: -; UNROLL-NO-VF-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[ASD]], i64 128 -; UNROLL-NO-VF-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; UNROLL-NO-VF-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[BSD]], i64 128 -; UNROLL-NO-VF-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; UNROLL-NO-VF-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP45]] -; UNROLL-NO-VF-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[BSD3]], [[SCEVGEP2]] +; UNROLL-NO-VF-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[ASD:%.*]], i64 512 +; UNROLL-NO-VF-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[BSD:%.*]], i64 512 +; UNROLL-NO-VF-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP1]] +; UNROLL-NO-VF-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[BSD]], [[UGLYGEP]] ; UNROLL-NO-VF-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; UNROLL-NO-VF-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-VF: vector.ph: ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: -; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE8:%.*]] ] -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION6:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDUCTION6]] -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4, !alias.scope !19, !noalias !22 -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4, !alias.scope !19, !noalias !22 -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = add nsw i32 [[TMP2]], 23 -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP3]], 23 -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = icmp slt i32 [[TMP2]], 100 -; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = icmp slt i32 [[TMP3]], 100 -; UNROLL-NO-VF-NEXT: br i1 [[TMP6]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] +; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE3:%.*]] ] +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4, !alias.scope !19, !noalias !22 +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4, !alias.scope !19, !noalias !22 +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = add nsw i32 [[TMP4]], 23 +; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = add nsw i32 [[TMP5]], 23 +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = icmp slt i32 [[TMP4]], 100 +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = icmp slt i32 [[TMP5]], 100 +; UNROLL-NO-VF-NEXT: br i1 [[TMP8]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] ; UNROLL-NO-VF: pred.sdiv.if: -; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !alias.scope !22 -; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = sdiv i32 [[TMP4]], [[TMP2]] -; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = sdiv i32 [[TMP9]], [[TMP10]] +; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope !22 +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = sdiv i32 [[TMP6]], [[TMP4]] +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = sdiv i32 [[TMP11]], [[TMP12]] ; UNROLL-NO-VF-NEXT: br label [[PRED_SDIV_CONTINUE]] ; UNROLL-NO-VF: pred.sdiv.continue: -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_SDIV_IF]] ] -; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_SDIV_IF]] ] -; UNROLL-NO-VF-NEXT: br i1 [[TMP7]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8]] -; UNROLL-NO-VF: pred.sdiv.if6: -; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDUCTION6]] -; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !alias.scope !22 -; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = sdiv i32 [[TMP5]], [[TMP3]] -; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = sdiv i32 [[TMP15]], [[TMP16]] -; UNROLL-NO-VF-NEXT: br label [[PRED_SDIV_CONTINUE8]] -; UNROLL-NO-VF: pred.sdiv.continue7: -; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP16]], [[PRED_SDIV_IF7]] ] -; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP17]], [[PRED_SDIV_IF7]] ] -; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = xor i1 [[TMP6]], true -; UNROLL-NO-VF-NEXT: [[TMP21:%.*]] = xor i1 [[TMP7]], true -; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP20]], i32 [[TMP4]], i32 [[TMP13]] -; UNROLL-NO-VF-NEXT: [[PREDPHI9:%.*]] = select i1 [[TMP21]], i32 [[TMP5]], i32 [[TMP19]] -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI]], i32* [[TMP0]], align 4, !alias.scope !19, !noalias !22 -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI9]], i32* [[TMP1]], align 4, !alias.scope !19, !noalias !22 +; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP12]], [[PRED_SDIV_IF]] ] +; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_SDIV_IF]] ] +; UNROLL-NO-VF-NEXT: br i1 [[TMP9]], label [[PRED_SDIV_IF2:%.*]], label [[PRED_SDIV_CONTINUE3]] +; UNROLL-NO-VF: pred.sdiv.if2: +; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !alias.scope !22 +; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = sdiv i32 [[TMP7]], [[TMP5]] +; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = sdiv i32 [[TMP17]], [[TMP18]] +; UNROLL-NO-VF-NEXT: br label [[PRED_SDIV_CONTINUE3]] +; UNROLL-NO-VF: pred.sdiv.continue3: +; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP18]], [[PRED_SDIV_IF2]] ] +; UNROLL-NO-VF-NEXT: [[TMP21:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP19]], [[PRED_SDIV_IF2]] ] +; UNROLL-NO-VF-NEXT: [[TMP22:%.*]] = xor i1 [[TMP8]], true +; UNROLL-NO-VF-NEXT: [[TMP23:%.*]] = xor i1 [[TMP9]], true +; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP22]], i32 [[TMP6]], i32 [[TMP15]] +; UNROLL-NO-VF-NEXT: [[PREDPHI4:%.*]] = select i1 [[TMP23]], i32 [[TMP7]], i32 [[TMP21]] +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI]], ptr [[TMP2]], align 4, !alias.scope !19, !noalias !22 +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI4]], ptr [[TMP3]], align 4, !alias.scope !19, !noalias !22 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; UNROLL-NO-VF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; UNROLL-NO-VF-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128 ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -543,10 +508,10 @@ define void @test_scalar2scalar(i32* nocapture %asd, i32* nocapture %bsd) { ; UNROLL-NO-VF-NEXT: ret void ; UNROLL-NO-VF: for.body: ; UNROLL-NO-VF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END:%.*]] ] -; UNROLL-NO-VF-NEXT: [[ISD:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDVARS_IV]] -; UNROLL-NO-VF-NEXT: [[LSD:%.*]] = load i32, i32* [[ISD]], align 4 -; UNROLL-NO-VF-NEXT: [[ISD_B:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDVARS_IV]] -; UNROLL-NO-VF-NEXT: [[LSD_B:%.*]] = load i32, i32* [[ISD_B]], align 4 +; UNROLL-NO-VF-NEXT: [[ISD:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[INDVARS_IV]] +; UNROLL-NO-VF-NEXT: [[LSD:%.*]] = load i32, ptr [[ISD]], align 4 +; UNROLL-NO-VF-NEXT: [[ISD_B:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[INDVARS_IV]] +; UNROLL-NO-VF-NEXT: [[LSD_B:%.*]] = load i32, ptr [[ISD_B]], align 4 ; UNROLL-NO-VF-NEXT: [[PSD:%.*]] = add nsw i32 [[LSD]], 23 ; UNROLL-NO-VF-NEXT: [[CMP1:%.*]] = icmp slt i32 [[LSD]], 100 ; UNROLL-NO-VF-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_END]] @@ -556,7 +521,7 @@ define void @test_scalar2scalar(i32* nocapture %asd, i32* nocapture %bsd) { ; UNROLL-NO-VF-NEXT: br label [[IF_END]] ; UNROLL-NO-VF: if.end: ; UNROLL-NO-VF-NEXT: [[YSD_0:%.*]] = phi i32 [ [[RSD]], [[IF_THEN]] ], [ [[PSD]], [[FOR_BODY]] ] -; UNROLL-NO-VF-NEXT: store i32 [[YSD_0]], i32* [[ISD]], align 4 +; UNROLL-NO-VF-NEXT: store i32 [[YSD_0]], ptr [[ISD]], align 4 ; UNROLL-NO-VF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; UNROLL-NO-VF-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128 ; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] @@ -570,10 +535,10 @@ for.cond.cleanup: ; preds = %if.end for.body: ; preds = %if.end, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ] - %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv - %lsd = load i32, i32* %isd, align 4 - %isd.b = getelementptr inbounds i32, i32* %bsd, i64 %indvars.iv - %lsd.b = load i32, i32* %isd.b, align 4 + %isd = getelementptr inbounds i32, ptr %asd, i64 %indvars.iv + %lsd = load i32, ptr %isd, align 4 + %isd.b = getelementptr inbounds i32, ptr %bsd, i64 %indvars.iv + %lsd.b = load i32, ptr %isd.b, align 4 %psd = add nsw i32 %lsd, 23 %cmp1 = icmp slt i32 %lsd, 100 br i1 %cmp1, label %if.then, label %if.end @@ -585,77 +550,70 @@ if.then: ; preds = %for.body if.end: ; preds = %if.then, %for.body %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %for.body ] - store i32 %ysd.0, i32* %isd, align 4 + store i32 %ysd.0, ptr %isd, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 128 br i1 %exitcond, label %for.cond.cleanup, label %for.body } -define void @pr30172(i32* nocapture %asd, i32* nocapture %bsd) !dbg !5 {; +define void @pr30172(ptr nocapture %asd, ptr nocapture %bsd) !dbg !5 {; ; CHECK-LABEL: @pr30172( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ASD1:%.*]] = bitcast i32* [[ASD:%.*]] to i8* -; CHECK-NEXT: [[BSD3:%.*]] = bitcast i32* [[BSD:%.*]] to i8* -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[ASD]], i64 128 -; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[BSD]], i64 128 -; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP45]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[BSD3]], [[SCEVGEP2]] +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[ASD:%.*]], i64 512 +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[BSD:%.*]], i64 512 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[BSD]], [[UGLYGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE8:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE4:%.*]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4, !alias.scope !28, !noalias !31 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4, !alias.scope !31 -; CHECK-NEXT: [[TMP7:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP8:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP9:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP10:%.*]] = xor <2 x i1> [[TMP8]], , !dbg [[DBG33:![0-9]+]] -; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP10]], <2 x i1> [[TMP9]], <2 x i1> zeroinitializer, !dbg [[DBG34:![0-9]+]] -; CHECK-NEXT: [[TMP12:%.*]] = or <2 x i1> [[TMP11]], [[TMP8]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0 -; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4, !alias.scope !28, !noalias !31 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4, !alias.scope !31 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP7:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP6]], , !dbg [[DBG33:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP8]], <2 x i1> [[TMP7]], <2 x i1> zeroinitializer, !dbg [[DBG34:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = or <2 x i1> [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0 +; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] ; CHECK: pred.sdiv.if: -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = sdiv i32 [[TMP14]], [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[WIDE_LOAD6]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = sdiv i32 [[TMP17]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = sdiv i32 [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = sdiv i32 [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP16]], i32 0 ; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE]] ; CHECK: pred.sdiv.continue: -; CHECK-NEXT: [[TMP20:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP16]], [[PRED_SDIV_IF]] ] -; CHECK-NEXT: [[TMP21:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP19]], [[PRED_SDIV_IF]] ] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1 -; CHECK-NEXT: br i1 [[TMP22]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8]] -; CHECK: pred.sdiv.if7: -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP25:%.*]] = sdiv i32 [[TMP23]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x i32> [[WIDE_LOAD6]], i32 1 -; CHECK-NEXT: [[TMP27:%.*]] = sdiv i32 [[TMP26]], [[TMP25]] -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP27]], i32 1 -; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE8]] -; CHECK: pred.sdiv.continue8: -; CHECK-NEXT: [[TMP29:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP25]], [[PRED_SDIV_IF7]] ] -; CHECK-NEXT: [[TMP30:%.*]] = phi <2 x i32> [ [[TMP21]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP28]], [[PRED_SDIV_IF7]] ] -; CHECK-NEXT: [[TMP31:%.*]] = xor <2 x i1> [[TMP9]], , !dbg [[DBG34]] -; CHECK-NEXT: [[TMP32:%.*]] = select <2 x i1> [[TMP10]], <2 x i1> [[TMP31]], <2 x i1> zeroinitializer, !dbg [[DBG34]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP32]], <2 x i32> [[TMP7]], <2 x i32> [[TMP30]] -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[PREDPHI]], <2 x i32>* [[TMP34]], align 4, !alias.scope !28, !noalias !31 +; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP14]], [[PRED_SDIV_IF]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP17]], [[PRED_SDIV_IF]] ] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 +; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4]] +; CHECK: pred.sdiv.if3: +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = sdiv i32 [[TMP21]], [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 1 +; CHECK-NEXT: [[TMP25:%.*]] = sdiv i32 [[TMP24]], [[TMP23]] +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[TMP25]], i32 1 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE4]] +; CHECK: pred.sdiv.continue4: +; CHECK-NEXT: [[TMP27:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP23]], [[PRED_SDIV_IF3]] ] +; CHECK-NEXT: [[TMP28:%.*]] = phi <2 x i32> [ [[TMP19]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP26]], [[PRED_SDIV_IF3]] ] +; CHECK-NEXT: [[TMP29:%.*]] = xor <2 x i1> [[TMP7]], , !dbg [[DBG34]] +; CHECK-NEXT: [[TMP30:%.*]] = select <2 x i1> [[TMP8]], <2 x i1> [[TMP29]], <2 x i1> zeroinitializer, !dbg [[DBG34]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP30]], <2 x i32> [[TMP5]], <2 x i32> [[TMP28]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP31]], align 4, !alias.scope !28, !noalias !31 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; CHECK-NEXT: br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] +; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -666,10 +624,10 @@ define void @pr30172(i32* nocapture %asd, i32* nocapture %bsd) !dbg !5 {; ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END:%.*]] ] -; CHECK-NEXT: [[ISD:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[LSD:%.*]] = load i32, i32* [[ISD]], align 4 -; CHECK-NEXT: [[ISD_B:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[LSD_B:%.*]] = load i32, i32* [[ISD_B]], align 4 +; CHECK-NEXT: [[ISD:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[LSD:%.*]] = load i32, ptr [[ISD]], align 4 +; CHECK-NEXT: [[ISD_B:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[LSD_B:%.*]] = load i32, ptr [[ISD_B]], align 4 ; CHECK-NEXT: [[PSD:%.*]] = add nsw i32 [[LSD]], 23 ; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[LSD]], 100 ; CHECK-NEXT: [[CMP2:%.*]] = icmp sge i32 [[LSD]], 200 @@ -681,78 +639,74 @@ define void @pr30172(i32* nocapture %asd, i32* nocapture %bsd) !dbg !5 {; ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: ; CHECK-NEXT: [[YSD_0:%.*]] = phi i32 [ [[RSD]], [[IF_THEN]] ], [ [[PSD]], [[FOR_BODY]] ] -; CHECK-NEXT: store i32 [[YSD_0]], i32* [[ISD]], align 4 +; CHECK-NEXT: store i32 [[YSD_0]], ptr [[ISD]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; ; UNROLL-NO-VF-LABEL: @pr30172( ; UNROLL-NO-VF-NEXT: entry: -; UNROLL-NO-VF-NEXT: [[ASD1:%.*]] = bitcast i32* [[ASD:%.*]] to i8* -; UNROLL-NO-VF-NEXT: [[BSD3:%.*]] = bitcast i32* [[BSD:%.*]] to i8* ; UNROLL-NO-VF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; UNROLL-NO-VF: vector.memcheck: -; UNROLL-NO-VF-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[ASD]], i64 128 -; UNROLL-NO-VF-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; UNROLL-NO-VF-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[BSD]], i64 128 -; UNROLL-NO-VF-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; UNROLL-NO-VF-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP45]] -; UNROLL-NO-VF-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[BSD3]], [[SCEVGEP2]] +; UNROLL-NO-VF-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[ASD:%.*]], i64 512 +; UNROLL-NO-VF-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[BSD:%.*]], i64 512 +; UNROLL-NO-VF-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP1]] +; UNROLL-NO-VF-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[BSD]], [[UGLYGEP]] ; UNROLL-NO-VF-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; UNROLL-NO-VF-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-VF: vector.ph: ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: -; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE8:%.*]] ] -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION6:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDUCTION6]] -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4, !alias.scope !28, !noalias !31 -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4, !alias.scope !28, !noalias !31 -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = add nsw i32 [[TMP2]], 23 -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP3]], 23 -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = icmp slt i32 [[TMP2]], 100 -; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = icmp slt i32 [[TMP3]], 100 -; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = icmp sge i32 [[TMP2]], 200 -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = icmp sge i32 [[TMP3]], 200 -; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = xor i1 [[TMP6]], true, !dbg [[DBG33:![0-9]+]] -; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = xor i1 [[TMP7]], true, !dbg [[DBG33]] -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = select i1 [[TMP10]], i1 [[TMP8]], i1 false, !dbg [[DBG34:![0-9]+]] -; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = select i1 [[TMP11]], i1 [[TMP9]], i1 false, !dbg [[DBG34]] -; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP6]] -; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = or i1 [[TMP13]], [[TMP7]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] +; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE3:%.*]] ] +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4, !alias.scope !28, !noalias !31 +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4, !alias.scope !28, !noalias !31 +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = add nsw i32 [[TMP4]], 23 +; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = add nsw i32 [[TMP5]], 23 +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = icmp slt i32 [[TMP4]], 100 +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = icmp slt i32 [[TMP5]], 100 +; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = icmp sge i32 [[TMP4]], 200 +; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = icmp sge i32 [[TMP5]], 200 +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = xor i1 [[TMP8]], true, !dbg [[DBG33:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = xor i1 [[TMP9]], true, !dbg [[DBG33]] +; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = select i1 [[TMP12]], i1 [[TMP10]], i1 false, !dbg [[DBG34:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = select i1 [[TMP13]], i1 [[TMP11]], i1 false, !dbg [[DBG34]] +; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = or i1 [[TMP14]], [[TMP8]] +; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = or i1 [[TMP15]], [[TMP9]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP16]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] ; UNROLL-NO-VF: pred.sdiv.if: -; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !alias.scope !31 -; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = sdiv i32 [[TMP4]], [[TMP2]] -; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = sdiv i32 [[TMP17]], [[TMP18]] +; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4, !alias.scope !31 +; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = sdiv i32 [[TMP6]], [[TMP4]] +; UNROLL-NO-VF-NEXT: [[TMP21:%.*]] = sdiv i32 [[TMP19]], [[TMP20]] ; UNROLL-NO-VF-NEXT: br label [[PRED_SDIV_CONTINUE]] ; UNROLL-NO-VF: pred.sdiv.continue: -; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP18]], [[PRED_SDIV_IF]] ] -; UNROLL-NO-VF-NEXT: [[TMP21:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP19]], [[PRED_SDIV_IF]] ] -; UNROLL-NO-VF-NEXT: br i1 [[TMP15]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8]] -; UNROLL-NO-VF: pred.sdiv.if6: -; UNROLL-NO-VF-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDUCTION6]] -; UNROLL-NO-VF-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, !alias.scope !31 -; UNROLL-NO-VF-NEXT: [[TMP24:%.*]] = sdiv i32 [[TMP5]], [[TMP3]] -; UNROLL-NO-VF-NEXT: [[TMP25:%.*]] = sdiv i32 [[TMP23]], [[TMP24]] -; UNROLL-NO-VF-NEXT: br label [[PRED_SDIV_CONTINUE8]] -; UNROLL-NO-VF: pred.sdiv.continue7: -; UNROLL-NO-VF-NEXT: [[TMP26:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP24]], [[PRED_SDIV_IF7]] ] -; UNROLL-NO-VF-NEXT: [[TMP27:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP25]], [[PRED_SDIV_IF7]] ] -; UNROLL-NO-VF-NEXT: [[TMP28:%.*]] = xor i1 [[TMP8]], true, !dbg [[DBG34]] -; UNROLL-NO-VF-NEXT: [[TMP29:%.*]] = xor i1 [[TMP9]], true, !dbg [[DBG34]] -; UNROLL-NO-VF-NEXT: [[TMP30:%.*]] = select i1 [[TMP10]], i1 [[TMP28]], i1 false, !dbg [[DBG34]] -; UNROLL-NO-VF-NEXT: [[TMP31:%.*]] = select i1 [[TMP11]], i1 [[TMP29]], i1 false, !dbg [[DBG34]] -; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP30]], i32 [[TMP4]], i32 [[TMP21]] -; UNROLL-NO-VF-NEXT: [[PREDPHI9:%.*]] = select i1 [[TMP31]], i32 [[TMP5]], i32 [[TMP27]] -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI]], i32* [[TMP0]], align 4, !alias.scope !28, !noalias !31 -; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI9]], i32* [[TMP1]], align 4, !alias.scope !28, !noalias !31 +; UNROLL-NO-VF-NEXT: [[TMP22:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_SDIV_IF]] ] +; UNROLL-NO-VF-NEXT: [[TMP23:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP21]], [[PRED_SDIV_IF]] ] +; UNROLL-NO-VF-NEXT: br i1 [[TMP17]], label [[PRED_SDIV_IF2:%.*]], label [[PRED_SDIV_CONTINUE3]] +; UNROLL-NO-VF: pred.sdiv.if2: +; UNROLL-NO-VF-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4, !alias.scope !31 +; UNROLL-NO-VF-NEXT: [[TMP26:%.*]] = sdiv i32 [[TMP7]], [[TMP5]] +; UNROLL-NO-VF-NEXT: [[TMP27:%.*]] = sdiv i32 [[TMP25]], [[TMP26]] +; UNROLL-NO-VF-NEXT: br label [[PRED_SDIV_CONTINUE3]] +; UNROLL-NO-VF: pred.sdiv.continue3: +; UNROLL-NO-VF-NEXT: [[TMP28:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP26]], [[PRED_SDIV_IF2]] ] +; UNROLL-NO-VF-NEXT: [[TMP29:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP27]], [[PRED_SDIV_IF2]] ] +; UNROLL-NO-VF-NEXT: [[TMP30:%.*]] = xor i1 [[TMP10]], true, !dbg [[DBG34]] +; UNROLL-NO-VF-NEXT: [[TMP31:%.*]] = xor i1 [[TMP11]], true, !dbg [[DBG34]] +; UNROLL-NO-VF-NEXT: [[TMP32:%.*]] = select i1 [[TMP12]], i1 [[TMP30]], i1 false, !dbg [[DBG34]] +; UNROLL-NO-VF-NEXT: [[TMP33:%.*]] = select i1 [[TMP13]], i1 [[TMP31]], i1 false, !dbg [[DBG34]] +; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP32]], i32 [[TMP6]], i32 [[TMP23]] +; UNROLL-NO-VF-NEXT: [[PREDPHI4:%.*]] = select i1 [[TMP33]], i32 [[TMP7]], i32 [[TMP29]] +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI]], ptr [[TMP2]], align 4, !alias.scope !28, !noalias !31 +; UNROLL-NO-VF-NEXT: store i32 [[PREDPHI4]], ptr [[TMP3]], align 4, !alias.scope !28, !noalias !31 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; UNROLL-NO-VF-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; UNROLL-NO-VF-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128 ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -763,10 +717,10 @@ define void @pr30172(i32* nocapture %asd, i32* nocapture %bsd) !dbg !5 {; ; UNROLL-NO-VF-NEXT: ret void ; UNROLL-NO-VF: for.body: ; UNROLL-NO-VF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END:%.*]] ] -; UNROLL-NO-VF-NEXT: [[ISD:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDVARS_IV]] -; UNROLL-NO-VF-NEXT: [[LSD:%.*]] = load i32, i32* [[ISD]], align 4 -; UNROLL-NO-VF-NEXT: [[ISD_B:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDVARS_IV]] -; UNROLL-NO-VF-NEXT: [[LSD_B:%.*]] = load i32, i32* [[ISD_B]], align 4 +; UNROLL-NO-VF-NEXT: [[ISD:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[INDVARS_IV]] +; UNROLL-NO-VF-NEXT: [[LSD:%.*]] = load i32, ptr [[ISD]], align 4 +; UNROLL-NO-VF-NEXT: [[ISD_B:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[INDVARS_IV]] +; UNROLL-NO-VF-NEXT: [[LSD_B:%.*]] = load i32, ptr [[ISD_B]], align 4 ; UNROLL-NO-VF-NEXT: [[PSD:%.*]] = add nsw i32 [[LSD]], 23 ; UNROLL-NO-VF-NEXT: [[CMP1:%.*]] = icmp slt i32 [[LSD]], 100 ; UNROLL-NO-VF-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[CHECKBB:%.*]], !dbg [[DBG33]] @@ -779,7 +733,7 @@ define void @pr30172(i32* nocapture %asd, i32* nocapture %bsd) !dbg !5 {; ; UNROLL-NO-VF-NEXT: br label [[IF_END]] ; UNROLL-NO-VF: if.end: ; UNROLL-NO-VF-NEXT: [[YSD_0:%.*]] = phi i32 [ [[RSD]], [[IF_THEN]] ], [ [[PSD]], [[CHECKBB]] ] -; UNROLL-NO-VF-NEXT: store i32 [[YSD_0]], i32* [[ISD]], align 4 +; UNROLL-NO-VF-NEXT: store i32 [[YSD_0]], ptr [[ISD]], align 4 ; UNROLL-NO-VF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; UNROLL-NO-VF-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128 ; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] @@ -792,10 +746,10 @@ for.cond.cleanup: ; preds = %if.end for.body: ; preds = %if.end, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ] - %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv - %lsd = load i32, i32* %isd, align 4 - %isd.b = getelementptr inbounds i32, i32* %bsd, i64 %indvars.iv - %lsd.b = load i32, i32* %isd.b, align 4 + %isd = getelementptr inbounds i32, ptr %asd, i64 %indvars.iv + %lsd = load i32, ptr %isd, align 4 + %isd.b = getelementptr inbounds i32, ptr %bsd, i64 %indvars.iv + %lsd.b = load i32, ptr %isd.b, align 4 %psd = add nsw i32 %lsd, 23 %cmp1 = icmp slt i32 %lsd, 100 br i1 %cmp1, label %if.then, label %checkbb, !dbg !7 @@ -811,13 +765,13 @@ if.then: ; preds = %checkbb, %for.body if.end: ; preds = %if.then, %checkbb %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %checkbb ] - store i32 %ysd.0, i32* %isd, align 4 + store i32 %ysd.0, ptr %isd, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 128 br i1 %exitcond, label %for.cond.cleanup, label %for.body } -define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { +define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) { ; CHECK-LABEL: @predicated_udiv_scalarized_operand( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) @@ -831,53 +785,52 @@ define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[PRED_UDIV_CONTINUE2]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_UDIV_CONTINUE2]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; CHECK: pred.udiv.if: -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = add nsw i32 [[TMP5]], [[X:%.*]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = udiv i32 [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], [[X:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = udiv i32 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 ; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE]] ; CHECK: pred.udiv.continue: -; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_UDIV_IF]] ] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 -; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]] +; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UDIV_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]] ; CHECK: pred.udiv.if1: -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], [[X]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = udiv i32 [[TMP14]], [[TMP13]] -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP15]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = add nsw i32 [[TMP11]], [[X]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = udiv i32 [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP14]], i32 1 ; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE2]] ; CHECK: pred.udiv.continue2: -; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i32> [ [[TMP10]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP16]], [[PRED_UDIV_IF1]] ] -; CHECK-NEXT: [[TMP18:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP17]], <2 x i32> [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP19]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], [[PRED_UDIV_IF1]] ] +; CHECK-NEXT: [[TMP17:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP16]], <2 x i32> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP18]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP19]]) +; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP18]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ] ; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[T6:%.*]], [[FOR_INC]] ] -; CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] -; CHECK-NEXT: [[T2:%.*]] = load i32, i32* [[T0]], align 4 +; CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]] +; CHECK-NEXT: [[T2:%.*]] = load i32, ptr [[T0]], align 4 ; CHECK-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK: if.then: ; CHECK-NEXT: [[T3:%.*]] = add nsw i32 [[T2]], [[X]] @@ -890,7 +843,7 @@ define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP38:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[T7:%.*]] = phi i32 [ [[T6]], [[FOR_INC]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[T7:%.*]] = phi i32 [ [[T6]], [[FOR_INC]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[T7]] ; ; UNROLL-NO-VF-LABEL: @predicated_udiv_scalarized_operand( @@ -903,40 +856,40 @@ define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { ; UNROLL-NO-VF-NEXT: [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]] ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: -; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE4:%.*]] ] -; UNROLL-NO-VF-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[PRED_UDIV_CONTINUE4]] ] -; UNROLL-NO-VF-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[PRED_UDIV_CONTINUE4]] ] -; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDUCTION]] -; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDUCTION2]] -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4 -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4 +; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE3:%.*]] ] +; UNROLL-NO-VF-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[PRED_UDIV_CONTINUE3]] ] +; UNROLL-NO-VF-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[PRED_UDIV_CONTINUE3]] ] +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]] +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4 +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4 ; UNROLL-NO-VF-NEXT: br i1 [[C:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; UNROLL-NO-VF: pred.udiv.if: -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = add nsw i32 [[TMP2]], [[X:%.*]] -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP2]], [[TMP4]] +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = add nsw i32 [[TMP4]], [[X:%.*]] +; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = udiv i32 [[TMP4]], [[TMP6]] ; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE]] ; UNROLL-NO-VF: pred.udiv.continue: -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_UDIV_IF]] ] -; UNROLL-NO-VF-NEXT: br i1 [[C]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4]] +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UDIV_IF]] ] +; UNROLL-NO-VF-NEXT: br i1 [[C]], label [[PRED_UDIV_IF2:%.*]], label [[PRED_UDIV_CONTINUE3]] ; UNROLL-NO-VF: pred.udiv.if2: -; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = add nsw i32 [[TMP3]], [[X]] -; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = udiv i32 [[TMP3]], [[TMP7]] -; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE4]] +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP5]], [[X]] +; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP5]], [[TMP9]] +; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE3]] ; UNROLL-NO-VF: pred.udiv.continue3: -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP8]], [[PRED_UDIV_IF3]] ] -; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = xor i1 [[C]], true -; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = xor i1 [[C]], true -; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], i32 [[TMP6]], i32 [[TMP2]] -; UNROLL-NO-VF-NEXT: [[PREDPHI5:%.*]] = select i1 [[C]], i32 [[TMP9]], i32 [[TMP3]] -; UNROLL-NO-VF-NEXT: [[TMP12]] = add i32 [[VEC_PHI]], [[PREDPHI]] -; UNROLL-NO-VF-NEXT: [[TMP13]] = add i32 [[VEC_PHI1]], [[PREDPHI5]] +; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP10]], [[PRED_UDIV_IF2]] ] +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = xor i1 [[C]], true +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = xor i1 [[C]], true +; UNROLL-NO-VF-NEXT: [[PREDPHI:%.*]] = select i1 [[C]], i32 [[TMP8]], i32 [[TMP4]] +; UNROLL-NO-VF-NEXT: [[PREDPHI4:%.*]] = select i1 [[C]], i32 [[TMP11]], i32 [[TMP5]] +; UNROLL-NO-VF-NEXT: [[TMP14]] = add i32 [[VEC_PHI]], [[PREDPHI]] +; UNROLL-NO-VF-NEXT: [[TMP15]] = add i32 [[VEC_PHI1]], [[PREDPHI4]] ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] ; UNROLL-NO-VF: middle.block: -; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP13]], [[TMP12]] +; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP15]], [[TMP14]] ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: @@ -946,8 +899,8 @@ define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { ; UNROLL-NO-VF: for.body: ; UNROLL-NO-VF-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ] ; UNROLL-NO-VF-NEXT: [[R:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[T6:%.*]], [[FOR_INC]] ] -; UNROLL-NO-VF-NEXT: [[T0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] -; UNROLL-NO-VF-NEXT: [[T2:%.*]] = load i32, i32* [[T0]], align 4 +; UNROLL-NO-VF-NEXT: [[T0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]] +; UNROLL-NO-VF-NEXT: [[T2:%.*]] = load i32, ptr [[T0]], align 4 ; UNROLL-NO-VF-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; UNROLL-NO-VF: if.then: ; UNROLL-NO-VF-NEXT: [[T3:%.*]] = add nsw i32 [[T2]], [[X]] @@ -972,8 +925,8 @@ entry: for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] %r = phi i32 [ 0, %entry ], [ %t6, %for.inc ] - %t0 = getelementptr inbounds i32, i32* %a, i64 %i - %t2 = load i32, i32* %t0, align 4 + %t0 = getelementptr inbounds i32, ptr %a, i64 %i + %t2 = load i32, ptr %t0, align 4 br i1 %c, label %if.then, label %for.inc if.then: diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll index 6cb0bc6623987..a742e28dea711 100644 --- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -6,47 +6,47 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ; Test predication of stores. -define i32 @test(i32* nocapture %f) #0 { +define i32 @test(ptr nocapture %f) #0 { ; UNROLL-LABEL: @test( ; UNROLL-NEXT: entry: ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: -; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] -; UNROLL-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[F:%.*]], i64 [[INDUCTION]] -; UNROLL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[INDUCTION1]] -; UNROLL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4 -; UNROLL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4 -; UNROLL-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], 100 -; UNROLL-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], 100 -; UNROLL-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] +; UNROLL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[F:%.*]], i64 [[TMP0]] +; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[F]], i64 [[TMP1]] +; UNROLL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4 +; UNROLL-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4 +; UNROLL-NEXT: [[TMP6:%.*]] = icmp sgt i32 [[TMP4]], 100 +; UNROLL-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], 100 +; UNROLL-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; UNROLL: pred.store.if: -; UNROLL-NEXT: [[TMP6:%.*]] = add nsw i32 [[TMP2]], 20 -; UNROLL-NEXT: store i32 [[TMP6]], i32* [[TMP0]], align 4 +; UNROLL-NEXT: [[TMP8:%.*]] = add nsw i32 [[TMP4]], 20 +; UNROLL-NEXT: store i32 [[TMP8]], ptr [[TMP2]], align 4 ; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE]] ; UNROLL: pred.store.continue: -; UNROLL-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] +; UNROLL-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] ; UNROLL: pred.store.if1: -; UNROLL-NEXT: [[TMP7:%.*]] = add nsw i32 [[TMP3]], 20 -; UNROLL-NEXT: store i32 [[TMP7]], i32* [[TMP1]], align 4 -; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE3]] +; UNROLL-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP5]], 20 +; UNROLL-NEXT: store i32 [[TMP9]], ptr [[TMP3]], align 4 +; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE2]] ; UNROLL: pred.store.continue2: ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; UNROLL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; UNROLL-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; UNROLL-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128 ; UNROLL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] ; UNROLL: for.body: ; UNROLL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 128, [[MIDDLE_BLOCK]] ] -; UNROLL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[INDVARS_IV]] -; UNROLL-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; UNROLL-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP9]], 100 +; UNROLL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[F]], i64 [[INDVARS_IV]] +; UNROLL-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; UNROLL-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP11]], 100 ; UNROLL-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; UNROLL: if.then: -; UNROLL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 20 -; UNROLL-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX]], align 4 +; UNROLL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], 20 +; UNROLL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 ; UNROLL-NEXT: br label [[FOR_INC]] ; UNROLL: for.inc: ; UNROLL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -61,30 +61,30 @@ define i32 @test(i32* nocapture %f) #0 { ; UNROLL-NOSIMPLIFY: vector.ph: ; UNROLL-NOSIMPLIFY-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NOSIMPLIFY: vector.body: -; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[F:%.*]], i64 [[INDUCTION]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[INDUCTION1]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], 100 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], 100 -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[F:%.*]], i64 [[TMP0]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[F]], i64 [[TMP1]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = icmp sgt i32 [[TMP4]], 100 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], 100 +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; UNROLL-NOSIMPLIFY: pred.store.if: -; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = add nsw i32 [[TMP2]], 20 -; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP6]], i32* [[TMP0]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = add nsw i32 [[TMP4]], 20 +; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP8]], ptr [[TMP2]], align 4 ; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE]] ; UNROLL-NOSIMPLIFY: pred.store.continue: -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] ; UNROLL-NOSIMPLIFY: pred.store.if1: -; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = add nsw i32 [[TMP3]], 20 -; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP7]], i32* [[TMP1]], align 4 -; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE3]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP5]], 20 +; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP9]], ptr [[TMP3]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE2]] ; UNROLL-NOSIMPLIFY: pred.store.continue2: ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; UNROLL-NOSIMPLIFY: middle.block: ; UNROLL-NOSIMPLIFY-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -93,13 +93,13 @@ define i32 @test(i32* nocapture %f) #0 { ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NOSIMPLIFY: for.body: ; UNROLL-NOSIMPLIFY-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[INDVARS_IV]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; UNROLL-NOSIMPLIFY-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP9]], 100 +; UNROLL-NOSIMPLIFY-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[F]], i64 [[INDVARS_IV]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP11]], 100 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; UNROLL-NOSIMPLIFY: if.then: -; UNROLL-NOSIMPLIFY-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 20 -; UNROLL-NOSIMPLIFY-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], 20 +; UNROLL-NOSIMPLIFY-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_INC]] ; UNROLL-NOSIMPLIFY: for.inc: ; UNROLL-NOSIMPLIFY-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -114,45 +114,44 @@ define i32 @test(i32* nocapture %f) #0 { ; VEC: vector.body: ; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] ; VEC-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; VEC-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[F:%.*]], i64 [[TMP0]] -; VEC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; VEC-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* -; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 -; VEC-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], -; VEC-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 -; VEC-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; VEC-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[F:%.*]], i64 [[TMP0]] +; VEC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 +; VEC-NEXT: [[TMP3:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], +; VEC-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 +; VEC-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; VEC: pred.store.if: -; VEC-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 -; VEC-NEXT: [[TMP7:%.*]] = add nsw i32 [[TMP6]], 20 -; VEC-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[TMP0]] -; VEC-NEXT: store i32 [[TMP7]], i32* [[TMP8]], align 4 +; VEC-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; VEC-NEXT: [[TMP6:%.*]] = add nsw i32 [[TMP5]], 20 +; VEC-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[F]], i64 [[TMP0]] +; VEC-NEXT: store i32 [[TMP6]], ptr [[TMP7]], align 4 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE]] ; VEC: pred.store.continue: -; VEC-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 -; VEC-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] +; VEC-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 +; VEC-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] ; VEC: pred.store.if1: -; VEC-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1 -; VEC-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 -; VEC-NEXT: [[TMP12:%.*]] = add nsw i32 [[TMP11]], 20 -; VEC-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[TMP10]] -; VEC-NEXT: store i32 [[TMP12]], i32* [[TMP13]], align 4 +; VEC-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 1 +; VEC-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; VEC-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP10]], 20 +; VEC-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[F]], i64 [[TMP9]] +; VEC-NEXT: store i32 [[TMP11]], ptr [[TMP12]], align 4 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE2]] ; VEC: pred.store.continue2: ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; VEC-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; VEC-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VEC-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; VEC-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VEC: middle.block: ; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128 ; VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] ; VEC: for.body: ; VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 128, [[MIDDLE_BLOCK]] ] -; VEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[INDVARS_IV]] -; VEC-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; VEC-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP15]], 100 +; VEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[F]], i64 [[INDVARS_IV]] +; VEC-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; VEC-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP14]], 100 ; VEC-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; VEC: if.then: -; VEC-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 20 -; VEC-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX]], align 4 +; VEC-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP14]], 20 +; VEC-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 ; VEC-NEXT: br label [[FOR_INC]] ; VEC: for.inc: ; VEC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -168,14 +167,14 @@ entry: for.body: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] - %arrayidx = getelementptr inbounds i32, i32* %f, i64 %indvars.iv - %0 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %f, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 %cmp1 = icmp sgt i32 %0, 100 br i1 %cmp1, label %if.then, label %for.inc if.then: %add = add nsw i32 %0, 20 - store i32 %add, i32* %arrayidx, align 4 + store i32 %add, ptr %arrayidx, align 4 br label %for.inc for.inc: @@ -192,7 +191,7 @@ for.end: ; vectorized loop body. ; PR18724 -define void @bug18724(i1 %cond, [768 x i32]* %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) { +define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) { ; UNROLL-LABEL: @bug18724( ; UNROLL-NEXT: entry: ; UNROLL-NEXT: [[TMP0:%.*]] = xor i1 [[COND:%.*]], true @@ -210,36 +209,36 @@ define void @bug18724(i1 %cond, [768 x i32]* %ptr, i1 %cond.2, i64 %v.1, i32 %v. ; UNROLL-NEXT: [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]] ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: -; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] -; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[V_2:%.*]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE4]] ] -; UNROLL-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], [[PRED_STORE_CONTINUE4]] ] +; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] +; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[V_2:%.*]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE3]] ] +; UNROLL-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI4:%.*]], [[PRED_STORE_CONTINUE3]] ] ; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[V_1]], [[INDEX]] -; UNROLL-NEXT: br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE4]] +; UNROLL-NEXT: br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE3]] ; UNROLL: pred.store.if: -; UNROLL-NEXT: [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0 -; UNROLL-NEXT: [[TMP5:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR:%.*]], i64 0, i64 [[INDUCTION]] -; UNROLL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -; UNROLL-NEXT: store i32 [[TMP6]], i32* [[TMP5]], align 4 -; UNROLL-NEXT: [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 1 -; UNROLL-NEXT: [[TMP7:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDUCTION1]] -; UNROLL-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -; UNROLL-NEXT: store i32 [[TMP8]], i32* [[TMP7]], align 4 -; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE4]] +; UNROLL-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR:%.*]], i64 0, i64 [[TMP5]] +; UNROLL-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +; UNROLL-NEXT: store i32 [[TMP7]], ptr [[TMP6]], align 4 +; UNROLL-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 1 +; UNROLL-NEXT: [[TMP9:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP8]] +; UNROLL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 +; UNROLL-NEXT: store i32 [[TMP10]], ptr [[TMP9]], align 4 +; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE3]] ; UNROLL: pred.store.continue3: -; UNROLL-NEXT: [[TMP9:%.*]] = add i32 [[VEC_PHI]], 1 -; UNROLL-NEXT: [[TMP10:%.*]] = add i32 [[VEC_PHI2]], 1 -; UNROLL-NEXT: [[TMP11:%.*]] = xor i1 [[COND_2]], true -; UNROLL-NEXT: [[TMP12:%.*]] = xor i1 [[COND_2]], true -; UNROLL-NEXT: [[PREDPHI]] = select i1 [[TMP11]], i32 [[VEC_PHI]], i32 [[TMP9]] -; UNROLL-NEXT: [[PREDPHI5]] = select i1 [[TMP12]], i32 [[VEC_PHI2]], i32 [[TMP10]] +; UNROLL-NEXT: [[TMP11:%.*]] = add i32 [[VEC_PHI]], 1 +; UNROLL-NEXT: [[TMP12:%.*]] = add i32 [[VEC_PHI1]], 1 +; UNROLL-NEXT: [[TMP13:%.*]] = xor i1 [[COND_2]], true +; UNROLL-NEXT: [[TMP14:%.*]] = xor i1 [[COND_2]], true +; UNROLL-NEXT: [[PREDPHI]] = select i1 [[TMP13]], i32 [[VEC_PHI]], i32 [[TMP11]] +; UNROLL-NEXT: [[PREDPHI4]] = select i1 [[TMP14]], i32 [[VEC_PHI1]], i32 [[TMP12]] ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; UNROLL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; UNROLL: middle.block: -; UNROLL-NEXT: [[BIN_RDX:%.*]] = add i32 [[PREDPHI5]], [[PREDPHI]] +; UNROLL-NEXT: [[BIN_RDX:%.*]] = add i32 [[PREDPHI4]], [[PREDPHI]] ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] -; UNROLL-NEXT: [[TMP14:%.*]] = xor i1 [[CMP_N]], true -; UNROLL-NEXT: call void @llvm.assume(i1 [[TMP14]]) +; UNROLL-NEXT: [[TMP16:%.*]] = xor i1 [[CMP_N]], true +; UNROLL-NEXT: call void @llvm.assume(i1 [[TMP16]]) ; UNROLL-NEXT: br label [[SCALAR_PH]] ; UNROLL: scalar.ph: ; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[V_1]], [[ENTRY:%.*]] ] @@ -248,11 +247,11 @@ define void @bug18724(i1 %cond, [768 x i32]* %ptr, i1 %cond.2, i64 %v.1, i32 %v. ; UNROLL: for.body14: ; UNROLL-NEXT: [[INDVARS_IV3:%.*]] = phi i64 [ [[INDVARS_IV_NEXT4:%.*]], [[FOR_INC23:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NEXT: [[INEWCHUNKS_120:%.*]] = phi i32 [ [[INEWCHUNKS_2:%.*]], [[FOR_INC23]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDVARS_IV3]] -; UNROLL-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX16]], align 4 +; UNROLL-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[INDVARS_IV3]] +; UNROLL-NEXT: [[TMP:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4 ; UNROLL-NEXT: br i1 [[COND_2]], label [[IF_THEN18:%.*]], label [[FOR_INC23]] ; UNROLL: if.then18: -; UNROLL-NEXT: store i32 [[TMP]], i32* [[ARRAYIDX16]], align 4 +; UNROLL-NEXT: store i32 [[TMP]], ptr [[ARRAYIDX16]], align 4 ; UNROLL-NEXT: [[INC21:%.*]] = add nsw i32 [[INEWCHUNKS_120]], 1 ; UNROLL-NEXT: br label [[FOR_INC23]] ; UNROLL: for.inc23: @@ -282,37 +281,37 @@ define void @bug18724(i1 %cond, [768 x i32]* %ptr, i1 %cond.2, i64 %v.1, i32 %v. ; UNROLL-NOSIMPLIFY-NEXT: [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]] ; UNROLL-NOSIMPLIFY-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NOSIMPLIFY: vector.body: -; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[V_2:%.*]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE4]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], [[PRED_STORE_CONTINUE4]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[V_2:%.*]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE3]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI4:%.*]], [[PRED_STORE_CONTINUE3]] ] ; UNROLL-NOSIMPLIFY-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[V_1]], [[INDEX]] ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; UNROLL-NOSIMPLIFY: pred.store.if: -; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR:%.*]], i64 0, i64 [[INDUCTION]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 -; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP5]], i32* [[TMP4]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR:%.*]], i64 0, i64 [[TMP4]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP6]], ptr [[TMP5]], align 4 ; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE]] ; UNROLL-NOSIMPLIFY: pred.store.continue: -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[COND_2]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[COND_2]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] ; UNROLL-NOSIMPLIFY: pred.store.if2: -; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDUCTION1]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP7]], i32* [[TMP6]], align 4 -; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE4]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP7]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP9]], ptr [[TMP8]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE3]] ; UNROLL-NOSIMPLIFY: pred.store.continue3: -; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = add i32 [[VEC_PHI]], 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP9:%.*]] = add i32 [[VEC_PHI2]], 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP10:%.*]] = xor i1 [[COND_2]], true -; UNROLL-NOSIMPLIFY-NEXT: [[TMP11:%.*]] = xor i1 [[COND_2]], true -; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI]] = select i1 [[TMP10]], i32 [[VEC_PHI]], i32 [[TMP8]] -; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI5]] = select i1 [[TMP11]], i32 [[VEC_PHI2]], i32 [[TMP9]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP10:%.*]] = add i32 [[VEC_PHI]], 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP11:%.*]] = add i32 [[VEC_PHI1]], 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP12:%.*]] = xor i1 [[COND_2]], true +; UNROLL-NOSIMPLIFY-NEXT: [[TMP13:%.*]] = xor i1 [[COND_2]], true +; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI]] = select i1 [[TMP12]], i32 [[VEC_PHI]], i32 [[TMP10]] +; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI4]] = select i1 [[TMP13]], i32 [[VEC_PHI1]], i32 [[TMP11]] ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; UNROLL-NOSIMPLIFY: middle.block: -; UNROLL-NOSIMPLIFY-NEXT: [[BIN_RDX:%.*]] = add i32 [[PREDPHI5]], [[PREDPHI]] +; UNROLL-NOSIMPLIFY-NEXT: [[BIN_RDX:%.*]] = add i32 [[PREDPHI4]], [[PREDPHI]] ; UNROLL-NOSIMPLIFY-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[CMP_N]], label [[FOR_INC26_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NOSIMPLIFY: scalar.ph: @@ -322,11 +321,11 @@ define void @bug18724(i1 %cond, [768 x i32]* %ptr, i1 %cond.2, i64 %v.1, i32 %v. ; UNROLL-NOSIMPLIFY: for.body14: ; UNROLL-NOSIMPLIFY-NEXT: [[INDVARS_IV3:%.*]] = phi i64 [ [[INDVARS_IV_NEXT4:%.*]], [[FOR_INC23:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NOSIMPLIFY-NEXT: [[INEWCHUNKS_120:%.*]] = phi i32 [ [[INEWCHUNKS_2:%.*]], [[FOR_INC23]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDVARS_IV3]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX16]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[INDVARS_IV3]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[COND_2]], label [[IF_THEN18:%.*]], label [[FOR_INC23]] ; UNROLL-NOSIMPLIFY: if.then18: -; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP]], i32* [[ARRAYIDX16]], align 4 +; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP]], ptr [[ARRAYIDX16]], align 4 ; UNROLL-NOSIMPLIFY-NEXT: [[INC21:%.*]] = add nsw i32 [[INEWCHUNKS_120]], 1 ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_INC23]] ; UNROLL-NOSIMPLIFY: for.inc23: @@ -366,51 +365,50 @@ define void @bug18724(i1 %cond, [768 x i32]* %ptr, i1 %cond.2, i64 %v.1, i32 %v. ; VEC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE2]] ] ; VEC-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[V_1]], [[INDEX]] ; VEC-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0 -; VEC-NEXT: [[TMP7:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR:%.*]], i64 0, i64 [[TMP6]] -; VEC-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 -; VEC-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>* -; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4 -; VEC-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 -; VEC-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; VEC-NEXT: [[TMP7:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR:%.*]], i64 0, i64 [[TMP6]] +; VEC-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4 +; VEC-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 +; VEC-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; VEC: pred.store.if: -; VEC-NEXT: [[TMP11:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[TMP6]] -; VEC-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 -; VEC-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 4 +; VEC-NEXT: [[TMP10:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP6]] +; VEC-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; VEC-NEXT: store i32 [[TMP11]], ptr [[TMP10]], align 4 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE]] ; VEC: pred.store.continue: -; VEC-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 -; VEC-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] +; VEC-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; VEC-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] ; VEC: pred.store.if1: -; VEC-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 1 -; VEC-NEXT: [[TMP15:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[TMP14]] -; VEC-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 -; VEC-NEXT: store i32 [[TMP16]], i32* [[TMP15]], align 4 +; VEC-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1 +; VEC-NEXT: [[TMP14:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP13]] +; VEC-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; VEC-NEXT: store i32 [[TMP15]], ptr [[TMP14]], align 4 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE2]] ; VEC: pred.store.continue2: -; VEC-NEXT: [[TMP17:%.*]] = add <2 x i32> [[VEC_PHI]], -; VEC-NEXT: [[TMP18:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], -; VEC-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP18]], <2 x i32> [[VEC_PHI]], <2 x i32> [[TMP17]] +; VEC-NEXT: [[TMP16:%.*]] = add <2 x i32> [[VEC_PHI]], +; VEC-NEXT: [[TMP17:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], +; VEC-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP17]], <2 x i32> [[VEC_PHI]], <2 x i32> [[TMP16]] ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; VEC-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VEC-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VEC: middle.block: -; VEC-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PREDPHI]]) +; VEC-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PREDPHI]]) ; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] -; VEC-NEXT: [[TMP21:%.*]] = xor i1 [[CMP_N]], true -; VEC-NEXT: call void @llvm.assume(i1 [[TMP21]]) +; VEC-NEXT: [[TMP20:%.*]] = xor i1 [[CMP_N]], true +; VEC-NEXT: call void @llvm.assume(i1 [[TMP20]]) ; VEC-NEXT: br label [[SCALAR_PH]] ; VEC: scalar.ph: ; VEC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[V_1]], [[ENTRY:%.*]] ] -; VEC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[V_2]], [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; VEC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[V_2]], [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] ; VEC-NEXT: br label [[FOR_BODY14:%.*]] ; VEC: for.body14: ; VEC-NEXT: [[INDVARS_IV3:%.*]] = phi i64 [ [[INDVARS_IV_NEXT4:%.*]], [[FOR_INC23:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; VEC-NEXT: [[INEWCHUNKS_120:%.*]] = phi i32 [ [[INEWCHUNKS_2:%.*]], [[FOR_INC23]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; VEC-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDVARS_IV3]] -; VEC-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX16]], align 4 +; VEC-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[INDVARS_IV3]] +; VEC-NEXT: [[TMP:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4 ; VEC-NEXT: br i1 [[COND_2]], label [[IF_THEN18:%.*]], label [[FOR_INC23]] ; VEC: if.then18: -; VEC-NEXT: store i32 [[TMP]], i32* [[ARRAYIDX16]], align 4 +; VEC-NEXT: store i32 [[TMP]], ptr [[ARRAYIDX16]], align 4 ; VEC-NEXT: [[INC21:%.*]] = add nsw i32 [[INEWCHUNKS_120]], 1 ; VEC-NEXT: br label [[FOR_INC23]] ; VEC: for.inc23: @@ -430,12 +428,12 @@ for.body9: for.body14: %indvars.iv3 = phi i64 [ %indvars.iv.next4, %for.inc23 ], [ %v.1, %for.body9 ] %iNewChunks.120 = phi i32 [ %iNewChunks.2, %for.inc23 ], [ %v.2, %for.body9 ] - %arrayidx16 = getelementptr inbounds [768 x i32], [768 x i32]* %ptr, i64 0, i64 %indvars.iv3 - %tmp = load i32, i32* %arrayidx16, align 4 + %arrayidx16 = getelementptr inbounds [768 x i32], ptr %ptr, i64 0, i64 %indvars.iv3 + %tmp = load i32, ptr %arrayidx16, align 4 br i1 %cond.2, label %if.then18, label %for.inc23 if.then18: - store i32 %tmp, i32* %arrayidx16, align 4 + store i32 %tmp, ptr %arrayidx16, align 4 %inc21 = add nsw i32 %iNewChunks.120, 1 br label %for.inc23 @@ -460,39 +458,39 @@ define void @minimal_bit_widths(i1 %c) { ; UNROLL-NEXT: entry: ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: -; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] -; UNROLL-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE6]] +; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] +; UNROLL-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE3]] ; UNROLL: pred.store.if: -; UNROLL-NEXT: [[INDUCTION3:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION3]] -; UNROLL-NEXT: [[TMP1:%.*]] = load i8, i8* [[TMP0]], align 1 -; UNROLL-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32 -; UNROLL-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 -; UNROLL-NEXT: store i8 [[TMP3]], i8* [[TMP0]], align 1 -; UNROLL-NEXT: [[INDUCTION4:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION4]] -; UNROLL-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1 -; UNROLL-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 -; UNROLL-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8 -; UNROLL-NEXT: store i8 [[TMP7]], i8* [[TMP4]], align 1 -; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE6]] +; UNROLL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr undef, i64 [[TMP0]] +; UNROLL-NEXT: [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 1 +; UNROLL-NEXT: [[TMP3:%.*]] = zext i8 [[TMP2]] to i32 +; UNROLL-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +; UNROLL-NEXT: store i8 [[TMP4]], ptr [[TMP1]], align 1 +; UNROLL-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr undef, i64 [[TMP5]] +; UNROLL-NEXT: [[TMP7:%.*]] = load i8, ptr [[TMP6]], align 1 +; UNROLL-NEXT: [[TMP8:%.*]] = zext i8 [[TMP7]] to i32 +; UNROLL-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8 +; UNROLL-NEXT: store i8 [[TMP9]], ptr [[TMP6]], align 1 +; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE3]] ; UNROLL: pred.store.continue3: ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef -; UNROLL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; UNROLL-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef +; UNROLL-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 undef, undef ; UNROLL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] ; UNROLL: for.body: ; UNROLL-NEXT: [[TMP0:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_INC:%.*]] ], [ undef, [[MIDDLE_BLOCK]] ] ; UNROLL-NEXT: [[TMP1:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ undef, [[MIDDLE_BLOCK]] ] -; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]] -; UNROLL-NEXT: [[TMP3:%.*]] = load i8, i8* [[TMP2]], align 1 +; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr undef, i64 [[TMP0]] +; UNROLL-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1 ; UNROLL-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; UNROLL: if.then: ; UNROLL-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32 ; UNROLL-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; UNROLL-NEXT: store i8 [[TMP5]], i8* [[TMP2]], align 1 +; UNROLL-NEXT: store i8 [[TMP5]], ptr [[TMP2]], align 1 ; UNROLL-NEXT: br label [[FOR_INC]] ; UNROLL: for.inc: ; UNROLL-NEXT: [[TMP6]] = add nuw nsw i64 [[TMP0]], 1 @@ -508,30 +506,30 @@ define void @minimal_bit_widths(i1 %c) { ; UNROLL-NOSIMPLIFY: vector.ph: ; UNROLL-NOSIMPLIFY-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NOSIMPLIFY: vector.body: -; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; UNROLL-NOSIMPLIFY: pred.store.if: -; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION3:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION3]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = load i8, i8* [[TMP0]], align 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 -; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP3]], i8* [[TMP0]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr undef, i64 [[TMP0]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = zext i8 [[TMP2]] to i32 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP4]], ptr [[TMP1]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE]] ; UNROLL-NOSIMPLIFY: pred.store.continue: -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] ; UNROLL-NOSIMPLIFY: pred.store.if2: -; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION2]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8 -; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP7]], i8* [[TMP4]], align 1 -; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE6]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr undef, i64 [[TMP5]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = load i8, ptr [[TMP6]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = zext i8 [[TMP7]] to i32 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8 +; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP9]], ptr [[TMP6]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE3]] ; UNROLL-NOSIMPLIFY: pred.store.continue3: ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; UNROLL-NOSIMPLIFY: middle.block: ; UNROLL-NOSIMPLIFY-NEXT: [[CMP_N:%.*]] = icmp eq i64 undef, undef ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -542,13 +540,13 @@ define void @minimal_bit_widths(i1 %c) { ; UNROLL-NOSIMPLIFY: for.body: ; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = load i8, i8* [[TMP2]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr undef, i64 [[TMP0]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; UNROLL-NOSIMPLIFY: if.then: ; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32 ; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP5]], i8* [[TMP2]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP5]], ptr [[TMP2]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_INC]] ; UNROLL-NOSIMPLIFY: for.inc: ; UNROLL-NOSIMPLIFY-NEXT: [[TMP6]] = add nuw nsw i64 [[TMP0]], 1 @@ -566,47 +564,46 @@ define void @minimal_bit_widths(i1 %c) { ; VEC: vector.body: ; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] ; VEC-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; VEC-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]] -; VEC-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i32 0 -; VEC-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <2 x i8>* -; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, <2 x i8>* [[TMP4]], align 1 -; VEC-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 -; VEC-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; VEC-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr undef, i64 [[TMP0]] +; VEC-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1 +; VEC-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 +; VEC-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; VEC: pred.store.if: -; VEC-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0 -; VEC-NEXT: [[TMP7:%.*]] = zext i8 [[TMP6]] to i32 -; VEC-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8 -; VEC-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]] -; VEC-NEXT: store i8 [[TMP8]], i8* [[TMP9]], align 1 +; VEC-NEXT: [[TMP4:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0 +; VEC-NEXT: [[TMP5:%.*]] = zext i8 [[TMP4]] to i32 +; VEC-NEXT: [[TMP6:%.*]] = trunc i32 [[TMP5]] to i8 +; VEC-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr undef, i64 [[TMP0]] +; VEC-NEXT: store i8 [[TMP6]], ptr [[TMP7]], align 1 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE]] ; VEC: pred.store.continue: -; VEC-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 -; VEC-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] +; VEC-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; VEC-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] ; VEC: pred.store.if2: -; VEC-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1 -; VEC-NEXT: [[TMP12:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1 -; VEC-NEXT: [[TMP13:%.*]] = zext i8 [[TMP12]] to i32 -; VEC-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8 -; VEC-NEXT: [[TMP15:%.*]] = getelementptr i8, i8* undef, i64 [[TMP11]] -; VEC-NEXT: store i8 [[TMP14]], i8* [[TMP15]], align 1 +; VEC-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 1 +; VEC-NEXT: [[TMP10:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1 +; VEC-NEXT: [[TMP11:%.*]] = zext i8 [[TMP10]] to i32 +; VEC-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8 +; VEC-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr undef, i64 [[TMP9]] +; VEC-NEXT: store i8 [[TMP12]], ptr [[TMP13]], align 1 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE3]] ; VEC: pred.store.continue3: ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; VEC-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef -; VEC-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; VEC-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef +; VEC-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; VEC: middle.block: ; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 undef, undef ; VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] ; VEC: for.body: ; VEC-NEXT: [[TMP0:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_INC:%.*]] ], [ undef, [[MIDDLE_BLOCK]] ] ; VEC-NEXT: [[TMP1:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ undef, [[MIDDLE_BLOCK]] ] -; VEC-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]] -; VEC-NEXT: [[TMP3:%.*]] = load i8, i8* [[TMP2]], align 1 +; VEC-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr undef, i64 [[TMP0]] +; VEC-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1 ; VEC-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; VEC: if.then: ; VEC-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32 ; VEC-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; VEC-NEXT: store i8 [[TMP5]], i8* [[TMP2]], align 1 +; VEC-NEXT: store i8 [[TMP5]], ptr [[TMP2]], align 1 ; VEC-NEXT: br label [[FOR_INC]] ; VEC: for.inc: ; VEC-NEXT: [[TMP6]] = add nuw nsw i64 [[TMP0]], 1 @@ -622,14 +619,14 @@ entry: for.body: %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ] %tmp1 = phi i64 [ %tmp7, %for.inc ], [ undef, %entry ] - %tmp2 = getelementptr i8, i8* undef, i64 %tmp0 - %tmp3 = load i8, i8* %tmp2, align 1 + %tmp2 = getelementptr i8, ptr undef, i64 %tmp0 + %tmp3 = load i8, ptr %tmp2, align 1 br i1 %c, label %if.then, label %for.inc if.then: %tmp4 = zext i8 %tmp3 to i32 %tmp5 = trunc i32 %tmp4 to i8 - store i8 %tmp5, i8* %tmp2, align 1 + store i8 %tmp5, ptr %tmp2, align 1 br label %for.inc for.inc: @@ -642,21 +639,21 @@ for.end: ret void } -define void @minimal_bit_widths_with_aliasing_store(i1 %c, i8* %ptr) { +define void @minimal_bit_widths_with_aliasing_store(i1 %c, ptr %ptr) { ; UNROLL-LABEL: @minimal_bit_widths_with_aliasing_store( ; UNROLL-NEXT: entry: ; UNROLL-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL: for.body: ; UNROLL-NEXT: [[TMP0:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ] ; UNROLL-NEXT: [[TMP1:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ] -; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i64 [[TMP0]] -; UNROLL-NEXT: [[TMP3:%.*]] = load i8, i8* [[TMP2]], align 1 -; UNROLL-NEXT: store i8 0, i8* [[TMP2]], align 1 +; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[TMP0]] +; UNROLL-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1 +; UNROLL-NEXT: store i8 0, ptr [[TMP2]], align 1 ; UNROLL-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; UNROLL: if.then: ; UNROLL-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32 ; UNROLL-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; UNROLL-NEXT: store i8 [[TMP5]], i8* [[TMP2]], align 1 +; UNROLL-NEXT: store i8 [[TMP5]], ptr [[TMP2]], align 1 ; UNROLL-NEXT: br label [[FOR_INC]] ; UNROLL: for.inc: ; UNROLL-NEXT: [[TMP6]] = add nuw nsw i64 [[TMP0]], 1 @@ -672,32 +669,32 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, i8* %ptr) { ; UNROLL-NOSIMPLIFY: vector.ph: ; UNROLL-NOSIMPLIFY-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NOSIMPLIFY: vector.body: -; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i64 [[INDUCTION]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[PTR]], i64 [[INDUCTION2]] -; UNROLL-NOSIMPLIFY-NEXT: store i8 0, i8* [[TMP0]], align 1 -; UNROLL-NOSIMPLIFY-NEXT: store i8 0, i8* [[TMP1]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[TMP0]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP1]] +; UNROLL-NOSIMPLIFY-NEXT: store i8 0, ptr [[TMP2]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: store i8 0, ptr [[TMP3]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; UNROLL-NOSIMPLIFY: pred.store.if: -; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = load i8, i8* [[TMP0]], align 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = zext i8 [[TMP2]] to i32 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 -; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP4]], i8* [[TMP0]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = zext i8 [[TMP4]] to i32 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = trunc i32 [[TMP5]] to i8 +; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP6]], ptr [[TMP2]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE]] ; UNROLL-NOSIMPLIFY: pred.store.continue: -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] ; UNROLL-NOSIMPLIFY: pred.store.if2: -; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP1]], align 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8 -; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP7]], i8* [[TMP1]], align 1 -; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE6]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = load i8, ptr [[TMP3]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = zext i8 [[TMP7]] to i32 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8 +; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP9]], ptr [[TMP3]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE3]] ; UNROLL-NOSIMPLIFY: pred.store.continue3: ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; UNROLL-NOSIMPLIFY: middle.block: ; UNROLL-NOSIMPLIFY-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, 0 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -708,14 +705,14 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, i8* %ptr) { ; UNROLL-NOSIMPLIFY: for.body: ; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[PTR]], i64 [[TMP0]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = load i8, i8* [[TMP2]], align 1 -; UNROLL-NOSIMPLIFY-NEXT: store i8 0, i8* [[TMP2]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP0]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: store i8 0, ptr [[TMP2]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; UNROLL-NOSIMPLIFY: if.then: ; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32 ; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP5]], i8* [[TMP2]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP5]], ptr [[TMP2]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_INC]] ; UNROLL-NOSIMPLIFY: for.inc: ; UNROLL-NOSIMPLIFY-NEXT: [[TMP6]] = add nuw nsw i64 [[TMP0]], 1 @@ -731,14 +728,14 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, i8* %ptr) { ; VEC: for.body: ; VEC-NEXT: [[TMP0:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ] ; VEC-NEXT: [[TMP1:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ] -; VEC-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i64 [[TMP0]] -; VEC-NEXT: [[TMP3:%.*]] = load i8, i8* [[TMP2]], align 1 -; VEC-NEXT: store i8 0, i8* [[TMP2]], align 1 +; VEC-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[TMP0]] +; VEC-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1 +; VEC-NEXT: store i8 0, ptr [[TMP2]], align 1 ; VEC-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; VEC: if.then: ; VEC-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32 ; VEC-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; VEC-NEXT: store i8 [[TMP5]], i8* [[TMP2]], align 1 +; VEC-NEXT: store i8 [[TMP5]], ptr [[TMP2]], align 1 ; VEC-NEXT: br label [[FOR_INC]] ; VEC: for.inc: ; VEC-NEXT: [[TMP6]] = add nuw nsw i64 [[TMP0]], 1 @@ -754,15 +751,15 @@ entry: for.body: %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ] %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 0, %entry ] - %tmp2 = getelementptr i8, i8* %ptr, i64 %tmp0 - %tmp3 = load i8, i8* %tmp2, align 1 - store i8 0, i8* %tmp2 + %tmp2 = getelementptr i8, ptr %ptr, i64 %tmp0 + %tmp3 = load i8, ptr %tmp2, align 1 + store i8 0, ptr %tmp2 br i1 %c, label %if.then, label %for.inc if.then: %tmp4 = zext i8 %tmp3 to i32 %tmp5 = trunc i32 %tmp4 to i8 - store i8 %tmp5, i8* %tmp2, align 1 + store i8 %tmp5, ptr %tmp2, align 1 br label %for.inc for.inc: diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll index 6b7f09a37a480..c496642e499e6 100644 --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -9,7 +9,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; Make sure that we can handle multiple integer induction variables. ; -define void @multi_int_induction(i32* %A, i32 %N) { +define void @multi_int_induction(ptr %A, i32 %N) { ; CHECK-LABEL: @multi_int_induction( ; CHECK-NEXT: for.body.lr.ph: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N:%.*]], -1 @@ -20,21 +20,20 @@ define void @multi_int_induction(i32* %A, i32 %N) { ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] -; CHECK-NEXT: [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END:%.*]] = add i32 190, [[CAST_VTC]] +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END:%.*]] = add i32 190, [[DOTCAST]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; CHECK-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP5]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -45,8 +44,8 @@ define void @multi_int_induction(i32* %A, i32 %N) { ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[COUNT_09:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[COUNT_09]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[COUNT_09]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[INC]] = add nsw i32 [[COUNT_09]], 1 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 @@ -64,19 +63,18 @@ define void @multi_int_induction(i32* %A, i32 %N) { ; IND-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IND: vector.ph: ; IND-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -2 -; IND-NEXT: [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32 -; IND-NEXT: [[IND_END:%.*]] = add i32 [[CAST_VTC]], 190 +; IND-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; IND-NEXT: [[IND_END:%.*]] = add i32 [[DOTCAST]], 190 ; IND-NEXT: br label [[VECTOR_BODY:%.*]] ; IND: vector.body: ; IND-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; IND-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] -; IND-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>* -; IND-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP4]], align 4 +; IND-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] +; IND-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP3]], align 4 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; IND-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IND-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IND-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IND-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IND: middle.block: ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; IND-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -87,8 +85,8 @@ define void @multi_int_induction(i32* %A, i32 %N) { ; IND: for.body: ; IND-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; IND-NEXT: [[COUNT_09:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; IND-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; IND-NEXT: store i32 [[COUNT_09]], i32* [[ARRAYIDX2]], align 4 +; IND-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; IND-NEXT: store i32 [[COUNT_09]], ptr [[ARRAYIDX2]], align 4 ; IND-NEXT: [[INC]] = add nsw i32 [[COUNT_09]], 1 ; IND-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; IND-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 @@ -106,23 +104,21 @@ define void @multi_int_induction(i32* %A, i32 %N) { ; UNROLL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -4 -; UNROLL-NEXT: [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32 -; UNROLL-NEXT: [[IND_END:%.*]] = add i32 [[CAST_VTC]], 190 +; UNROLL-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; UNROLL-NEXT: [[IND_END:%.*]] = add i32 [[DOTCAST]], 190 ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] -; UNROLL-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>* -; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP4]], align 4 -; UNROLL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i64 2 -; UNROLL-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>* -; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP6]], align 4 +; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] +; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP3]], align 4 +; UNROLL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP4]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; UNROLL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -133,8 +129,8 @@ define void @multi_int_induction(i32* %A, i32 %N) { ; UNROLL: for.body: ; UNROLL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; UNROLL-NEXT: [[COUNT_09:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; UNROLL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; UNROLL-NEXT: store i32 [[COUNT_09]], i32* [[ARRAYIDX2]], align 4 +; UNROLL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; UNROLL-NEXT: store i32 [[COUNT_09]], ptr [[ARRAYIDX2]], align 4 ; UNROLL-NEXT: [[INC]] = add nsw i32 [[COUNT_09]], 1 ; UNROLL-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; UNROLL-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 @@ -153,8 +149,8 @@ define void @multi_int_induction(i32* %A, i32 %N) { ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] -; UNROLL-NO-IC-NEXT: [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32 -; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = add i32 190, [[CAST_VTC]] +; UNROLL-NO-IC-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = add i32 190, [[DOTCAST]] ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -162,18 +158,16 @@ define void @multi_int_induction(i32* %A, i32 %N) { ; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 2 -; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]] -; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]] -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP8]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 2 -; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP10]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]] +; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]] +; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP7]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP8]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], -; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -184,8 +178,8 @@ define void @multi_int_induction(i32* %A, i32 %N) { ; UNROLL-NO-IC: for.body: ; UNROLL-NO-IC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[COUNT_09:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; UNROLL-NO-IC-NEXT: store i32 [[COUNT_09]], i32* [[ARRAYIDX2]], align 4 +; UNROLL-NO-IC-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; UNROLL-NO-IC-NEXT: store i32 [[COUNT_09]], ptr [[ARRAYIDX2]], align 4 ; UNROLL-NO-IC-NEXT: [[INC]] = add nsw i32 [[COUNT_09]], 1 ; UNROLL-NO-IC-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; UNROLL-NO-IC-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 @@ -203,23 +197,21 @@ define void @multi_int_induction(i32* %A, i32 %N) { ; INTERLEAVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; INTERLEAVE: vector.ph: ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -8 -; INTERLEAVE-NEXT: [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32 -; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i32 [[CAST_VTC]], 190 +; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i32 [[DOTCAST]], 190 ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] -; INTERLEAVE-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP4]], align 4 -; INTERLEAVE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i64 4 -; INTERLEAVE-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], <4 x i32>* [[TMP6]], align 4 +; INTERLEAVE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] +; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP3]], align 4 +; INTERLEAVE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], ptr [[TMP4]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -230,8 +222,8 @@ define void @multi_int_induction(i32* %A, i32 %N) { ; INTERLEAVE: for.body: ; INTERLEAVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; INTERLEAVE-NEXT: [[COUNT_09:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; INTERLEAVE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; INTERLEAVE-NEXT: store i32 [[COUNT_09]], i32* [[ARRAYIDX2]], align 4 +; INTERLEAVE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; INTERLEAVE-NEXT: store i32 [[COUNT_09]], ptr [[ARRAYIDX2]], align 4 ; INTERLEAVE-NEXT: [[INC]] = add nsw i32 [[COUNT_09]], 1 ; INTERLEAVE-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; INTERLEAVE-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 @@ -246,8 +238,8 @@ for.body.lr.ph: for.body: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] %count.09 = phi i32 [ 190, %for.body.lr.ph ], [ %inc, %for.body ] - %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv - store i32 %count.09, i32* %arrayidx2, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + store i32 %count.09, ptr %arrayidx2, align 4 %inc = add nsw i32 %count.09, 1 %indvars.iv.next = add i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 @@ -266,24 +258,23 @@ for.end: ; Vectorized induction variable. -define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) { +define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) { ; CHECK-LABEL: @scalar_use( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[OFFSET:%.*]] -; CHECK-NEXT: [[SCEVGEP1:%.*]] = bitcast float* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], [[OFFSET]] -; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]] -; CHECK-NEXT: [[SCEVGEP23:%.*]] = bitcast float* [[SCEVGEP2]] to i8* -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[A]], i64 [[OFFSET2:%.*]] -; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8* -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[N]], [[OFFSET2]] -; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr float, float* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[SCEVGEP67:%.*]] = bitcast float* [[SCEVGEP6]] to i8* -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP67]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP45]], [[SCEVGEP23]] +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[OFFSET:%.*]], 2 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[N]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[OFFSET2:%.*]], 2 +; CHECK-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP3]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[UGLYGEP2]], [[UGLYGEP1]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -294,21 +285,18 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], [[OFFSET]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <2 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP6]], align 4, !alias.scope !4, !noalias !7 -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP2]], [[OFFSET2]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <2 x float>* -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <2 x float>, <2 x float>* [[TMP10]], align 4, !alias.scope !7 -; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD8]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP5]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP12]], <2 x float>* [[TMP13]], align 4, !alias.scope !4, !noalias !7 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], [[OFFSET]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP8]], align 4, !alias.scope !4, !noalias !7 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP5]], [[OFFSET2]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x float>, ptr [[TMP11]], align 4, !alias.scope !7 +; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD4]] +; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP12]] +; CHECK-NEXT: store <2 x float> [[TMP13]], ptr [[TMP8]], align 4, !alias.scope !4, !noalias !7 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] @@ -321,14 +309,14 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[IND_SUM:%.*]] = add i64 [[IV]], [[OFFSET]] -; CHECK-NEXT: [[ARR_IDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM]] -; CHECK-NEXT: [[L1:%.*]] = load float, float* [[ARR_IDX]], align 4 +; CHECK-NEXT: [[ARR_IDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM]] +; CHECK-NEXT: [[L1:%.*]] = load float, ptr [[ARR_IDX]], align 4 ; CHECK-NEXT: [[IND_SUM2:%.*]] = add i64 [[IV]], [[OFFSET2]] -; CHECK-NEXT: [[ARR_IDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM2]] -; CHECK-NEXT: [[L2:%.*]] = load float, float* [[ARR_IDX2]], align 4 +; CHECK-NEXT: [[ARR_IDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM2]] +; CHECK-NEXT: [[L2:%.*]] = load float, ptr [[ARR_IDX2]], align 4 ; CHECK-NEXT: [[M:%.*]] = fmul fast float [[B]], [[L2]] ; CHECK-NEXT: [[AD:%.*]] = fadd fast float [[L1]], [[M]] -; CHECK-NEXT: store float [[AD]], float* [[ARR_IDX]], align 4 +; CHECK-NEXT: store float [[AD]], ptr [[ARR_IDX]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -340,14 +328,17 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) ; IND-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2 ; IND-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; IND: vector.memcheck: -; IND-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[OFFSET:%.*]] -; IND-NEXT: [[TMP0:%.*]] = add i64 [[N]], [[OFFSET]] -; IND-NEXT: [[SCEVGEP2:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]] -; IND-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[A]], i64 [[OFFSET2:%.*]] -; IND-NEXT: [[TMP1:%.*]] = add i64 [[N]], [[OFFSET2]] -; IND-NEXT: [[SCEVGEP6:%.*]] = getelementptr float, float* [[A]], i64 [[TMP1]] -; IND-NEXT: [[BOUND0:%.*]] = icmp ult float* [[SCEVGEP]], [[SCEVGEP6]] -; IND-NEXT: [[BOUND1:%.*]] = icmp ult float* [[SCEVGEP4]], [[SCEVGEP2]] +; IND-NEXT: [[TMP0:%.*]] = shl i64 [[OFFSET:%.*]], 2 +; IND-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP0]] +; IND-NEXT: [[TMP1:%.*]] = shl i64 [[N]], 2 +; IND-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP0]] +; IND-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; IND-NEXT: [[TMP3:%.*]] = shl i64 [[OFFSET2:%.*]], 2 +; IND-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]] +; IND-NEXT: [[TMP4:%.*]] = add i64 [[TMP1]], [[TMP3]] +; IND-NEXT: [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]] +; IND-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP3]] +; IND-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[UGLYGEP2]], [[UGLYGEP1]] ; IND-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; IND-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; IND: vector.ph: @@ -357,18 +348,15 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) ; IND-NEXT: br label [[VECTOR_BODY:%.*]] ; IND: vector.body: ; IND-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; IND-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], [[OFFSET]] -; IND-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] -; IND-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <2 x float>* -; IND-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4, !alias.scope !4, !noalias !7 -; IND-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], [[OFFSET2]] -; IND-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]] -; IND-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>* -; IND-NEXT: [[WIDE_LOAD8:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4, !alias.scope !7 -; IND-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD8]] -; IND-NEXT: [[TMP9:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP8]] -; IND-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP3]] to <2 x float>* -; IND-NEXT: store <2 x float> [[TMP9]], <2 x float>* [[TMP10]], align 4, !alias.scope !4, !noalias !7 +; IND-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], [[OFFSET]] +; IND-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]] +; IND-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !alias.scope !4, !noalias !7 +; IND-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], [[OFFSET2]] +; IND-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] +; IND-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x float>, ptr [[TMP8]], align 4, !alias.scope !7 +; IND-NEXT: [[TMP9:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD4]] +; IND-NEXT: [[TMP10:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP9]] +; IND-NEXT: store <2 x float> [[TMP10]], ptr [[TMP6]], align 4, !alias.scope !4, !noalias !7 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; IND-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IND-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] @@ -381,14 +369,14 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) ; IND: for.body: ; IND-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IND-NEXT: [[IND_SUM:%.*]] = add i64 [[IV]], [[OFFSET]] -; IND-NEXT: [[ARR_IDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM]] -; IND-NEXT: [[L1:%.*]] = load float, float* [[ARR_IDX]], align 4 +; IND-NEXT: [[ARR_IDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM]] +; IND-NEXT: [[L1:%.*]] = load float, ptr [[ARR_IDX]], align 4 ; IND-NEXT: [[IND_SUM2:%.*]] = add i64 [[IV]], [[OFFSET2]] -; IND-NEXT: [[ARR_IDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM2]] -; IND-NEXT: [[L2:%.*]] = load float, float* [[ARR_IDX2]], align 4 +; IND-NEXT: [[ARR_IDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM2]] +; IND-NEXT: [[L2:%.*]] = load float, ptr [[ARR_IDX2]], align 4 ; IND-NEXT: [[M:%.*]] = fmul fast float [[L2]], [[B]] ; IND-NEXT: [[AD:%.*]] = fadd fast float [[L1]], [[M]] -; IND-NEXT: store float [[AD]], float* [[ARR_IDX]], align 4 +; IND-NEXT: store float [[AD]], ptr [[ARR_IDX]], align 4 ; IND-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IND-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; IND-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -400,50 +388,47 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) ; UNROLL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 ; UNROLL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; UNROLL: vector.memcheck: -; UNROLL-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[OFFSET:%.*]] -; UNROLL-NEXT: [[TMP0:%.*]] = add i64 [[N]], [[OFFSET]] -; UNROLL-NEXT: [[SCEVGEP2:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]] -; UNROLL-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[A]], i64 [[OFFSET2:%.*]] -; UNROLL-NEXT: [[TMP1:%.*]] = add i64 [[N]], [[OFFSET2]] -; UNROLL-NEXT: [[SCEVGEP6:%.*]] = getelementptr float, float* [[A]], i64 [[TMP1]] -; UNROLL-NEXT: [[BOUND0:%.*]] = icmp ult float* [[SCEVGEP]], [[SCEVGEP6]] -; UNROLL-NEXT: [[BOUND1:%.*]] = icmp ult float* [[SCEVGEP4]], [[SCEVGEP2]] +; UNROLL-NEXT: [[TMP0:%.*]] = shl i64 [[OFFSET:%.*]], 2 +; UNROLL-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP0]] +; UNROLL-NEXT: [[TMP1:%.*]] = shl i64 [[N]], 2 +; UNROLL-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP0]] +; UNROLL-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; UNROLL-NEXT: [[TMP3:%.*]] = shl i64 [[OFFSET2:%.*]], 2 +; UNROLL-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]] +; UNROLL-NEXT: [[TMP4:%.*]] = add i64 [[TMP1]], [[TMP3]] +; UNROLL-NEXT: [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]] +; UNROLL-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP3]] +; UNROLL-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[UGLYGEP2]], [[UGLYGEP1]] ; UNROLL-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; UNROLL-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4 ; UNROLL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[B:%.*]], i64 0 ; UNROLL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer -; UNROLL-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <2 x float> poison, float [[B]], i64 0 -; UNROLL-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT11]], <2 x float> poison, <2 x i32> zeroinitializer +; UNROLL-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <2 x float> poison, float [[B]], i64 0 +; UNROLL-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT7]], <2 x float> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], [[OFFSET]] -; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] -; UNROLL-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <2 x float>* -; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4, !alias.scope !4, !noalias !7 -; UNROLL-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP3]], i64 2 -; UNROLL-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <2 x float>* -; UNROLL-NEXT: [[WIDE_LOAD8:%.*]] = load <2 x float>, <2 x float>* [[TMP6]], align 4, !alias.scope !4, !noalias !7 -; UNROLL-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], [[OFFSET2]] -; UNROLL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP7]] -; UNROLL-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <2 x float>* -; UNROLL-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4, !alias.scope !7 -; UNROLL-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP8]], i64 2 -; UNROLL-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <2 x float>* -; UNROLL-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x float>, <2 x float>* [[TMP11]], align 4, !alias.scope !7 -; UNROLL-NEXT: [[TMP12:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD9]] -; UNROLL-NEXT: [[TMP13:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT12]], [[WIDE_LOAD10]] -; UNROLL-NEXT: [[TMP14:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP12]] -; UNROLL-NEXT: [[TMP15:%.*]] = fadd fast <2 x float> [[WIDE_LOAD8]], [[TMP13]] -; UNROLL-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP3]] to <2 x float>* -; UNROLL-NEXT: store <2 x float> [[TMP14]], <2 x float>* [[TMP16]], align 4, !alias.scope !4, !noalias !7 -; UNROLL-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP5]] to <2 x float>* -; UNROLL-NEXT: store <2 x float> [[TMP15]], <2 x float>* [[TMP17]], align 4, !alias.scope !4, !noalias !7 +; UNROLL-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], [[OFFSET]] +; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]] +; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !alias.scope !4, !noalias !7 +; UNROLL-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 2 +; UNROLL-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x float>, ptr [[TMP7]], align 4, !alias.scope !4, !noalias !7 +; UNROLL-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], [[OFFSET2]] +; UNROLL-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] +; UNROLL-NEXT: [[WIDE_LOAD5:%.*]] = load <2 x float>, ptr [[TMP9]], align 4, !alias.scope !7 +; UNROLL-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 2 +; UNROLL-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x float>, ptr [[TMP10]], align 4, !alias.scope !7 +; UNROLL-NEXT: [[TMP11:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD5]] +; UNROLL-NEXT: [[TMP12:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT8]], [[WIDE_LOAD6]] +; UNROLL-NEXT: [[TMP13:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP11]] +; UNROLL-NEXT: [[TMP14:%.*]] = fadd fast <2 x float> [[WIDE_LOAD4]], [[TMP12]] +; UNROLL-NEXT: store <2 x float> [[TMP13]], ptr [[TMP6]], align 4, !alias.scope !4, !noalias !7 +; UNROLL-NEXT: store <2 x float> [[TMP14]], ptr [[TMP7]], align 4, !alias.scope !4, !noalias !7 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; UNROLL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; UNROLL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -453,14 +438,14 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) ; UNROLL: for.body: ; UNROLL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; UNROLL-NEXT: [[IND_SUM:%.*]] = add i64 [[IV]], [[OFFSET]] -; UNROLL-NEXT: [[ARR_IDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM]] -; UNROLL-NEXT: [[L1:%.*]] = load float, float* [[ARR_IDX]], align 4 +; UNROLL-NEXT: [[ARR_IDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM]] +; UNROLL-NEXT: [[L1:%.*]] = load float, ptr [[ARR_IDX]], align 4 ; UNROLL-NEXT: [[IND_SUM2:%.*]] = add i64 [[IV]], [[OFFSET2]] -; UNROLL-NEXT: [[ARR_IDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM2]] -; UNROLL-NEXT: [[L2:%.*]] = load float, float* [[ARR_IDX2]], align 4 +; UNROLL-NEXT: [[ARR_IDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM2]] +; UNROLL-NEXT: [[L2:%.*]] = load float, ptr [[ARR_IDX2]], align 4 ; UNROLL-NEXT: [[M:%.*]] = fmul fast float [[L2]], [[B]] ; UNROLL-NEXT: [[AD:%.*]] = fadd fast float [[L1]], [[M]] -; UNROLL-NEXT: store float [[AD]], float* [[ARR_IDX]], align 4 +; UNROLL-NEXT: store float [[AD]], ptr [[ARR_IDX]], align 4 ; UNROLL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; UNROLL-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; UNROLL-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -472,18 +457,17 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) ; UNROLL-NO-IC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 ; UNROLL-NO-IC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; UNROLL-NO-IC: vector.memcheck: -; UNROLL-NO-IC-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[OFFSET:%.*]] -; UNROLL-NO-IC-NEXT: [[SCEVGEP1:%.*]] = bitcast float* [[SCEVGEP]] to i8* -; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add i64 [[N]], [[OFFSET]] -; UNROLL-NO-IC-NEXT: [[SCEVGEP2:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]] -; UNROLL-NO-IC-NEXT: [[SCEVGEP23:%.*]] = bitcast float* [[SCEVGEP2]] to i8* -; UNROLL-NO-IC-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[A]], i64 [[OFFSET2:%.*]] -; UNROLL-NO-IC-NEXT: [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8* -; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add i64 [[N]], [[OFFSET2]] -; UNROLL-NO-IC-NEXT: [[SCEVGEP6:%.*]] = getelementptr float, float* [[A]], i64 [[TMP1]] -; UNROLL-NO-IC-NEXT: [[SCEVGEP67:%.*]] = bitcast float* [[SCEVGEP6]] to i8* -; UNROLL-NO-IC-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP67]] -; UNROLL-NO-IC-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP45]], [[SCEVGEP23]] +; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = shl i64 [[OFFSET:%.*]], 2 +; UNROLL-NO-IC-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP0]] +; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = shl i64 [[N]], 2 +; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP0]] +; UNROLL-NO-IC-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = shl i64 [[OFFSET2:%.*]], 2 +; UNROLL-NO-IC-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]] +; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = add i64 [[TMP1]], [[TMP3]] +; UNROLL-NO-IC-NEXT: [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]] +; UNROLL-NO-IC-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP3]] +; UNROLL-NO-IC-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[UGLYGEP2]], [[UGLYGEP1]] ; UNROLL-NO-IC-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; UNROLL-NO-IC-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-IC: vector.ph: @@ -491,44 +475,38 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[B:%.*]], i32 0 ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <2 x float> poison, float [[B]], i32 0 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT11]], <2 x float> poison, <2 x i32> zeroinitializer +; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <2 x float> poison, float [[B]], i32 0 +; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT7]], <2 x float> poison, <2 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 2 -; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = add i64 [[TMP2]], [[OFFSET]] -; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = add i64 [[TMP3]], [[OFFSET]] -; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]] -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <2 x float>* -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4, !alias.scope !4, !noalias !7 -; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 2 -; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <2 x float>* -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD8:%.*]] = load <2 x float>, <2 x float>* [[TMP11]], align 4, !alias.scope !4, !noalias !7 -; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = add i64 [[TMP2]], [[OFFSET2]] -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = add i64 [[TMP3]], [[OFFSET2]] -; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP12]] -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP13]] -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <2 x float>* -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x float>, <2 x float>* [[TMP17]], align 4, !alias.scope !7 -; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i32 2 -; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <2 x float>* -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x float>, <2 x float>* [[TMP19]], align 4, !alias.scope !7 -; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD9]] -; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT12]], [[WIDE_LOAD10]] -; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP20]] -; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = fadd fast <2 x float> [[WIDE_LOAD8]], [[TMP21]] -; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = bitcast float* [[TMP8]] to <2 x float>* -; UNROLL-NO-IC-NEXT: store <2 x float> [[TMP22]], <2 x float>* [[TMP24]], align 4, !alias.scope !4, !noalias !7 -; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP10]] to <2 x float>* -; UNROLL-NO-IC-NEXT: store <2 x float> [[TMP23]], <2 x float>* [[TMP25]], align 4, !alias.scope !4, !noalias !7 +; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 2 +; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add i64 [[TMP5]], [[OFFSET]] +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = add i64 [[TMP6]], [[OFFSET]] +; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] +; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] +; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP11]], align 4, !alias.scope !4, !noalias !7 +; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 2 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x float>, ptr [[TMP12]], align 4, !alias.scope !4, !noalias !7 +; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = add i64 [[TMP5]], [[OFFSET2]] +; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = add i64 [[TMP6]], [[OFFSET2]] +; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] +; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] +; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD5:%.*]] = load <2 x float>, ptr [[TMP17]], align 4, !alias.scope !7 +; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 2 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x float>, ptr [[TMP18]], align 4, !alias.scope !7 +; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD5]] +; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT8]], [[WIDE_LOAD6]] +; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP19]] +; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = fadd fast <2 x float> [[WIDE_LOAD4]], [[TMP20]] +; UNROLL-NO-IC-NEXT: store <2 x float> [[TMP21]], ptr [[TMP11]], align 4, !alias.scope !4, !noalias !7 +; UNROLL-NO-IC-NEXT: store <2 x float> [[TMP22]], ptr [[TMP12]], align 4, !alias.scope !4, !noalias !7 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -538,14 +516,14 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) ; UNROLL-NO-IC: for.body: ; UNROLL-NO-IC-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[IND_SUM:%.*]] = add i64 [[IV]], [[OFFSET]] -; UNROLL-NO-IC-NEXT: [[ARR_IDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM]] -; UNROLL-NO-IC-NEXT: [[L1:%.*]] = load float, float* [[ARR_IDX]], align 4 +; UNROLL-NO-IC-NEXT: [[ARR_IDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM]] +; UNROLL-NO-IC-NEXT: [[L1:%.*]] = load float, ptr [[ARR_IDX]], align 4 ; UNROLL-NO-IC-NEXT: [[IND_SUM2:%.*]] = add i64 [[IV]], [[OFFSET2]] -; UNROLL-NO-IC-NEXT: [[ARR_IDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM2]] -; UNROLL-NO-IC-NEXT: [[L2:%.*]] = load float, float* [[ARR_IDX2]], align 4 +; UNROLL-NO-IC-NEXT: [[ARR_IDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM2]] +; UNROLL-NO-IC-NEXT: [[L2:%.*]] = load float, ptr [[ARR_IDX2]], align 4 ; UNROLL-NO-IC-NEXT: [[M:%.*]] = fmul fast float [[B]], [[L2]] ; UNROLL-NO-IC-NEXT: [[AD:%.*]] = fadd fast float [[L1]], [[M]] -; UNROLL-NO-IC-NEXT: store float [[AD]], float* [[ARR_IDX]], align 4 +; UNROLL-NO-IC-NEXT: store float [[AD]], ptr [[ARR_IDX]], align 4 ; UNROLL-NO-IC-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; UNROLL-NO-IC-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; UNROLL-NO-IC-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -557,50 +535,47 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) ; INTERLEAVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8 ; INTERLEAVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; INTERLEAVE: vector.memcheck: -; INTERLEAVE-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[OFFSET:%.*]] -; INTERLEAVE-NEXT: [[TMP0:%.*]] = add i64 [[N]], [[OFFSET]] -; INTERLEAVE-NEXT: [[SCEVGEP2:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]] -; INTERLEAVE-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[A]], i64 [[OFFSET2:%.*]] -; INTERLEAVE-NEXT: [[TMP1:%.*]] = add i64 [[N]], [[OFFSET2]] -; INTERLEAVE-NEXT: [[SCEVGEP6:%.*]] = getelementptr float, float* [[A]], i64 [[TMP1]] -; INTERLEAVE-NEXT: [[BOUND0:%.*]] = icmp ult float* [[SCEVGEP]], [[SCEVGEP6]] -; INTERLEAVE-NEXT: [[BOUND1:%.*]] = icmp ult float* [[SCEVGEP4]], [[SCEVGEP2]] +; INTERLEAVE-NEXT: [[TMP0:%.*]] = shl i64 [[OFFSET:%.*]], 2 +; INTERLEAVE-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP0]] +; INTERLEAVE-NEXT: [[TMP1:%.*]] = shl i64 [[N]], 2 +; INTERLEAVE-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP0]] +; INTERLEAVE-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; INTERLEAVE-NEXT: [[TMP3:%.*]] = shl i64 [[OFFSET2:%.*]], 2 +; INTERLEAVE-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]] +; INTERLEAVE-NEXT: [[TMP4:%.*]] = add i64 [[TMP1]], [[TMP3]] +; INTERLEAVE-NEXT: [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]] +; INTERLEAVE-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP3]] +; INTERLEAVE-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[UGLYGEP2]], [[UGLYGEP1]] ; INTERLEAVE-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; INTERLEAVE-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; INTERLEAVE: vector.ph: ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -8 ; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0 ; INTERLEAVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer -; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x float> poison, float [[B]], i64 0 -; INTERLEAVE-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT11]], <4 x float> poison, <4 x i32> zeroinitializer +; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x float> poison, float [[B]], i64 0 +; INTERLEAVE-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT7]], <4 x float> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], [[OFFSET]] -; INTERLEAVE-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] -; INTERLEAVE-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>* -; INTERLEAVE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4, !alias.scope !4, !noalias !7 -; INTERLEAVE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP3]], i64 4 -; INTERLEAVE-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>* -; INTERLEAVE-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4, !alias.scope !4, !noalias !7 -; INTERLEAVE-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], [[OFFSET2]] -; INTERLEAVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP7]] -; INTERLEAVE-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>* -; INTERLEAVE-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4, !alias.scope !7 -; INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP8]], i64 4 -; INTERLEAVE-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>* -; INTERLEAVE-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP11]], align 4, !alias.scope !7 -; INTERLEAVE-NEXT: [[TMP12:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD9]] -; INTERLEAVE-NEXT: [[TMP13:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT12]], [[WIDE_LOAD10]] -; INTERLEAVE-NEXT: [[TMP14:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[TMP12]] -; INTERLEAVE-NEXT: [[TMP15:%.*]] = fadd fast <4 x float> [[WIDE_LOAD8]], [[TMP13]] -; INTERLEAVE-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP3]] to <4 x float>* -; INTERLEAVE-NEXT: store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, !alias.scope !4, !noalias !7 -; INTERLEAVE-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP5]] to <4 x float>* -; INTERLEAVE-NEXT: store <4 x float> [[TMP15]], <4 x float>* [[TMP17]], align 4, !alias.scope !4, !noalias !7 +; INTERLEAVE-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], [[OFFSET]] +; INTERLEAVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]] +; INTERLEAVE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !alias.scope !4, !noalias !7 +; INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 4 +; INTERLEAVE-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP7]], align 4, !alias.scope !4, !noalias !7 +; INTERLEAVE-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], [[OFFSET2]] +; INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] +; INTERLEAVE-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP9]], align 4, !alias.scope !7 +; INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 4 +; INTERLEAVE-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP10]], align 4, !alias.scope !7 +; INTERLEAVE-NEXT: [[TMP11:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD5]] +; INTERLEAVE-NEXT: [[TMP12:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT8]], [[WIDE_LOAD6]] +; INTERLEAVE-NEXT: [[TMP13:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[TMP11]] +; INTERLEAVE-NEXT: [[TMP14:%.*]] = fadd fast <4 x float> [[WIDE_LOAD4]], [[TMP12]] +; INTERLEAVE-NEXT: store <4 x float> [[TMP13]], ptr [[TMP6]], align 4, !alias.scope !4, !noalias !7 +; INTERLEAVE-NEXT: store <4 x float> [[TMP14]], ptr [[TMP7]], align 4, !alias.scope !4, !noalias !7 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; INTERLEAVE-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -610,14 +585,14 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) ; INTERLEAVE: for.body: ; INTERLEAVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; INTERLEAVE-NEXT: [[IND_SUM:%.*]] = add i64 [[IV]], [[OFFSET]] -; INTERLEAVE-NEXT: [[ARR_IDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM]] -; INTERLEAVE-NEXT: [[L1:%.*]] = load float, float* [[ARR_IDX]], align 4 +; INTERLEAVE-NEXT: [[ARR_IDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM]] +; INTERLEAVE-NEXT: [[L1:%.*]] = load float, ptr [[ARR_IDX]], align 4 ; INTERLEAVE-NEXT: [[IND_SUM2:%.*]] = add i64 [[IV]], [[OFFSET2]] -; INTERLEAVE-NEXT: [[ARR_IDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM2]] -; INTERLEAVE-NEXT: [[L2:%.*]] = load float, float* [[ARR_IDX2]], align 4 +; INTERLEAVE-NEXT: [[ARR_IDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM2]] +; INTERLEAVE-NEXT: [[L2:%.*]] = load float, ptr [[ARR_IDX2]], align 4 ; INTERLEAVE-NEXT: [[M:%.*]] = fmul fast float [[L2]], [[B]] ; INTERLEAVE-NEXT: [[AD:%.*]] = fadd fast float [[L1]], [[M]] -; INTERLEAVE-NEXT: store float [[AD]], float* [[ARR_IDX]], align 4 +; INTERLEAVE-NEXT: store float [[AD]], ptr [[ARR_IDX]], align 4 ; INTERLEAVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; INTERLEAVE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; INTERLEAVE-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -630,14 +605,14 @@ entry: for.body: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] %ind.sum = add i64 %iv, %offset - %arr.idx = getelementptr inbounds float, float* %a, i64 %ind.sum - %l1 = load float, float* %arr.idx, align 4 + %arr.idx = getelementptr inbounds float, ptr %a, i64 %ind.sum + %l1 = load float, ptr %arr.idx, align 4 %ind.sum2 = add i64 %iv, %offset2 - %arr.idx2 = getelementptr inbounds float, float* %a, i64 %ind.sum2 - %l2 = load float, float* %arr.idx2, align 4 + %arr.idx2 = getelementptr inbounds float, ptr %a, i64 %ind.sum2 + %l2 = load float, ptr %arr.idx2, align 4 %m = fmul fast float %b, %l2 %ad = fadd fast float %l1, %m - store float %ad, float* %arr.idx, align 4 + store float %ad, ptr %arr.idx, align 4 %iv.next = add nuw nsw i64 %iv, 1 %exitcond = icmp eq i64 %iv.next, %n br i1 %exitcond, label %loopexit, label %for.body @@ -656,7 +631,7 @@ loopexit: ; ; -define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) { +define i64 @scalarize_induction_variable_01(ptr %a, i64 %n) { ; CHECK-LABEL: @scalarize_induction_variable_01( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) @@ -668,36 +643,35 @@ define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[TMP2]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP4]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP3]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP3]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[TMP9:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I]] -; CHECK-NEXT: [[TMP8:%.*]] = load i64, i64* [[TMP7]], align 8 -; CHECK-NEXT: [[TMP9]] = add i64 [[TMP8]], [[SUM]] +; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[TMP8:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I]] +; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8]] = add i64 [[TMP7]], [[SUM]] ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP10:%.*]] = phi i64 [ [[TMP9]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret i64 [[TMP10]] +; CHECK-NEXT: [[TMP9:%.*]] = phi i64 [ [[TMP8]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i64 [[TMP9]] ; ; IND-LABEL: @scalarize_induction_variable_01( ; IND-NEXT: entry: @@ -709,34 +683,33 @@ define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) { ; IND-NEXT: br label [[VECTOR_BODY:%.*]] ; IND: vector.body: ; IND-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; IND-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] -; IND-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]] -; IND-NEXT: [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>* -; IND-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; IND-NEXT: [[TMP2]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] +; IND-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] +; IND-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; IND-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8 +; IND-NEXT: [[TMP1]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] ; IND-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; IND-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IND-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; IND-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IND-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; IND: middle.block: -; IND-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP2]]) +; IND-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP1]]) ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; IND-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: ; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IND-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; IND-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; IND-NEXT: br label [[FOR_BODY:%.*]] ; IND: for.body: ; IND-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; IND-NEXT: [[SUM:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; IND-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I]] -; IND-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP5]], align 8 -; IND-NEXT: [[TMP7]] = add i64 [[TMP6]], [[SUM]] +; IND-NEXT: [[SUM:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; IND-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I]] +; IND-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 8 +; IND-NEXT: [[TMP6]] = add i64 [[TMP5]], [[SUM]] ; IND-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; IND-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; IND-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]] ; IND: for.end: -; IND-NEXT: [[TMP8:%.*]] = phi i64 [ [[TMP7]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] -; IND-NEXT: ret i64 [[TMP8]] +; IND-NEXT: [[TMP7:%.*]] = phi i64 [ [[TMP6]], [[FOR_BODY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] +; IND-NEXT: ret i64 [[TMP7]] ; ; UNROLL-LABEL: @scalarize_induction_variable_01( ; UNROLL-NEXT: entry: @@ -748,40 +721,38 @@ define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) { ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]] -; UNROLL-NEXT: [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>* -; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2 -; UNROLL-NEXT: [[TMP3:%.*]] = bitcast i64* [[TMP2]] to <2 x i64>* -; UNROLL-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 -; UNROLL-NEXT: [[TMP4]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] -; UNROLL-NEXT: [[TMP5]] = add <2 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]] +; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8 +; UNROLL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 2 +; UNROLL-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8 +; UNROLL-NEXT: [[TMP2]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] +; UNROLL-NEXT: [[TMP3]] = add <2 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]] ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; UNROLL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; UNROLL-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; UNROLL: middle.block: -; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP5]], [[TMP4]] -; UNROLL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP3]], [[TMP2]] +; UNROLL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: ; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; UNROLL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; UNROLL-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL: for.body: ; UNROLL-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[SUM:%.*]] = phi i64 [ [[TMP10:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I]] -; UNROLL-NEXT: [[TMP9:%.*]] = load i64, i64* [[TMP8]], align 8 -; UNROLL-NEXT: [[TMP10]] = add i64 [[TMP9]], [[SUM]] +; UNROLL-NEXT: [[SUM:%.*]] = phi i64 [ [[TMP8:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I]] +; UNROLL-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 8 +; UNROLL-NEXT: [[TMP8]] = add i64 [[TMP7]], [[SUM]] ; UNROLL-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; UNROLL-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; UNROLL-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]] ; UNROLL: for.end: -; UNROLL-NEXT: [[TMP11:%.*]] = phi i64 [ [[TMP10]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] -; UNROLL-NEXT: ret i64 [[TMP11]] +; UNROLL-NEXT: [[TMP9:%.*]] = phi i64 [ [[TMP8]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] +; UNROLL-NEXT: ret i64 [[TMP9]] ; ; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_01( ; UNROLL-NO-IC-NEXT: entry: @@ -794,44 +765,42 @@ define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) { ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 -; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP0]] -; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP1]] -; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[TMP2]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <2 x i64>* -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 8 -; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, i64* [[TMP2]], i32 2 -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = bitcast i64* [[TMP6]] to <2 x i64>* -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 8 -; UNROLL-NO-IC-NEXT: [[TMP8]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] -; UNROLL-NO-IC-NEXT: [[TMP9]] = add <2 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]] +; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 +; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 +; UNROLL-NO-IC-NEXT: [[TMP6]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] +; UNROLL-NO-IC-NEXT: [[TMP7]] = add <2 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]] ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP9]], [[TMP8]] -; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP7]], [[TMP6]] +; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-IC: for.body: ; UNROLL-NO-IC-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; UNROLL-NO-IC-NEXT: [[SUM:%.*]] = phi i64 [ [[TMP14:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I]] -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8 -; UNROLL-NO-IC-NEXT: [[TMP14]] = add i64 [[TMP13]], [[SUM]] +; UNROLL-NO-IC-NEXT: [[SUM:%.*]] = phi i64 [ [[TMP12:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I]] +; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP10]], align 8 +; UNROLL-NO-IC-NEXT: [[TMP12]] = add i64 [[TMP11]], [[SUM]] ; UNROLL-NO-IC-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; UNROLL-NO-IC-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; UNROLL-NO-IC-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]] ; UNROLL-NO-IC: for.end: -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = phi i64 [ [[TMP14]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] -; UNROLL-NO-IC-NEXT: ret i64 [[TMP15]] +; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = phi i64 [ [[TMP12]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: ret i64 [[TMP13]] ; ; INTERLEAVE-LABEL: @scalarize_induction_variable_01( ; INTERLEAVE-NEXT: entry: @@ -843,40 +812,38 @@ define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) { ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]] -; INTERLEAVE-NEXT: [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <4 x i64>* -; INTERLEAVE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, <4 x i64>* [[TMP1]], align 8 -; INTERLEAVE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 4 -; INTERLEAVE-NEXT: [[TMP3:%.*]] = bitcast i64* [[TMP2]] to <4 x i64>* -; INTERLEAVE-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, <4 x i64>* [[TMP3]], align 8 -; INTERLEAVE-NEXT: [[TMP4]] = add <4 x i64> [[WIDE_LOAD]], [[VEC_PHI]] -; INTERLEAVE-NEXT: [[TMP5]] = add <4 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]] +; INTERLEAVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; INTERLEAVE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8 +; INTERLEAVE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 4 +; INTERLEAVE-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8 +; INTERLEAVE-NEXT: [[TMP2]] = add <4 x i64> [[WIDE_LOAD]], [[VEC_PHI]] +; INTERLEAVE-NEXT: [[TMP3]] = add <4 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]] ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; INTERLEAVE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; INTERLEAVE: middle.block: -; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = add <4 x i64> [[TMP5]], [[TMP4]] -; INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[BIN_RDX]]) +; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = add <4 x i64> [[TMP3]], [[TMP2]] +; INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[BIN_RDX]]) ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: ; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; INTERLEAVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; INTERLEAVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; INTERLEAVE-NEXT: br label [[FOR_BODY:%.*]] ; INTERLEAVE: for.body: ; INTERLEAVE-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[SUM:%.*]] = phi i64 [ [[TMP10:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I]] -; INTERLEAVE-NEXT: [[TMP9:%.*]] = load i64, i64* [[TMP8]], align 8 -; INTERLEAVE-NEXT: [[TMP10]] = add i64 [[TMP9]], [[SUM]] +; INTERLEAVE-NEXT: [[SUM:%.*]] = phi i64 [ [[TMP8:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; INTERLEAVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I]] +; INTERLEAVE-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 8 +; INTERLEAVE-NEXT: [[TMP8]] = add i64 [[TMP7]], [[SUM]] ; INTERLEAVE-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; INTERLEAVE-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; INTERLEAVE-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]] ; INTERLEAVE: for.end: -; INTERLEAVE-NEXT: [[TMP11:%.*]] = phi i64 [ [[TMP10]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] -; INTERLEAVE-NEXT: ret i64 [[TMP11]] +; INTERLEAVE-NEXT: [[TMP9:%.*]] = phi i64 [ [[TMP8]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] +; INTERLEAVE-NEXT: ret i64 [[TMP9]] ; entry: br label %for.body @@ -884,8 +851,8 @@ entry: for.body: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] %sum = phi i64 [ %2, %for.body ], [ 0, %entry ] - %0 = getelementptr inbounds i64, i64* %a, i64 %i - %1 = load i64, i64* %0, align 8 + %0 = getelementptr inbounds i64, ptr %a, i64 %i + %1 = load i64, ptr %0, align 8 %2 = add i64 %1, %sum %i.next = add nuw nsw i64 %i, 1 %cond = icmp slt i64 %i.next, %n @@ -907,7 +874,7 @@ for.end: ; ; -define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) { +define float @scalarize_induction_variable_02(ptr %a, ptr %b, i64 %n) { ; CHECK-LABEL: @scalarize_induction_variable_02( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 8) @@ -927,16 +894,16 @@ define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) { ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[TMP5]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[TMP11]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = load float, float* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP11]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP12]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i32 0 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP14]], i32 1 ; CHECK-NEXT: [[TMP17:%.*]] = fadd fast <2 x float> [[VEC_PHI]], @@ -956,10 +923,10 @@ define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) { ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[S:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP28:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[I]] -; CHECK-NEXT: [[TMP23:%.*]] = load float, float* [[TMP22]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[I]] -; CHECK-NEXT: [[TMP25:%.*]] = load float, float* [[TMP24]], align 4 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[I]] +; CHECK-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[I]] +; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4 ; CHECK-NEXT: [[TMP26:%.*]] = fadd fast float [[S]], 1.000000e+00 ; CHECK-NEXT: [[TMP27:%.*]] = fadd fast float [[TMP26]], [[TMP23]] ; CHECK-NEXT: [[TMP28]] = fadd fast float [[TMP27]], [[TMP25]] @@ -987,16 +954,16 @@ define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) { ; IND-NEXT: [[VEC_PHI:%.*]] = phi <2 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3 ; IND-NEXT: [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 8 -; IND-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[OFFSET_IDX]] -; IND-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]] -; IND-NEXT: [[TMP6:%.*]] = load float, float* [[TMP4]], align 4 -; IND-NEXT: [[TMP7:%.*]] = load float, float* [[TMP5]], align 4 +; IND-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[OFFSET_IDX]] +; IND-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]] +; IND-NEXT: [[TMP6:%.*]] = load float, ptr [[TMP4]], align 4 +; IND-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP5]], align 4 ; IND-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0 ; IND-NEXT: [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP7]], i64 1 -; IND-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[OFFSET_IDX]] -; IND-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]] -; IND-NEXT: [[TMP12:%.*]] = load float, float* [[TMP10]], align 4 -; IND-NEXT: [[TMP13:%.*]] = load float, float* [[TMP11]], align 4 +; IND-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[OFFSET_IDX]] +; IND-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]] +; IND-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP10]], align 4 +; IND-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP11]], align 4 ; IND-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0 ; IND-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP13]], i64 1 ; IND-NEXT: [[TMP16:%.*]] = fadd fast <2 x float> [[VEC_PHI]], @@ -1016,10 +983,10 @@ define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) { ; IND: for.body: ; IND-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ] ; IND-NEXT: [[S:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP27:%.*]], [[FOR_BODY]] ] -; IND-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[I]] -; IND-NEXT: [[TMP22:%.*]] = load float, float* [[TMP21]], align 4 -; IND-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[I]] -; IND-NEXT: [[TMP24:%.*]] = load float, float* [[TMP23]], align 4 +; IND-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[I]] +; IND-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4 +; IND-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[I]] +; IND-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4 ; IND-NEXT: [[TMP25:%.*]] = fadd fast float [[S]], 1.000000e+00 ; IND-NEXT: [[TMP26:%.*]] = fadd fast float [[TMP25]], [[TMP22]] ; IND-NEXT: [[TMP27]] = fadd fast float [[TMP26]], [[TMP24]] @@ -1050,28 +1017,28 @@ define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) { ; UNROLL-NEXT: [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 8 ; UNROLL-NEXT: [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 16 ; UNROLL-NEXT: [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 24 -; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[OFFSET_IDX]] -; UNROLL-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]] -; UNROLL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] -; UNROLL-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]] -; UNROLL-NEXT: [[TMP10:%.*]] = load float, float* [[TMP6]], align 4 -; UNROLL-NEXT: [[TMP11:%.*]] = load float, float* [[TMP7]], align 4 +; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[OFFSET_IDX]] +; UNROLL-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]] +; UNROLL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] +; UNROLL-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]] +; UNROLL-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4 +; UNROLL-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4 ; UNROLL-NEXT: [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 ; UNROLL-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP11]], i64 1 -; UNROLL-NEXT: [[TMP14:%.*]] = load float, float* [[TMP8]], align 4 -; UNROLL-NEXT: [[TMP15:%.*]] = load float, float* [[TMP9]], align 4 +; UNROLL-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP8]], align 4 +; UNROLL-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP9]], align 4 ; UNROLL-NEXT: [[TMP16:%.*]] = insertelement <2 x float> poison, float [[TMP14]], i64 0 ; UNROLL-NEXT: [[TMP17:%.*]] = insertelement <2 x float> [[TMP16]], float [[TMP15]], i64 1 -; UNROLL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[OFFSET_IDX]] -; UNROLL-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]] -; UNROLL-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP4]] -; UNROLL-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP5]] -; UNROLL-NEXT: [[TMP22:%.*]] = load float, float* [[TMP18]], align 4 -; UNROLL-NEXT: [[TMP23:%.*]] = load float, float* [[TMP19]], align 4 +; UNROLL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[OFFSET_IDX]] +; UNROLL-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]] +; UNROLL-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] +; UNROLL-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]] +; UNROLL-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP18]], align 4 +; UNROLL-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP19]], align 4 ; UNROLL-NEXT: [[TMP24:%.*]] = insertelement <2 x float> poison, float [[TMP22]], i64 0 ; UNROLL-NEXT: [[TMP25:%.*]] = insertelement <2 x float> [[TMP24]], float [[TMP23]], i64 1 -; UNROLL-NEXT: [[TMP26:%.*]] = load float, float* [[TMP20]], align 4 -; UNROLL-NEXT: [[TMP27:%.*]] = load float, float* [[TMP21]], align 4 +; UNROLL-NEXT: [[TMP26:%.*]] = load float, ptr [[TMP20]], align 4 +; UNROLL-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP21]], align 4 ; UNROLL-NEXT: [[TMP28:%.*]] = insertelement <2 x float> poison, float [[TMP26]], i64 0 ; UNROLL-NEXT: [[TMP29:%.*]] = insertelement <2 x float> [[TMP28]], float [[TMP27]], i64 1 ; UNROLL-NEXT: [[TMP30:%.*]] = fadd fast <2 x float> [[VEC_PHI]], @@ -1095,10 +1062,10 @@ define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) { ; UNROLL: for.body: ; UNROLL-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ] ; UNROLL-NEXT: [[S:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP44:%.*]], [[FOR_BODY]] ] -; UNROLL-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[I]] -; UNROLL-NEXT: [[TMP39:%.*]] = load float, float* [[TMP38]], align 4 -; UNROLL-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[I]] -; UNROLL-NEXT: [[TMP41:%.*]] = load float, float* [[TMP40]], align 4 +; UNROLL-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[I]] +; UNROLL-NEXT: [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4 +; UNROLL-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[I]] +; UNROLL-NEXT: [[TMP41:%.*]] = load float, ptr [[TMP40]], align 4 ; UNROLL-NEXT: [[TMP42:%.*]] = fadd fast float [[S]], 1.000000e+00 ; UNROLL-NEXT: [[TMP43:%.*]] = fadd fast float [[TMP42]], [[TMP39]] ; UNROLL-NEXT: [[TMP44]] = fadd fast float [[TMP43]], [[TMP41]] @@ -1131,28 +1098,28 @@ define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) { ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8 ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 16 ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 24 -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP3]] -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] -; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]] -; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]] -; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = load float, float* [[TMP7]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = load float, float* [[TMP8]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP3]] +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] +; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]] +; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP8]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP12]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = load float, float* [[TMP9]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = load float, float* [[TMP10]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP9]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP10]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = insertelement <2 x float> poison, float [[TMP15]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = insertelement <2 x float> [[TMP17]], float [[TMP16]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP3]] -; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP4]] -; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP5]] -; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP6]] -; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = load float, float* [[TMP19]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = load float, float* [[TMP20]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP3]] +; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] +; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]] +; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]] +; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP19]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP20]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = insertelement <2 x float> poison, float [[TMP23]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = insertelement <2 x float> [[TMP25]], float [[TMP24]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = load float, float* [[TMP21]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = load float, float* [[TMP22]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP21]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = load float, ptr [[TMP22]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP28]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = fadd fast <2 x float> [[VEC_PHI]], @@ -1176,10 +1143,10 @@ define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) { ; UNROLL-NO-IC: for.body: ; UNROLL-NO-IC-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[S:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP45:%.*]], [[FOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[I]] -; UNROLL-NO-IC-NEXT: [[TMP40:%.*]] = load float, float* [[TMP39]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[I]] -; UNROLL-NO-IC-NEXT: [[TMP42:%.*]] = load float, float* [[TMP41]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[I]] +; UNROLL-NO-IC-NEXT: [[TMP40:%.*]] = load float, ptr [[TMP39]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[I]] +; UNROLL-NO-IC-NEXT: [[TMP42:%.*]] = load float, ptr [[TMP41]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP43:%.*]] = fadd fast float [[S]], 1.000000e+00 ; UNROLL-NO-IC-NEXT: [[TMP44:%.*]] = fadd fast float [[TMP43]], [[TMP40]] ; UNROLL-NO-IC-NEXT: [[TMP45]] = fadd fast float [[TMP44]], [[TMP42]] @@ -1207,58 +1174,54 @@ define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) { ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3 ; INTERLEAVE-NEXT: [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 32 -; INTERLEAVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[OFFSET_IDX]] -; INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]] -; INTERLEAVE-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP6]] to <32 x float>* -; INTERLEAVE-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP7]] to <32 x float>* -; INTERLEAVE-NEXT: [[WIDE_VEC:%.*]] = load <32 x float>, <32 x float>* [[TMP8]], align 4 -; INTERLEAVE-NEXT: [[WIDE_VEC2:%.*]] = load <32 x float>, <32 x float>* [[TMP9]], align 4 +; INTERLEAVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[OFFSET_IDX]] +; INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]] +; INTERLEAVE-NEXT: [[WIDE_VEC:%.*]] = load <32 x float>, ptr [[TMP6]], align 4 +; INTERLEAVE-NEXT: [[WIDE_VEC2:%.*]] = load <32 x float>, ptr [[TMP7]], align 4 ; INTERLEAVE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x float> [[WIDE_VEC]], <32 x float> poison, <4 x i32> ; INTERLEAVE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <32 x float> [[WIDE_VEC2]], <32 x float> poison, <4 x i32> -; INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[OFFSET_IDX]] -; INTERLEAVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP5]] -; INTERLEAVE-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP10]] to <32 x float>* -; INTERLEAVE-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP11]] to <32 x float>* -; INTERLEAVE-NEXT: [[WIDE_VEC4:%.*]] = load <32 x float>, <32 x float>* [[TMP12]], align 4 -; INTERLEAVE-NEXT: [[WIDE_VEC5:%.*]] = load <32 x float>, <32 x float>* [[TMP13]], align 4 +; INTERLEAVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[OFFSET_IDX]] +; INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]] +; INTERLEAVE-NEXT: [[WIDE_VEC4:%.*]] = load <32 x float>, ptr [[TMP8]], align 4 +; INTERLEAVE-NEXT: [[WIDE_VEC5:%.*]] = load <32 x float>, ptr [[TMP9]], align 4 ; INTERLEAVE-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <32 x float> [[WIDE_VEC4]], <32 x float> poison, <4 x i32> ; INTERLEAVE-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <32 x float> [[WIDE_VEC5]], <32 x float> poison, <4 x i32> -; INTERLEAVE-NEXT: [[TMP14:%.*]] = fadd fast <4 x float> [[VEC_PHI]], -; INTERLEAVE-NEXT: [[TMP15:%.*]] = fadd fast <4 x float> [[VEC_PHI1]], -; INTERLEAVE-NEXT: [[TMP16:%.*]] = fadd fast <4 x float> [[TMP14]], [[STRIDED_VEC]] -; INTERLEAVE-NEXT: [[TMP17:%.*]] = fadd fast <4 x float> [[TMP15]], [[STRIDED_VEC3]] -; INTERLEAVE-NEXT: [[TMP18]] = fadd fast <4 x float> [[TMP16]], [[STRIDED_VEC6]] -; INTERLEAVE-NEXT: [[TMP19]] = fadd fast <4 x float> [[TMP17]], [[STRIDED_VEC7]] +; INTERLEAVE-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[VEC_PHI]], +; INTERLEAVE-NEXT: [[TMP11:%.*]] = fadd fast <4 x float> [[VEC_PHI1]], +; INTERLEAVE-NEXT: [[TMP12:%.*]] = fadd fast <4 x float> [[TMP10]], [[STRIDED_VEC]] +; INTERLEAVE-NEXT: [[TMP13:%.*]] = fadd fast <4 x float> [[TMP11]], [[STRIDED_VEC3]] +; INTERLEAVE-NEXT: [[TMP14]] = fadd fast <4 x float> [[TMP12]], [[STRIDED_VEC6]] +; INTERLEAVE-NEXT: [[TMP15]] = fadd fast <4 x float> [[TMP13]], [[STRIDED_VEC7]] ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; INTERLEAVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; INTERLEAVE: middle.block: -; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP19]], [[TMP18]] -; INTERLEAVE-NEXT: [[TMP21:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) +; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP15]], [[TMP14]] +; INTERLEAVE-NEXT: [[TMP17:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) ; INTERLEAVE-NEXT: br label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: ; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; INTERLEAVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; INTERLEAVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] ; INTERLEAVE-NEXT: br label [[FOR_BODY:%.*]] ; INTERLEAVE: for.body: ; INTERLEAVE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ] -; INTERLEAVE-NEXT: [[S:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP28:%.*]], [[FOR_BODY]] ] -; INTERLEAVE-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[I]] -; INTERLEAVE-NEXT: [[TMP23:%.*]] = load float, float* [[TMP22]], align 4 -; INTERLEAVE-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[I]] -; INTERLEAVE-NEXT: [[TMP25:%.*]] = load float, float* [[TMP24]], align 4 -; INTERLEAVE-NEXT: [[TMP26:%.*]] = fadd fast float [[S]], 1.000000e+00 -; INTERLEAVE-NEXT: [[TMP27:%.*]] = fadd fast float [[TMP26]], [[TMP23]] -; INTERLEAVE-NEXT: [[TMP28]] = fadd fast float [[TMP27]], [[TMP25]] +; INTERLEAVE-NEXT: [[S:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP24:%.*]], [[FOR_BODY]] ] +; INTERLEAVE-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[I]] +; INTERLEAVE-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4 +; INTERLEAVE-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[I]] +; INTERLEAVE-NEXT: [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4 +; INTERLEAVE-NEXT: [[TMP22:%.*]] = fadd fast float [[S]], 1.000000e+00 +; INTERLEAVE-NEXT: [[TMP23:%.*]] = fadd fast float [[TMP22]], [[TMP19]] +; INTERLEAVE-NEXT: [[TMP24]] = fadd fast float [[TMP23]], [[TMP21]] ; INTERLEAVE-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 8 ; INTERLEAVE-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; INTERLEAVE-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP14:![0-9]+]] ; INTERLEAVE: for.end: -; INTERLEAVE-NEXT: ret float [[TMP28]] +; INTERLEAVE-NEXT: ret float [[TMP24]] ; entry: br label %for.body @@ -1266,10 +1229,10 @@ entry: for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %s = phi float [ 0.0, %entry ], [ %6, %for.body ] - %0 = getelementptr inbounds float, float* %a, i64 %i - %1 = load float, float* %0, align 4 - %2 = getelementptr inbounds float, float* %b, i64 %i - %3 = load float, float* %2, align 4 + %0 = getelementptr inbounds float, ptr %a, i64 %i + %1 = load float, ptr %0, align 4 + %2 = getelementptr inbounds float, ptr %b, i64 %i + %3 = load float, ptr %2, align 4 %4 = fadd fast float %s, 1.0 %5 = fadd fast float %4, %1 %6 = fadd fast float %5, %3 @@ -1290,7 +1253,7 @@ for.end: ; %pair.i32 = type { i32, i32 } -define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) { +define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) { ; CHECK-LABEL: @scalarize_induction_variable_03( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) @@ -1306,17 +1269,17 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[TMP7]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0 -; CHECK-NEXT: store i32 [[TMP9]], i32* [[TMP2]], align 8 +; CHECK-NEXT: store i32 [[TMP9]], ptr [[TMP2]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1 -; CHECK-NEXT: store i32 [[TMP10]], i32* [[TMP3]], align 8 +; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP3]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] @@ -1328,10 +1291,10 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[F]], align 8 +; CHECK-NEXT: [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[F]], align 8 ; CHECK-NEXT: [[TMP13:%.*]] = xor i32 [[TMP12]], [[Y]] -; CHECK-NEXT: store i32 [[TMP13]], i32* [[F]], align 8 +; CHECK-NEXT: store i32 [[TMP13]], ptr [[F]], align 8 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP16:![0-9]+]] @@ -1351,17 +1314,17 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) { ; IND: vector.body: ; IND-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 1 -; IND-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 1 -; IND-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP0]], i32 1 -; IND-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 8 -; IND-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 8 +; IND-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1 +; IND-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP0]], i32 1 +; IND-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 8 +; IND-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 8 ; IND-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0 ; IND-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1 ; IND-NEXT: [[TMP7:%.*]] = xor <2 x i32> [[TMP6]], [[BROADCAST_SPLAT]] ; IND-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP7]], i64 0 -; IND-NEXT: store i32 [[TMP8]], i32* [[TMP1]], align 8 +; IND-NEXT: store i32 [[TMP8]], ptr [[TMP1]], align 8 ; IND-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i64 1 -; IND-NEXT: store i32 [[TMP9]], i32* [[TMP2]], align 8 +; IND-NEXT: store i32 [[TMP9]], ptr [[TMP2]], align 8 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; IND-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IND-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] @@ -1373,10 +1336,10 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) { ; IND-NEXT: br label [[FOR_BODY:%.*]] ; IND: for.body: ; IND-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; IND-NEXT: [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 -; IND-NEXT: [[TMP11:%.*]] = load i32, i32* [[F]], align 8 +; IND-NEXT: [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 +; IND-NEXT: [[TMP11:%.*]] = load i32, ptr [[F]], align 8 ; IND-NEXT: [[TMP12:%.*]] = xor i32 [[TMP11]], [[Y]] -; IND-NEXT: store i32 [[TMP12]], i32* [[F]], align 8 +; IND-NEXT: store i32 [[TMP12]], ptr [[F]], align 8 ; IND-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; IND-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; IND-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP16:![0-9]+]] @@ -1400,28 +1363,28 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) { ; UNROLL-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 1 ; UNROLL-NEXT: [[TMP1:%.*]] = or i64 [[INDEX]], 2 ; UNROLL-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 3 -; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 1 -; UNROLL-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP0]], i32 1 -; UNROLL-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP1]], i32 1 -; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1 -; UNROLL-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 8 -; UNROLL-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 8 +; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1 +; UNROLL-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP0]], i32 1 +; UNROLL-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP1]], i32 1 +; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1 +; UNROLL-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP3]], align 8 +; UNROLL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 8 ; UNROLL-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i64 0 ; UNROLL-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP8]], i64 1 -; UNROLL-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP5]], align 8 -; UNROLL-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP6]], align 8 +; UNROLL-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP5]], align 8 +; UNROLL-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP6]], align 8 ; UNROLL-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0 ; UNROLL-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1 ; UNROLL-NEXT: [[TMP15:%.*]] = xor <2 x i32> [[TMP10]], [[BROADCAST_SPLAT]] ; UNROLL-NEXT: [[TMP16:%.*]] = xor <2 x i32> [[TMP14]], [[BROADCAST_SPLAT2]] ; UNROLL-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP15]], i64 0 -; UNROLL-NEXT: store i32 [[TMP17]], i32* [[TMP3]], align 8 +; UNROLL-NEXT: store i32 [[TMP17]], ptr [[TMP3]], align 8 ; UNROLL-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP15]], i64 1 -; UNROLL-NEXT: store i32 [[TMP18]], i32* [[TMP4]], align 8 +; UNROLL-NEXT: store i32 [[TMP18]], ptr [[TMP4]], align 8 ; UNROLL-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP16]], i64 0 -; UNROLL-NEXT: store i32 [[TMP19]], i32* [[TMP5]], align 8 +; UNROLL-NEXT: store i32 [[TMP19]], ptr [[TMP5]], align 8 ; UNROLL-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[TMP16]], i64 1 -; UNROLL-NEXT: store i32 [[TMP20]], i32* [[TMP6]], align 8 +; UNROLL-NEXT: store i32 [[TMP20]], ptr [[TMP6]], align 8 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] @@ -1433,10 +1396,10 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) { ; UNROLL-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL: for.body: ; UNROLL-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 -; UNROLL-NEXT: [[TMP22:%.*]] = load i32, i32* [[F]], align 8 +; UNROLL-NEXT: [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 +; UNROLL-NEXT: [[TMP22:%.*]] = load i32, ptr [[F]], align 8 ; UNROLL-NEXT: [[TMP23:%.*]] = xor i32 [[TMP22]], [[Y]] -; UNROLL-NEXT: store i32 [[TMP23]], i32* [[F]], align 8 +; UNROLL-NEXT: store i32 [[TMP23]], ptr [[F]], align 8 ; UNROLL-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; UNROLL-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; UNROLL-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP16:![0-9]+]] @@ -1462,28 +1425,28 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) { ; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[TMP0]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP1]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 8 -; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 8 +; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[TMP0]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP1]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 8 +; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP9]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP6]], align 8 -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP7]], align 8 +; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP6]], align 8 +; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP7]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP13]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = xor <2 x i32> [[TMP11]], [[BROADCAST_SPLAT]] ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = xor <2 x i32> [[TMP15]], [[BROADCAST_SPLAT2]] ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0 -; UNROLL-NO-IC-NEXT: store i32 [[TMP18]], i32* [[TMP4]], align 8 +; UNROLL-NO-IC-NEXT: store i32 [[TMP18]], ptr [[TMP4]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1 -; UNROLL-NO-IC-NEXT: store i32 [[TMP19]], i32* [[TMP5]], align 8 +; UNROLL-NO-IC-NEXT: store i32 [[TMP19]], ptr [[TMP5]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[TMP17]], i32 0 -; UNROLL-NO-IC-NEXT: store i32 [[TMP20]], i32* [[TMP6]], align 8 +; UNROLL-NO-IC-NEXT: store i32 [[TMP20]], ptr [[TMP6]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1 -; UNROLL-NO-IC-NEXT: store i32 [[TMP21]], i32* [[TMP7]], align 8 +; UNROLL-NO-IC-NEXT: store i32 [[TMP21]], ptr [[TMP7]], align 8 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] @@ -1495,10 +1458,10 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) { ; UNROLL-NO-IC-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-IC: for.body: ; UNROLL-NO-IC-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; UNROLL-NO-IC-NEXT: [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = load i32, i32* [[F]], align 8 +; UNROLL-NO-IC-NEXT: [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = load i32, ptr [[F]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = xor i32 [[TMP23]], [[Y]] -; UNROLL-NO-IC-NEXT: store i32 [[TMP24]], i32* [[F]], align 8 +; UNROLL-NO-IC-NEXT: store i32 [[TMP24]], ptr [[F]], align 8 ; UNROLL-NO-IC-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; UNROLL-NO-IC-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; UNROLL-NO-IC-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP16:![0-9]+]] @@ -1529,41 +1492,39 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) { ; INTERLEAVE-NEXT: [[TMP6:%.*]] = or i64 [[INDEX]], 5 ; INTERLEAVE-NEXT: [[TMP7:%.*]] = or i64 [[INDEX]], 6 ; INTERLEAVE-NEXT: [[TMP8:%.*]] = or i64 [[INDEX]], 7 -; INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 1 -; INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1 -; INTERLEAVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1 -; INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1 -; INTERLEAVE-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 1 -; INTERLEAVE-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP6]], i32 1 -; INTERLEAVE-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP7]], i32 1 -; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP8]], i32 1 -; INTERLEAVE-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>* -; INTERLEAVE-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP13]] to <8 x i32>* -; INTERLEAVE-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP17]], align 8 -; INTERLEAVE-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP18]], align 8 +; INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1 +; INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1 +; INTERLEAVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1 +; INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1 +; INTERLEAVE-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP5]], i32 1 +; INTERLEAVE-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP6]], i32 1 +; INTERLEAVE-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP7]], i32 1 +; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP8]], i32 1 +; INTERLEAVE-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP9]], align 8 +; INTERLEAVE-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP13]], align 8 ; INTERLEAVE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> ; INTERLEAVE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> -; INTERLEAVE-NEXT: [[TMP19:%.*]] = xor <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] -; INTERLEAVE-NEXT: [[TMP20:%.*]] = xor <4 x i32> [[STRIDED_VEC2]], [[BROADCAST_SPLAT4]] -; INTERLEAVE-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP19]], i64 0 -; INTERLEAVE-NEXT: store i32 [[TMP21]], i32* [[TMP9]], align 8 -; INTERLEAVE-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP19]], i64 1 -; INTERLEAVE-NEXT: store i32 [[TMP22]], i32* [[TMP10]], align 8 -; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP19]], i64 2 -; INTERLEAVE-NEXT: store i32 [[TMP23]], i32* [[TMP11]], align 8 -; INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP19]], i64 3 -; INTERLEAVE-NEXT: store i32 [[TMP24]], i32* [[TMP12]], align 8 -; INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP20]], i64 0 -; INTERLEAVE-NEXT: store i32 [[TMP25]], i32* [[TMP13]], align 8 -; INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP20]], i64 1 -; INTERLEAVE-NEXT: store i32 [[TMP26]], i32* [[TMP14]], align 8 -; INTERLEAVE-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP20]], i64 2 -; INTERLEAVE-NEXT: store i32 [[TMP27]], i32* [[TMP15]], align 8 -; INTERLEAVE-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[TMP20]], i64 3 -; INTERLEAVE-NEXT: store i32 [[TMP28]], i32* [[TMP16]], align 8 +; INTERLEAVE-NEXT: [[TMP17:%.*]] = xor <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] +; INTERLEAVE-NEXT: [[TMP18:%.*]] = xor <4 x i32> [[STRIDED_VEC2]], [[BROADCAST_SPLAT4]] +; INTERLEAVE-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[TMP17]], i64 0 +; INTERLEAVE-NEXT: store i32 [[TMP19]], ptr [[TMP9]], align 8 +; INTERLEAVE-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP17]], i64 1 +; INTERLEAVE-NEXT: store i32 [[TMP20]], ptr [[TMP10]], align 8 +; INTERLEAVE-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP17]], i64 2 +; INTERLEAVE-NEXT: store i32 [[TMP21]], ptr [[TMP11]], align 8 +; INTERLEAVE-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP17]], i64 3 +; INTERLEAVE-NEXT: store i32 [[TMP22]], ptr [[TMP12]], align 8 +; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP18]], i64 0 +; INTERLEAVE-NEXT: store i32 [[TMP23]], ptr [[TMP13]], align 8 +; INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP18]], i64 1 +; INTERLEAVE-NEXT: store i32 [[TMP24]], ptr [[TMP14]], align 8 +; INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP18]], i64 2 +; INTERLEAVE-NEXT: store i32 [[TMP25]], ptr [[TMP15]], align 8 +; INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP18]], i64 3 +; INTERLEAVE-NEXT: store i32 [[TMP26]], ptr [[TMP16]], align 8 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; INTERLEAVE-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: br label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: @@ -1571,10 +1532,10 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) { ; INTERLEAVE-NEXT: br label [[FOR_BODY:%.*]] ; INTERLEAVE: for.body: ; INTERLEAVE-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 -; INTERLEAVE-NEXT: [[TMP30:%.*]] = load i32, i32* [[F]], align 8 -; INTERLEAVE-NEXT: [[TMP31:%.*]] = xor i32 [[TMP30]], [[Y]] -; INTERLEAVE-NEXT: store i32 [[TMP31]], i32* [[F]], align 8 +; INTERLEAVE-NEXT: [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 +; INTERLEAVE-NEXT: [[TMP28:%.*]] = load i32, ptr [[F]], align 8 +; INTERLEAVE-NEXT: [[TMP29:%.*]] = xor i32 [[TMP28]], [[Y]] +; INTERLEAVE-NEXT: store i32 [[TMP29]], ptr [[F]], align 8 ; INTERLEAVE-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; INTERLEAVE-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; INTERLEAVE-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP16:![0-9]+]] @@ -1586,10 +1547,10 @@ entry: for.body: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] - %f = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 - %0 = load i32, i32* %f, align 8 + %f = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 1 + %0 = load i32, ptr %f, align 8 %1 = xor i32 %0, %y - store i32 %1, i32* %f, align 8 + store i32 %1, ptr %f, align 8 %i.next = add nuw nsw i64 %i, 1 %cond = icmp slt i64 %i.next, %n br i1 %cond, label %for.body, label %for.end @@ -1605,29 +1566,26 @@ for.end: ; p[i].f = a[i * 4] ; -define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) { +define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; CHECK-LABEL: @scalarize_induction_variable_04( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N:%.*]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 0, i32 1 -; CHECK-NEXT: [[SCEVGEP1:%.*]] = bitcast i32* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 4 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 ; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1 -; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]] -; CHECK-NEXT: [[SCEVGEP23:%.*]] = bitcast %pair.i32* [[SCEVGEP2]] to i8* -; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP4]], 2 -; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], 1 -; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP7]] -; CHECK-NEXT: [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8* -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP56]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[A4]], [[SCEVGEP23]] +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 8 +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 4 +; CHECK-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP2]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -1637,23 +1595,23 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP10:%.*]] = shl nsw <2 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 1, !alias.scope !17 -; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP14]], align 1, !alias.scope !17 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP9]], i32 1 -; CHECK-NEXT: store i32 [[TMP15]], i32* [[TMP17]], align 1, !alias.scope !20, !noalias !17 -; CHECK-NEXT: store i32 [[TMP16]], i32* [[TMP18]], align 1, !alias.scope !20, !noalias !17 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = shl nsw <2 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP13]], align 1, !alias.scope !17 +; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1, !alias.scope !17 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP10]], i32 1 +; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP18]], align 1, !alias.scope !20, !noalias !17 +; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP19]], align 1, !alias.scope !20, !noalias !17 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1662,14 +1620,14 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[TMP20:%.*]] = shl nsw i64 [[I]], 2 -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 -; CHECK-NEXT: store i32 [[TMP22]], i32* [[TMP23]], align 1 +; CHECK-NEXT: [[TMP21:%.*]] = shl nsw i64 [[I]], 2 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 +; CHECK-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 1 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 -; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[I_NEXT]] to i32 -; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP24]], [[N]] +; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[I_NEXT]] to i32 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP25]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void @@ -1682,16 +1640,17 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) { ; IND-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[TMP0]], 0 ; IND-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; IND: vector.memcheck: -; IND-NEXT: [[SCEVGEP:%.*]] = getelementptr [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 0, i32 1 +; IND-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 4 ; IND-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 ; IND-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 -; IND-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1 -; IND-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP4]], 2 -; IND-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], 1 -; IND-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP7]] -; IND-NEXT: [[BOUND0:%.*]] = icmp ult i32* [[SCEVGEP]], [[SCEVGEP5]] -; IND-NEXT: [[TMP8:%.*]] = getelementptr [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 0 -; IND-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP8]], [[A]] +; IND-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; IND-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 8 +; IND-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]] +; IND-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 4 +; IND-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], 4 +; IND-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP8]] +; IND-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP2]] +; IND-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[A]] ; IND-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; IND-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; IND: vector.ph: @@ -1703,15 +1662,15 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) { ; IND-NEXT: [[TMP9:%.*]] = or i64 [[INDEX]], 1 ; IND-NEXT: [[TMP10:%.*]] = shl nsw <2 x i64> [[VEC_IND]], ; IND-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP10]], i64 0 -; IND-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP11]] +; IND-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] ; IND-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i64 1 -; IND-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP13]] -; IND-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 1, !alias.scope !17 -; IND-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP14]], align 1, !alias.scope !17 -; IND-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1 -; IND-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP9]], i32 1 -; IND-NEXT: store i32 [[TMP15]], i32* [[TMP17]], align 1, !alias.scope !20, !noalias !17 -; IND-NEXT: store i32 [[TMP16]], i32* [[TMP18]], align 1, !alias.scope !20, !noalias !17 +; IND-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] +; IND-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP12]], align 1, !alias.scope !17 +; IND-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 1, !alias.scope !17 +; IND-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]], i32 1 +; IND-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP9]], i32 1 +; IND-NEXT: store i32 [[TMP15]], ptr [[TMP17]], align 1, !alias.scope !20, !noalias !17 +; IND-NEXT: store i32 [[TMP16]], ptr [[TMP18]], align 1, !alias.scope !20, !noalias !17 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; IND-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1725,10 +1684,10 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) { ; IND: for.body: ; IND-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; IND-NEXT: [[TMP20:%.*]] = shl nsw i64 [[I]], 2 -; IND-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP20]] -; IND-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 1 -; IND-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 -; IND-NEXT: store i32 [[TMP22]], i32* [[TMP23]], align 1 +; IND-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]] +; IND-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 1 +; IND-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 +; IND-NEXT: store i32 [[TMP22]], ptr [[TMP23]], align 1 ; IND-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; IND-NEXT: [[TMP24:%.*]] = trunc i64 [[I_NEXT]] to i32 ; IND-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP24]], [[N]] @@ -1744,16 +1703,17 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) { ; UNROLL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3 ; UNROLL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; UNROLL: vector.memcheck: -; UNROLL-NEXT: [[SCEVGEP:%.*]] = getelementptr [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 0, i32 1 +; UNROLL-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 4 ; UNROLL-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 ; UNROLL-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 -; UNROLL-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1 -; UNROLL-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP4]], 2 -; UNROLL-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], 1 -; UNROLL-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP7]] -; UNROLL-NEXT: [[BOUND0:%.*]] = icmp ult i32* [[SCEVGEP]], [[SCEVGEP5]] -; UNROLL-NEXT: [[TMP8:%.*]] = getelementptr [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 0 -; UNROLL-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP8]], [[A]] +; UNROLL-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; UNROLL-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 8 +; UNROLL-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]] +; UNROLL-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 4 +; UNROLL-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], 4 +; UNROLL-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP8]] +; UNROLL-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP2]] +; UNROLL-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[A]] ; UNROLL-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; UNROLL-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: @@ -1769,25 +1729,25 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) { ; UNROLL-NEXT: [[STEP_ADD:%.*]] = shl <2 x i64> [[VEC_IND]], ; UNROLL-NEXT: [[TMP13:%.*]] = add <2 x i64> [[STEP_ADD]], ; UNROLL-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i64 0 -; UNROLL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP14]] +; UNROLL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] ; UNROLL-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i64 1 -; UNROLL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]] +; UNROLL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]] ; UNROLL-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP13]], i64 0 -; UNROLL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP18]] +; UNROLL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] ; UNROLL-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP13]], i64 1 -; UNROLL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP20]] -; UNROLL-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP15]], align 1, !alias.scope !17 -; UNROLL-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP17]], align 1, !alias.scope !17 -; UNROLL-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP19]], align 1, !alias.scope !17 -; UNROLL-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP21]], align 1, !alias.scope !17 -; UNROLL-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1 -; UNROLL-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP9]], i32 1 -; UNROLL-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP10]], i32 1 -; UNROLL-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP11]], i32 1 -; UNROLL-NEXT: store i32 [[TMP22]], i32* [[TMP26]], align 1, !alias.scope !20, !noalias !17 -; UNROLL-NEXT: store i32 [[TMP23]], i32* [[TMP27]], align 1, !alias.scope !20, !noalias !17 -; UNROLL-NEXT: store i32 [[TMP24]], i32* [[TMP28]], align 1, !alias.scope !20, !noalias !17 -; UNROLL-NEXT: store i32 [[TMP25]], i32* [[TMP29]], align 1, !alias.scope !20, !noalias !17 +; UNROLL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]] +; UNROLL-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP15]], align 1, !alias.scope !17 +; UNROLL-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP17]], align 1, !alias.scope !17 +; UNROLL-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP19]], align 1, !alias.scope !17 +; UNROLL-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP21]], align 1, !alias.scope !17 +; UNROLL-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]], i32 1 +; UNROLL-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP9]], i32 1 +; UNROLL-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP10]], i32 1 +; UNROLL-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1 +; UNROLL-NEXT: store i32 [[TMP22]], ptr [[TMP26]], align 1, !alias.scope !20, !noalias !17 +; UNROLL-NEXT: store i32 [[TMP23]], ptr [[TMP27]], align 1, !alias.scope !20, !noalias !17 +; UNROLL-NEXT: store i32 [[TMP24]], ptr [[TMP28]], align 1, !alias.scope !20, !noalias !17 +; UNROLL-NEXT: store i32 [[TMP25]], ptr [[TMP29]], align 1, !alias.scope !20, !noalias !17 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; UNROLL-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1801,10 +1761,10 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) { ; UNROLL: for.body: ; UNROLL-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NEXT: [[TMP31:%.*]] = shl nsw i64 [[I]], 2 -; UNROLL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP31]] -; UNROLL-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 1 -; UNROLL-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 -; UNROLL-NEXT: store i32 [[TMP33]], i32* [[TMP34]], align 1 +; UNROLL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP31]] +; UNROLL-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 1 +; UNROLL-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 +; UNROLL-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 1 ; UNROLL-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; UNROLL-NEXT: [[TMP35:%.*]] = trunc i64 [[I_NEXT]] to i32 ; UNROLL-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP35]], [[N]] @@ -1814,26 +1774,23 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) { ; ; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_04( ; UNROLL-NO-IC-NEXT: entry: -; UNROLL-NO-IC-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* ; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add i32 [[N:%.*]], -1 ; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 ; UNROLL-NO-IC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 ; UNROLL-NO-IC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; UNROLL-NO-IC: vector.memcheck: -; UNROLL-NO-IC-NEXT: [[SCEVGEP:%.*]] = getelementptr [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 0, i32 1 -; UNROLL-NO-IC-NEXT: [[SCEVGEP1:%.*]] = bitcast i32* [[SCEVGEP]] to i8* +; UNROLL-NO-IC-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 4 ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 -; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1 -; UNROLL-NO-IC-NEXT: [[SCEVGEP2:%.*]] = getelementptr [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]] -; UNROLL-NO-IC-NEXT: [[SCEVGEP23:%.*]] = bitcast %pair.i32* [[SCEVGEP2]] to i8* -; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP4]], 2 -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], 1 -; UNROLL-NO-IC-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP7]] -; UNROLL-NO-IC-NEXT: [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8* -; UNROLL-NO-IC-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP56]] -; UNROLL-NO-IC-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[A4]], [[SCEVGEP23]] +; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 8 +; UNROLL-NO-IC-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]] +; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 4 +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 4 +; UNROLL-NO-IC-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP8]] +; UNROLL-NO-IC-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP2]] +; UNROLL-NO-IC-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]] ; UNROLL-NO-IC-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; UNROLL-NO-IC-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-IC: vector.ph: @@ -1844,36 +1801,36 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) { ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 2 -; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 3 -; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = shl nsw <2 x i64> [[VEC_IND]], -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = shl nsw <2 x i64> [[STEP_ADD]], -; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP14]] -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]] -; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP18]] -; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP20]] -; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP15]], align 1, !alias.scope !17 -; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP17]], align 1, !alias.scope !17 -; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP19]], align 1, !alias.scope !17 -; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP21]], align 1, !alias.scope !17 -; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP8]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP9]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP10]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP11]], i32 1 -; UNROLL-NO-IC-NEXT: store i32 [[TMP22]], i32* [[TMP26]], align 1, !alias.scope !20, !noalias !17 -; UNROLL-NO-IC-NEXT: store i32 [[TMP23]], i32* [[TMP27]], align 1, !alias.scope !20, !noalias !17 -; UNROLL-NO-IC-NEXT: store i32 [[TMP24]], i32* [[TMP28]], align 1, !alias.scope !20, !noalias !17 -; UNROLL-NO-IC-NEXT: store i32 [[TMP25]], i32* [[TMP29]], align 1, !alias.scope !20, !noalias !17 +; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1 +; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 2 +; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 3 +; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = shl nsw <2 x i64> [[VEC_IND]], +; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = shl nsw <2 x i64> [[STEP_ADD]], +; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]] +; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP17]] +; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]] +; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]] +; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP16]], align 1, !alias.scope !17 +; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP18]], align 1, !alias.scope !17 +; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP20]], align 1, !alias.scope !17 +; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP22]], align 1, !alias.scope !17 +; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[TMP9]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP10]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP12]], i32 1 +; UNROLL-NO-IC-NEXT: store i32 [[TMP23]], ptr [[TMP27]], align 1, !alias.scope !20, !noalias !17 +; UNROLL-NO-IC-NEXT: store i32 [[TMP24]], ptr [[TMP28]], align 1, !alias.scope !20, !noalias !17 +; UNROLL-NO-IC-NEXT: store i32 [[TMP25]], ptr [[TMP29]], align 1, !alias.scope !20, !noalias !17 +; UNROLL-NO-IC-NEXT: store i32 [[TMP26]], ptr [[TMP30]], align 1, !alias.scope !20, !noalias !17 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], -; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1882,14 +1839,14 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) { ; UNROLL-NO-IC-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-IC: for.body: ; UNROLL-NO-IC-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = shl nsw i64 [[I]], 2 -; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP31]] -; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 1 -; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 -; UNROLL-NO-IC-NEXT: store i32 [[TMP33]], i32* [[TMP34]], align 1 +; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = shl nsw i64 [[I]], 2 +; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP32]] +; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 1 +; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 +; UNROLL-NO-IC-NEXT: store i32 [[TMP34]], ptr [[TMP35]], align 1 ; UNROLL-NO-IC-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 -; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = trunc i64 [[I_NEXT]] to i32 -; UNROLL-NO-IC-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP35]], [[N]] +; UNROLL-NO-IC-NEXT: [[TMP36:%.*]] = trunc i64 [[I_NEXT]] to i32 +; UNROLL-NO-IC-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP36]], [[N]] ; UNROLL-NO-IC-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; UNROLL-NO-IC: for.end: ; UNROLL-NO-IC-NEXT: ret void @@ -1902,16 +1859,17 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) { ; INTERLEAVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 8 ; INTERLEAVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; INTERLEAVE: vector.memcheck: -; INTERLEAVE-NEXT: [[SCEVGEP:%.*]] = getelementptr [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 0, i32 1 +; INTERLEAVE-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 4 ; INTERLEAVE-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 ; INTERLEAVE-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 -; INTERLEAVE-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1 -; INTERLEAVE-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP4]], 2 -; INTERLEAVE-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], 1 -; INTERLEAVE-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP7]] -; INTERLEAVE-NEXT: [[BOUND0:%.*]] = icmp ult i32* [[SCEVGEP]], [[SCEVGEP5]] -; INTERLEAVE-NEXT: [[TMP8:%.*]] = getelementptr [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 0 -; INTERLEAVE-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP8]], [[A]] +; INTERLEAVE-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; INTERLEAVE-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 8 +; INTERLEAVE-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]] +; INTERLEAVE-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 4 +; INTERLEAVE-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], 4 +; INTERLEAVE-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP8]] +; INTERLEAVE-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP2]] +; INTERLEAVE-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[A]] ; INTERLEAVE-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; INTERLEAVE-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; INTERLEAVE: vector.ph: @@ -1931,39 +1889,37 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) { ; INTERLEAVE-NEXT: [[TMP17:%.*]] = or i64 [[INDEX]], 7 ; INTERLEAVE-NEXT: [[TMP18:%.*]] = shl nsw i64 [[INDEX]], 2 ; INTERLEAVE-NEXT: [[TMP19:%.*]] = shl nsw i64 [[TMP14]], 2 -; INTERLEAVE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP18]] -; INTERLEAVE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP19]] -; INTERLEAVE-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP20]] to <16 x i32>* -; INTERLEAVE-NEXT: [[TMP23:%.*]] = bitcast i32* [[TMP21]] to <16 x i32>* -; INTERLEAVE-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[TMP22]], align 1 -; INTERLEAVE-NEXT: [[WIDE_VEC7:%.*]] = load <16 x i32>, <16 x i32>* [[TMP23]], align 1 -; INTERLEAVE-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1 -; INTERLEAVE-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP11]], i32 1 -; INTERLEAVE-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP12]], i32 1 -; INTERLEAVE-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP13]], i32 1 -; INTERLEAVE-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP14]], i32 1 -; INTERLEAVE-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP15]], i32 1 -; INTERLEAVE-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP16]], i32 1 -; INTERLEAVE-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP17]], i32 1 -; INTERLEAVE-NEXT: [[TMP32:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 0 -; INTERLEAVE-NEXT: store i32 [[TMP32]], i32* [[TMP24]], align 1, !alias.scope !17, !noalias !20 -; INTERLEAVE-NEXT: [[TMP33:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 4 -; INTERLEAVE-NEXT: store i32 [[TMP33]], i32* [[TMP25]], align 1, !alias.scope !17, !noalias !20 -; INTERLEAVE-NEXT: [[TMP34:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 8 -; INTERLEAVE-NEXT: store i32 [[TMP34]], i32* [[TMP26]], align 1, !alias.scope !17, !noalias !20 -; INTERLEAVE-NEXT: [[TMP35:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 12 -; INTERLEAVE-NEXT: store i32 [[TMP35]], i32* [[TMP27]], align 1, !alias.scope !17, !noalias !20 -; INTERLEAVE-NEXT: [[TMP36:%.*]] = extractelement <16 x i32> [[WIDE_VEC7]], i64 0 -; INTERLEAVE-NEXT: store i32 [[TMP36]], i32* [[TMP28]], align 1, !alias.scope !17, !noalias !20 -; INTERLEAVE-NEXT: [[TMP37:%.*]] = extractelement <16 x i32> [[WIDE_VEC7]], i64 4 -; INTERLEAVE-NEXT: store i32 [[TMP37]], i32* [[TMP29]], align 1, !alias.scope !17, !noalias !20 -; INTERLEAVE-NEXT: [[TMP38:%.*]] = extractelement <16 x i32> [[WIDE_VEC7]], i64 8 -; INTERLEAVE-NEXT: store i32 [[TMP38]], i32* [[TMP30]], align 1, !alias.scope !17, !noalias !20 -; INTERLEAVE-NEXT: [[TMP39:%.*]] = extractelement <16 x i32> [[WIDE_VEC7]], i64 12 -; INTERLEAVE-NEXT: store i32 [[TMP39]], i32* [[TMP31]], align 1, !alias.scope !17, !noalias !20 +; INTERLEAVE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] +; INTERLEAVE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]] +; INTERLEAVE-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP20]], align 1 +; INTERLEAVE-NEXT: [[WIDE_VEC3:%.*]] = load <16 x i32>, ptr [[TMP21]], align 1 +; INTERLEAVE-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]], i32 1 +; INTERLEAVE-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1 +; INTERLEAVE-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP12]], i32 1 +; INTERLEAVE-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP13]], i32 1 +; INTERLEAVE-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP14]], i32 1 +; INTERLEAVE-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP15]], i32 1 +; INTERLEAVE-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP16]], i32 1 +; INTERLEAVE-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP17]], i32 1 +; INTERLEAVE-NEXT: [[TMP30:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 0 +; INTERLEAVE-NEXT: store i32 [[TMP30]], ptr [[TMP22]], align 1, !alias.scope !17, !noalias !20 +; INTERLEAVE-NEXT: [[TMP31:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 4 +; INTERLEAVE-NEXT: store i32 [[TMP31]], ptr [[TMP23]], align 1, !alias.scope !17, !noalias !20 +; INTERLEAVE-NEXT: [[TMP32:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 8 +; INTERLEAVE-NEXT: store i32 [[TMP32]], ptr [[TMP24]], align 1, !alias.scope !17, !noalias !20 +; INTERLEAVE-NEXT: [[TMP33:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 12 +; INTERLEAVE-NEXT: store i32 [[TMP33]], ptr [[TMP25]], align 1, !alias.scope !17, !noalias !20 +; INTERLEAVE-NEXT: [[TMP34:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 0 +; INTERLEAVE-NEXT: store i32 [[TMP34]], ptr [[TMP26]], align 1, !alias.scope !17, !noalias !20 +; INTERLEAVE-NEXT: [[TMP35:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 4 +; INTERLEAVE-NEXT: store i32 [[TMP35]], ptr [[TMP27]], align 1, !alias.scope !17, !noalias !20 +; INTERLEAVE-NEXT: [[TMP36:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 8 +; INTERLEAVE-NEXT: store i32 [[TMP36]], ptr [[TMP28]], align 1, !alias.scope !17, !noalias !20 +; INTERLEAVE-NEXT: [[TMP37:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 12 +; INTERLEAVE-NEXT: store i32 [[TMP37]], ptr [[TMP29]], align 1, !alias.scope !17, !noalias !20 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; INTERLEAVE-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: br label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: @@ -1971,14 +1927,14 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) { ; INTERLEAVE-NEXT: br label [[FOR_BODY:%.*]] ; INTERLEAVE: for.body: ; INTERLEAVE-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[TMP41:%.*]] = shl nsw i64 [[I]], 2 -; INTERLEAVE-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP41]] -; INTERLEAVE-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 1 -; INTERLEAVE-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 -; INTERLEAVE-NEXT: store i32 [[TMP43]], i32* [[TMP44]], align 1 +; INTERLEAVE-NEXT: [[TMP39:%.*]] = shl nsw i64 [[I]], 2 +; INTERLEAVE-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP39]] +; INTERLEAVE-NEXT: [[TMP41:%.*]] = load i32, ptr [[TMP40]], align 1 +; INTERLEAVE-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 +; INTERLEAVE-NEXT: store i32 [[TMP41]], ptr [[TMP42]], align 1 ; INTERLEAVE-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 -; INTERLEAVE-NEXT: [[TMP45:%.*]] = trunc i64 [[I_NEXT]] to i32 -; INTERLEAVE-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP45]], [[N]] +; INTERLEAVE-NEXT: [[TMP43:%.*]] = trunc i64 [[I_NEXT]] to i32 +; INTERLEAVE-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP43]], [[N]] ; INTERLEAVE-NEXT: br i1 [[COND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; INTERLEAVE: for.end: ; INTERLEAVE-NEXT: ret void @@ -1989,10 +1945,10 @@ entry: for.body: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry] %0 = shl nsw i64 %i, 2 - %1 = getelementptr inbounds i32, i32* %a, i64 %0 - %2 = load i32, i32* %1, align 1 - %3 = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 - store i32 %2, i32* %3, align 1 + %1 = getelementptr inbounds i32, ptr %a, i64 %0 + %2 = load i32, ptr %1, align 1 + %3 = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 1 + store i32 %2, ptr %3, align 1 %i.next = add nuw nsw i64 %i, 1 %4 = trunc i64 %i.next to i32 %cond = icmp eq i32 %4, %n @@ -2018,7 +1974,7 @@ for.end: ; ; -define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) { +define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) { ; CHECK-LABEL: @scalarize_induction_variable_05( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 1) @@ -2032,50 +1988,49 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[PRED_UDIV_CONTINUE2]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[PRED_UDIV_CONTINUE2]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; CHECK: pred.udiv.if: -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = udiv i32 [[TMP5]], [[TMP0]] -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0 ; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE]] ; CHECK: pred.udiv.continue: -; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UDIV_IF]] ] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]] +; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]] ; CHECK: pred.udiv.if1: -; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[INDEX]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = udiv i32 [[TMP11]], [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = udiv i32 [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP11]], i32 1 ; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE2]] ; CHECK: pred.udiv.continue2: -; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF1]] ] -; CHECK-NEXT: [[TMP15:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP14]] -; CHECK-NEXT: [[TMP16]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ [[TMP7]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF1]] ] +; CHECK-NEXT: [[TMP14:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP15]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP16]]) +; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP15]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SMAX]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ] ; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[VAR4:%.*]], [[IF_END]] ] -; CHECK-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I]] -; CHECK-NEXT: [[VAR1:%.*]] = load i32, i32* [[VAR0]], align 4 +; CHECK-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I]] +; CHECK-NEXT: [[VAR1:%.*]] = load i32, ptr [[VAR0]], align 4 ; CHECK-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]] ; CHECK: if.then: ; CHECK-NEXT: [[VAR2:%.*]] = udiv i32 [[VAR1]], [[I]] @@ -2087,7 +2042,7 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) { ; CHECK-NEXT: [[COND:%.*]] = icmp slt i32 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[VAR5]] ; ; IND-LABEL: @scalarize_induction_variable_05( @@ -2101,49 +2056,48 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) { ; IND-NEXT: br label [[VECTOR_BODY:%.*]] ; IND: vector.body: ; IND-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ] -; IND-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[PRED_UDIV_CONTINUE2]] ] +; IND-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[PRED_UDIV_CONTINUE2]] ] ; IND-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64 -; IND-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; IND-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* -; IND-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4 +; IND-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; IND-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4 ; IND-NEXT: br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; IND: pred.udiv.if: -; IND-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 0 -; IND-NEXT: [[TMP4:%.*]] = udiv i32 [[TMP3]], [[INDEX]] -; IND-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i64 0 +; IND-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 0 +; IND-NEXT: [[TMP3:%.*]] = udiv i32 [[TMP2]], [[INDEX]] +; IND-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0 ; IND-NEXT: br label [[PRED_UDIV_CONTINUE]] ; IND: pred.udiv.continue: -; IND-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_UDIV_IF]] ] +; IND-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_UDIV_IF]] ] ; IND-NEXT: br i1 [[C]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]] ; IND: pred.udiv.if1: -; IND-NEXT: [[TMP7:%.*]] = or i32 [[INDEX]], 1 -; IND-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 1 -; IND-NEXT: [[TMP9:%.*]] = udiv i32 [[TMP8]], [[TMP7]] -; IND-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP9]], i64 1 +; IND-NEXT: [[TMP6:%.*]] = or i32 [[INDEX]], 1 +; IND-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 1 +; IND-NEXT: [[TMP8:%.*]] = udiv i32 [[TMP7]], [[TMP6]] +; IND-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP8]], i64 1 ; IND-NEXT: br label [[PRED_UDIV_CONTINUE2]] ; IND: pred.udiv.continue2: -; IND-NEXT: [[TMP11:%.*]] = phi <2 x i32> [ [[TMP6]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP10]], [[PRED_UDIV_IF1]] ] -; IND-NEXT: [[TMP12:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], -; IND-NEXT: [[TMP13:%.*]] = shufflevector <2 x i1> [[TMP12]], <2 x i1> poison, <2 x i32> zeroinitializer -; IND-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP13]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP11]] -; IND-NEXT: [[TMP14]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]] +; IND-NEXT: [[TMP10:%.*]] = phi <2 x i32> [ [[TMP5]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP9]], [[PRED_UDIV_IF1]] ] +; IND-NEXT: [[TMP11:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], +; IND-NEXT: [[TMP12:%.*]] = shufflevector <2 x i1> [[TMP11]], <2 x i1> poison, <2 x i32> zeroinitializer +; IND-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP10]] +; IND-NEXT: [[TMP13]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]] ; IND-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; IND-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; IND-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; IND-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; IND-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; IND: middle.block: -; IND-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP14]]) +; IND-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP13]]) ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SMAX]], [[N_VEC]] ; IND-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: ; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IND-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; IND-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; IND-NEXT: br label [[FOR_BODY:%.*]] ; IND: for.body: ; IND-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ] ; IND-NEXT: [[SUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[VAR4:%.*]], [[IF_END]] ] -; IND-NEXT: [[TMP17:%.*]] = zext i32 [[I]] to i64 -; IND-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]] -; IND-NEXT: [[VAR1:%.*]] = load i32, i32* [[VAR0]], align 4 +; IND-NEXT: [[TMP16:%.*]] = zext i32 [[I]] to i64 +; IND-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]] +; IND-NEXT: [[VAR1:%.*]] = load i32, ptr [[VAR0]], align 4 ; IND-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]] ; IND: if.then: ; IND-NEXT: [[VAR2:%.*]] = udiv i32 [[VAR1]], [[I]] @@ -2155,7 +2109,7 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) { ; IND-NEXT: [[COND:%.*]] = icmp slt i32 [[I_NEXT]], [[N]] ; IND-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP25:![0-9]+]] ; IND: for.end: -; IND-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; IND-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] ; IND-NEXT: ret i32 [[VAR5]] ; ; UNROLL-LABEL: @scalarize_induction_variable_05( @@ -2170,76 +2124,74 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) { ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE10:%.*]] ] -; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[PRED_UDIV_CONTINUE10]] ] -; UNROLL-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_UDIV_CONTINUE10]] ] +; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[PRED_UDIV_CONTINUE10]] ] +; UNROLL-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[PRED_UDIV_CONTINUE10]] ] ; UNROLL-NEXT: [[TMP0:%.*]] = or i32 [[INDEX]], 2 ; UNROLL-NEXT: [[TMP1:%.*]] = sext i32 [[INDEX]] to i64 -; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]] -; UNROLL-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* -; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 -; UNROLL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 2 -; UNROLL-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>* -; UNROLL-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4 +; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]] +; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 +; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 2 +; UNROLL-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4 ; UNROLL-NEXT: br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; UNROLL: pred.udiv.if: -; UNROLL-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 0 -; UNROLL-NEXT: [[TMP7:%.*]] = udiv i32 [[TMP6]], [[INDEX]] -; UNROLL-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i64 0 +; UNROLL-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 0 +; UNROLL-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], [[INDEX]] +; UNROLL-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i64 0 ; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE]] ; UNROLL: pred.udiv.continue: -; UNROLL-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UDIV_IF]] ] +; UNROLL-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ] ; UNROLL-NEXT: br i1 [[C]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]] ; UNROLL: pred.udiv.if3: -; UNROLL-NEXT: [[TMP10:%.*]] = or i32 [[INDEX]], 1 -; UNROLL-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 1 -; UNROLL-NEXT: [[TMP12:%.*]] = udiv i32 [[TMP11]], [[TMP10]] -; UNROLL-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i64 1 +; UNROLL-NEXT: [[TMP8:%.*]] = or i32 [[INDEX]], 1 +; UNROLL-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 1 +; UNROLL-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP9]], [[TMP8]] +; UNROLL-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP10]], i64 1 ; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE4]] ; UNROLL: pred.udiv.continue4: -; UNROLL-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF3]] ] +; UNROLL-NEXT: [[TMP12:%.*]] = phi <2 x i32> [ [[TMP7]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP11]], [[PRED_UDIV_IF3]] ] ; UNROLL-NEXT: br i1 [[C]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]] ; UNROLL: pred.udiv.if7: -; UNROLL-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i64 0 -; UNROLL-NEXT: [[TMP16:%.*]] = udiv i32 [[TMP15]], [[TMP0]] -; UNROLL-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP16]], i64 0 +; UNROLL-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i64 0 +; UNROLL-NEXT: [[TMP14:%.*]] = udiv i32 [[TMP13]], [[TMP0]] +; UNROLL-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP14]], i64 0 ; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE8]] ; UNROLL: pred.udiv.continue8: -; UNROLL-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP17]], [[PRED_UDIV_IF7]] ] +; UNROLL-NEXT: [[TMP16:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP15]], [[PRED_UDIV_IF7]] ] ; UNROLL-NEXT: br i1 [[C]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10]] ; UNROLL: pred.udiv.if9: -; UNROLL-NEXT: [[TMP19:%.*]] = or i32 [[INDEX]], 3 -; UNROLL-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i64 1 -; UNROLL-NEXT: [[TMP21:%.*]] = udiv i32 [[TMP20]], [[TMP19]] -; UNROLL-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP18]], i32 [[TMP21]], i64 1 +; UNROLL-NEXT: [[TMP17:%.*]] = or i32 [[INDEX]], 3 +; UNROLL-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i64 1 +; UNROLL-NEXT: [[TMP19:%.*]] = udiv i32 [[TMP18]], [[TMP17]] +; UNROLL-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> [[TMP16]], i32 [[TMP19]], i64 1 ; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE10]] ; UNROLL: pred.udiv.continue10: -; UNROLL-NEXT: [[TMP23:%.*]] = phi <2 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP22]], [[PRED_UDIV_IF9]] ] -; UNROLL-NEXT: [[TMP24:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], +; UNROLL-NEXT: [[TMP21:%.*]] = phi <2 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP20]], [[PRED_UDIV_IF9]] ] +; UNROLL-NEXT: [[TMP22:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], +; UNROLL-NEXT: [[TMP23:%.*]] = shufflevector <2 x i1> [[TMP22]], <2 x i1> poison, <2 x i32> zeroinitializer +; UNROLL-NEXT: [[TMP24:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT5]], ; UNROLL-NEXT: [[TMP25:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> poison, <2 x i32> zeroinitializer -; UNROLL-NEXT: [[TMP26:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT5]], -; UNROLL-NEXT: [[TMP27:%.*]] = shufflevector <2 x i1> [[TMP26]], <2 x i1> poison, <2 x i32> zeroinitializer -; UNROLL-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP25]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP14]] -; UNROLL-NEXT: [[PREDPHI11:%.*]] = select <2 x i1> [[TMP27]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP23]] -; UNROLL-NEXT: [[TMP28]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]] -; UNROLL-NEXT: [[TMP29]] = add <2 x i32> [[PREDPHI11]], [[VEC_PHI1]] +; UNROLL-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP23]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP12]] +; UNROLL-NEXT: [[PREDPHI11:%.*]] = select <2 x i1> [[TMP25]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP21]] +; UNROLL-NEXT: [[TMP26]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]] +; UNROLL-NEXT: [[TMP27]] = add <2 x i32> [[PREDPHI11]], [[VEC_PHI1]] ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; UNROLL-NEXT: [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; UNROLL-NEXT: [[TMP28:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; UNROLL: middle.block: -; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[TMP29]], [[TMP28]] -; UNROLL-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]]) +; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[TMP27]], [[TMP26]] +; UNROLL-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]]) ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SMAX]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: ; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP31]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; UNROLL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP29]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; UNROLL-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL: for.body: ; UNROLL-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ] ; UNROLL-NEXT: [[SUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[VAR4:%.*]], [[IF_END]] ] -; UNROLL-NEXT: [[TMP32:%.*]] = zext i32 [[I]] to i64 -; UNROLL-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP32]] -; UNROLL-NEXT: [[VAR1:%.*]] = load i32, i32* [[VAR0]], align 4 +; UNROLL-NEXT: [[TMP30:%.*]] = zext i32 [[I]] to i64 +; UNROLL-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP30]] +; UNROLL-NEXT: [[VAR1:%.*]] = load i32, ptr [[VAR0]], align 4 ; UNROLL-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]] ; UNROLL: if.then: ; UNROLL-NEXT: [[VAR2:%.*]] = udiv i32 [[VAR1]], [[I]] @@ -2251,7 +2203,7 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) { ; UNROLL-NEXT: [[COND:%.*]] = icmp slt i32 [[I_NEXT]], [[N]] ; UNROLL-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP25:![0-9]+]] ; UNROLL: for.end: -; UNROLL-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ] +; UNROLL-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] ; UNROLL-NEXT: ret i32 [[VAR5]] ; ; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_05( @@ -2269,79 +2221,77 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) { ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE10:%.*]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP32:%.*]], [[PRED_UDIV_CONTINUE10]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[PRED_UDIV_CONTINUE10]] ] +; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[PRED_UDIV_CONTINUE10]] ] +; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[PRED_UDIV_CONTINUE10]] ] ; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 2 -; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] -; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP1]] -; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 2 -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 -; UNROLL-NO-IC-NEXT: br i1 [[TMP8]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP1]] +; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 2 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP5]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 +; UNROLL-NO-IC-NEXT: br i1 [[TMP6]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; UNROLL-NO-IC: pred.udiv.if: -; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP9]], [[TMP0]] -; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = udiv i32 [[TMP7]], [[TMP0]] +; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE]] ; UNROLL-NO-IC: pred.udiv.continue: -; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_UDIV_IF]] ] -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 -; UNROLL-NO-IC-NEXT: br i1 [[TMP13]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]] +; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_UDIV_IF]] ] +; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; UNROLL-NO-IC-NEXT: br i1 [[TMP11]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]] ; UNROLL-NO-IC: pred.udiv.if3: -; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = add i32 [[INDEX]], 1 -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = udiv i32 [[TMP15]], [[TMP14]] -; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP16]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 1 +; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = udiv i32 [[TMP13]], [[TMP12]] +; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP14]], i32 1 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE4]] ; UNROLL-NO-IC: pred.udiv.continue4: -; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP17]], [[PRED_UDIV_IF3]] ] -; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT6]], i32 0 -; UNROLL-NO-IC-NEXT: br i1 [[TMP19]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]] +; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = phi <2 x i32> [ [[TMP10]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], [[PRED_UDIV_IF3]] ] +; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT6]], i32 0 +; UNROLL-NO-IC-NEXT: br i1 [[TMP17]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]] ; UNROLL-NO-IC: pred.udiv.if7: -; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = udiv i32 [[TMP20]], [[TMP1]] -; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = udiv i32 [[TMP18]], [[TMP1]] +; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i32 0 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE8]] ; UNROLL-NO-IC: pred.udiv.continue8: -; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP22]], [[PRED_UDIV_IF7]] ] -; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT6]], i32 1 -; UNROLL-NO-IC-NEXT: br i1 [[TMP24]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10]] +; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP20]], [[PRED_UDIV_IF7]] ] +; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT6]], i32 1 +; UNROLL-NO-IC-NEXT: br i1 [[TMP22]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10]] ; UNROLL-NO-IC: pred.udiv.if9: -; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = add i32 [[INDEX]], 3 -; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP26]], [[TMP25]] -; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP27]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = add i32 [[INDEX]], 3 +; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = udiv i32 [[TMP24]], [[TMP23]] +; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP25]], i32 1 ; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE10]] ; UNROLL-NO-IC: pred.udiv.continue10: -; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = phi <2 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP28]], [[PRED_UDIV_IF9]] ] -; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], -; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT6]], -; UNROLL-NO-IC-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP30]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP18]] -; UNROLL-NO-IC-NEXT: [[PREDPHI11:%.*]] = select <2 x i1> [[TMP31]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP29]] -; UNROLL-NO-IC-NEXT: [[TMP32]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]] -; UNROLL-NO-IC-NEXT: [[TMP33]] = add <2 x i32> [[PREDPHI11]], [[VEC_PHI1]] +; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = phi <2 x i32> [ [[TMP21]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP26]], [[PRED_UDIV_IF9]] ] +; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], +; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT6]], +; UNROLL-NO-IC-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP28]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP16]] +; UNROLL-NO-IC-NEXT: [[PREDPHI11:%.*]] = select <2 x i1> [[TMP29]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP27]] +; UNROLL-NO-IC-NEXT: [[TMP30]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]] +; UNROLL-NO-IC-NEXT: [[TMP31]] = add <2 x i32> [[PREDPHI11]], [[VEC_PHI1]] ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[TMP33]], [[TMP32]] -; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]]) +; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[TMP31]], [[TMP30]] +; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]]) ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SMAX]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-IC: for.body: ; UNROLL-NO-IC-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ] ; UNROLL-NO-IC-NEXT: [[SUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[VAR4:%.*]], [[IF_END]] ] -; UNROLL-NO-IC-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I]] -; UNROLL-NO-IC-NEXT: [[VAR1:%.*]] = load i32, i32* [[VAR0]], align 4 +; UNROLL-NO-IC-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I]] +; UNROLL-NO-IC-NEXT: [[VAR1:%.*]] = load i32, ptr [[VAR0]], align 4 ; UNROLL-NO-IC-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]] ; UNROLL-NO-IC: if.then: ; UNROLL-NO-IC-NEXT: [[VAR2:%.*]] = udiv i32 [[VAR1]], [[I]] @@ -2353,7 +2303,7 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) { ; UNROLL-NO-IC-NEXT: [[COND:%.*]] = icmp slt i32 [[I_NEXT]], [[N]] ; UNROLL-NO-IC-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP25:![0-9]+]] ; UNROLL-NO-IC: for.end: -; UNROLL-NO-IC-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: ret i32 [[VAR5]] ; ; INTERLEAVE-LABEL: @scalarize_induction_variable_05( @@ -2368,112 +2318,110 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) { ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE18:%.*]] ] -; INTERLEAVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_UDIV_CONTINUE18]] ] -; INTERLEAVE-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[PRED_UDIV_CONTINUE18]] ] +; INTERLEAVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_UDIV_CONTINUE18]] ] +; INTERLEAVE-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_UDIV_CONTINUE18]] ] ; INTERLEAVE-NEXT: [[TMP0:%.*]] = or i32 [[INDEX]], 4 ; INTERLEAVE-NEXT: [[TMP1:%.*]] = sext i32 [[INDEX]] to i64 -; INTERLEAVE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]] -; INTERLEAVE-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; INTERLEAVE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; INTERLEAVE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 4 -; INTERLEAVE-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* -; INTERLEAVE-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4 +; INTERLEAVE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]] +; INTERLEAVE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; INTERLEAVE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 4 +; INTERLEAVE-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 ; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; INTERLEAVE: pred.udiv.if: -; INTERLEAVE-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 0 -; INTERLEAVE-NEXT: [[TMP7:%.*]] = udiv i32 [[TMP6]], [[INDEX]] -; INTERLEAVE-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 +; INTERLEAVE-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 0 +; INTERLEAVE-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], [[INDEX]] +; INTERLEAVE-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i64 0 ; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE]] ; INTERLEAVE: pred.udiv.continue: -; INTERLEAVE-NEXT: [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UDIV_IF]] ] +; INTERLEAVE-NEXT: [[TMP7:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ] ; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]] ; INTERLEAVE: pred.udiv.if3: -; INTERLEAVE-NEXT: [[TMP10:%.*]] = or i32 [[INDEX]], 1 -; INTERLEAVE-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 1 -; INTERLEAVE-NEXT: [[TMP12:%.*]] = udiv i32 [[TMP11]], [[TMP10]] -; INTERLEAVE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP12]], i64 1 +; INTERLEAVE-NEXT: [[TMP8:%.*]] = or i32 [[INDEX]], 1 +; INTERLEAVE-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 1 +; INTERLEAVE-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP9]], [[TMP8]] +; INTERLEAVE-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP10]], i64 1 ; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE4]] ; INTERLEAVE: pred.udiv.continue4: -; INTERLEAVE-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF3]] ] +; INTERLEAVE-NEXT: [[TMP12:%.*]] = phi <4 x i32> [ [[TMP7]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP11]], [[PRED_UDIV_IF3]] ] ; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]] ; INTERLEAVE: pred.udiv.if5: -; INTERLEAVE-NEXT: [[TMP15:%.*]] = or i32 [[INDEX]], 2 -; INTERLEAVE-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 2 -; INTERLEAVE-NEXT: [[TMP17:%.*]] = udiv i32 [[TMP16]], [[TMP15]] -; INTERLEAVE-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP17]], i64 2 +; INTERLEAVE-NEXT: [[TMP13:%.*]] = or i32 [[INDEX]], 2 +; INTERLEAVE-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 2 +; INTERLEAVE-NEXT: [[TMP15:%.*]] = udiv i32 [[TMP14]], [[TMP13]] +; INTERLEAVE-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP15]], i64 2 ; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE6]] ; INTERLEAVE: pred.udiv.continue6: -; INTERLEAVE-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP18]], [[PRED_UDIV_IF5]] ] +; INTERLEAVE-NEXT: [[TMP17:%.*]] = phi <4 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP16]], [[PRED_UDIV_IF5]] ] ; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]] ; INTERLEAVE: pred.udiv.if7: -; INTERLEAVE-NEXT: [[TMP20:%.*]] = or i32 [[INDEX]], 3 -; INTERLEAVE-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 3 -; INTERLEAVE-NEXT: [[TMP22:%.*]] = udiv i32 [[TMP21]], [[TMP20]] -; INTERLEAVE-NEXT: [[TMP23:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP22]], i64 3 +; INTERLEAVE-NEXT: [[TMP18:%.*]] = or i32 [[INDEX]], 3 +; INTERLEAVE-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 3 +; INTERLEAVE-NEXT: [[TMP20:%.*]] = udiv i32 [[TMP19]], [[TMP18]] +; INTERLEAVE-NEXT: [[TMP21:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP20]], i64 3 ; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE8]] ; INTERLEAVE: pred.udiv.continue8: -; INTERLEAVE-NEXT: [[TMP24:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP23]], [[PRED_UDIV_IF7]] ] +; INTERLEAVE-NEXT: [[TMP22:%.*]] = phi <4 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP21]], [[PRED_UDIV_IF7]] ] ; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]] ; INTERLEAVE: pred.udiv.if11: -; INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 0 -; INTERLEAVE-NEXT: [[TMP26:%.*]] = udiv i32 [[TMP25]], [[TMP0]] -; INTERLEAVE-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i64 0 +; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 0 +; INTERLEAVE-NEXT: [[TMP24:%.*]] = udiv i32 [[TMP23]], [[TMP0]] +; INTERLEAVE-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP24]], i64 0 ; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE12]] ; INTERLEAVE: pred.udiv.continue12: -; INTERLEAVE-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE8]] ], [ [[TMP27]], [[PRED_UDIV_IF11]] ] +; INTERLEAVE-NEXT: [[TMP26:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE8]] ], [ [[TMP25]], [[PRED_UDIV_IF11]] ] ; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]] ; INTERLEAVE: pred.udiv.if13: -; INTERLEAVE-NEXT: [[TMP29:%.*]] = or i32 [[INDEX]], 5 -; INTERLEAVE-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 1 -; INTERLEAVE-NEXT: [[TMP31:%.*]] = udiv i32 [[TMP30]], [[TMP29]] -; INTERLEAVE-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i64 1 +; INTERLEAVE-NEXT: [[TMP27:%.*]] = or i32 [[INDEX]], 5 +; INTERLEAVE-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 1 +; INTERLEAVE-NEXT: [[TMP29:%.*]] = udiv i32 [[TMP28]], [[TMP27]] +; INTERLEAVE-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP29]], i64 1 ; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE14]] ; INTERLEAVE: pred.udiv.continue14: -; INTERLEAVE-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP32]], [[PRED_UDIV_IF13]] ] +; INTERLEAVE-NEXT: [[TMP31:%.*]] = phi <4 x i32> [ [[TMP26]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP30]], [[PRED_UDIV_IF13]] ] ; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]] ; INTERLEAVE: pred.udiv.if15: -; INTERLEAVE-NEXT: [[TMP34:%.*]] = or i32 [[INDEX]], 6 -; INTERLEAVE-NEXT: [[TMP35:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 2 -; INTERLEAVE-NEXT: [[TMP36:%.*]] = udiv i32 [[TMP35]], [[TMP34]] -; INTERLEAVE-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i64 2 +; INTERLEAVE-NEXT: [[TMP32:%.*]] = or i32 [[INDEX]], 6 +; INTERLEAVE-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 2 +; INTERLEAVE-NEXT: [[TMP34:%.*]] = udiv i32 [[TMP33]], [[TMP32]] +; INTERLEAVE-NEXT: [[TMP35:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP34]], i64 2 ; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE16]] ; INTERLEAVE: pred.udiv.continue16: -; INTERLEAVE-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP37]], [[PRED_UDIV_IF15]] ] +; INTERLEAVE-NEXT: [[TMP36:%.*]] = phi <4 x i32> [ [[TMP31]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP35]], [[PRED_UDIV_IF15]] ] ; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF17:%.*]], label [[PRED_UDIV_CONTINUE18]] ; INTERLEAVE: pred.udiv.if17: -; INTERLEAVE-NEXT: [[TMP39:%.*]] = or i32 [[INDEX]], 7 -; INTERLEAVE-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 3 -; INTERLEAVE-NEXT: [[TMP41:%.*]] = udiv i32 [[TMP40]], [[TMP39]] -; INTERLEAVE-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i64 3 +; INTERLEAVE-NEXT: [[TMP37:%.*]] = or i32 [[INDEX]], 7 +; INTERLEAVE-NEXT: [[TMP38:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 3 +; INTERLEAVE-NEXT: [[TMP39:%.*]] = udiv i32 [[TMP38]], [[TMP37]] +; INTERLEAVE-NEXT: [[TMP40:%.*]] = insertelement <4 x i32> [[TMP36]], i32 [[TMP39]], i64 3 ; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE18]] ; INTERLEAVE: pred.udiv.continue18: -; INTERLEAVE-NEXT: [[TMP43:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP42]], [[PRED_UDIV_IF17]] ] -; INTERLEAVE-NEXT: [[TMP44:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT]], +; INTERLEAVE-NEXT: [[TMP41:%.*]] = phi <4 x i32> [ [[TMP36]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP40]], [[PRED_UDIV_IF17]] ] +; INTERLEAVE-NEXT: [[TMP42:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT]], +; INTERLEAVE-NEXT: [[TMP43:%.*]] = shufflevector <4 x i1> [[TMP42]], <4 x i1> poison, <4 x i32> zeroinitializer +; INTERLEAVE-NEXT: [[TMP44:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT9]], ; INTERLEAVE-NEXT: [[TMP45:%.*]] = shufflevector <4 x i1> [[TMP44]], <4 x i1> poison, <4 x i32> zeroinitializer -; INTERLEAVE-NEXT: [[TMP46:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT9]], -; INTERLEAVE-NEXT: [[TMP47:%.*]] = shufflevector <4 x i1> [[TMP46]], <4 x i1> poison, <4 x i32> zeroinitializer -; INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP45]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[TMP24]] -; INTERLEAVE-NEXT: [[PREDPHI19:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_LOAD2]], <4 x i32> [[TMP43]] -; INTERLEAVE-NEXT: [[TMP48]] = add <4 x i32> [[PREDPHI]], [[VEC_PHI]] -; INTERLEAVE-NEXT: [[TMP49]] = add <4 x i32> [[PREDPHI19]], [[VEC_PHI1]] +; INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP43]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[TMP22]] +; INTERLEAVE-NEXT: [[PREDPHI19:%.*]] = select <4 x i1> [[TMP45]], <4 x i32> [[WIDE_LOAD2]], <4 x i32> [[TMP41]] +; INTERLEAVE-NEXT: [[TMP46]] = add <4 x i32> [[PREDPHI]], [[VEC_PHI]] +; INTERLEAVE-NEXT: [[TMP47]] = add <4 x i32> [[PREDPHI19]], [[VEC_PHI1]] ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; INTERLEAVE-NEXT: [[TMP50:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP48:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; INTERLEAVE: middle.block: -; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP49]], [[TMP48]] -; INTERLEAVE-NEXT: [[TMP51:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; INTERLEAVE-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP47]], [[TMP46]] +; INTERLEAVE-NEXT: [[TMP49:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SMAX]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: ; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; INTERLEAVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP51]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; INTERLEAVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP49]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; INTERLEAVE-NEXT: br label [[FOR_BODY:%.*]] ; INTERLEAVE: for.body: ; INTERLEAVE-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ] ; INTERLEAVE-NEXT: [[SUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[VAR4:%.*]], [[IF_END]] ] -; INTERLEAVE-NEXT: [[TMP52:%.*]] = zext i32 [[I]] to i64 -; INTERLEAVE-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP52]] -; INTERLEAVE-NEXT: [[VAR1:%.*]] = load i32, i32* [[VAR0]], align 4 +; INTERLEAVE-NEXT: [[TMP50:%.*]] = zext i32 [[I]] to i64 +; INTERLEAVE-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP50]] +; INTERLEAVE-NEXT: [[VAR1:%.*]] = load i32, ptr [[VAR0]], align 4 ; INTERLEAVE-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]] ; INTERLEAVE: if.then: ; INTERLEAVE-NEXT: [[VAR2:%.*]] = udiv i32 [[VAR1]], [[I]] @@ -2485,7 +2433,7 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) { ; INTERLEAVE-NEXT: [[COND:%.*]] = icmp slt i32 [[I_NEXT]], [[N]] ; INTERLEAVE-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP25:![0-9]+]] ; INTERLEAVE: for.end: -; INTERLEAVE-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ] +; INTERLEAVE-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP49]], [[MIDDLE_BLOCK]] ] ; INTERLEAVE-NEXT: ret i32 [[VAR5]] ; entry: @@ -2494,8 +2442,8 @@ entry: for.body: %i = phi i32 [ 0, %entry ], [ %i.next, %if.end ] %sum = phi i32 [ 0, %entry ], [ %var4, %if.end ] - %var0 = getelementptr inbounds i32, i32* %a, i32 %i - %var1 = load i32, i32* %var0, align 4 + %var0 = getelementptr inbounds i32, ptr %a, i32 %i + %var1 = load i32, ptr %var0, align 4 br i1 %c, label %if.then, label %if.end if.then: @@ -2523,7 +2471,7 @@ for.end: ; %pair.i16 = type { i16, i16 } -define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) { +define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; CHECK-LABEL: @iv_vector_and_scalar_users( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N:%.*]], -1 @@ -2544,12 +2492,12 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) { ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] ; CHECK-NEXT: [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16> -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0 -; CHECK-NEXT: store i16 [[TMP9]], i16* [[TMP7]], align 2 +; CHECK-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 2 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1 -; CHECK-NEXT: store i16 [[TMP10]], i16* [[TMP8]], align 2 +; CHECK-NEXT: store i16 [[TMP10]], ptr [[TMP8]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -2565,8 +2513,8 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) { ; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[I]] to i32 ; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[A]], [[TMP12]] ; CHECK-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i16 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[I]], i32 1 -; CHECK-NEXT: store i16 [[TMP14]], i16* [[TMP15]], align 2 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[I]], i32 1 +; CHECK-NEXT: store i16 [[TMP14]], ptr [[TMP15]], align 2 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[I_NEXT]] to i32 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP16]], [[N]] @@ -2592,12 +2540,12 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) { ; IND-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 1 ; IND-NEXT: [[TMP4:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] ; IND-NEXT: [[TMP5:%.*]] = trunc <2 x i32> [[TMP4]] to <2 x i16> -; IND-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[INDEX]], i32 1 -; IND-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP3]], i32 1 +; IND-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1 +; IND-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP3]], i32 1 ; IND-NEXT: [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i64 0 -; IND-NEXT: store i16 [[TMP8]], i16* [[TMP6]], align 2 +; IND-NEXT: store i16 [[TMP8]], ptr [[TMP6]], align 2 ; IND-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i64 1 -; IND-NEXT: store i16 [[TMP9]], i16* [[TMP7]], align 2 +; IND-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 2 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], ; IND-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -2613,8 +2561,8 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) { ; IND-NEXT: [[TMP11:%.*]] = trunc i64 [[I]] to i32 ; IND-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], [[A]] ; IND-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 -; IND-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[I]], i32 1 -; IND-NEXT: store i16 [[TMP13]], i16* [[TMP14]], align 2 +; IND-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[I]], i32 1 +; IND-NEXT: store i16 [[TMP13]], ptr [[TMP14]], align 2 ; IND-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; IND-NEXT: [[TMP15:%.*]] = trunc i64 [[I_NEXT]] to i32 ; IND-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP15]], [[N]] @@ -2647,18 +2595,18 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) { ; UNROLL-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[VEC_IND]] ; UNROLL-NEXT: [[TMP10:%.*]] = trunc <2 x i32> [[TMP6]] to <2 x i16> ; UNROLL-NEXT: [[TMP11:%.*]] = trunc <2 x i32> [[TMP9]] to <2 x i16> -; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[INDEX]], i32 1 -; UNROLL-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP3]], i32 1 -; UNROLL-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP4]], i32 1 -; UNROLL-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP5]], i32 1 +; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1 +; UNROLL-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP3]], i32 1 +; UNROLL-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1 +; UNROLL-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP5]], i32 1 ; UNROLL-NEXT: [[TMP16:%.*]] = extractelement <2 x i16> [[TMP10]], i64 0 -; UNROLL-NEXT: store i16 [[TMP16]], i16* [[TMP12]], align 2 +; UNROLL-NEXT: store i16 [[TMP16]], ptr [[TMP12]], align 2 ; UNROLL-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP10]], i64 1 -; UNROLL-NEXT: store i16 [[TMP17]], i16* [[TMP13]], align 2 +; UNROLL-NEXT: store i16 [[TMP17]], ptr [[TMP13]], align 2 ; UNROLL-NEXT: [[TMP18:%.*]] = extractelement <2 x i16> [[TMP11]], i64 0 -; UNROLL-NEXT: store i16 [[TMP18]], i16* [[TMP14]], align 2 +; UNROLL-NEXT: store i16 [[TMP18]], ptr [[TMP14]], align 2 ; UNROLL-NEXT: [[TMP19:%.*]] = extractelement <2 x i16> [[TMP11]], i64 1 -; UNROLL-NEXT: store i16 [[TMP19]], i16* [[TMP15]], align 2 +; UNROLL-NEXT: store i16 [[TMP19]], ptr [[TMP15]], align 2 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], ; UNROLL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -2674,8 +2622,8 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) { ; UNROLL-NEXT: [[TMP21:%.*]] = trunc i64 [[I]] to i32 ; UNROLL-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[A]] ; UNROLL-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 -; UNROLL-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[I]], i32 1 -; UNROLL-NEXT: store i16 [[TMP23]], i16* [[TMP24]], align 2 +; UNROLL-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[I]], i32 1 +; UNROLL-NEXT: store i16 [[TMP23]], ptr [[TMP24]], align 2 ; UNROLL-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; UNROLL-NEXT: [[TMP25:%.*]] = trunc i64 [[I_NEXT]] to i32 ; UNROLL-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP25]], [[N]] @@ -2710,18 +2658,18 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) { ; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = add <2 x i32> [[BROADCAST_SPLAT3]], [[STEP_ADD]] ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16> ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = trunc <2 x i32> [[TMP8]] to <2 x i16> -; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[TMP3]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP4]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP5]], i32 1 -; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP6]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[TMP3]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP5]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP6]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP9]], i32 0 -; UNROLL-NO-IC-NEXT: store i16 [[TMP15]], i16* [[TMP11]], align 2 +; UNROLL-NO-IC-NEXT: store i16 [[TMP15]], ptr [[TMP11]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i32 1 -; UNROLL-NO-IC-NEXT: store i16 [[TMP16]], i16* [[TMP12]], align 2 +; UNROLL-NO-IC-NEXT: store i16 [[TMP16]], ptr [[TMP12]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP10]], i32 0 -; UNROLL-NO-IC-NEXT: store i16 [[TMP17]], i16* [[TMP13]], align 2 +; UNROLL-NO-IC-NEXT: store i16 [[TMP17]], ptr [[TMP13]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = extractelement <2 x i16> [[TMP10]], i32 1 -; UNROLL-NO-IC-NEXT: store i16 [[TMP18]], i16* [[TMP14]], align 2 +; UNROLL-NO-IC-NEXT: store i16 [[TMP18]], ptr [[TMP14]], align 2 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -2737,8 +2685,8 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) { ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = trunc i64 [[I]] to i32 ; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = add i32 [[A]], [[TMP20]] ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = trunc i32 [[TMP21]] to i16 -; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[I]], i32 1 -; UNROLL-NO-IC-NEXT: store i16 [[TMP22]], i16* [[TMP23]], align 2 +; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[I]], i32 1 +; UNROLL-NO-IC-NEXT: store i16 [[TMP22]], ptr [[TMP23]], align 2 ; UNROLL-NO-IC-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = trunc i64 [[I_NEXT]] to i32 ; UNROLL-NO-IC-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP24]], [[N]] @@ -2775,30 +2723,30 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) { ; INTERLEAVE-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[VEC_IND]] ; INTERLEAVE-NEXT: [[TMP14:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i16> ; INTERLEAVE-NEXT: [[TMP15:%.*]] = trunc <4 x i32> [[TMP13]] to <4 x i16> -; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[INDEX]], i32 1 -; INTERLEAVE-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP3]], i32 1 -; INTERLEAVE-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP4]], i32 1 -; INTERLEAVE-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP5]], i32 1 -; INTERLEAVE-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP6]], i32 1 -; INTERLEAVE-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP7]], i32 1 -; INTERLEAVE-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP8]], i32 1 -; INTERLEAVE-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP9]], i32 1 +; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1 +; INTERLEAVE-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP3]], i32 1 +; INTERLEAVE-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1 +; INTERLEAVE-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP5]], i32 1 +; INTERLEAVE-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP6]], i32 1 +; INTERLEAVE-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP7]], i32 1 +; INTERLEAVE-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP8]], i32 1 +; INTERLEAVE-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP9]], i32 1 ; INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[TMP14]], i64 0 -; INTERLEAVE-NEXT: store i16 [[TMP24]], i16* [[TMP16]], align 2 +; INTERLEAVE-NEXT: store i16 [[TMP24]], ptr [[TMP16]], align 2 ; INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[TMP14]], i64 1 -; INTERLEAVE-NEXT: store i16 [[TMP25]], i16* [[TMP17]], align 2 +; INTERLEAVE-NEXT: store i16 [[TMP25]], ptr [[TMP17]], align 2 ; INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[TMP14]], i64 2 -; INTERLEAVE-NEXT: store i16 [[TMP26]], i16* [[TMP18]], align 2 +; INTERLEAVE-NEXT: store i16 [[TMP26]], ptr [[TMP18]], align 2 ; INTERLEAVE-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[TMP14]], i64 3 -; INTERLEAVE-NEXT: store i16 [[TMP27]], i16* [[TMP19]], align 2 +; INTERLEAVE-NEXT: store i16 [[TMP27]], ptr [[TMP19]], align 2 ; INTERLEAVE-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[TMP15]], i64 0 -; INTERLEAVE-NEXT: store i16 [[TMP28]], i16* [[TMP20]], align 2 +; INTERLEAVE-NEXT: store i16 [[TMP28]], ptr [[TMP20]], align 2 ; INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <4 x i16> [[TMP15]], i64 1 -; INTERLEAVE-NEXT: store i16 [[TMP29]], i16* [[TMP21]], align 2 +; INTERLEAVE-NEXT: store i16 [[TMP29]], ptr [[TMP21]], align 2 ; INTERLEAVE-NEXT: [[TMP30:%.*]] = extractelement <4 x i16> [[TMP15]], i64 2 -; INTERLEAVE-NEXT: store i16 [[TMP30]], i16* [[TMP22]], align 2 +; INTERLEAVE-NEXT: store i16 [[TMP30]], ptr [[TMP22]], align 2 ; INTERLEAVE-NEXT: [[TMP31:%.*]] = extractelement <4 x i16> [[TMP15]], i64 3 -; INTERLEAVE-NEXT: store i16 [[TMP31]], i16* [[TMP23]], align 2 +; INTERLEAVE-NEXT: store i16 [[TMP31]], ptr [[TMP23]], align 2 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; INTERLEAVE-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -2814,8 +2762,8 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) { ; INTERLEAVE-NEXT: [[TMP33:%.*]] = trunc i64 [[I]] to i32 ; INTERLEAVE-NEXT: [[TMP34:%.*]] = add i32 [[TMP33]], [[A]] ; INTERLEAVE-NEXT: [[TMP35:%.*]] = trunc i32 [[TMP34]] to i16 -; INTERLEAVE-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[I]], i32 1 -; INTERLEAVE-NEXT: store i16 [[TMP35]], i16* [[TMP36]], align 2 +; INTERLEAVE-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[I]], i32 1 +; INTERLEAVE-NEXT: store i16 [[TMP35]], ptr [[TMP36]], align 2 ; INTERLEAVE-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; INTERLEAVE-NEXT: [[TMP37:%.*]] = trunc i64 [[I_NEXT]] to i32 ; INTERLEAVE-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP37]], [[N]] @@ -2831,8 +2779,8 @@ for.body: %0 = trunc i64 %i to i32 %1 = add i32 %a, %0 %2 = trunc i32 %1 to i16 - %3 = getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %i, i32 1 - store i16 %2, i16* %3, align 2 + %3 = getelementptr inbounds %pair.i16, ptr %p, i64 %i, i32 1 + store i16 %2, ptr %3, align 2 %i.next = add nuw nsw i64 %i, 1 %4 = trunc i64 %i.next to i32 %cond = icmp eq i32 %4, %n @@ -3279,9 +3227,9 @@ exit: define i32 @testoverflowcheck() { ; CHECK-LABEL: @testoverflowcheck( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTPR_I:%.*]] = load i8, i8* @e, align 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @d, align 4 -; CHECK-NEXT: [[C_PROMOTED_I:%.*]] = load i32, i32* @c, align 4 +; CHECK-NEXT: [[DOTPR_I:%.*]] = load i8, ptr @e, align 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @d, align 4 +; CHECK-NEXT: [[C_PROMOTED_I:%.*]] = load i32, ptr @c, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 -1, [[DOTPR_I]] ; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1 @@ -3290,8 +3238,8 @@ define i32 @testoverflowcheck() { ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP3]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP3]], [[N_MOD_VF]] -; CHECK-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 -; CHECK-NEXT: [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[CAST_VTC]] +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 +; CHECK-NEXT: [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[DOTCAST]] ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> , i32 [[C_PROMOTED_I]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer @@ -3324,9 +3272,9 @@ define i32 @testoverflowcheck() { ; ; IND-LABEL: @testoverflowcheck( ; IND-NEXT: entry: -; IND-NEXT: [[DOTPR_I:%.*]] = load i8, i8* @e, align 1 -; IND-NEXT: [[TMP0:%.*]] = load i32, i32* @d, align 4 -; IND-NEXT: [[C_PROMOTED_I:%.*]] = load i32, i32* @c, align 4 +; IND-NEXT: [[DOTPR_I:%.*]] = load i8, ptr @e, align 1 +; IND-NEXT: [[TMP0:%.*]] = load i32, ptr @d, align 4 +; IND-NEXT: [[C_PROMOTED_I:%.*]] = load i32, ptr @c, align 4 ; IND-NEXT: [[TMP1:%.*]] = xor i8 [[DOTPR_I]], -1 ; IND-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32 ; IND-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1 @@ -3334,8 +3282,8 @@ define i32 @testoverflowcheck() { ; IND-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IND: vector.ph: ; IND-NEXT: [[N_VEC:%.*]] = and i32 [[TMP3]], -2 -; IND-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 -; IND-NEXT: [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[CAST_VTC]] +; IND-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 +; IND-NEXT: [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[DOTCAST]] ; IND-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> , i32 [[C_PROMOTED_I]], i64 0 ; IND-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 ; IND-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer @@ -3366,9 +3314,9 @@ define i32 @testoverflowcheck() { ; ; UNROLL-LABEL: @testoverflowcheck( ; UNROLL-NEXT: entry: -; UNROLL-NEXT: [[DOTPR_I:%.*]] = load i8, i8* @e, align 1 -; UNROLL-NEXT: [[TMP0:%.*]] = load i32, i32* @d, align 4 -; UNROLL-NEXT: [[C_PROMOTED_I:%.*]] = load i32, i32* @c, align 4 +; UNROLL-NEXT: [[DOTPR_I:%.*]] = load i8, ptr @e, align 1 +; UNROLL-NEXT: [[TMP0:%.*]] = load i32, ptr @d, align 4 +; UNROLL-NEXT: [[C_PROMOTED_I:%.*]] = load i32, ptr @c, align 4 ; UNROLL-NEXT: [[TMP1:%.*]] = xor i8 [[DOTPR_I]], -1 ; UNROLL-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32 ; UNROLL-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1 @@ -3376,8 +3324,8 @@ define i32 @testoverflowcheck() { ; UNROLL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[TMP3]], -4 -; UNROLL-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 -; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[CAST_VTC]] +; UNROLL-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 +; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[DOTCAST]] ; UNROLL-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> , i32 [[C_PROMOTED_I]], i64 0 ; UNROLL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 ; UNROLL-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 @@ -3410,9 +3358,9 @@ define i32 @testoverflowcheck() { ; ; UNROLL-NO-IC-LABEL: @testoverflowcheck( ; UNROLL-NO-IC-NEXT: entry: -; UNROLL-NO-IC-NEXT: [[DOTPR_I:%.*]] = load i8, i8* @e, align 1 -; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = load i32, i32* @d, align 4 -; UNROLL-NO-IC-NEXT: [[C_PROMOTED_I:%.*]] = load i32, i32* @c, align 4 +; UNROLL-NO-IC-NEXT: [[DOTPR_I:%.*]] = load i8, ptr @e, align 1 +; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = load i32, ptr @d, align 4 +; UNROLL-NO-IC-NEXT: [[C_PROMOTED_I:%.*]] = load i32, ptr @c, align 4 ; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = sub i8 -1, [[DOTPR_I]] ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32 ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1 @@ -3421,8 +3369,8 @@ define i32 @testoverflowcheck() { ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP3]], 4 ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP3]], [[N_MOD_VF]] -; UNROLL-NO-IC-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 -; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[CAST_VTC]] +; UNROLL-NO-IC-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 +; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[DOTCAST]] ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> , i32 [[C_PROMOTED_I]], i32 0 ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer @@ -3460,9 +3408,9 @@ define i32 @testoverflowcheck() { ; ; INTERLEAVE-LABEL: @testoverflowcheck( ; INTERLEAVE-NEXT: entry: -; INTERLEAVE-NEXT: [[DOTPR_I:%.*]] = load i8, i8* @e, align 1 -; INTERLEAVE-NEXT: [[TMP0:%.*]] = load i32, i32* @d, align 4 -; INTERLEAVE-NEXT: [[C_PROMOTED_I:%.*]] = load i32, i32* @c, align 4 +; INTERLEAVE-NEXT: [[DOTPR_I:%.*]] = load i8, ptr @e, align 1 +; INTERLEAVE-NEXT: [[TMP0:%.*]] = load i32, ptr @d, align 4 +; INTERLEAVE-NEXT: [[C_PROMOTED_I:%.*]] = load i32, ptr @c, align 4 ; INTERLEAVE-NEXT: [[TMP1:%.*]] = xor i8 [[DOTPR_I]], -1 ; INTERLEAVE-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32 ; INTERLEAVE-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1 @@ -3470,8 +3418,8 @@ define i32 @testoverflowcheck() { ; INTERLEAVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; INTERLEAVE: vector.ph: ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i32 [[TMP3]], -8 -; INTERLEAVE-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 -; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[CAST_VTC]] +; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 +; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[DOTCAST]] ; INTERLEAVE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[C_PROMOTED_I]], i64 0 ; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 ; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 @@ -3503,9 +3451,9 @@ define i32 @testoverflowcheck() { ; INTERLEAVE-NEXT: ret i32 [[AND_I_LCSSA]] ; entry: - %.pr.i = load i8, i8* @e, align 1 - %0 = load i32, i32* @d, align 4 - %c.promoted.i = load i32, i32* @c, align 4 + %.pr.i = load i8, ptr @e, align 1 + %0 = load i32, ptr @d, align 4 + %c.promoted.i = load i32, ptr @c, align 4 br label %cond.end.i cond.end.i: @@ -3525,7 +3473,7 @@ loopexit: ; we need to convert the SCEV expression into an AddRecExpr. ; The expression gets converted to {zext i8 %t to i32,+,1}. -define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) { +define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; CHECK-LABEL: @wrappingindvars1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ST:%.*]] = zext i8 [[T:%.*]] to i16 @@ -3552,8 +3500,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) { ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] -; CHECK-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 -; CHECK-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[CAST_VTC]] +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 +; CHECK-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] ; CHECK-NEXT: [[IND_END2:%.*]] = add i32 [[EXT]], [[N_VEC]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i32 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer @@ -3562,17 +3510,16 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[INDEX]] to i8 -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i8 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP16]], align 4 +; CHECK-NEXT: [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST4]] +; CHECK-NEXT: [[TMP12:%.*]] = add i8 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; CHECK-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP14]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -3585,8 +3532,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) { ; CHECK-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i8 [[IDX]] -; CHECK-NEXT: store i32 [[SPHI]], i32* [[PTR]], align 4 +; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[IDX]] +; CHECK-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 ; CHECK-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; CHECK-NEXT: [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32 ; CHECK-NEXT: [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1 @@ -3619,8 +3566,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) { ; IND-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; IND: vector.ph: ; IND-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -2 -; IND-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 -; IND-NEXT: [[IND_END:%.*]] = add i8 [[CAST_VTC]], [[T]] +; IND-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 +; IND-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] ; IND-NEXT: [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]] ; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0 ; IND-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer @@ -3629,16 +3576,15 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) { ; IND: vector.body: ; IND-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; IND-NEXT: [[TMP10:%.*]] = trunc i32 [[INDEX]] to i8 -; IND-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[TMP10]], [[T]] -; IND-NEXT: [[TMP11:%.*]] = sext i8 [[OFFSET_IDX]] to i64 -; IND-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP11]] -; IND-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>* -; IND-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP13]], align 4 +; IND-NEXT: [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8 +; IND-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST4]], [[T]] +; IND-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 +; IND-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] +; IND-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; IND-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; IND-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; IND-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; IND-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; IND: middle.block: ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; IND-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -3651,9 +3597,9 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) { ; IND-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; IND-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; IND-NEXT: [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] -; IND-NEXT: [[TMP15:%.*]] = sext i8 [[IDX]] to i64 -; IND-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]] -; IND-NEXT: store i32 [[SPHI]], i32* [[PTR]], align 4 +; IND-NEXT: [[TMP13:%.*]] = sext i8 [[IDX]] to i64 +; IND-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] +; IND-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 ; IND-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; IND-NEXT: [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32 ; IND-NEXT: [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1 @@ -3686,8 +3632,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) { ; UNROLL-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -4 -; UNROLL-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 -; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[CAST_VTC]], [[T]] +; UNROLL-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 +; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] ; UNROLL-NEXT: [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]] ; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0 ; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer @@ -3697,19 +3643,17 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) { ; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[TMP10:%.*]] = trunc i32 [[INDEX]] to i8 -; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[TMP10]], [[T]] -; UNROLL-NEXT: [[TMP11:%.*]] = sext i8 [[OFFSET_IDX]] to i64 -; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP11]] -; UNROLL-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>* -; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP13]], align 4 -; UNROLL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 2 -; UNROLL-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>* -; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP15]], align 4 +; UNROLL-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 +; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] +; UNROLL-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 +; UNROLL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] +; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 +; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; UNROLL-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -3722,9 +3666,9 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) { ; UNROLL-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; UNROLL-NEXT: [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[TMP17:%.*]] = sext i8 [[IDX]] to i64 -; UNROLL-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]] -; UNROLL-NEXT: store i32 [[SPHI]], i32* [[PTR]], align 4 +; UNROLL-NEXT: [[TMP14:%.*]] = sext i8 [[IDX]] to i64 +; UNROLL-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] +; UNROLL-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 ; UNROLL-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; UNROLL-NEXT: [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32 ; UNROLL-NEXT: [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1 @@ -3761,8 +3705,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) { ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 4 ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] -; UNROLL-NO-IC-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 -; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[CAST_VTC]] +; UNROLL-NO-IC-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 +; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] ; UNROLL-NO-IC-NEXT: [[IND_END2:%.*]] = add i32 [[EXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i32 0 ; UNROLL-NO-IC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer @@ -3772,22 +3716,20 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) { ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = trunc i32 [[INDEX]] to i8 -; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[TMP12]] -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 0 -; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = add i8 [[OFFSET_IDX]], 2 -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i8 [[TMP13]] -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i8 [[TMP14]] -; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP18]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 2 -; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP20]], align 4 +; UNROLL-NO-IC-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 +; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST5]] +; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = add i8 [[OFFSET_IDX]], 0 +; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 2 +; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP12]] +; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[TMP13]] +; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP16]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP17]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], -; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -3800,8 +3742,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) { ; UNROLL-NO-IC-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NO-IC-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; UNROLL-NO-IC-NEXT: [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] -; UNROLL-NO-IC-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i8 [[IDX]] -; UNROLL-NO-IC-NEXT: store i32 [[SPHI]], i32* [[PTR]], align 4 +; UNROLL-NO-IC-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[IDX]] +; UNROLL-NO-IC-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 ; UNROLL-NO-IC-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; UNROLL-NO-IC-NEXT: [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32 ; UNROLL-NO-IC-NEXT: [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1 @@ -3834,8 +3776,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) { ; INTERLEAVE-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; INTERLEAVE: vector.ph: ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -8 -; INTERLEAVE-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 -; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[CAST_VTC]], [[T]] +; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 +; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] ; INTERLEAVE-NEXT: [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]] ; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT]], i64 0 ; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer @@ -3845,19 +3787,17 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) { ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[TMP10:%.*]] = trunc i32 [[INDEX]] to i8 -; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[TMP10]], [[T]] -; INTERLEAVE-NEXT: [[TMP11:%.*]] = sext i8 [[OFFSET_IDX]] to i64 -; INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP11]] -; INTERLEAVE-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP13]], align 4 -; INTERLEAVE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 4 -; INTERLEAVE-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>* -; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], <4 x i32>* [[TMP15]], align 4 +; INTERLEAVE-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 +; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] +; INTERLEAVE-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 +; INTERLEAVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] +; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 +; INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -3870,9 +3810,9 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) { ; INTERLEAVE-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; INTERLEAVE-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; INTERLEAVE-NEXT: [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[TMP17:%.*]] = sext i8 [[IDX]] to i64 -; INTERLEAVE-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]] -; INTERLEAVE-NEXT: store i32 [[SPHI]], i32* [[PTR]], align 4 +; INTERLEAVE-NEXT: [[TMP14:%.*]] = sext i8 [[IDX]] to i64 +; INTERLEAVE-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] +; INTERLEAVE-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 ; INTERLEAVE-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; INTERLEAVE-NEXT: [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32 ; INTERLEAVE-NEXT: [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1 @@ -3895,8 +3835,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) { %idx.b = phi i32 [ 0, %entry ], [ %idx.b.inc, %loop ] %sphi = phi i32 [ %ext, %entry ], [%idx.inc.ext, %loop] - %ptr = getelementptr inbounds i32, i32* %A, i8 %idx - store i32 %sphi, i32* %ptr + %ptr = getelementptr inbounds i32, ptr %A, i8 %idx + store i32 %sphi, ptr %ptr %idx.inc = add i8 %idx, 1 %idx.inc.ext = zext i8 %idx.inc to i32 @@ -3913,7 +3853,7 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) { ; In order to recognize %sphi as an induction PHI and vectorize this loop, ; we need to convert the SCEV expression into an AddRecExpr. ; The expression gets converted to ({4 * (zext %t to i32),+,4}). -define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) { +define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; CHECK-LABEL: @wrappingindvars2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ST:%.*]] = zext i8 [[T:%.*]] to i16 @@ -3941,8 +3881,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) { ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] -; CHECK-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 -; CHECK-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[CAST_VTC]] +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 +; CHECK-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] ; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[N_VEC]], 4 ; CHECK-NEXT: [[IND_END1:%.*]] = add i32 [[EXT_MUL]], [[TMP12]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i32 0 @@ -3952,17 +3892,16 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[INDEX]] to i8 -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = add i8 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i8 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP17]], align 4 +; CHECK-NEXT: [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST4]] +; CHECK-NEXT: [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 +; CHECK-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP15]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -3975,8 +3914,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) { ; CHECK-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i8 [[IDX]] -; CHECK-NEXT: store i32 [[SPHI]], i32* [[PTR]], align 4 +; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[IDX]] +; CHECK-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 ; CHECK-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; CHECK-NEXT: [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32 ; CHECK-NEXT: [[MUL]] = mul i32 [[IDX_INC_EXT]], 4 @@ -4011,10 +3950,10 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) { ; IND-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; IND: vector.ph: ; IND-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -2 -; IND-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 -; IND-NEXT: [[IND_END:%.*]] = add i8 [[CAST_VTC]], [[T]] -; IND-NEXT: [[EXT_MUL4:%.*]] = add i32 [[N_VEC]], [[EXT]] -; IND-NEXT: [[IND_END1:%.*]] = shl i32 [[EXT_MUL4]], 2 +; IND-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 +; IND-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] +; IND-NEXT: [[EXT_MUL5:%.*]] = add i32 [[N_VEC]], [[EXT]] +; IND-NEXT: [[IND_END1:%.*]] = shl i32 [[EXT_MUL5]], 2 ; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0 ; IND-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; IND-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], @@ -4022,16 +3961,15 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) { ; IND: vector.body: ; IND-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; IND-NEXT: [[TMP10:%.*]] = trunc i32 [[INDEX]] to i8 -; IND-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[TMP10]], [[T]] -; IND-NEXT: [[TMP11:%.*]] = sext i8 [[OFFSET_IDX]] to i64 -; IND-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP11]] -; IND-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>* -; IND-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP13]], align 4 +; IND-NEXT: [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8 +; IND-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST4]], [[T]] +; IND-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 +; IND-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] +; IND-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; IND-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; IND-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; IND-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; IND-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; IND: middle.block: ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; IND-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -4044,9 +3982,9 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) { ; IND-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; IND-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] ; IND-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] -; IND-NEXT: [[TMP15:%.*]] = sext i8 [[IDX]] to i64 -; IND-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]] -; IND-NEXT: store i32 [[SPHI]], i32* [[PTR]], align 4 +; IND-NEXT: [[TMP13:%.*]] = sext i8 [[IDX]] to i64 +; IND-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] +; IND-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 ; IND-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; IND-NEXT: [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32 ; IND-NEXT: [[MUL]] = shl nuw nsw i32 [[IDX_INC_EXT]], 2 @@ -4081,10 +4019,10 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) { ; UNROLL-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -4 -; UNROLL-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 -; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[CAST_VTC]], [[T]] -; UNROLL-NEXT: [[EXT_MUL5:%.*]] = add i32 [[N_VEC]], [[EXT]] -; UNROLL-NEXT: [[IND_END1:%.*]] = shl i32 [[EXT_MUL5]], 2 +; UNROLL-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 +; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] +; UNROLL-NEXT: [[EXT_MUL6:%.*]] = add i32 [[N_VEC]], [[EXT]] +; UNROLL-NEXT: [[IND_END1:%.*]] = shl i32 [[EXT_MUL6]], 2 ; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0 ; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], @@ -4093,19 +4031,17 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) { ; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[TMP10:%.*]] = trunc i32 [[INDEX]] to i8 -; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[TMP10]], [[T]] -; UNROLL-NEXT: [[TMP11:%.*]] = sext i8 [[OFFSET_IDX]] to i64 -; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP11]] -; UNROLL-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>* -; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP13]], align 4 -; UNROLL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 2 -; UNROLL-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>* -; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP15]], align 4 +; UNROLL-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 +; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] +; UNROLL-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 +; UNROLL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] +; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 +; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; UNROLL-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -4118,9 +4054,9 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) { ; UNROLL-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] ; UNROLL-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[TMP17:%.*]] = sext i8 [[IDX]] to i64 -; UNROLL-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]] -; UNROLL-NEXT: store i32 [[SPHI]], i32* [[PTR]], align 4 +; UNROLL-NEXT: [[TMP14:%.*]] = sext i8 [[IDX]] to i64 +; UNROLL-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] +; UNROLL-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 ; UNROLL-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; UNROLL-NEXT: [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32 ; UNROLL-NEXT: [[MUL]] = shl nuw nsw i32 [[IDX_INC_EXT]], 2 @@ -4159,8 +4095,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) { ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 4 ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] -; UNROLL-NO-IC-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 -; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[CAST_VTC]] +; UNROLL-NO-IC-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 +; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = mul i32 [[N_VEC]], 4 ; UNROLL-NO-IC-NEXT: [[IND_END1:%.*]] = add i32 [[EXT_MUL]], [[TMP12]] ; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i32 0 @@ -4171,22 +4107,20 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) { ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = trunc i32 [[INDEX]] to i8 -; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[TMP13]] -; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = add i8 [[OFFSET_IDX]], 0 -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = add i8 [[OFFSET_IDX]], 2 -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i8 [[TMP14]] -; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i8 [[TMP15]] -; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP19]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 2 -; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP21]], align 4 +; UNROLL-NO-IC-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 +; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST5]] +; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 0 +; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = add i8 [[OFFSET_IDX]], 2 +; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP13]] +; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[TMP14]] +; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP17]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP18]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], -; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -4199,8 +4133,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) { ; UNROLL-NO-IC-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NO-IC-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] ; UNROLL-NO-IC-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] -; UNROLL-NO-IC-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i8 [[IDX]] -; UNROLL-NO-IC-NEXT: store i32 [[SPHI]], i32* [[PTR]], align 4 +; UNROLL-NO-IC-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[IDX]] +; UNROLL-NO-IC-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 ; UNROLL-NO-IC-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; UNROLL-NO-IC-NEXT: [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32 ; UNROLL-NO-IC-NEXT: [[MUL]] = mul i32 [[IDX_INC_EXT]], 4 @@ -4235,10 +4169,10 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) { ; INTERLEAVE-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; INTERLEAVE: vector.ph: ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -8 -; INTERLEAVE-NEXT: [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8 -; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[CAST_VTC]], [[T]] -; INTERLEAVE-NEXT: [[EXT_MUL5:%.*]] = add i32 [[N_VEC]], [[EXT]] -; INTERLEAVE-NEXT: [[IND_END1:%.*]] = shl i32 [[EXT_MUL5]], 2 +; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 +; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] +; INTERLEAVE-NEXT: [[EXT_MUL6:%.*]] = add i32 [[N_VEC]], [[EXT]] +; INTERLEAVE-NEXT: [[IND_END1:%.*]] = shl i32 [[EXT_MUL6]], 2 ; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT_MUL]], i64 0 ; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], @@ -4247,19 +4181,17 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) { ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[TMP10:%.*]] = trunc i32 [[INDEX]] to i8 -; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[TMP10]], [[T]] -; INTERLEAVE-NEXT: [[TMP11:%.*]] = sext i8 [[OFFSET_IDX]] to i64 -; INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP11]] -; INTERLEAVE-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP13]], align 4 -; INTERLEAVE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 4 -; INTERLEAVE-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>* -; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], <4 x i32>* [[TMP15]], align 4 +; INTERLEAVE-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 +; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] +; INTERLEAVE-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 +; INTERLEAVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] +; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 +; INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -4272,9 +4204,9 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) { ; INTERLEAVE-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; INTERLEAVE-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] ; INTERLEAVE-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[TMP17:%.*]] = sext i8 [[IDX]] to i64 -; INTERLEAVE-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]] -; INTERLEAVE-NEXT: store i32 [[SPHI]], i32* [[PTR]], align 4 +; INTERLEAVE-NEXT: [[TMP14:%.*]] = sext i8 [[IDX]] to i64 +; INTERLEAVE-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] +; INTERLEAVE-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 ; INTERLEAVE-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; INTERLEAVE-NEXT: [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32 ; INTERLEAVE-NEXT: [[MUL]] = shl nuw nsw i32 [[IDX_INC_EXT]], 2 @@ -4300,8 +4232,8 @@ entry: %sphi = phi i32 [ %ext.mul, %entry ], [%mul, %loop] %idx.b = phi i32 [ 0, %entry ], [ %idx.b.inc, %loop ] - %ptr = getelementptr inbounds i32, i32* %A, i8 %idx - store i32 %sphi, i32* %ptr + %ptr = getelementptr inbounds i32, ptr %A, i8 %idx + store i32 %sphi, ptr %ptr %idx.inc = add i8 %idx, 1 %idx.inc.ext = zext i8 %idx.inc to i32 @@ -4318,7 +4250,7 @@ entry: ; Check that we generate vectorized IVs in the pre-header ; instead of widening the scalar IV inside the loop, when ; we know how to do that. -define void @veciv(i32* nocapture %a, i32 %start, i32 %k) { +define void @veciv(ptr nocapture %a, i32 %start, i32 %k) { ; CHECK-LABEL: @veciv( ; CHECK-NEXT: for.body.preheader: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K:%.*]], 2 @@ -4331,14 +4263,13 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -4347,8 +4278,8 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] @@ -4366,13 +4297,12 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) { ; IND-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64 -; IND-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; IND-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* -; IND-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP2]], align 4 +; IND-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; IND-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; IND-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; IND-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] +; IND-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; IND-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] ; IND: middle.block: ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[K]] ; IND-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -4381,9 +4311,9 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) { ; IND-NEXT: br label [[FOR_BODY:%.*]] ; IND: for.body: ; IND-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; IND-NEXT: [[TMP4:%.*]] = sext i32 [[INDVARS_IV]] to i64 -; IND-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]] -; IND-NEXT: store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4 +; IND-NEXT: [[TMP3:%.*]] = sext i32 [[INDVARS_IV]] to i64 +; IND-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]] +; IND-NEXT: store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4 ; IND-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 ; IND-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]] ; IND-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] @@ -4402,16 +4332,14 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) { ; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], ; UNROLL-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64 -; UNROLL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; UNROLL-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* -; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP2]], align 4 -; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 2 -; UNROLL-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>* -; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP4]], align 4 +; UNROLL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4 +; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] +; UNROLL-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[K]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -4420,9 +4348,9 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) { ; UNROLL-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL: for.body: ; UNROLL-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[TMP6:%.*]] = sext i32 [[INDVARS_IV]] to i64 -; UNROLL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]] -; UNROLL-NEXT: store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4 +; UNROLL-NEXT: [[TMP4:%.*]] = sext i32 [[INDVARS_IV]] to i64 +; UNROLL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]] +; UNROLL-NEXT: store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4 ; UNROLL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 ; UNROLL-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]] ; UNROLL-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] @@ -4443,18 +4371,16 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) { ; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], ; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 2 -; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] -; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP1]] -; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP5]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 2 -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP7]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP1]] +; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP4]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP5]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -4463,8 +4389,8 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) { ; UNROLL-NO-IC-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-IC: for.body: ; UNROLL-NO-IC-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; UNROLL-NO-IC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]] -; UNROLL-NO-IC-NEXT: store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4 +; UNROLL-NO-IC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDVARS_IV]] +; UNROLL-NO-IC-NEXT: store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4 ; UNROLL-NO-IC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 ; UNROLL-NO-IC-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]] ; UNROLL-NO-IC-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] @@ -4483,16 +4409,14 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) { ; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], ; INTERLEAVE-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64 -; INTERLEAVE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; INTERLEAVE-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* -; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP2]], align 4 -; INTERLEAVE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; INTERLEAVE-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], <4 x i32>* [[TMP4]], align 4 +; INTERLEAVE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP1]], align 4 +; INTERLEAVE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[K]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -4501,9 +4425,9 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) { ; INTERLEAVE-NEXT: br label [[FOR_BODY:%.*]] ; INTERLEAVE: for.body: ; INTERLEAVE-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[TMP6:%.*]] = sext i32 [[INDVARS_IV]] to i64 -; INTERLEAVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]] -; INTERLEAVE-NEXT: store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4 +; INTERLEAVE-NEXT: [[TMP4:%.*]] = sext i32 [[INDVARS_IV]] to i64 +; INTERLEAVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]] +; INTERLEAVE-NEXT: store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4 ; INTERLEAVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 ; INTERLEAVE-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]] ; INTERLEAVE-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] @@ -4515,8 +4439,8 @@ for.body.preheader: for.body: %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] - %arrayidx = getelementptr inbounds i32, i32* %a, i32 %indvars.iv - store i32 %indvars.iv, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %a, i32 %indvars.iv + store i32 %indvars.iv, ptr %arrayidx, align 4 %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 %exitcond = icmp eq i32 %indvars.iv.next, %k br i1 %exitcond, label %exit, label %for.body @@ -4525,7 +4449,7 @@ exit: ret void } -define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) { +define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) { ; CHECK-LABEL: @trunciv( ; CHECK-NEXT: for.body.preheader: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[K:%.*]], 2 @@ -4542,18 +4466,17 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[K]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[OFFSET_IDX]] to i32 ; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP9]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[K]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -4563,8 +4486,8 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) { ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[TRUNC_IV:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TRUNC_IV]] -; CHECK-NEXT: store i32 [[TRUNC_IV]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TRUNC_IV]] +; CHECK-NEXT: store i32 [[TRUNC_IV]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[K]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] @@ -4582,17 +4505,16 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) { ; IND-NEXT: [[N_VEC:%.*]] = and i64 [[K]], -2 ; IND-NEXT: br label [[VECTOR_BODY:%.*]] ; IND: vector.body: -; IND-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IND-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; IND-NEXT: [[SEXT:%.*]] = shl i64 [[INDEX]], 32 +; IND-NEXT: [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32 ; IND-NEXT: [[TMP0:%.*]] = ashr exact i64 [[SEXT]], 32 -; IND-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; IND-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* -; IND-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP2]], align 4 -; IND-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; IND-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; IND-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4 +; IND-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 2 ; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; IND-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IND-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; IND-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IND-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] ; IND: middle.block: ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[K]] ; IND-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -4603,9 +4525,9 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) { ; IND-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; IND-NEXT: [[TRUNC_IV:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; IND-NEXT: [[SEXT1:%.*]] = shl i64 [[INDVARS_IV]], 32 -; IND-NEXT: [[TMP4:%.*]] = ashr exact i64 [[SEXT1]], 32 -; IND-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]] -; IND-NEXT: store i32 [[TRUNC_IV]], i32* [[ARRAYIDX]], align 4 +; IND-NEXT: [[TMP3:%.*]] = ashr exact i64 [[SEXT1]], 32 +; IND-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]] +; IND-NEXT: store i32 [[TRUNC_IV]], ptr [[ARRAYIDX]], align 4 ; IND-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; IND-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[K]] ; IND-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] @@ -4623,21 +4545,19 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) { ; UNROLL-NEXT: [[N_VEC:%.*]] = and i64 [[K]], -4 ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: -; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[SEXT:%.*]] = shl i64 [[INDEX]], 32 +; UNROLL-NEXT: [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32 ; UNROLL-NEXT: [[TMP0:%.*]] = ashr exact i64 [[SEXT]], 32 -; UNROLL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; UNROLL-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* -; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP2]], align 4 -; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 2 -; UNROLL-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>* -; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP4]], align 4 -; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; UNROLL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4 +; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4 +; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; UNROLL-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[K]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -4648,9 +4568,9 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) { ; UNROLL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NEXT: [[TRUNC_IV:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; UNROLL-NEXT: [[SEXT2:%.*]] = shl i64 [[INDVARS_IV]], 32 -; UNROLL-NEXT: [[TMP6:%.*]] = ashr exact i64 [[SEXT2]], 32 -; UNROLL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]] -; UNROLL-NEXT: store i32 [[TRUNC_IV]], i32* [[ARRAYIDX]], align 4 +; UNROLL-NEXT: [[TMP4:%.*]] = ashr exact i64 [[SEXT2]], 32 +; UNROLL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]] +; UNROLL-NEXT: store i32 [[TRUNC_IV]], ptr [[ARRAYIDX]], align 4 ; UNROLL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; UNROLL-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[K]] ; UNROLL-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] @@ -4673,24 +4593,22 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) { ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i64 [[K]], [[N_MOD_VF]] ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: -; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = trunc i64 [[INDEX]] to i32 +; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = trunc i64 [[OFFSET_IDX]] to i32 ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 0 ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add i32 [[TMP5]], 2 -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP6]] -; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP7]] -; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP11]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 2 -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP13]], align 4 -; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP6]] +; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP7]] +; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP10]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP11]], align 4 +; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], -; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[K]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -4700,8 +4618,8 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) { ; UNROLL-NO-IC: for.body: ; UNROLL-NO-IC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NO-IC-NEXT: [[TRUNC_IV:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; UNROLL-NO-IC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TRUNC_IV]] -; UNROLL-NO-IC-NEXT: store i32 [[TRUNC_IV]], i32* [[ARRAYIDX]], align 4 +; UNROLL-NO-IC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TRUNC_IV]] +; UNROLL-NO-IC-NEXT: store i32 [[TRUNC_IV]], ptr [[ARRAYIDX]], align 4 ; UNROLL-NO-IC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; UNROLL-NO-IC-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[K]] ; UNROLL-NO-IC-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] @@ -4719,21 +4637,19 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) { ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i64 [[K]], -8 ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: -; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[SEXT:%.*]] = shl i64 [[INDEX]], 32 +; INTERLEAVE-NEXT: [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32 ; INTERLEAVE-NEXT: [[TMP0:%.*]] = ashr exact i64 [[SEXT]], 32 -; INTERLEAVE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; INTERLEAVE-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* -; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP2]], align 4 -; INTERLEAVE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; INTERLEAVE-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], <4 x i32>* [[TMP4]], align 4 -; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; INTERLEAVE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP1]], align 4 +; INTERLEAVE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4 +; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[K]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -4744,9 +4660,9 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) { ; INTERLEAVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; INTERLEAVE-NEXT: [[TRUNC_IV:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; INTERLEAVE-NEXT: [[SEXT2:%.*]] = shl i64 [[INDVARS_IV]], 32 -; INTERLEAVE-NEXT: [[TMP6:%.*]] = ashr exact i64 [[SEXT2]], 32 -; INTERLEAVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]] -; INTERLEAVE-NEXT: store i32 [[TRUNC_IV]], i32* [[ARRAYIDX]], align 4 +; INTERLEAVE-NEXT: [[TMP4:%.*]] = ashr exact i64 [[SEXT2]], 32 +; INTERLEAVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]] +; INTERLEAVE-NEXT: store i32 [[TRUNC_IV]], ptr [[ARRAYIDX]], align 4 ; INTERLEAVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; INTERLEAVE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[K]] ; INTERLEAVE-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] @@ -4759,8 +4675,8 @@ for.body.preheader: for.body: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] %trunc.iv = trunc i64 %indvars.iv to i32 - %arrayidx = getelementptr inbounds i32, i32* %a, i32 %trunc.iv - store i32 %trunc.iv, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %a, i32 %trunc.iv + store i32 %trunc.iv, ptr %arrayidx, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, %k br i1 %exitcond, label %exit, label %for.body @@ -4771,7 +4687,7 @@ exit: ; ; -define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) { +define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) { ; CHECK-LABEL: @nonprimary( ; CHECK-NEXT: for.body.preheader: ; CHECK-NEXT: [[TMP0:%.*]] = sub i32 [[K:%.*]], [[I:%.*]] @@ -4790,14 +4706,13 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[I]], [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; CHECK-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP3]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -4806,8 +4721,8 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]] @@ -4831,13 +4746,12 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) { ; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], [[I]] ; IND-NEXT: [[TMP1:%.*]] = sext i32 [[OFFSET_IDX]] to i64 -; IND-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]] -; IND-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* -; IND-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP3]], align 4 +; IND-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]] +; IND-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; IND-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; IND-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] +; IND-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; IND-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] ; IND: middle.block: ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; IND-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -4846,9 +4760,9 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) { ; IND-NEXT: br label [[FOR_BODY:%.*]] ; IND: for.body: ; IND-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; IND-NEXT: [[TMP5:%.*]] = sext i32 [[INDVARS_IV]] to i64 -; IND-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]] -; IND-NEXT: store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4 +; IND-NEXT: [[TMP4:%.*]] = sext i32 [[INDVARS_IV]] to i64 +; IND-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]] +; IND-NEXT: store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4 ; IND-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 ; IND-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]] ; IND-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]] @@ -4873,16 +4787,14 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) { ; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], ; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], [[I]] ; UNROLL-NEXT: [[TMP1:%.*]] = sext i32 [[OFFSET_IDX]] to i64 -; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]] -; UNROLL-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* -; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP3]], align 4 -; UNROLL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 2 -; UNROLL-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>* -; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP5]], align 4 +; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]] +; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4 +; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP3]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] +; UNROLL-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -4891,9 +4803,9 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) { ; UNROLL-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL: for.body: ; UNROLL-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[TMP7:%.*]] = sext i32 [[INDVARS_IV]] to i64 -; UNROLL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]] -; UNROLL-NEXT: store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4 +; UNROLL-NEXT: [[TMP5:%.*]] = sext i32 [[INDVARS_IV]] to i64 +; UNROLL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP5]] +; UNROLL-NEXT: store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4 ; UNROLL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 ; UNROLL-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]] ; UNROLL-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]] @@ -4920,18 +4832,16 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) { ; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[I]], [[INDEX]] ; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 0 ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 2 -; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP1]] -; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP2]] -; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP6]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 2 -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP8]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP1]] +; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP2]] +; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP5]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP6]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], -; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -4940,8 +4850,8 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) { ; UNROLL-NO-IC-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-IC: for.body: ; UNROLL-NO-IC-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; UNROLL-NO-IC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]] -; UNROLL-NO-IC-NEXT: store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4 +; UNROLL-NO-IC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDVARS_IV]] +; UNROLL-NO-IC-NEXT: store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4 ; UNROLL-NO-IC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 ; UNROLL-NO-IC-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]] ; UNROLL-NO-IC-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]] @@ -4966,16 +4876,14 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) { ; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], ; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], [[I]] ; INTERLEAVE-NEXT: [[TMP1:%.*]] = sext i32 [[OFFSET_IDX]] to i64 -; INTERLEAVE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]] -; INTERLEAVE-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP3]], align 4 -; INTERLEAVE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 4 -; INTERLEAVE-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* -; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], <4 x i32>* [[TMP5]], align 4 +; INTERLEAVE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]] +; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP2]], align 4 +; INTERLEAVE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], ptr [[TMP3]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -4984,9 +4892,9 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) { ; INTERLEAVE-NEXT: br label [[FOR_BODY:%.*]] ; INTERLEAVE: for.body: ; INTERLEAVE-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[TMP7:%.*]] = sext i32 [[INDVARS_IV]] to i64 -; INTERLEAVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]] -; INTERLEAVE-NEXT: store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4 +; INTERLEAVE-NEXT: [[TMP5:%.*]] = sext i32 [[INDVARS_IV]] to i64 +; INTERLEAVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP5]] +; INTERLEAVE-NEXT: store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4 ; INTERLEAVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 ; INTERLEAVE-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]] ; INTERLEAVE-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]] @@ -4998,8 +4906,8 @@ for.body.preheader: for.body: %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ %i, %for.body.preheader ] - %arrayidx = getelementptr inbounds i32, i32* %a, i32 %indvars.iv - store i32 %indvars.iv, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %a, i32 %indvars.iv + store i32 %indvars.iv, ptr %arrayidx, align 4 %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 %exitcond = icmp eq i32 %indvars.iv.next, %k br i1 %exitcond, label %exit, label %for.body @@ -5008,7 +4916,7 @@ exit: ret void } -define void @non_primary_iv_trunc(i32* %a, i64 %n) { +define void @non_primary_iv_trunc(ptr %a, i64 %n) { ; CHECK-LABEL: @non_primary_iv_trunc( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) @@ -5023,14 +4931,13 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -5041,9 +4948,9 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) { ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] +; CHECK-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]] ; CHECK-NEXT: [[VAR1:%.*]] = trunc i64 [[J]] to i32 -; CHECK-NEXT: store i32 [[VAR1]], i32* [[VAR0]], align 4 +; CHECK-NEXT: store i32 [[VAR1]], ptr [[VAR0]], align 4 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J]], 2 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] @@ -5063,13 +4970,12 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) { ; IND: vector.body: ; IND-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; IND-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] -; IND-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>* -; IND-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP1]], align 4 +; IND-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] +; IND-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP0]], align 4 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; IND-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IND-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]] +; IND-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IND-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]] ; IND: middle.block: ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; IND-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -5080,9 +4986,9 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) { ; IND: for.body: ; IND-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; IND-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; IND-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] +; IND-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]] ; IND-NEXT: [[VAR1:%.*]] = trunc i64 [[J]] to i32 -; IND-NEXT: store i32 [[VAR1]], i32* [[VAR0]], align 4 +; IND-NEXT: store i32 [[VAR1]], ptr [[VAR0]], align 4 ; IND-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; IND-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J]], 2 ; IND-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] @@ -5103,16 +5009,14 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) { ; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] -; UNROLL-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>* -; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP1]], align 4 -; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; UNROLL-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* -; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP3]], align 4 +; UNROLL-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] +; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP0]], align 4 +; UNROLL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP1]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]] +; UNROLL-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -5123,9 +5027,9 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) { ; UNROLL: for.body: ; UNROLL-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] +; UNROLL-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]] ; UNROLL-NEXT: [[VAR1:%.*]] = trunc i64 [[J]] to i32 -; UNROLL-NEXT: store i32 [[VAR1]], i32* [[VAR0]], align 4 +; UNROLL-NEXT: store i32 [[VAR1]], ptr [[VAR0]], align 4 ; UNROLL-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; UNROLL-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J]], 2 ; UNROLL-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] @@ -5149,18 +5053,16 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) { ; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], ; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 -; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] -; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP5]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 2 -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP7]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]] +; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP4]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP5]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -5171,9 +5073,9 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) { ; UNROLL-NO-IC: for.body: ; UNROLL-NO-IC-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NO-IC-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; UNROLL-NO-IC-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] +; UNROLL-NO-IC-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]] ; UNROLL-NO-IC-NEXT: [[VAR1:%.*]] = trunc i64 [[J]] to i32 -; UNROLL-NO-IC-NEXT: store i32 [[VAR1]], i32* [[VAR0]], align 4 +; UNROLL-NO-IC-NEXT: store i32 [[VAR1]], ptr [[VAR0]], align 4 ; UNROLL-NO-IC-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; UNROLL-NO-IC-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J]], 2 ; UNROLL-NO-IC-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] @@ -5194,16 +5096,14 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) { ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] -; INTERLEAVE-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP1]], align 4 -; INTERLEAVE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 -; INTERLEAVE-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], <4 x i32>* [[TMP3]], align 4 +; INTERLEAVE-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] +; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP0]], align 4 +; INTERLEAVE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], ptr [[TMP1]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]] ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -5214,9 +5114,9 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) { ; INTERLEAVE: for.body: ; INTERLEAVE-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; INTERLEAVE-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] +; INTERLEAVE-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]] ; INTERLEAVE-NEXT: [[VAR1:%.*]] = trunc i64 [[J]] to i32 -; INTERLEAVE-NEXT: store i32 [[VAR1]], i32* [[VAR0]], align 4 +; INTERLEAVE-NEXT: store i32 [[VAR1]], ptr [[VAR0]], align 4 ; INTERLEAVE-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; INTERLEAVE-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J]], 2 ; INTERLEAVE-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] @@ -5230,9 +5130,9 @@ entry: for.body: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] %j = phi i64 [ %j.next, %for.body ], [ 0, %entry ] - %var0 = getelementptr inbounds i32, i32* %a, i64 %i + %var0 = getelementptr inbounds i32, ptr %a, i64 %i %var1 = trunc i64 %j to i32 - store i32 %var1, i32* %var0, align 4 + store i32 %var1, ptr %var0, align 4 %i.next = add nuw nsw i64 %i, 1 %j.next = add nuw nsw i64 %j, 2 %cond = icmp slt i64 %i.next, %n @@ -6103,7 +6003,7 @@ loop: ; preds = %loop, %entry } ; Test case for PR52460. -define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32* %dst) { +define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %dst) { ; ; CHECK-LABEL: @pr52460_first_order_recurrence_truncated_iv( ; CHECK-NEXT: entry: @@ -6111,25 +6011,24 @@ define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32* ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SRC:%.*]], align 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[DST:%.*]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 [[TMP1]] ; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32>* [[TMP8]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store <2 x i32> [[TMP6]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, 100 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND]], i32 1 @@ -6144,14 +6043,14 @@ define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32* ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[TRUNC_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[TRUNC_IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[SRC]], align 4 +; CHECK-NEXT: [[LV:%.*]] = load i32, ptr [[SRC]], align 4 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[LV]], [[SCALAR_RECUR]] ; CHECK-NEXT: [[TRUNC_IV_NEXT]] = add i32 [[TRUNC_IV]], 1 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[IV_TRUNC]] = trunc i64 [[IV]] to i32 -; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i32, i32* [[DST]], i32 [[IV_TRUNC]] +; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i32, ptr [[DST]], i32 [[IV_TRUNC]] ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[IV_TRUNC]], [[MUL]] -; CHECK-NEXT: store i32 [[ADD]], i32* [[DST_GEP]], align 4 +; CHECK-NEXT: store i32 [[ADD]], ptr [[DST_GEP]], align 4 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TRUNC_IV_NEXT]], 100 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP53:![0-9]+]] ; CHECK: exit: @@ -6163,24 +6062,23 @@ define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32* ; IND: vector.ph: ; IND-NEXT: br label [[VECTOR_BODY:%.*]] ; IND: vector.body: -; IND-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IND-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[VEC_IND]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[TMP0:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> -; IND-NEXT: [[TMP1:%.*]] = load i32, i32* [[SRC:%.*]], align 4 +; IND-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC:%.*]], align 4 ; IND-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i64 0 ; IND-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; IND-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT]], [[TMP0]] -; IND-NEXT: [[SEXT:%.*]] = shl i64 [[INDEX]], 32 +; IND-NEXT: [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32 ; IND-NEXT: [[TMP3:%.*]] = ashr exact i64 [[SEXT]], 32 -; IND-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP3]] +; IND-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP3]] ; IND-NEXT: [[TMP5:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP2]] -; IND-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>* -; IND-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP6]], align 4 -; IND-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; IND-NEXT: store <2 x i32> [[TMP5]], ptr [[TMP4]], align 4 +; IND-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 2 ; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; IND-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 -; IND-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]] +; IND-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; IND-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]] ; IND: middle.block: ; IND-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: @@ -6196,31 +6094,29 @@ define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32* ; UNROLL: vector.ph: ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: -; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[STEP_ADD]] = add <2 x i32> [[VEC_IND]], ; UNROLL-NEXT: [[TMP0:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> ; UNROLL-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[VEC_IND]], <2 x i32> [[STEP_ADD]], <2 x i32> -; UNROLL-NEXT: [[TMP2:%.*]] = load i32, i32* [[SRC:%.*]], align 4 +; UNROLL-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC:%.*]], align 4 ; UNROLL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0 ; UNROLL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: [[TMP3:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT4]], [[TMP0]] ; UNROLL-NEXT: [[TMP4:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT4]], [[TMP1]] -; UNROLL-NEXT: [[SEXT:%.*]] = shl i64 [[INDEX]], 32 +; UNROLL-NEXT: [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32 ; UNROLL-NEXT: [[TMP5:%.*]] = ashr exact i64 [[SEXT]], 32 -; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP5]] +; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP5]] ; UNROLL-NEXT: [[TMP7:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP3]] ; UNROLL-NEXT: [[TMP8:%.*]] = add <2 x i32> [[STEP_ADD]], [[TMP4]] -; UNROLL-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>* -; UNROLL-NEXT: store <2 x i32> [[TMP7]], <2 x i32>* [[TMP9]], align 4 -; UNROLL-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP6]], i64 2 -; UNROLL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <2 x i32>* -; UNROLL-NEXT: store <2 x i32> [[TMP8]], <2 x i32>* [[TMP11]], align 4 -; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; UNROLL-NEXT: store <2 x i32> [[TMP7]], ptr [[TMP6]], align 4 +; UNROLL-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP6]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[TMP8]], ptr [[TMP9]], align 4 +; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 -; UNROLL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]] +; UNROLL-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; UNROLL-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: @@ -6236,34 +6132,32 @@ define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32* ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: -; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[STEP_ADD]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32 +; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32 ; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0 ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 2 ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[VEC_IND]], <2 x i32> [[STEP_ADD]], <2 x i32> -; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = load i32, i32* [[SRC:%.*]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = load i32, ptr [[SRC:%.*]], align 4 ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0 ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT4]], [[TMP3]] ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT4]], [[TMP4]] -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[DST:%.*]], i32 [[TMP1]] -; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[DST]], i32 [[TMP2]] +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 [[TMP1]] +; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP2]] ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP6]] ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = add <2 x i32> [[STEP_ADD]], [[TMP7]] -; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP8]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP10]], <2 x i32>* [[TMP13]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP8]], i32 2 -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP11]], <2 x i32>* [[TMP15]], align 4 -; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP10]], ptr [[TMP12]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP8]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP11]], ptr [[TMP13]], align 4 +; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 -; UNROLL-NO-IC-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; UNROLL-NO-IC-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, 100 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD]], i32 1 @@ -6278,14 +6172,14 @@ define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32* ; UNROLL-NO-IC-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; UNROLL-NO-IC-NEXT: [[TRUNC_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[TRUNC_IV_NEXT:%.*]], [[LOOP]] ] ; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ] -; UNROLL-NO-IC-NEXT: [[LV:%.*]] = load i32, i32* [[SRC]], align 4 +; UNROLL-NO-IC-NEXT: [[LV:%.*]] = load i32, ptr [[SRC]], align 4 ; UNROLL-NO-IC-NEXT: [[MUL:%.*]] = mul nsw i32 [[LV]], [[SCALAR_RECUR]] ; UNROLL-NO-IC-NEXT: [[TRUNC_IV_NEXT]] = add i32 [[TRUNC_IV]], 1 ; UNROLL-NO-IC-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; UNROLL-NO-IC-NEXT: [[IV_TRUNC]] = trunc i64 [[IV]] to i32 -; UNROLL-NO-IC-NEXT: [[DST_GEP:%.*]] = getelementptr i32, i32* [[DST]], i32 [[IV_TRUNC]] +; UNROLL-NO-IC-NEXT: [[DST_GEP:%.*]] = getelementptr i32, ptr [[DST]], i32 [[IV_TRUNC]] ; UNROLL-NO-IC-NEXT: [[ADD:%.*]] = add i32 [[IV_TRUNC]], [[MUL]] -; UNROLL-NO-IC-NEXT: store i32 [[ADD]], i32* [[DST_GEP]], align 4 +; UNROLL-NO-IC-NEXT: store i32 [[ADD]], ptr [[DST_GEP]], align 4 ; UNROLL-NO-IC-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TRUNC_IV_NEXT]], 100 ; UNROLL-NO-IC-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP53:![0-9]+]] ; UNROLL-NO-IC: exit: @@ -6297,31 +6191,29 @@ define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32* ; INTERLEAVE: vector.ph: ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: -; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], ; INTERLEAVE-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> ; INTERLEAVE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> -; INTERLEAVE-NEXT: [[TMP2:%.*]] = load i32, i32* [[SRC:%.*]], align 4 +; INTERLEAVE-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC:%.*]], align 4 ; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i64 0 ; INTERLEAVE-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: [[TMP3:%.*]] = mul nsw <4 x i32> [[BROADCAST_SPLAT4]], [[TMP0]] ; INTERLEAVE-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[BROADCAST_SPLAT4]], [[TMP1]] -; INTERLEAVE-NEXT: [[SEXT:%.*]] = shl i64 [[INDEX]], 32 +; INTERLEAVE-NEXT: [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32 ; INTERLEAVE-NEXT: [[TMP5:%.*]] = ashr exact i64 [[SEXT]], 32 -; INTERLEAVE-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP5]] +; INTERLEAVE-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP5]] ; INTERLEAVE-NEXT: [[TMP7:%.*]] = add <4 x i32> [[VEC_IND]], [[TMP3]] ; INTERLEAVE-NEXT: [[TMP8:%.*]] = add <4 x i32> [[STEP_ADD]], [[TMP4]] -; INTERLEAVE-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; INTERLEAVE-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP9]], align 4 -; INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP6]], i64 4 -; INTERLEAVE-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* -; INTERLEAVE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], align 4 -; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; INTERLEAVE-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP6]], align 4 +; INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP6]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4 +; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 -; INTERLEAVE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 +; INTERLEAVE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]] ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i64 3 ; INTERLEAVE-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -6334,16 +6226,16 @@ define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32* ; INTERLEAVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; INTERLEAVE-NEXT: [[TRUNC_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[TRUNC_IV_NEXT:%.*]], [[LOOP]] ] ; INTERLEAVE-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ] -; INTERLEAVE-NEXT: [[LV:%.*]] = load i32, i32* [[SRC]], align 4 +; INTERLEAVE-NEXT: [[LV:%.*]] = load i32, ptr [[SRC]], align 4 ; INTERLEAVE-NEXT: [[MUL:%.*]] = mul nsw i32 [[LV]], [[SCALAR_RECUR]] ; INTERLEAVE-NEXT: [[TRUNC_IV_NEXT]] = add i32 [[TRUNC_IV]], 1 ; INTERLEAVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; INTERLEAVE-NEXT: [[IV_TRUNC]] = trunc i64 [[IV]] to i32 ; INTERLEAVE-NEXT: [[SEXT5:%.*]] = shl i64 [[IV]], 32 -; INTERLEAVE-NEXT: [[TMP13:%.*]] = ashr exact i64 [[SEXT5]], 32 -; INTERLEAVE-NEXT: [[DST_GEP:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP13]] +; INTERLEAVE-NEXT: [[TMP11:%.*]] = ashr exact i64 [[SEXT5]], 32 +; INTERLEAVE-NEXT: [[DST_GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP11]] ; INTERLEAVE-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[IV_TRUNC]] -; INTERLEAVE-NEXT: store i32 [[ADD]], i32* [[DST_GEP]], align 4 +; INTERLEAVE-NEXT: store i32 [[ADD]], ptr [[DST_GEP]], align 4 ; INTERLEAVE-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TRUNC_IV_NEXT]], 100 ; INTERLEAVE-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP53:![0-9]+]] ; INTERLEAVE: exit: @@ -6356,14 +6248,14 @@ loop: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] %trunc.iv = phi i32 [ 0, %entry ], [ %trunc.iv.next, %loop ] %recur = phi i32 [ 0, %entry ], [ %iv.trunc, %loop ] - %lv = load i32, i32* %src, align 4 + %lv = load i32, ptr %src, align 4 %mul = mul nsw i32 %lv, %recur %trunc.iv.next = add i32 %trunc.iv, 1 %iv.next = add nuw nsw i64 %iv, 1 %iv.trunc = trunc i64 %iv to i32 - %dst.gep = getelementptr i32, i32* %dst, i32 %iv.trunc + %dst.gep = getelementptr i32, ptr %dst, i32 %iv.trunc %add = add i32 %iv.trunc, %mul - store i32 %add, i32* %dst.gep + store i32 %add, ptr %dst.gep %exitcond = icmp eq i32 %trunc.iv.next, 100 br i1 %exitcond, label %exit, label %loop @@ -6374,7 +6266,7 @@ exit: ; Test case where %iv.2.ext and %iv.2.conv become redundant due to the SCEV ; predicates generated for the vector loop. They should be removed in the ; vector loop. -define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n, i32 %step, i32* %ptr) { +define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n, i32 %step, ptr %ptr) { ; ; CHECK-LABEL: @test_optimized_cast_induction_feeding_first_order_recurrence( ; CHECK-NEXT: entry: @@ -6406,8 +6298,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[CAST_VTC]], [[STEP]] +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i32 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP17:%.*]] = mul <2 x i32> , [[DOTSPLAT]] @@ -6422,14 +6314,13 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; CHECK-NEXT: [[VEC_IND]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 [[TMP19]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP20]], <2 x i32>* [[TMP23]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0 +; CHECK-NEXT: store <2 x i32> [[TMP20]], ptr [[TMP22]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], [[DOTSPLAT3]] -; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]] +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND]], i32 1 @@ -6446,8 +6337,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; CHECK-NEXT: [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[IV_2_EXT:%.*]] = shl i32 [[IV_2]], 24 ; CHECK-NEXT: [[IV_2_CONV]] = ashr exact i32 [[IV_2_EXT]], 24 -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IV_1]] -; CHECK-NEXT: store i32 [[SCALAR_RECUR]], i32* [[GEP]], align 4 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV_1]] +; CHECK-NEXT: store i32 [[SCALAR_RECUR]], ptr [[GEP]], align 4 ; CHECK-NEXT: [[IV_2_NEXT]] = add nsw i32 [[IV_2_CONV]], [[STEP]] ; CHECK-NEXT: [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_1_NEXT]], [[N]] @@ -6482,8 +6373,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; IND-NEXT: br i1 [[TMP15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; IND: vector.ph: ; IND-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -2 -; IND-NEXT: [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32 -; IND-NEXT: [[IND_END:%.*]] = mul i32 [[CAST_VTC]], [[STEP]] +; IND-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; IND-NEXT: [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]] ; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0 ; IND-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; IND-NEXT: [[TMP16:%.*]] = mul nuw <2 x i32> [[DOTSPLAT]], @@ -6496,13 +6387,12 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; IND-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[VEC_IND]] = phi <2 x i32> [ [[TMP16]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[TMP18:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> -; IND-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 [[INDEX]] -; IND-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <2 x i32>* -; IND-NEXT: store <2 x i32> [[TMP18]], <2 x i32>* [[TMP20]], align 4 +; IND-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[INDEX]] +; IND-NEXT: store <2 x i32> [[TMP18]], ptr [[TMP19]], align 4 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], [[DOTSPLAT3]] -; IND-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IND-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]] +; IND-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IND-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]] ; IND: middle.block: ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] ; IND-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND]], i64 1 @@ -6518,8 +6408,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; IND-NEXT: [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ] ; IND-NEXT: [[IV_2_EXT:%.*]] = shl i32 [[IV_2]], 24 ; IND-NEXT: [[IV_2_CONV]] = ashr exact i32 [[IV_2_EXT]], 24 -; IND-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IV_1]] -; IND-NEXT: store i32 [[SCALAR_RECUR]], i32* [[GEP]], align 4 +; IND-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV_1]] +; IND-NEXT: store i32 [[SCALAR_RECUR]], ptr [[GEP]], align 4 ; IND-NEXT: [[IV_2_NEXT]] = add nsw i32 [[IV_2_CONV]], [[STEP]] ; IND-NEXT: [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 1 ; IND-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_1_NEXT]], [[N]] @@ -6554,8 +6444,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; UNROLL-NEXT: br i1 [[TMP15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4 -; UNROLL-NEXT: [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32 -; UNROLL-NEXT: [[IND_END:%.*]] = mul i32 [[CAST_VTC]], [[STEP]] +; UNROLL-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; UNROLL-NEXT: [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]] ; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0 ; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: [[TMP16:%.*]] = mul nuw <2 x i32> [[DOTSPLAT]], @@ -6570,16 +6460,14 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; UNROLL-NEXT: [[STEP_ADD]] = add <2 x i32> [[VEC_IND]], [[DOTSPLAT3]] ; UNROLL-NEXT: [[TMP18:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> ; UNROLL-NEXT: [[TMP19:%.*]] = shufflevector <2 x i32> [[VEC_IND]], <2 x i32> [[STEP_ADD]], <2 x i32> -; UNROLL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 [[INDEX]] -; UNROLL-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>* -; UNROLL-NEXT: store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4 -; UNROLL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i64 2 -; UNROLL-NEXT: [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <2 x i32>* -; UNROLL-NEXT: store <2 x i32> [[TMP19]], <2 x i32>* [[TMP23]], align 4 +; UNROLL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[INDEX]] +; UNROLL-NEXT: store <2 x i32> [[TMP18]], ptr [[TMP20]], align 4 +; UNROLL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[TMP19]], ptr [[TMP21]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], [[DOTSPLAT3]] -; UNROLL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]] +; UNROLL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] ; UNROLL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD]], i64 1 @@ -6595,8 +6483,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; UNROLL-NEXT: [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ] ; UNROLL-NEXT: [[IV_2_EXT:%.*]] = shl i32 [[IV_2]], 24 ; UNROLL-NEXT: [[IV_2_CONV]] = ashr exact i32 [[IV_2_EXT]], 24 -; UNROLL-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IV_1]] -; UNROLL-NEXT: store i32 [[SCALAR_RECUR]], i32* [[GEP]], align 4 +; UNROLL-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV_1]] +; UNROLL-NEXT: store i32 [[SCALAR_RECUR]], ptr [[GEP]], align 4 ; UNROLL-NEXT: [[IV_2_NEXT]] = add nsw i32 [[IV_2_CONV]], [[STEP]] ; UNROLL-NEXT: [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 1 ; UNROLL-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_1_NEXT]], [[N]] @@ -6634,8 +6522,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; UNROLL-NO-IC-NEXT: [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32 -; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = mul i32 [[CAST_VTC]], [[STEP]] +; UNROLL-NO-IC-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]] ; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i32 0 ; UNROLL-NO-IC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = mul <2 x i32> , [[DOTSPLAT]] @@ -6653,18 +6541,16 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 2 ; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = shufflevector <2 x i32> [[VEC_IND]], <2 x i32> [[STEP_ADD]], <2 x i32> -; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 [[TMP19]] -; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP20]] -; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP23]], i32 0 -; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP21]], <2 x i32>* [[TMP26]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[TMP23]], i32 2 -; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP27]] to <2 x i32>* -; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP22]], <2 x i32>* [[TMP28]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP19]] +; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[TMP20]] +; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP21]], ptr [[TMP25]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP22]], ptr [[TMP26]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], [[DOTSPLAT3]] -; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD]], i32 1 @@ -6681,8 +6567,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; UNROLL-NO-IC-NEXT: [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ] ; UNROLL-NO-IC-NEXT: [[IV_2_EXT:%.*]] = shl i32 [[IV_2]], 24 ; UNROLL-NO-IC-NEXT: [[IV_2_CONV]] = ashr exact i32 [[IV_2_EXT]], 24 -; UNROLL-NO-IC-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IV_1]] -; UNROLL-NO-IC-NEXT: store i32 [[SCALAR_RECUR]], i32* [[GEP]], align 4 +; UNROLL-NO-IC-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV_1]] +; UNROLL-NO-IC-NEXT: store i32 [[SCALAR_RECUR]], ptr [[GEP]], align 4 ; UNROLL-NO-IC-NEXT: [[IV_2_NEXT]] = add nsw i32 [[IV_2_CONV]], [[STEP]] ; UNROLL-NO-IC-NEXT: [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 1 ; UNROLL-NO-IC-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_1_NEXT]], [[N]] @@ -6717,8 +6603,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; INTERLEAVE-NEXT: br i1 [[TMP15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; INTERLEAVE: vector.ph: ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -8 -; INTERLEAVE-NEXT: [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32 -; INTERLEAVE-NEXT: [[IND_END:%.*]] = mul i32 [[CAST_VTC]], [[STEP]] +; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; INTERLEAVE-NEXT: [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]] ; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i64 0 ; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: [[TMP16:%.*]] = mul <4 x i32> [[DOTSPLAT]], @@ -6733,16 +6619,14 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; INTERLEAVE-NEXT: [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]] ; INTERLEAVE-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> ; INTERLEAVE-NEXT: [[TMP19:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> -; INTERLEAVE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 [[INDEX]] -; INTERLEAVE-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>* -; INTERLEAVE-NEXT: store <4 x i32> [[TMP18]], <4 x i32>* [[TMP21]], align 4 -; INTERLEAVE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i64 4 -; INTERLEAVE-NEXT: [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>* -; INTERLEAVE-NEXT: store <4 x i32> [[TMP19]], <4 x i32>* [[TMP23]], align 4 +; INTERLEAVE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[INDEX]] +; INTERLEAVE-NEXT: store <4 x i32> [[TMP18]], ptr [[TMP20]], align 4 +; INTERLEAVE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[TMP19]], ptr [[TMP21]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], [[DOTSPLAT3]] -; INTERLEAVE-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]] ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] ; INTERLEAVE-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i64 3 @@ -6758,8 +6642,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; INTERLEAVE-NEXT: [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ] ; INTERLEAVE-NEXT: [[IV_2_EXT:%.*]] = shl i32 [[IV_2]], 24 ; INTERLEAVE-NEXT: [[IV_2_CONV]] = ashr exact i32 [[IV_2_EXT]], 24 -; INTERLEAVE-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IV_1]] -; INTERLEAVE-NEXT: store i32 [[SCALAR_RECUR]], i32* [[GEP]], align 4 +; INTERLEAVE-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV_1]] +; INTERLEAVE-NEXT: store i32 [[SCALAR_RECUR]], ptr [[GEP]], align 4 ; INTERLEAVE-NEXT: [[IV_2_NEXT]] = add nsw i32 [[IV_2_CONV]], [[STEP]] ; INTERLEAVE-NEXT: [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 1 ; INTERLEAVE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_1_NEXT]], [[N]] @@ -6776,8 +6660,8 @@ loop: %iv.2 = phi i32 [ 0, %entry ], [ %iv.2.next, %loop ] %iv.2.ext = shl i32 %iv.2, 24 %iv.2.conv = ashr exact i32 %iv.2.ext, 24 - %gep = getelementptr inbounds i32, i32* %ptr, i64 %iv.1 - store i32 %for, i32* %gep, align 4 + %gep = getelementptr inbounds i32, ptr %ptr, i64 %iv.1 + store i32 %for, ptr %gep, align 4 %iv.2.next = add nsw i32 %iv.2.conv, %step %iv.1.next = add nuw nsw i64 %iv.1, 1 %exitcond = icmp eq i64 %iv.1.next, %n diff --git a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll index 70dece3dff4a3..86b0ca2b3650a 100644 --- a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll @@ -2,7 +2,7 @@ ; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s ; Tests for PR54266. -define i32 @one_direct_branch(i32* %src) { +define i32 @one_direct_branch(ptr %src) { ; CHECK-LABEL: @one_direct_branch( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -11,16 +11,15 @@ define i32 @one_direct_branch(i32* %src) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> , [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i32> , [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1000, 1000 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -28,8 +27,8 @@ define i32 @one_direct_branch(i32* %src) { ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] -; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i32 [[IV]] -; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[SRC_GEP]], align 4 +; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]] +; CHECK-NEXT: [[LV:%.*]] = load i32, ptr [[SRC_GEP]], align 4 ; CHECK-NEXT: [[XOR:%.*]] = xor i32 25500, [[LV]] ; CHECK-NEXT: br label [[LOOP_LATCH]] ; CHECK: loop.latch: @@ -38,7 +37,7 @@ define i32 @one_direct_branch(i32* %src) { ; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[XOR_LCSSA]] ; entry: @@ -46,8 +45,8 @@ entry: loop: %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] - %src.gep = getelementptr inbounds i32, i32* %src, i32 %iv - %lv = load i32, i32* %src.gep + %src.gep = getelementptr inbounds i32, ptr %src, i32 %iv + %lv = load i32, ptr %src.gep %xor = xor i32 25500, %lv br label %loop.latch @@ -62,7 +61,7 @@ exit: ret i32 %xor.lcssa } -define i32 @two_direct_branch(i32* %src) { +define i32 @two_direct_branch(ptr %src) { ; CHECK-LABEL: @two_direct_branch( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -71,16 +70,15 @@ define i32 @two_direct_branch(i32* %src) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> , [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i32> , [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1000, 1000 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -88,8 +86,8 @@ define i32 @two_direct_branch(i32* %src) { ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] -; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i32 [[IV]] -; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[SRC_GEP]], align 4 +; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]] +; CHECK-NEXT: [[LV:%.*]] = load i32, ptr [[SRC_GEP]], align 4 ; CHECK-NEXT: [[XOR:%.*]] = xor i32 25500, [[LV]] ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: @@ -101,7 +99,7 @@ define i32 @two_direct_branch(i32* %src) { ; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[XOR_LCSSA]] ; entry: @@ -109,8 +107,8 @@ entry: loop: ; preds = %for.inc3, %entry %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] - %src.gep = getelementptr inbounds i32, i32* %src, i32 %iv - %lv = load i32, i32* %src.gep + %src.gep = getelementptr inbounds i32, ptr %src, i32 %iv + %lv = load i32, ptr %src.gep %xor = xor i32 25500, %lv br label %bb @@ -129,7 +127,7 @@ exit: ret i32 %xor.lcssa } -define i32 @cond_branch(i32 %a, i32* %src) { +define i32 @cond_branch(i32 %a, ptr %src) { ; CHECK-LABEL: @cond_branch( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -141,20 +139,19 @@ define i32 @cond_branch(i32 %a, i32* %src) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> , [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> , <4 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i32> , [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> , <4 x i32> [[TMP3]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1000, 1000 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -162,8 +159,8 @@ define i32 @cond_branch(i32 %a, i32* %src) { ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] -; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i32 [[IV]] -; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[SRC_GEP]], align 4 +; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]] +; CHECK-NEXT: [[LV:%.*]] = load i32, ptr [[SRC_GEP]], align 4 ; CHECK-NEXT: [[XOR:%.*]] = xor i32 25500, [[LV]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[IV]], [[A]] ; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_LATCH]], label [[THEN:%.*]] @@ -175,7 +172,7 @@ define i32 @cond_branch(i32 %a, i32* %src) { ; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[XOR_LCSSA]] ; entry: @@ -183,8 +180,8 @@ entry: loop: %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] - %src.gep = getelementptr inbounds i32, i32* %src, i32 %iv - %lv = load i32, i32* %src.gep + %src.gep = getelementptr inbounds i32, ptr %src, i32 %iv + %lv = load i32, ptr %src.gep %xor = xor i32 25500, %lv %cmp = icmp ne i32 %iv, %a br i1 %cmp, label %loop.latch, label %then @@ -211,13 +208,13 @@ define i32 @optimizable_trunc_used_outside() { ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 2 ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 3 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll index 0fc560c5d1293..b3569f387b68c 100644 --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll @@ -17,54 +17,53 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; Instcombine'd version of @inv_val_store_to_inv_address_conditional_diff_values. ; Now the store is no longer of invariant value. ; scalar store the value extracted from the last element of the vector value. -define void @inv_val_store_to_inv_address_conditional_diff_values_ic(i32* %a, i64 %n, i32* %b, i32 %k) { +define void @inv_val_store_to_inv_address_conditional_diff_values_ic(ptr %a, i64 %n, ptr %b, i32 %k) { ; CHECK-LABEL: @inv_val_store_to_inv_address_conditional_diff_values_ic( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 -; CHECK-NEXT: [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4 +; CHECK-NEXT: [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[B]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[SMAX]], 2 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[B]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[A]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[K:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8, !alias.scope !0, !noalias !3 ; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT8]], <4 x i32>* [[TMP2]], align 4, !alias.scope !0, !noalias !3 -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i32> [[BROADCAST_SPLAT8]], <4 x i32> [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 3 -; CHECK-NEXT: store i32 [[TMP3]], i32* [[A]], align 4, !alias.scope !3 +; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT4]], ptr [[TMP1]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i32> [[BROADCAST_SPLAT4]], <4 x i32> [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 3 +; CHECK-NEXT: store i32 [[TMP2]], ptr [[A]], align 4, !alias.scope !3 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] -; CHECK-NEXT: [[I2:%.*]] = load i32, i32* [[I1]], align 8 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]] +; CHECK-NEXT: [[I2:%.*]] = load i32, ptr [[I1]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[I2]], [[K]] -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[I1]], align 4 +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[I1]], align 4 ; CHECK-NEXT: br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]] ; CHECK: cond_store: ; CHECK-NEXT: br label [[LATCH]] @@ -72,7 +71,7 @@ define void @inv_val_store_to_inv_address_conditional_diff_values_ic(i32* %a, i6 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ] -; CHECK-NEXT: store i32 [[STOREVAL]], i32* [[A]], align 4 +; CHECK-NEXT: store i32 [[STOREVAL]], ptr [[A]], align 4 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]] @@ -87,10 +86,10 @@ entry: for.body: ; preds = %for.body, %entry %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] - %i1 = getelementptr inbounds i32, i32* %b, i64 %i - %i2 = load i32, i32* %i1, align 8 + %i1 = getelementptr inbounds i32, ptr %b, i64 %i + %i2 = load i32, ptr %i1, align 8 %cmp = icmp eq i32 %i2, %k - store i32 %ntrunc, i32* %i1 + store i32 %ntrunc, ptr %i1 br i1 %cmp, label %cond_store, label %cond_store_k cond_store: @@ -101,7 +100,7 @@ cond_store_k: latch: %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ] - store i32 %storeval, i32* %a + store i32 %storeval, ptr %a %i.next = add nuw nsw i64 %i, 1 %cond = icmp slt i64 %i.next, %n br i1 %cond, label %for.body, label %for.end @@ -118,50 +117,50 @@ for.end: ; preds = %for.body ; into the uniform address within the loop. ; Since the condition and the phi is loop invariant, they are LICM'ed after ; vectorization. -define void @inv_val_store_to_inv_address_conditional_inv(i32* %a, i64 %n, i32* %b, i32 %k) { +define void @inv_val_store_to_inv_address_conditional_inv(ptr %a, i64 %n, ptr %b, i32 %k) { ; CHECK-LABEL: @inv_val_store_to_inv_address_conditional_inv( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[NTRUNC]], [[K:%.*]] -; CHECK-NEXT: [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4 +; CHECK-NEXT: [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[B]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[SMAX]], 2 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[B]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[A]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i1> undef, i1 [[CMP]], i64 3 -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 3 -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT10]] -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 3 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i1> undef, i1 [[CMP]], i64 3 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 3 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT6]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 3 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], <4 x i32>* [[TMP3]], align 4, !alias.scope !8, !noalias !11 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[A]], align 4, !alias.scope !11 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 4, !alias.scope !8, !noalias !11 +; CHECK-NEXT: store i32 [[TMP2]], ptr [[A]], align 4, !alias.scope !11 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[I1]], align 4 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]] +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[I1]], align 4 ; CHECK-NEXT: br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]] ; CHECK: cond_store: ; CHECK-NEXT: br label [[LATCH]] @@ -169,7 +168,7 @@ define void @inv_val_store_to_inv_address_conditional_inv(i32* %a, i64 %n, i32* ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ] -; CHECK-NEXT: store i32 [[STOREVAL]], i32* [[A]], align 4 +; CHECK-NEXT: store i32 [[STOREVAL]], ptr [[A]], align 4 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP14:![0-9]+]] @@ -185,9 +184,9 @@ entry: for.body: ; preds = %for.body, %entry %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] - %i1 = getelementptr inbounds i32, i32* %b, i64 %i - %i2 = load i32, i32* %i1, align 8 - store i32 %ntrunc, i32* %i1 + %i1 = getelementptr inbounds i32, ptr %b, i64 %i + %i2 = load i32, ptr %i1, align 8 + store i32 %ntrunc, ptr %i1 br i1 %cmp, label %cond_store, label %cond_store_k cond_store: @@ -198,7 +197,7 @@ cond_store_k: latch: %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ] - store i32 %storeval, i32* %a + store i32 %storeval, ptr %a %i.next = add nuw nsw i64 %i, 1 %cond = icmp slt i64 %i.next, %n br i1 %cond, label %for.body, label %for.end @@ -210,31 +209,31 @@ for.end: ; preds = %for.body ; variant value stored to uniform address tests that the code gen extracts the ; last element from the variant vector and scalar stores it into the uniform ; address. -define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) { +define i32 @variant_val_store_to_inv_address(ptr %a, i64 %n, ptr %b, i32 %k) { ; CHECK-LABEL: @variant_val_store_to_inv_address( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4 +; CHECK-NEXT: [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[SMAX]], 2 +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[A]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[B]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8, !alias.scope !15 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8, !alias.scope !15 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4, !alias.scope !18, !noalias !15 +; CHECK-NEXT: store i32 [[TMP2]], ptr [[A]], align 4, !alias.scope !18, !noalias !15 ; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -242,7 +241,7 @@ define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) { ; CHECK: middle.block: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] @@ -251,9 +250,9 @@ define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) { ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[I0:%.*]] = phi i32 [ [[I3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] -; CHECK-NEXT: [[I2:%.*]] = load i32, i32* [[I1]], align 8 -; CHECK-NEXT: store i32 [[I2]], i32* [[A]], align 4 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]] +; CHECK-NEXT: [[I2:%.*]] = load i32, ptr [[I1]], align 8 +; CHECK-NEXT: store i32 [[I2]], ptr [[A]], align 4 ; CHECK-NEXT: [[I3]] = add i32 [[I0]], [[I2]] ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] @@ -273,9 +272,9 @@ entry: for.body: ; preds = %for.body, %entry %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] %i0 = phi i32 [ %i3, %for.body ], [ 0, %entry ] - %i1 = getelementptr inbounds i32, i32* %b, i64 %i - %i2 = load i32, i32* %i1, align 8 - store i32 %i2, i32* %a + %i1 = getelementptr inbounds i32, ptr %b, i64 %i + %i2 = load i32, ptr %i1, align 8 + store i32 %i2, ptr %a %i3 = add i32 %i0, %i2 %i.next = add nuw nsw i64 %i, 1 %cond = icmp slt i64 %i.next, %n diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll index f8e1931b84e25..0d82bea4c1fae 100644 --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -13,42 +13,42 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; address. -; memory check is found.conflict = b[max(n-1,1)] > a && (i8* a)+1 > (i8* b) +; memory check is found.conflict = b[max(n-1,1)] > a && (ptr a)+1 > (ptr b) -define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) { +define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b) { ; CHECK-LABEL: @inv_val_store_to_inv_address_with_reduction( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 -; CHECK-NEXT: [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4 +; CHECK-NEXT: [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[SMAX]], 2 +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[A]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[B]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8, !alias.scope !0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8, !alias.scope !0 ; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] @@ -57,10 +57,10 @@ define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[I0:%.*]] = phi i32 [ [[I3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] -; CHECK-NEXT: [[I2:%.*]] = load i32, i32* [[I1]], align 8 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]] +; CHECK-NEXT: [[I2:%.*]] = load i32, ptr [[I1]], align 8 ; CHECK-NEXT: [[I3]] = add i32 [[I0]], [[I2]] -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4 +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]] @@ -78,10 +78,10 @@ entry: for.body: ; preds = %for.body, %entry %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] %i0 = phi i32 [ %i3, %for.body ], [ 0, %entry ] - %i1 = getelementptr inbounds i32, i32* %b, i64 %i - %i2 = load i32, i32* %i1, align 8 + %i1 = getelementptr inbounds i32, ptr %b, i64 %i + %i2 = load i32, ptr %i1, align 8 %i3 = add i32 %i0, %i2 - store i32 %ntrunc, i32* %a + store i32 %ntrunc, ptr %a %i.next = add nuw nsw i64 %i, 1 %cond = icmp slt i64 %i.next, %n br i1 %cond, label %for.body, label %for.end @@ -91,46 +91,46 @@ for.end: ; preds = %for.body ret i32 %i4 } -define void @inv_val_store_to_inv_address(i32* %a, i64 %n, i32* %b) { +define void @inv_val_store_to_inv_address(ptr %a, i64 %n, ptr %b) { ; CHECK-LABEL: @inv_val_store_to_inv_address( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 -; CHECK-NEXT: [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4 +; CHECK-NEXT: [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[SMAX]], 2 +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[A]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[B]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !8, !noalias !11 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], <4 x i32>* [[TMP1]], align 4, !alias.scope !11 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !8, !noalias !11 +; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 4, !alias.scope !11 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4 -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[I1]], align 4 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]] +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4 +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[I1]], align 4 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP14:![0-9]+]] @@ -145,10 +145,10 @@ entry: for.body: ; preds = %for.body, %entry %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] - %i1 = getelementptr inbounds i32, i32* %b, i64 %i - %i2 = load i32, i32* %i1, align 8 - store i32 %ntrunc, i32* %a - store i32 %ntrunc, i32* %i1 + %i1 = getelementptr inbounds i32, ptr %b, i64 %i + %i2 = load i32, ptr %i1, align 8 + store i32 %ntrunc, ptr %a + store i32 %ntrunc, ptr %i1 %i.next = add nuw nsw i64 %i, 1 %cond = icmp slt i64 %i.next, %n br i1 %cond, label %for.body, label %for.end @@ -167,78 +167,77 @@ for.end: ; preds = %for.body -define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) { +define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i32 %k) { ; CHECK-LABEL: @inv_val_store_to_inv_address_conditional( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 -; CHECK-NEXT: [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4 +; CHECK-NEXT: [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[B]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[SMAX]], 2 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[B]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[A]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[K:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8, !alias.scope !15, !noalias !18 +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE10:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8, !alias.scope !15, !noalias !18 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT8]], <4 x i32>* [[TMP3]], align 4, !alias.scope !15, !noalias !18 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT4]], ptr [[TMP1]], align 4, !alias.scope !15, !noalias !18 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18 +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !18 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1 -; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1 +; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] +; CHECK: pred.store.if5: +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !18 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] +; CHECK: pred.store.continue6: +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2 +; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] +; CHECK: pred.store.if7: +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !18 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] +; CHECK: pred.store.continue8: +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3 +; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10]] ; CHECK: pred.store.if9: -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18 +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !18 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]] ; CHECK: pred.store.continue10: -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2 -; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] -; CHECK: pred.store.if11: -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] -; CHECK: pred.store.continue12: -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14]] -; CHECK: pred.store.if13: -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]] -; CHECK: pred.store.continue14: ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] -; CHECK-NEXT: [[I2:%.*]] = load i32, i32* [[I1]], align 8 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]] +; CHECK-NEXT: [[I2:%.*]] = load i32, ptr [[I1]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[I2]], [[K]] -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[I1]], align 4 +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[I1]], align 4 ; CHECK-NEXT: br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]] ; CHECK: cond_store: -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4 +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 @@ -255,14 +254,14 @@ entry: for.body: ; preds = %for.body, %entry %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] - %i1 = getelementptr inbounds i32, i32* %b, i64 %i - %i2 = load i32, i32* %i1, align 8 + %i1 = getelementptr inbounds i32, ptr %b, i64 %i + %i2 = load i32, ptr %i1, align 8 %cmp = icmp eq i32 %i2, %k - store i32 %ntrunc, i32* %i1 + store i32 %ntrunc, ptr %i1 br i1 %cmp, label %cond_store, label %latch cond_store: - store i32 %ntrunc, i32* %a + store i32 %ntrunc, ptr %a br label %latch latch: @@ -279,17 +278,17 @@ for.end: ; preds = %for.body ; else a = k; ; TODO: We could vectorize this once we support multiple uniform stores to the ; same address. -define void @inv_val_store_to_inv_address_conditional_diff_values(i32* %a, i64 %n, i32* %b, i32 %k) { +define void @inv_val_store_to_inv_address_conditional_diff_values(ptr %a, i64 %n, ptr %b, i32 %k) { ; CHECK-LABEL: @inv_val_store_to_inv_address_conditional_diff_values( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[I]] -; CHECK-NEXT: [[I2:%.*]] = load i32, i32* [[I1]], align 8 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[I]] +; CHECK-NEXT: [[I2:%.*]] = load i32, ptr [[I1]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[I2]], [[K:%.*]] -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[I1]], align 4 +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[I1]], align 4 ; CHECK-NEXT: br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]] ; CHECK: cond_store: ; CHECK-NEXT: br label [[LATCH]] @@ -297,7 +296,7 @@ define void @inv_val_store_to_inv_address_conditional_diff_values(i32* %a, i64 % ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[STOREMERGE:%.*]] = phi i32 [ [[K]], [[COND_STORE_K]] ], [ [[NTRUNC]], [[COND_STORE]] ] -; CHECK-NEXT: store i32 [[STOREMERGE]], i32* [[A:%.*]], align 4 +; CHECK-NEXT: store i32 [[STOREMERGE]], ptr [[A:%.*]], align 4 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]] @@ -310,18 +309,18 @@ entry: for.body: ; preds = %for.body, %entry %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] - %i1 = getelementptr inbounds i32, i32* %b, i64 %i - %i2 = load i32, i32* %i1, align 8 + %i1 = getelementptr inbounds i32, ptr %b, i64 %i + %i2 = load i32, ptr %i1, align 8 %cmp = icmp eq i32 %i2, %k - store i32 %ntrunc, i32* %i1 + store i32 %ntrunc, ptr %i1 br i1 %cmp, label %cond_store, label %cond_store_k cond_store: - store i32 %ntrunc, i32* %a + store i32 %ntrunc, ptr %a br label %latch cond_store_k: - store i32 %k, i32 * %a + store i32 %k, ptr %a br label %latch latch: @@ -342,7 +341,7 @@ for.end: ; preds = %for.body ; } ; } -define i32 @multiple_uniform_stores(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 { +define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 { ; CHECK-LABEL: @multiple_uniform_stores( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0 @@ -355,17 +354,17 @@ define i32 @multiple_uniform_stores(i32* nocapture %var1, i32* nocapture readonl ; CHECK-NEXT: [[CMP218:%.*]] = icmp ult i32 [[J_022]], [[ITR]] ; CHECK-NEXT: br i1 [[CMP218]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_INC8]] ; CHECK: for.body3.lr.ph: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[VAR1:%.*]], i64 [[INDVARS_IV23]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[VAR1:%.*]], i64 [[INDVARS_IV23]] ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[J_022]] to i64 ; CHECK-NEXT: br label [[FOR_BODY3:%.*]] ; CHECK: for.body3: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY3_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[VAR2:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VAR2:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = add nsw i32 [[ADD]], 1 -; CHECK-NEXT: store i32 [[TMP3]], i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: store i32 [[TMP3]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[ITR]] @@ -394,20 +393,20 @@ for.cond1.preheader: ; preds = %entry, %for.inc8 br i1 %cmp218, label %for.body3.lr.ph, label %for.inc8 for.body3.lr.ph: ; preds = %for.cond1.preheader - %arrayidx5 = getelementptr inbounds i32, i32* %var1, i64 %indvars.iv23 + %arrayidx5 = getelementptr inbounds i32, ptr %var1, i64 %indvars.iv23 %0 = zext i32 %j.022 to i64 br label %for.body3 for.body3: ; preds = %for.body3, %for.body3.lr.ph %indvars.iv = phi i64 [ %0, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ] - %arrayidx = getelementptr inbounds i32, i32* %var2, i64 %indvars.iv - %1 = load i32, i32* %arrayidx, align 4 - %2 = load i32, i32* %arrayidx5, align 4 + %arrayidx = getelementptr inbounds i32, ptr %var2, i64 %indvars.iv + %1 = load i32, ptr %arrayidx, align 4 + %2 = load i32, ptr %arrayidx5, align 4 %add = add nsw i32 %2, %1 - store i32 %add, i32* %arrayidx5, align 4 - %3 = load i32, i32* %arrayidx5, align 4 + store i32 %add, ptr %arrayidx5, align 4 + %3 = load i32, ptr %arrayidx5, align 4 %4 = add nsw i32 %3, 1 - store i32 %4, i32* %arrayidx5, align 4 + store i32 %4, ptr %arrayidx5, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 %exitcond = icmp eq i32 %lftr.wideiv, %itr @@ -426,7 +425,7 @@ for.end10: ; preds = %for.inc8, %entry ; second uniform store to the same address is conditional. ; we do not vectorize this. -define i32 @multiple_uniform_stores_conditional(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 { +define i32 @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 { ; CHECK-LABEL: @multiple_uniform_stores_conditional( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0 @@ -439,14 +438,14 @@ define i32 @multiple_uniform_stores_conditional(i32* nocapture %var1, i32* nocap ; CHECK-NEXT: [[CMP218:%.*]] = icmp ult i32 [[J_022]], [[ITR]] ; CHECK-NEXT: br i1 [[CMP218]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_INC8]] ; CHECK: for.body3.lr.ph: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[VAR1:%.*]], i64 [[INDVARS_IV23]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[VAR1:%.*]], i64 [[INDVARS_IV23]] ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[J_022]] to i64 ; CHECK-NEXT: br label [[FOR_BODY3:%.*]] ; CHECK: for.body3: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY3_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[VAR2:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VAR2:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[ADD]], 42 ; CHECK-NEXT: br i1 [[TMP3]], label [[COND_STORE:%.*]], label [[LATCH]] @@ -455,7 +454,7 @@ define i32 @multiple_uniform_stores_conditional(i32* nocapture %var1, i32* nocap ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[STOREMERGE:%.*]] = phi i32 [ [[TMP4]], [[COND_STORE]] ], [ [[ADD]], [[FOR_BODY3]] ] -; CHECK-NEXT: store i32 [[STOREMERGE]], i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: store i32 [[STOREMERGE]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[ITR]] @@ -484,24 +483,24 @@ for.cond1.preheader: ; preds = %entry, %for.inc8 br i1 %cmp218, label %for.body3.lr.ph, label %for.inc8 for.body3.lr.ph: ; preds = %for.cond1.preheader - %arrayidx5 = getelementptr inbounds i32, i32* %var1, i64 %indvars.iv23 + %arrayidx5 = getelementptr inbounds i32, ptr %var1, i64 %indvars.iv23 %0 = zext i32 %j.022 to i64 br label %for.body3 for.body3: ; preds = %for.body3, %for.body3.lr.ph %indvars.iv = phi i64 [ %0, %for.body3.lr.ph ], [ %indvars.iv.next, %latch ] - %arrayidx = getelementptr inbounds i32, i32* %var2, i64 %indvars.iv - %1 = load i32, i32* %arrayidx, align 4 - %2 = load i32, i32* %arrayidx5, align 4 + %arrayidx = getelementptr inbounds i32, ptr %var2, i64 %indvars.iv + %1 = load i32, ptr %arrayidx, align 4 + %2 = load i32, ptr %arrayidx5, align 4 %add = add nsw i32 %2, %1 - store i32 %add, i32* %arrayidx5, align 4 - %3 = load i32, i32* %arrayidx5, align 4 + store i32 %add, ptr %arrayidx5, align 4 + %3 = load i32, ptr %arrayidx5, align 4 %4 = add nsw i32 %3, 1 %5 = icmp ugt i32 %3, 42 br i1 %5, label %cond_store, label %latch cond_store: - store i32 %4, i32* %arrayidx5, align 4 + store i32 %4, ptr %arrayidx5, align 4 br label %latch latch: @@ -526,10 +525,10 @@ for.end10: ; preds = %for.inc8, %entry ; PR39653 ; Note: %i10 could be replaced by phi(%arg4, %i12), a potentially vectorizable ; 1st-order-recurrence -define void @unsafe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, i16* %arg3, i32 %arg4, i64 %arg5) { +define void @unsafe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, ptr %arg3, i32 %arg4, i64 %arg5) { ; CHECK-LABEL: @unsafe_dep_uniform_load_store( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[I6:%.*]] = getelementptr inbounds i16, i16* [[ARG3:%.*]], i64 [[ARG5:%.*]] +; CHECK-NEXT: [[I6:%.*]] = getelementptr inbounds i16, ptr [[ARG3:%.*]], i64 [[ARG5:%.*]] ; CHECK-NEXT: br label [[BB7:%.*]] ; CHECK: bb7: ; CHECK-NEXT: [[I121:%.*]] = phi i32 [ [[ARG4:%.*]], [[BB:%.*]] ], [ [[I12:%.*]], [[BB7]] ] @@ -542,13 +541,13 @@ define void @unsafe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, i16* ; CHECK-NEXT: [[I15:%.*]] = trunc i64 [[I8]] to i32 ; CHECK-NEXT: [[I16:%.*]] = add i32 [[I15]], [[ARG:%.*]] ; CHECK-NEXT: [[I17:%.*]] = zext i32 [[I16]] to i64 -; CHECK-NEXT: [[I18:%.*]] = getelementptr inbounds i16, i16* [[I6]], i64 [[I17]] -; CHECK-NEXT: store i16 [[I14]], i16* [[I18]], align 2 +; CHECK-NEXT: [[I18:%.*]] = getelementptr inbounds i16, ptr [[I6]], i64 [[I17]] +; CHECK-NEXT: store i16 [[I14]], ptr [[I18]], align 2 ; CHECK-NEXT: [[I19:%.*]] = add i32 [[I13]], [[I9]] ; CHECK-NEXT: [[I20:%.*]] = trunc i32 [[I19]] to i16 ; CHECK-NEXT: [[I21:%.*]] = and i16 [[I20]], 255 -; CHECK-NEXT: [[I22:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i64 [[I17]] -; CHECK-NEXT: store i16 [[I21]], i16* [[I22]], align 2 +; CHECK-NEXT: [[I22:%.*]] = getelementptr inbounds i16, ptr [[ARG3]], i64 [[I17]] +; CHECK-NEXT: store i16 [[I21]], ptr [[I22]], align 2 ; CHECK-NEXT: [[I23]] = add nsw i32 [[I9]], 1 ; CHECK-NEXT: [[I24]] = add nuw nsw i64 [[I8]], 1 ; CHECK-NEXT: [[I25:%.*]] = icmp eq i64 [[I24]], [[ARG2:%.*]] @@ -558,14 +557,14 @@ define void @unsafe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, i16* ; bb: %i = alloca i32 - store i32 %arg4, i32* %i - %i6 = getelementptr inbounds i16, i16* %arg3, i64 %arg5 + store i32 %arg4, ptr %i + %i6 = getelementptr inbounds i16, ptr %arg3, i64 %arg5 br label %bb7 bb7: %i8 = phi i64 [ 0, %bb ], [ %i24, %bb7 ] %i9 = phi i32 [ %arg1, %bb ], [ %i23, %bb7 ] - %i10 = load i32, i32* %i + %i10 = load i32, ptr %i %i11 = mul nsw i32 %i9, %i10 %i12 = srem i32 %i11, 65536 %i13 = add nsw i32 %i12, %i9 @@ -573,17 +572,17 @@ bb7: %i15 = trunc i64 %i8 to i32 %i16 = add i32 %arg, %i15 %i17 = zext i32 %i16 to i64 - %i18 = getelementptr inbounds i16, i16* %i6, i64 %i17 - store i16 %i14, i16* %i18, align 2 + %i18 = getelementptr inbounds i16, ptr %i6, i64 %i17 + store i16 %i14, ptr %i18, align 2 %i19 = add i32 %i13, %i9 %i20 = trunc i32 %i19 to i16 %i21 = and i16 %i20, 255 - %i22 = getelementptr inbounds i16, i16* %arg3, i64 %i17 - store i16 %i21, i16* %i22, align 2 + %i22 = getelementptr inbounds i16, ptr %arg3, i64 %i17 + store i16 %i21, ptr %i22, align 2 %i23 = add nsw i32 %i9, 1 %i24 = add nuw nsw i64 %i8, 1 %i25 = icmp eq i64 %i24, %arg2 - store i32 %i12, i32* %i + store i32 %i12, ptr %i br i1 %i25, label %bb26, label %bb7 bb26: diff --git a/llvm/test/Transforms/LoopVectorize/loop-form.ll b/llvm/test/Transforms/LoopVectorize/loop-form.ll index fe60ed251c524..e1816b8d4d0ff 100644 --- a/llvm/test/Transforms/LoopVectorize/loop-form.ll +++ b/llvm/test/Transforms/LoopVectorize/loop-form.ll @@ -4,7 +4,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" -define void @bottom_tested(i16* %p, i32 %n) { +define void @bottom_tested(ptr %p, i32 %n) { ; CHECK-LABEL: @bottom_tested( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 0) @@ -19,13 +19,12 @@ define void @bottom_tested(i16* %p, i32 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <2 x i16>* -; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 0 +; CHECK-NEXT: store <2 x i16> zeroinitializer, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[IF_END:%.*]], label [[SCALAR_PH]] @@ -35,8 +34,8 @@ define void @bottom_tested(i16* %p, i32 %n) { ; CHECK: for.cond: ; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_COND]] ] ; CHECK-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]] -; CHECK-NEXT: store i16 0, i16* [[B]], align 4 +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IPROM]] +; CHECK-NEXT: store i16 0, ptr [[B]], align 4 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]] ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP2:![0-9]+]] @@ -65,16 +64,16 @@ define void @bottom_tested(i16* %p, i32 %n) { ; TAILFOLD-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; TAILFOLD: pred.store.if: ; TAILFOLD-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; TAILFOLD-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]] -; TAILFOLD-NEXT: store i16 0, i16* [[TMP5]], align 4 +; TAILFOLD-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]] +; TAILFOLD-NEXT: store i16 0, ptr [[TMP5]], align 4 ; TAILFOLD-NEXT: br label [[PRED_STORE_CONTINUE]] ; TAILFOLD: pred.store.continue: ; TAILFOLD-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 ; TAILFOLD-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] ; TAILFOLD: pred.store.if1: ; TAILFOLD-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; TAILFOLD-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[TMP7]] -; TAILFOLD-NEXT: store i16 0, i16* [[TMP8]], align 4 +; TAILFOLD-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP7]] +; TAILFOLD-NEXT: store i16 0, ptr [[TMP8]], align 4 ; TAILFOLD-NEXT: br label [[PRED_STORE_CONTINUE2]] ; TAILFOLD: pred.store.continue2: ; TAILFOLD-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 @@ -89,8 +88,8 @@ define void @bottom_tested(i16* %p, i32 %n) { ; TAILFOLD: for.cond: ; TAILFOLD-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_COND]] ] ; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]] -; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4 +; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IPROM]] +; TAILFOLD-NEXT: store i16 0, ptr [[B]], align 4 ; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; TAILFOLD-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]] ; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP2:![0-9]+]] @@ -103,8 +102,8 @@ entry: for.cond: %i = phi i32 [ 0, %entry ], [ %inc, %for.cond ] %iprom = sext i32 %i to i64 - %b = getelementptr inbounds i16, i16* %p, i64 %iprom - store i16 0, i16* %b, align 4 + %b = getelementptr inbounds i16, ptr %p, i64 %iprom + store i16 0, ptr %b, align 4 %inc = add nsw i32 %i, 1 %cmp = icmp slt i32 %i, %n br i1 %cmp, label %for.cond, label %if.end @@ -113,7 +112,7 @@ if.end: ret void } -define void @early_exit(i16* %p, i32 %n) { +define void @early_exit(ptr %p, i32 %n) { ; CHECK-LABEL: @early_exit( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 0) @@ -130,13 +129,12 @@ define void @early_exit(i16* %p, i32 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>* -; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -148,8 +146,8 @@ define void @early_exit(i16* %p, i32 %n) { ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]] -; CHECK-NEXT: store i16 0, i16* [[B]], align 4 +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IPROM]] +; CHECK-NEXT: store i16 0, ptr [[B]], align 4 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: if.end: @@ -164,8 +162,8 @@ define void @early_exit(i16* %p, i32 %n) { ; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] ; TAILFOLD: for.body: ; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] -; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4 +; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]] +; TAILFOLD-NEXT: store i16 0, ptr [[B]], align 4 ; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; TAILFOLD-NEXT: br label [[FOR_COND]] ; TAILFOLD: if.end: @@ -181,8 +179,8 @@ for.cond: for.body: %iprom = sext i32 %i to i64 - %b = getelementptr inbounds i16, i16* %p, i64 %iprom - store i16 0, i16* %b, align 4 + %b = getelementptr inbounds i16, ptr %p, i64 %iprom + store i16 0, ptr %b, align 4 %inc = add nsw i32 %i, 1 br label %for.cond @@ -190,7 +188,7 @@ if.end: ret void } -define i32 @early_exit_with_live_out(i32* %ptr) { +define i32 @early_exit_with_live_out(ptr %ptr) { ; CHECK-LABEL: @early_exit_with_live_out( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -199,15 +197,13 @@ define i32 @early_exit_with_live_out(i32* %ptr) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> , <2 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: store <2 x i32> , ptr [[TMP2]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998 +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -215,13 +211,13 @@ define i32 @early_exit_with_live_out(i32* %ptr) { ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load i32, i32* [[GEP]], align 4 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_LATCH]] ; CHECK: loop.latch: -; CHECK-NEXT: store i32 10, i32* [[GEP]], align 4 +; CHECK-NEXT: store i32 10, ptr [[GEP]], align 4 ; CHECK-NEXT: br label [[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i32 [ [[L]], [[LOOP_HEADER]] ] @@ -232,13 +228,13 @@ define i32 @early_exit_with_live_out(i32* %ptr) { ; TAILFOLD-NEXT: br label [[LOOP_HEADER:%.*]] ; TAILFOLD: loop.header: ; TAILFOLD-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] -; TAILFOLD-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[IV]] -; TAILFOLD-NEXT: [[L:%.*]] = load i32, i32* [[GEP]], align 4 +; TAILFOLD-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[IV]] +; TAILFOLD-NEXT: [[L:%.*]] = load i32, ptr [[GEP]], align 4 ; TAILFOLD-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TAILFOLD-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 ; TAILFOLD-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_LATCH]] ; TAILFOLD: loop.latch: -; TAILFOLD-NEXT: store i32 10, i32* [[GEP]], align 4 +; TAILFOLD-NEXT: store i32 10, ptr [[GEP]], align 4 ; TAILFOLD-NEXT: br label [[LOOP_HEADER]] ; TAILFOLD: exit: ; TAILFOLD-NEXT: [[RES_LCSSA:%.*]] = phi i32 [ [[L]], [[LOOP_HEADER]] ] @@ -249,14 +245,14 @@ entry: loop.header: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] - %gep = getelementptr i32, i32* %ptr, i64 %iv - %l = load i32, i32* %gep + %gep = getelementptr i32, ptr %ptr, i64 %iv + %l = load i32, ptr %gep %iv.next = add nuw nsw i64 %iv, 1 %ec = icmp eq i64 %iv.next, 1000 br i1 %ec, label %exit, label %loop.latch loop.latch: - store i32 10, i32* %gep + store i32 10, ptr %gep br label %loop.header exit: @@ -266,7 +262,7 @@ exit: ; Same as early_exit, but with optsize to prevent the use of ; a scalar epilogue. -- Can't vectorize this in either case. -define void @optsize(i16* %p, i32 %n) optsize { +define void @optsize(ptr %p, i32 %n) optsize { ; CHECK-LABEL: @optsize( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_COND:%.*]] @@ -276,8 +272,8 @@ define void @optsize(i16* %p, i32 %n) optsize { ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] -; CHECK-NEXT: store i16 0, i16* [[B]], align 4 +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]] +; CHECK-NEXT: store i16 0, ptr [[B]], align 4 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; CHECK-NEXT: br label [[FOR_COND]] ; CHECK: if.end: @@ -292,8 +288,8 @@ define void @optsize(i16* %p, i32 %n) optsize { ; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] ; TAILFOLD: for.body: ; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] -; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4 +; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]] +; TAILFOLD-NEXT: store i16 0, ptr [[B]], align 4 ; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; TAILFOLD-NEXT: br label [[FOR_COND]] ; TAILFOLD: if.end: @@ -309,8 +305,8 @@ for.cond: for.body: %iprom = sext i32 %i to i64 - %b = getelementptr inbounds i16, i16* %p, i64 %iprom - store i16 0, i16* %b, align 4 + %b = getelementptr inbounds i16, ptr %p, i64 %iprom + store i16 0, ptr %b, align 4 %inc = add nsw i32 %i, 1 br label %for.cond @@ -320,7 +316,7 @@ if.end: ; multiple exit - no values inside the loop used outside -define void @multiple_unique_exit(i16* %p, i32 %n) { +define void @multiple_unique_exit(ptr %p, i32 %n) { ; CHECK-LABEL: @multiple_unique_exit( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 0) @@ -338,13 +334,12 @@ define void @multiple_unique_exit(i16* %p, i32 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>* -; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -356,8 +351,8 @@ define void @multiple_unique_exit(i16* %p, i32 %n) { ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]] -; CHECK-NEXT: store i16 0, i16* [[B]], align 4 +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IPROM]] +; CHECK-NEXT: store i16 0, ptr [[B]], align 4 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096 ; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP9:![0-9]+]] @@ -373,8 +368,8 @@ define void @multiple_unique_exit(i16* %p, i32 %n) { ; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] ; TAILFOLD: for.body: ; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] -; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4 +; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]] +; TAILFOLD-NEXT: store i16 0, ptr [[B]], align 4 ; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; TAILFOLD-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096 ; TAILFOLD-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]] @@ -391,8 +386,8 @@ for.cond: for.body: %iprom = sext i32 %i to i64 - %b = getelementptr inbounds i16, i16* %p, i64 %iprom - store i16 0, i16* %b, align 4 + %b = getelementptr inbounds i16, ptr %p, i64 %iprom + store i16 0, ptr %b, align 4 %inc = add nsw i32 %i, 1 %cmp2 = icmp slt i32 %i, 2096 br i1 %cmp2, label %for.cond, label %if.end @@ -402,7 +397,7 @@ if.end: } ; multiple exit - with an lcssa phi -define i32 @multiple_unique_exit2(i16* %p, i32 %n) { +define i32 @multiple_unique_exit2(ptr %p, i32 %n) { ; CHECK-LABEL: @multiple_unique_exit2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 0) @@ -420,13 +415,12 @@ define i32 @multiple_unique_exit2(i16* %p, i32 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>* -; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -438,8 +432,8 @@ define i32 @multiple_unique_exit2(i16* %p, i32 %n) { ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]] -; CHECK-NEXT: store i16 0, i16* [[B]], align 4 +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IPROM]] +; CHECK-NEXT: store i16 0, ptr [[B]], align 4 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096 ; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP11:![0-9]+]] @@ -456,8 +450,8 @@ define i32 @multiple_unique_exit2(i16* %p, i32 %n) { ; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] ; TAILFOLD: for.body: ; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] -; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4 +; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]] +; TAILFOLD-NEXT: store i16 0, ptr [[B]], align 4 ; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; TAILFOLD-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096 ; TAILFOLD-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]] @@ -475,8 +469,8 @@ for.cond: for.body: %iprom = sext i32 %i to i64 - %b = getelementptr inbounds i16, i16* %p, i64 %iprom - store i16 0, i16* %b, align 4 + %b = getelementptr inbounds i16, ptr %p, i64 %iprom + store i16 0, ptr %b, align 4 %inc = add nsw i32 %i, 1 %cmp2 = icmp slt i32 %i, 2096 br i1 %cmp2, label %for.cond, label %if.end @@ -486,7 +480,7 @@ if.end: } ; multiple exit w/a non lcssa phi -define i32 @multiple_unique_exit3(i16* %p, i32 %n) { +define i32 @multiple_unique_exit3(ptr %p, i32 %n) { ; CHECK-LABEL: @multiple_unique_exit3( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 0) @@ -504,13 +498,12 @@ define i32 @multiple_unique_exit3(i16* %p, i32 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>* -; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -522,8 +515,8 @@ define i32 @multiple_unique_exit3(i16* %p, i32 %n) { ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]] -; CHECK-NEXT: store i16 0, i16* [[B]], align 4 +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IPROM]] +; CHECK-NEXT: store i16 0, ptr [[B]], align 4 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096 ; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP13:![0-9]+]] @@ -540,8 +533,8 @@ define i32 @multiple_unique_exit3(i16* %p, i32 %n) { ; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] ; TAILFOLD: for.body: ; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] -; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4 +; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]] +; TAILFOLD-NEXT: store i16 0, ptr [[B]], align 4 ; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; TAILFOLD-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096 ; TAILFOLD-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]] @@ -559,8 +552,8 @@ for.cond: for.body: %iprom = sext i32 %i to i64 - %b = getelementptr inbounds i16, i16* %p, i64 %iprom - store i16 0, i16* %b, align 4 + %b = getelementptr inbounds i16, ptr %p, i64 %iprom + store i16 0, ptr %b, align 4 %inc = add nsw i32 %i, 1 %cmp2 = icmp slt i32 %i, 2096 br i1 %cmp2, label %for.cond, label %if.end @@ -571,7 +564,7 @@ if.end: } ; multiple exits w/distinct target blocks -define i32 @multiple_exit_blocks(i16* %p, i32 %n) { +define i32 @multiple_exit_blocks(ptr %p, i32 %n) { ; CHECK-LABEL: @multiple_exit_blocks( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 0) @@ -589,13 +582,12 @@ define i32 @multiple_exit_blocks(i16* %p, i32 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>* -; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -607,8 +599,8 @@ define i32 @multiple_exit_blocks(i16* %p, i32 %n) { ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]] -; CHECK-NEXT: store i16 0, i16* [[B]], align 4 +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IPROM]] +; CHECK-NEXT: store i16 0, ptr [[B]], align 4 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096 ; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END2:%.*]], !llvm.loop [[LOOP15:![0-9]+]] @@ -626,8 +618,8 @@ define i32 @multiple_exit_blocks(i16* %p, i32 %n) { ; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] ; TAILFOLD: for.body: ; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] -; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4 +; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]] +; TAILFOLD-NEXT: store i16 0, ptr [[B]], align 4 ; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; TAILFOLD-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096 ; TAILFOLD-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END2:%.*]] @@ -646,8 +638,8 @@ for.cond: for.body: %iprom = sext i32 %i to i64 - %b = getelementptr inbounds i16, i16* %p, i64 %iprom - store i16 0, i16* %b, align 4 + %b = getelementptr inbounds i16, ptr %p, i64 %iprom + store i16 0, ptr %b, align 4 %inc = add nsw i32 %i, 1 %cmp2 = icmp slt i32 %i, 2096 br i1 %cmp2, label %for.cond, label %if.end2 @@ -660,7 +652,7 @@ if.end2: } ; LCSSA, common value each exit -define i32 @multiple_exit_blocks2(i16* %p, i32 %n) { +define i32 @multiple_exit_blocks2(ptr %p, i32 %n) { ; CHECK-LABEL: @multiple_exit_blocks2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 0) @@ -678,13 +670,12 @@ define i32 @multiple_exit_blocks2(i16* %p, i32 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>* -; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -696,8 +687,8 @@ define i32 @multiple_exit_blocks2(i16* %p, i32 %n) { ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]] -; CHECK-NEXT: store i16 0, i16* [[B]], align 4 +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IPROM]] +; CHECK-NEXT: store i16 0, ptr [[B]], align 4 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096 ; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END2:%.*]], !llvm.loop [[LOOP17:![0-9]+]] @@ -717,8 +708,8 @@ define i32 @multiple_exit_blocks2(i16* %p, i32 %n) { ; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] ; TAILFOLD: for.body: ; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] -; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4 +; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]] +; TAILFOLD-NEXT: store i16 0, ptr [[B]], align 4 ; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; TAILFOLD-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096 ; TAILFOLD-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END2:%.*]] @@ -739,8 +730,8 @@ for.cond: for.body: %iprom = sext i32 %i to i64 - %b = getelementptr inbounds i16, i16* %p, i64 %iprom - store i16 0, i16* %b, align 4 + %b = getelementptr inbounds i16, ptr %p, i64 %iprom + store i16 0, ptr %b, align 4 %inc = add nsw i32 %i, 1 %cmp2 = icmp slt i32 %i, 2096 br i1 %cmp2, label %for.cond, label %if.end2 @@ -753,7 +744,7 @@ if.end2: } ; LCSSA, distinct value each exit -define i32 @multiple_exit_blocks3(i16* %p, i32 %n) { +define i32 @multiple_exit_blocks3(ptr %p, i32 %n) { ; CHECK-LABEL: @multiple_exit_blocks3( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 0) @@ -771,13 +762,12 @@ define i32 @multiple_exit_blocks3(i16* %p, i32 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>* -; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -789,8 +779,8 @@ define i32 @multiple_exit_blocks3(i16* %p, i32 %n) { ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]] -; CHECK-NEXT: store i16 0, i16* [[B]], align 4 +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IPROM]] +; CHECK-NEXT: store i16 0, ptr [[B]], align 4 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096 ; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END2:%.*]], !llvm.loop [[LOOP19:![0-9]+]] @@ -810,8 +800,8 @@ define i32 @multiple_exit_blocks3(i16* %p, i32 %n) { ; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] ; TAILFOLD: for.body: ; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] -; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4 +; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]] +; TAILFOLD-NEXT: store i16 0, ptr [[B]], align 4 ; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; TAILFOLD-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096 ; TAILFOLD-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END2:%.*]] @@ -832,8 +822,8 @@ for.cond: for.body: %iprom = sext i32 %i to i64 - %b = getelementptr inbounds i16, i16* %p, i64 %iprom - store i16 0, i16* %b, align 4 + %b = getelementptr inbounds i16, ptr %p, i64 %iprom + store i16 0, ptr %b, align 4 %inc = add nsw i32 %i, 1 %cmp2 = icmp slt i32 %i, 2096 br i1 %cmp2, label %for.cond, label %if.end2 @@ -847,15 +837,15 @@ if.end2: ; unique exit case but with a switch as two edges between the same pair of ; blocks is an often missed edge case -define i32 @multiple_exit_switch(i16* %p, i32 %n) { +define i32 @multiple_exit_switch(ptr %p, i32 %n) { ; CHECK-LABEL: @multiple_exit_switch( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_COND:%.*]] ; CHECK: for.cond: ; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND]] ] ; CHECK-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] -; CHECK-NEXT: store i16 0, i16* [[B]], align 4 +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]] +; CHECK-NEXT: store i16 0, ptr [[B]], align 4 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; CHECK-NEXT: switch i32 [[I]], label [[FOR_COND]] [ ; CHECK-NEXT: i32 2096, label [[IF_END:%.*]] @@ -871,8 +861,8 @@ define i32 @multiple_exit_switch(i16* %p, i32 %n) { ; TAILFOLD: for.cond: ; TAILFOLD-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND]] ] ; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] -; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4 +; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]] +; TAILFOLD-NEXT: store i16 0, ptr [[B]], align 4 ; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; TAILFOLD-NEXT: switch i32 [[I]], label [[FOR_COND]] [ ; TAILFOLD-NEXT: i32 2096, label [[IF_END:%.*]] @@ -888,8 +878,8 @@ entry: for.cond: %i = phi i32 [ 0, %entry ], [ %inc, %for.cond ] %iprom = sext i32 %i to i64 - %b = getelementptr inbounds i16, i16* %p, i64 %iprom - store i16 0, i16* %b, align 4 + %b = getelementptr inbounds i16, ptr %p, i64 %iprom + store i16 0, ptr %b, align 4 %inc = add nsw i32 %i, 1 switch i32 %i, label %for.cond [ i32 2096, label %if.end @@ -902,15 +892,15 @@ if.end: ; multiple exit case but with a switch as multiple exiting edges from ; a single block is a commonly missed edge case -define i32 @multiple_exit_switch2(i16* %p, i32 %n) { +define i32 @multiple_exit_switch2(ptr %p, i32 %n) { ; CHECK-LABEL: @multiple_exit_switch2( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_COND:%.*]] ; CHECK: for.cond: ; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND]] ] ; CHECK-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] -; CHECK-NEXT: store i16 0, i16* [[B]], align 4 +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]] +; CHECK-NEXT: store i16 0, ptr [[B]], align 4 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; CHECK-NEXT: switch i32 [[I]], label [[FOR_COND]] [ ; CHECK-NEXT: i32 2096, label [[IF_END:%.*]] @@ -927,8 +917,8 @@ define i32 @multiple_exit_switch2(i16* %p, i32 %n) { ; TAILFOLD: for.cond: ; TAILFOLD-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND]] ] ; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] -; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4 +; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]] +; TAILFOLD-NEXT: store i16 0, ptr [[B]], align 4 ; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; TAILFOLD-NEXT: switch i32 [[I]], label [[FOR_COND]] [ ; TAILFOLD-NEXT: i32 2096, label [[IF_END:%.*]] @@ -945,8 +935,8 @@ entry: for.cond: %i = phi i32 [ 0, %entry ], [ %inc, %for.cond ] %iprom = sext i32 %i to i64 - %b = getelementptr inbounds i16, i16* %p, i64 %iprom - store i16 0, i16* %b, align 4 + %b = getelementptr inbounds i16, ptr %p, i64 %iprom + store i16 0, ptr %b, align 4 %inc = add nsw i32 %i, 1 switch i32 %i, label %for.cond [ i32 2096, label %if.end @@ -960,7 +950,7 @@ if.end2: ret i32 1 } -define i32 @multiple_latch1(i16* %p) { +define i32 @multiple_latch1(ptr %p) { ; CHECK-LABEL: @multiple_latch1( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -971,8 +961,8 @@ define i32 @multiple_latch1(i16* %p) { ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY_BACKEDGE]], label [[FOR_SECOND:%.*]] ; CHECK: for.second: ; CHECK-NEXT: [[IPROM:%.*]] = sext i32 [[I_02]] to i64 -; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] -; CHECK-NEXT: store i16 0, i16* [[B]], align 4 +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]] +; CHECK-NEXT: store i16 0, ptr [[B]], align 4 ; CHECK-NEXT: [[CMPS:%.*]] = icmp sgt i32 [[INC]], 16 ; CHECK-NEXT: br i1 [[CMPS]], label [[FOR_BODY_BACKEDGE]], label [[FOR_END:%.*]] ; CHECK: for.body.backedge: @@ -990,8 +980,8 @@ define i32 @multiple_latch1(i16* %p) { ; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY_BACKEDGE]], label [[FOR_SECOND:%.*]] ; TAILFOLD: for.second: ; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I_02]] to i64 -; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] -; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4 +; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]] +; TAILFOLD-NEXT: store i16 0, ptr [[B]], align 4 ; TAILFOLD-NEXT: [[CMPS:%.*]] = icmp sgt i32 [[INC]], 16 ; TAILFOLD-NEXT: br i1 [[CMPS]], label [[FOR_BODY_BACKEDGE]], label [[FOR_END:%.*]] ; TAILFOLD: for.body.backedge: @@ -1010,8 +1000,8 @@ for.body: for.second: %iprom = sext i32 %i.02 to i64 - %b = getelementptr inbounds i16, i16* %p, i64 %iprom - store i16 0, i16* %b, align 4 + %b = getelementptr inbounds i16, ptr %p, i64 %iprom + store i16 0, ptr %b, align 4 %cmps = icmp sgt i32 %inc, 16 br i1 %cmps, label %for.body.backedge, label %for.end @@ -1025,7 +1015,7 @@ for.end: ; two back branches - loop simplify with convert this to the same form ; as previous before vectorizer sees it, but show that. -define i32 @multiple_latch2(i16* %p) { +define i32 @multiple_latch2(ptr %p) { ; CHECK-LABEL: @multiple_latch2( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -1038,8 +1028,8 @@ define i32 @multiple_latch2(i16* %p) { ; CHECK-NEXT: br label [[FOR_BODY]] ; CHECK: for.second: ; CHECK-NEXT: [[IPROM:%.*]] = sext i32 [[I_02]] to i64 -; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] -; CHECK-NEXT: store i16 0, i16* [[B]], align 4 +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]] +; CHECK-NEXT: store i16 0, ptr [[B]], align 4 ; CHECK-NEXT: [[CMPS:%.*]] = icmp sgt i32 [[INC]], 16 ; CHECK-NEXT: br i1 [[CMPS]], label [[FOR_BODY_BACKEDGE]], label [[FOR_END:%.*]] ; CHECK: for.end: @@ -1057,8 +1047,8 @@ define i32 @multiple_latch2(i16* %p) { ; TAILFOLD-NEXT: br label [[FOR_BODY]] ; TAILFOLD: for.second: ; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I_02]] to i64 -; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] -; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4 +; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]] +; TAILFOLD-NEXT: store i16 0, ptr [[B]], align 4 ; TAILFOLD-NEXT: [[CMPS:%.*]] = icmp sgt i32 [[INC]], 16 ; TAILFOLD-NEXT: br i1 [[CMPS]], label [[FOR_BODY_BACKEDGE]], label [[FOR_END:%.*]] ; TAILFOLD: for.end: @@ -1075,8 +1065,8 @@ for.body: for.second: %iprom = sext i32 %i.02 to i64 - %b = getelementptr inbounds i16, i16* %p, i64 %iprom - store i16 0, i16* %b, align 4 + %b = getelementptr inbounds i16, ptr %p, i64 %iprom + store i16 0, ptr %b, align 4 %cmps = icmp sgt i32 %inc, 16 br i1 %cmps, label %for.body, label %for.end @@ -1088,7 +1078,7 @@ for.end: ; Check interaction between block predication and early exits. We need the ; condition on the early exit to remain dead (i.e. not be used when forming ; the predicate mask). -define void @scalar_predication(float* %addr) { +define void @scalar_predication(ptr %addr) { ; CHECK-LABEL: @scalar_predication( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -1097,30 +1087,29 @@ define void @scalar_predication(float* %addr) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, float* [[ADDR:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, float* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <2 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fcmp oeq <2 x float> [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 -; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, ptr [[ADDR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp oeq <2 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr float, float* [[ADDR]], i64 [[TMP0]] -; CHECK-NEXT: store float 1.000000e+01, float* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, ptr [[ADDR]], i64 [[TMP0]] +; CHECK-NEXT: store float 1.000000e+01, ptr [[TMP6]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 -; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] ; CHECK: pred.store.if1: -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[ADDR]], i64 [[TMP9]] -; CHECK-NEXT: store float 1.000000e+01, float* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[ADDR]], i64 [[TMP8]] +; CHECK-NEXT: store float 1.000000e+01, ptr [[TMP9]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] ; CHECK: pred.store.continue2: ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1128,15 +1117,15 @@ define void @scalar_predication(float* %addr) { ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr float, float* [[ADDR]], i64 [[IV]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr float, ptr [[ADDR]], i64 [[IV]] ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 200 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_BODY:%.*]] ; CHECK: loop.body: -; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[GEP]], align 4 -; CHECK-NEXT: [[PRED:%.*]] = fcmp oeq float [[TMP12]], 0.000000e+00 +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[GEP]], align 4 +; CHECK-NEXT: [[PRED:%.*]] = fcmp oeq float [[TMP11]], 0.000000e+00 ; CHECK-NEXT: br i1 [[PRED]], label [[LOOP_LATCH]], label [[THEN:%.*]] ; CHECK: then: -; CHECK-NEXT: store float 1.000000e+01, float* [[GEP]], align 4 +; CHECK-NEXT: store float 1.000000e+01, ptr [[GEP]], align 4 ; CHECK-NEXT: br label [[LOOP_LATCH]] ; CHECK: loop.latch: ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 @@ -1149,15 +1138,15 @@ define void @scalar_predication(float* %addr) { ; TAILFOLD-NEXT: br label [[LOOP_HEADER:%.*]] ; TAILFOLD: loop.header: ; TAILFOLD-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] -; TAILFOLD-NEXT: [[GEP:%.*]] = getelementptr float, float* [[ADDR:%.*]], i64 [[IV]] +; TAILFOLD-NEXT: [[GEP:%.*]] = getelementptr float, ptr [[ADDR:%.*]], i64 [[IV]] ; TAILFOLD-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 200 ; TAILFOLD-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_BODY:%.*]] ; TAILFOLD: loop.body: -; TAILFOLD-NEXT: [[TMP0:%.*]] = load float, float* [[GEP]], align 4 +; TAILFOLD-NEXT: [[TMP0:%.*]] = load float, ptr [[GEP]], align 4 ; TAILFOLD-NEXT: [[PRED:%.*]] = fcmp oeq float [[TMP0]], 0.000000e+00 ; TAILFOLD-NEXT: br i1 [[PRED]], label [[LOOP_LATCH]], label [[THEN:%.*]] ; TAILFOLD: then: -; TAILFOLD-NEXT: store float 1.000000e+01, float* [[GEP]], align 4 +; TAILFOLD-NEXT: store float 1.000000e+01, ptr [[GEP]], align 4 ; TAILFOLD-NEXT: br label [[LOOP_LATCH]] ; TAILFOLD: loop.latch: ; TAILFOLD-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 @@ -1170,17 +1159,17 @@ entry: loop.header: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] - %gep = getelementptr float, float* %addr, i64 %iv + %gep = getelementptr float, ptr %addr, i64 %iv %exitcond.not = icmp eq i64 %iv, 200 br i1 %exitcond.not, label %exit, label %loop.body loop.body: - %0 = load float, float* %gep, align 4 + %0 = load float, ptr %gep, align 4 %pred = fcmp oeq float %0, 0.0 br i1 %pred, label %loop.latch, label %then then: - store float 10.0, float* %gep, align 4 + store float 10.0, ptr %gep, align 4 br label %loop.latch loop.latch: @@ -1191,7 +1180,7 @@ exit: ret void } -define i32 @me_reduction(i32* %addr) { +define i32 @me_reduction(ptr %addr) { ; CHECK-LABEL: @me_reduction( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -1199,32 +1188,31 @@ define i32 @me_reduction(i32* %addr) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4]] = add <2 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[ADDR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3]] = add <2 x i32> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP3]]) ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LOOP_LATCH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[ADDR]], i64 [[IV]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[ADDR]], i64 [[IV]] ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 200 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]] ; CHECK: loop.latch: -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[GEP]], align 4 -; CHECK-NEXT: [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[TMP7]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[TMP6]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND2_NOT:%.*]] = icmp eq i64 [[IV]], 400 ; CHECK-NEXT: br i1 [[EXITCOND2_NOT]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP23:![0-9]+]] @@ -1238,11 +1226,11 @@ define i32 @me_reduction(i32* %addr) { ; TAILFOLD: loop.header: ; TAILFOLD-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; TAILFOLD-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACCUM_NEXT:%.*]], [[LOOP_LATCH]] ] -; TAILFOLD-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i64 [[IV]] +; TAILFOLD-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[ADDR:%.*]], i64 [[IV]] ; TAILFOLD-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 200 ; TAILFOLD-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]] ; TAILFOLD: loop.latch: -; TAILFOLD-NEXT: [[TMP0:%.*]] = load i32, i32* [[GEP]], align 4 +; TAILFOLD-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP]], align 4 ; TAILFOLD-NEXT: [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[TMP0]] ; TAILFOLD-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TAILFOLD-NEXT: [[EXITCOND2_NOT:%.*]] = icmp eq i64 [[IV]], 400 @@ -1257,12 +1245,12 @@ entry: loop.header: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] %accum = phi i32 [0, %entry], [%accum.next, %loop.latch] - %gep = getelementptr i32, i32* %addr, i64 %iv + %gep = getelementptr i32, ptr %addr, i64 %iv %exitcond.not = icmp eq i64 %iv, 200 br i1 %exitcond.not, label %exit, label %loop.latch loop.latch: - %0 = load i32, i32* %gep, align 4 + %0 = load i32, ptr %gep, align 4 %accum.next = add i32 %accum, %0 %iv.next = add nuw nsw i64 %iv, 1 %exitcond2.not = icmp eq i64 %iv, 400 @@ -1277,18 +1265,18 @@ exit: ; this. There's an analogous single exit case where we extract the N-1 ; value of the reduction that we can also handle. If we fix the later, the ; multiple exit case probably falls out. -define i32 @me_reduction2(i32* %addr) { +define i32 @me_reduction2(ptr %addr) { ; CHECK-LABEL: @me_reduction2( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACCUM_NEXT:%.*]], [[LOOP_LATCH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i64 [[IV]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[ADDR:%.*]], i64 [[IV]] ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 200 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]] ; CHECK: loop.latch: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[GEP]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP]], align 4 ; CHECK-NEXT: [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[TMP0]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: br label [[LOOP_HEADER]] @@ -1302,11 +1290,11 @@ define i32 @me_reduction2(i32* %addr) { ; TAILFOLD: loop.header: ; TAILFOLD-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; TAILFOLD-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACCUM_NEXT:%.*]], [[LOOP_LATCH]] ] -; TAILFOLD-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i64 [[IV]] +; TAILFOLD-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[ADDR:%.*]], i64 [[IV]] ; TAILFOLD-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 200 ; TAILFOLD-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]] ; TAILFOLD: loop.latch: -; TAILFOLD-NEXT: [[TMP0:%.*]] = load i32, i32* [[GEP]], align 4 +; TAILFOLD-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP]], align 4 ; TAILFOLD-NEXT: [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[TMP0]] ; TAILFOLD-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TAILFOLD-NEXT: br label [[LOOP_HEADER]] @@ -1320,12 +1308,12 @@ entry: loop.header: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] %accum = phi i32 [0, %entry], [%accum.next, %loop.latch] - %gep = getelementptr i32, i32* %addr, i64 %iv + %gep = getelementptr i32, ptr %addr, i64 %iv %exitcond.not = icmp eq i64 %iv, 200 br i1 %exitcond.not, label %exit, label %loop.latch loop.latch: - %0 = load i32, i32* %gep, align 4 + %0 = load i32, ptr %gep, align 4 %accum.next = add i32 %accum, %0 %iv.next = add nuw nsw i64 %iv, 1 br label %loop.header diff --git a/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll b/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll index 41680c933a7f6..f5a174c2e01a1 100644 --- a/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll @@ -27,34 +27,35 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" %struct.s = type { [32 x i32], [32 x i32], [32 x [32 x i32]] } -define void @Test(%struct.s* nocapture %obj, i64 %z) #0 { +define void @Test(ptr nocapture %obj, i64 %z) #0 { ; CHECK-LABEL: @Test( -; CHECK-NEXT: [[OBJ4:%.*]] = bitcast %struct.s* [[OBJ:%.*]] to i8* -; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr [[STRUCT_S:%.*]], %struct.s* [[OBJ]], i64 0, i32 0, i64 [[Z:%.*]] -; CHECK-NEXT: [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8* +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[Z:%.*]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 256 +; CHECK-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[OBJ:%.*]], i64 [[TMP1]] ; CHECK-NEXT: br label [[DOTOUTER_PREHEADER:%.*]] ; CHECK: .outer.preheader: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[I_NEXT:%.*]], [[DOTOUTER:%.*]] ] -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 0 -; CHECK-NEXT: [[SCEVGEP1:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[Z]] -; CHECK-NEXT: [[SCEVGEP23:%.*]] = bitcast i32* [[SCEVGEP2]] to i8* -; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 1, i64 [[I]] -; CHECK-NEXT: [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8* -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[I]], 1 -; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 1, i64 [[TMP1]] -; CHECK-NEXT: [[SCEVGEP910:%.*]] = bitcast i32* [[SCEVGEP9]] to i8* -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 1, i64 [[I]] +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[I]], 7 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], 256 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[OBJ]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[OBJ]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[I]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 128 +; CHECK-NEXT: [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[OBJ]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP6]], 132 +; CHECK-NEXT: [[UGLYGEP4:%.*]] = getelementptr i8, ptr [[OBJ]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[OBJ]], i64 0, i32 1, i64 [[I]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[Z]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP56]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[OBJ4]], [[SCEVGEP23]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP2]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[OBJ]], [[UGLYGEP1]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: [[BOUND011:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP910]] -; CHECK-NEXT: [[BOUND112:%.*]] = icmp ult i8* [[SCEVGEP78]], [[SCEVGEP23]] -; CHECK-NEXT: [[FOUND_CONFLICT13:%.*]] = and i1 [[BOUND011]], [[BOUND112]] -; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT13]] +; CHECK-NEXT: [[BOUND05:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP4]] +; CHECK-NEXT: [[BOUND16:%.*]] = icmp ult ptr [[UGLYGEP3]], [[UGLYGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT7:%.*]] = and i1 [[BOUND05]], [[BOUND16]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT7]] ; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[Z]], 4 @@ -62,25 +63,22 @@ define void @Test(%struct.s* nocapture %obj, i64 %z) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 0, i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4, !alias.scope !0 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP2]], align 4, !alias.scope !3 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 0, i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4, !alias.scope !0 +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope !3 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP13]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !5, !noalias !7 -; CHECK-NEXT: [[TMP12:%.*]] = add nsw <4 x i32> [[TMP8]], [[WIDE_LOAD14]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[TMP14]], [[WIDE_LOAD8]] +; CHECK-NEXT: store <4 x i32> [[TMP17]], ptr [[TMP16]], align 4, !alias.scope !5, !noalias !7 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[Z]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[DOTOUTER]], label [[SCALAR_PH]] @@ -95,14 +93,14 @@ define void @Test(%struct.s* nocapture %obj, i64 %z) #0 { ; CHECK-NEXT: br i1 [[EXITCOND_OUTER]], label [[DOTEXIT:%.*]], label [[DOTOUTER_PREHEADER]] ; CHECK: .inner: ; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[J_NEXT:%.*]], [[DOTINNER]] ] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 0, i64 [[J]] -; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = add nsw i32 [[TMP17]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[J]] -; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 -; CHECK-NEXT: [[TMP21:%.*]] = add nsw i32 [[TMP18]], [[TMP20]] -; CHECK-NEXT: store i32 [[TMP21]], i32* [[TMP19]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 0, i64 [[J]] +; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP22:%.*]] = add nsw i32 [[TMP21]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[J]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = add nsw i32 [[TMP22]], [[TMP24]] +; CHECK-NEXT: store i32 [[TMP25]], ptr [[TMP23]], align 4 ; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J]], 1 ; CHECK-NEXT: [[EXITCOND_INNER:%.*]] = icmp eq i64 [[J_NEXT]], [[Z]] ; CHECK-NEXT: br i1 [[EXITCOND_INNER]], label [[DOTOUTER]], label [[DOTINNER]], !llvm.loop [[LOOP10:![0-9]+]] @@ -112,7 +110,7 @@ define void @Test(%struct.s* nocapture %obj, i64 %z) #0 { .outer.preheader: %i = phi i64 [ 0, %0 ], [ %i.next, %.outer ] - %1 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 1, i64 %i + %1 = getelementptr inbounds %struct.s, ptr %obj, i64 0, i32 1, i64 %i br label %.inner .exit: @@ -125,14 +123,14 @@ define void @Test(%struct.s* nocapture %obj, i64 %z) #0 { .inner: %j = phi i64 [ 0, %.outer.preheader ], [ %j.next, %.inner ] - %2 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 0, i64 %j - %3 = load i32, i32* %2 - %4 = load i32, i32* %1 + %2 = getelementptr inbounds %struct.s, ptr %obj, i64 0, i32 0, i64 %j + %3 = load i32, ptr %2 + %4 = load i32, ptr %1 %5 = add nsw i32 %4, %3 - %6 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 2, i64 %i, i64 %j - %7 = load i32, i32* %6 + %6 = getelementptr inbounds %struct.s, ptr %obj, i64 0, i32 2, i64 %i, i64 %j + %7 = load i32, ptr %6 %8 = add nsw i32 %5, %7 - store i32 %8, i32* %6 + store i32 %8, ptr %6 %j.next = add nuw nsw i64 %j, 1 %exitcond.inner = icmp eq i64 %j.next, %z br i1 %exitcond.inner, label %.outer, label %.inner diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll index 708cf790f8c86..4a8fda99ef486 100644 --- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll @@ -10,7 +10,7 @@ target datalayout = "e-m:e-i64:64-n32:64" ; Some limited forms of live-outs (non-reduction, non-recurrences) are supported. -define signext i32 @f1(i32* noalias %A, i32* noalias %B, i32 signext %n) { +define signext i32 @f1(ptr noalias %A, ptr noalias %B, i32 signext %n) { ; VF-TWO-CHECK-LABEL: @f1( ; VF-TWO-CHECK-NEXT: entry: ; VF-TWO-CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -29,20 +29,18 @@ define signext i32 @f1(i32* noalias %A, i32* noalias %B, i32 signext %n) { ; VF-TWO-CHECK: vector.body: ; VF-TWO-CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VF-TWO-CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; VF-TWO-CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; VF-TWO-CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; VF-TWO-CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 -; VF-TWO-CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] -; VF-TWO-CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; VF-TWO-CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>* -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4 -; VF-TWO-CHECK-NEXT: [[TMP7:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; VF-TWO-CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; VF-TWO-CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] +; VF-TWO-CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; VF-TWO-CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; VF-TWO-CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VF-TWO-CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF-TWO-CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VF-TWO-CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VF-TWO-CHECK: middle.block: -; VF-TWO-CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1 +; VF-TWO-CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 ; VF-TWO-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; VF-TWO-CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; VF-TWO-CHECK: vec.epilog.iter.check: @@ -55,22 +53,20 @@ define signext i32 @f1(i32* noalias %A, i32* noalias %B, i32 signext %n) { ; VF-TWO-CHECK-NEXT: [[N_VEC4:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF3]] ; VF-TWO-CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; VF-TWO-CHECK: vec.epilog.vector.body: -; VF-TWO-CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; VF-TWO-CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 0 -; VF-TWO-CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]] -; VF-TWO-CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0 -; VF-TWO-CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>* -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <2 x i32>, <2 x i32>* [[TMP13]], align 4 -; VF-TWO-CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP10]] -; VF-TWO-CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0 -; VF-TWO-CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>* -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <2 x i32>, <2 x i32>* [[TMP16]], align 4 -; VF-TWO-CHECK-NEXT: [[TMP17:%.*]] = add nsw <2 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD8]] -; VF-TWO-CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 2 -; VF-TWO-CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]] -; VF-TWO-CHECK-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; VF-TWO-CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; VF-TWO-CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX6]], 0 +; VF-TWO-CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] +; VF-TWO-CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <2 x i32>, ptr [[TMP10]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] +; VF-TWO-CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <2 x i32>, ptr [[TMP12]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP13:%.*]] = add nsw <2 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD8]] +; VF-TWO-CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2 +; VF-TWO-CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]] +; VF-TWO-CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; VF-TWO-CHECK: vec.epilog.middle.block: -; VF-TWO-CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1 +; VF-TWO-CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 ; VF-TWO-CHECK-NEXT: [[CMP_N5:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC4]] ; VF-TWO-CHECK-NEXT: br i1 [[CMP_N5]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; VF-TWO-CHECK: vec.epilog.scalar.ph: @@ -78,16 +74,16 @@ define signext i32 @f1(i32* noalias %A, i32* noalias %B, i32 signext %n) { ; VF-TWO-CHECK-NEXT: br label [[FOR_BODY:%.*]] ; VF-TWO-CHECK: for.body: ; VF-TWO-CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; VF-TWO-CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; VF-TWO-CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; VF-TWO-CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; VF-TWO-CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 -; VF-TWO-CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +; VF-TWO-CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; VF-TWO-CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; VF-TWO-CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; VF-TWO-CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; VF-TWO-CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] ; VF-TWO-CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VF-TWO-CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] ; VF-TWO-CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]] ; VF-TWO-CHECK: for.end.loopexit: -; VF-TWO-CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; VF-TWO-CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; VF-TWO-CHECK-NEXT: br label [[FOR_END]] ; VF-TWO-CHECK: for.end: ; VF-TWO-CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_END_LOOPEXIT]] ] @@ -103,10 +99,10 @@ for.body.preheader: ; preds = %entry for.body: ; preds = %for.body.preheader, %for.body %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv - %1 = load i32, i32* %arrayidx2, align 4 + %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 %add = add nsw i32 %0, %1 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll index 3f965d7eb09ed..93cf55b3cb006 100644 --- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll @@ -12,7 +12,7 @@ target datalayout = "e-m:e-i64:64-n32:64-v128:128:128" -define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) { +define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32 signext %N) { ; CHECK-LABEL: @f1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -31,22 +31,19 @@ define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[BB:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[CC:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[AA:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[BB:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[CC:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[AA:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[TMP7]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -60,24 +57,21 @@ define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias ; CHECK-NEXT: [[N_VEC4:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF3]] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* [[TMP15]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP17]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP18]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD8]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP21]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP19]], <4 x float>* [[TMP22]], align 4 -; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]] -; CHECK-NEXT: br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX6]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP11]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP13]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD8]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0 +; CHECK-NEXT: store <4 x float> [[TMP14]], ptr [[TMP16]], align 4 +; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 4 +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]] +; CHECK-NEXT: br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N5:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC4]] ; CHECK-NEXT: br i1 [[CMP_N5]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -86,13 +80,13 @@ define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP24:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP25:%.*]] = load float, float* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP24]], [[TMP25]] -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]] @@ -110,13 +104,13 @@ define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-PROFITABLE-BY-DEFAULT: for.body: ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[BB:%.*]], i64 [[INDVARS_IV]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[CC:%.*]], i64 [[INDVARS_IV]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[BB:%.*]], i64 [[INDVARS_IV]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[CC:%.*]], i64 [[INDVARS_IV]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], [[TMP1]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[AA:%.*]], i64 [[INDVARS_IV]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store float [[ADD]], float* [[ARRAYIDX4]], align 4 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[AA:%.*]], i64 [[INDVARS_IV]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store float [[ADD]], ptr [[ARRAYIDX4]], align 4 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]] @@ -135,13 +129,13 @@ for.body.preheader: ; preds = %entry for.body: ; preds = %for.body.preheader, %for.body %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv - %0 = load float, float* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv - %1 = load float, float* %arrayidx2, align 4 + %arrayidx = getelementptr inbounds float, ptr %bb, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, ptr %cc, i64 %indvars.iv + %1 = load float, ptr %arrayidx2, align 4 %add = fadd fast float %0, %1 - %arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv - store float %add, float* %arrayidx4, align 4 + %arrayidx4 = getelementptr inbounds float, ptr %aa, i64 %indvars.iv + store float %add, ptr %arrayidx4, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond, label %for.body, label %for.end.loopexit @@ -153,7 +147,7 @@ for.end: ; preds = %for.end.loopexit, % ret void } -define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signext %n) { +define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n) { ; CHECK-LABEL: @f2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 1 @@ -190,25 +184,23 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe ; CHECK-NEXT: [[TMP10:%.*]] = xor i32 [[TMP8]], -1 ; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[N]] ; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP14]], i32 -3 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP15]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP16]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 -3 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP15]], align 4 ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = fadd fast <4 x float> [[REVERSE]], -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP18]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP19]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP17]], <4 x float>* [[TMP20]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = fadd fast <4 x float> [[REVERSE]], +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0 +; CHECK-NEXT: store <4 x float> [[TMP16]], ptr [[TMP18]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[IND_END5:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END4:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] @@ -219,45 +211,43 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe ; CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC3]] to i32 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX8:%.*]] = trunc i64 [[OFFSET_IDX9]] to i32 -; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[OFFSET_IDX8]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[OFFSET_IDX9]], 0 -; CHECK-NEXT: [[TMP24:%.*]] = xor i32 [[TMP22]], -1 -; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP24]], [[N]] -; CHECK-NEXT: [[TMP26:%.*]] = sext i32 [[TMP25]] to i64 -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP27]], i32 0 -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP28]], i32 -3 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast float* [[TMP29]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP30]], align 4 -; CHECK-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = fadd fast <4 x float> [[REVERSE11]], -; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]] -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP34:%.*]] = bitcast float* [[TMP33]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP31]], <4 x float>* [[TMP34]], align 4 -; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[OFFSET_IDX9]], 4 -; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP35]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX8:%.*]] = trunc i64 [[INDEX7]] to i32 +; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[OFFSET_IDX8]], 0 +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX7]], 0 +; CHECK-NEXT: [[TMP22:%.*]] = xor i32 [[TMP20]], -1 +; CHECK-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], [[N]] +; CHECK-NEXT: [[TMP24:%.*]] = sext i32 [[TMP23]] to i64 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP25]], i32 0 +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i32 -3 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP27]], align 4 +; CHECK-NEXT: [[REVERSE10:%.*]] = shufflevector <4 x float> [[WIDE_LOAD9]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = fadd fast <4 x float> [[REVERSE10]], +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i32 0 +; CHECK-NEXT: store <4 x float> [[TMP28]], ptr [[TMP30]], align 4 +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 4 +; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP31]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[I_014]], -1 -; CHECK-NEXT: [[SUB2:%.*]] = add i32 [[TMP36]], [[N]] +; CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP32:%.*]] = xor i32 [[I_014]], -1 +; CHECK-NEXT: [[SUB2:%.*]] = add i32 [[TMP32]], [[N]] ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP37:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CONV3:%.*]] = fadd fast float [[TMP37]], 1.000000e+00 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store float [[CONV3]], float* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP33:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CONV3:%.*]] = fadd fast float [[TMP33]], 1.000000e+00 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[CONV3]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_014]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] @@ -281,11 +271,11 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP1:%.*]] = xor i32 [[I_014]], -1 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[SUB2:%.*]] = add i32 [[TMP1]], [[N]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IDXPROM]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[IDXPROM]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[CONV3:%.*]] = fadd fast float [[TMP2]], 1.000000e+00 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDVARS_IV]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store float [[CONV3]], float* [[ARRAYIDX5]], align 4 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDVARS_IV]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store float [[CONV3]], ptr [[ARRAYIDX5]], align 4 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INC]] = add nuw nsw i32 [[I_014]], 1 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] @@ -310,11 +300,11 @@ for.body: ; preds = %for.body.preheader, %1 = xor i32 %i.014, -1 %sub2 = add i32 %1, %n %idxprom = sext i32 %sub2 to i64 - %arrayidx = getelementptr inbounds float, float* %B, i64 %idxprom - %2 = load float, float* %arrayidx, align 4 + %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom + %2 = load float, ptr %arrayidx, align 4 %conv3 = fadd fast float %2, 1.000000e+00 - %arrayidx5 = getelementptr inbounds float, float* %A, i64 %indvars.iv - store float %conv3, float* %arrayidx5, align 4 + %arrayidx5 = getelementptr inbounds float, ptr %A, i64 %indvars.iv + store float %conv3, ptr %arrayidx5, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %inc = add nuw nsw i32 %i.014, 1 %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count @@ -327,7 +317,7 @@ for.end: ; preds = %for.end.loopexit, % ret i32 0 } -define void @f3(i8* noalias %A, i64 %n) { +define void @f3(ptr noalias %A, i64 %n) { ; CHECK-LABEL: @f3( ; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 @@ -342,13 +332,12 @@ define void @f3(i8* noalias %A, i64 %n) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* -; CHECK-NEXT: store <4 x i8> , <4 x i8>* [[TMP3]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <4 x i8> , ptr [[TMP2]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -362,15 +351,14 @@ define void @f3(i8* noalias %A, i64 %n) { ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>* -; CHECK-NEXT: store <4 x i8> , <4 x i8>* [[TMP8]], align 1 -; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX5]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store <4 x i8> , ptr [[TMP6]], align 1 +; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N4]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -379,8 +367,8 @@ define void @f3(i8* noalias %A, i64 %n) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[IV]] -; CHECK-NEXT: store i8 1, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: store i8 1, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP10:![0-9]+]] @@ -403,13 +391,12 @@ define void @f3(i8* noalias %A, i64 %n) { ; CHECK-PROFITABLE-BY-DEFAULT: vector.body: ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store <4 x i8> , <4 x i8>* [[TMP3]], align 1 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store <4 x i8> , ptr [[TMP2]], align 1 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-PROFITABLE-BY-DEFAULT: middle.block: ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -423,15 +410,14 @@ define void @f3(i8* noalias %A, i64 %n) { ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.vector.body: -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP5]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>* -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store <2 x i8> , <2 x i8>* [[TMP8]], align 1 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[OFFSET_IDX]], 2 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP4:%.*]] = add i64 [[INDEX5]], 0 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store <2 x i8> , ptr [[TMP6]], align 1 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], 2 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.middle.block: ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[CMP_N4]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -440,8 +426,8 @@ define void @f3(i8* noalias %A, i64 %n) { ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-PROFITABLE-BY-DEFAULT: for.body: ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[IV]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store i8 1, i8* [[ARRAYIDX]], align 1 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store i8 1, ptr [[ARRAYIDX]], align 1 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[N]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]] @@ -455,8 +441,8 @@ entry: for.body: ; preds = %for.body.preheader, %for.body %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %arrayidx = getelementptr inbounds i8, i8* %A, i64 %iv - store i8 1, i8* %arrayidx, align 1 + %arrayidx = getelementptr inbounds i8, ptr %A, i64 %iv + store i8 1, ptr %arrayidx, align 1 %iv.next = add nuw nsw i64 %iv, 1 %exitcond = icmp ne i64 %iv.next, %n br i1 %exitcond, label %for.body, label %for.end.loopexit @@ -469,7 +455,7 @@ for.end: ; preds = %for.end.loopexit, % } ; Test case for PR54745. -define void @induction_resume_value_requires_non_trivial_scev_expansion(i8* %dst) { +define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst) { ; CHECK-LABEL: @induction_resume_value_requires_non_trivial_scev_expansion( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] @@ -499,14 +485,13 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(i8* %dst ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* -; CHECK-NEXT: store <4 x i8> [[VEC_IND]], <4 x i8>* [[TMP7]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store <4 x i8> [[VEC_IND]], ptr [[TMP6]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], [[DOTSPLAT3]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 84 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 84 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 84, 84 ; CHECK-NEXT: br i1 [[CMP_N]], label [[OUTER_LATCH]], label [[SCALAR_PH]] @@ -518,8 +503,8 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(i8* %dst ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[INNER]] ] ; CHECK-NEXT: [[IV_2:%.*]] = phi i8 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[INNER]] ] ; CHECK-NEXT: [[IV_2_NEXT]] = sub i8 [[IV_2]], [[TRUNC_ADD]] -; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[IV]] -; CHECK-NEXT: store i8 [[IV_2]], i8* [[GEP_DST]], align 1 +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i8 [[IV_2]], ptr [[GEP_DST]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp ugt i64 [[IV]], 83 ; CHECK-NEXT: br i1 [[EC]], label [[OUTER_LATCH]], label [[INNER]], !llvm.loop [[LOOP12:![0-9]+]] @@ -558,14 +543,13 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(i8* %dst ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[TMP4]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store <4 x i8> [[VEC_IND]], <4 x i8>* [[TMP7]], align 1 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP4]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store <4 x i8> [[VEC_IND]], ptr [[TMP6]], align 1 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], [[DOTSPLAT3]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 84 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 84 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-PROFITABLE-BY-DEFAULT: middle.block: ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 84, 84 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[CMP_N]], label [[OUTER_LATCH]], label [[SCALAR_PH]] @@ -577,8 +561,8 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(i8* %dst ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[INNER]] ] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV_2:%.*]] = phi i8 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[INNER]] ] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV_2_NEXT]] = sub i8 [[IV_2]], [[TRUNC_ADD]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[IV]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store i8 [[IV_2]], i8* [[GEP_DST]], align 1 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store i8 [[IV_2]], ptr [[GEP_DST]], align 1 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[EC:%.*]] = icmp ugt i64 [[IV]], 83 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[EC]], label [[OUTER_LATCH]], label [[INNER]], !llvm.loop [[LOOP6:![0-9]+]] @@ -603,8 +587,8 @@ inner: %iv = phi i64 [ 1, %outer.header ], [ %iv.next, %inner ] %iv.2 = phi i8 [ 0, %outer.header ], [ %iv.2.next, %inner ] %iv.2.next = sub i8 %iv.2, %trunc.add - %gep.dst = getelementptr inbounds i8, i8* %dst, i64 %iv - store i8 %iv.2, i8* %gep.dst + %gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv + store i8 %iv.2, ptr %gep.dst %iv.next = add nuw nsw i64 %iv, 1 %ec = icmp ugt i64 %iv, 83 br i1 %ec, label %outer.latch, label %inner diff --git a/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll b/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll index d7a2b62e77488..91feee419622d 100644 --- a/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll +++ b/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll @@ -51,44 +51,43 @@ define void @doit1(i32 %n, i32 %step) local_unnamed_addr { ; CHECK-NEXT: [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[TMP4]], i8 [[TMP5]]) ; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0 ; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i8, i1 } [[MUL]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = sub i8 0, [[MUL_RESULT]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp slt i8 [[MUL_RESULT]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt i8 [[TMP7]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP3]], i1 [[TMP9]], i1 [[TMP8]] -; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP10]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[TMP6:%.*]] = sub i8 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp slt i8 [[MUL_RESULT]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i8 [[TMP6]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP3]], i1 [[TMP8]], i1 [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], 255 ; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i8 [[TMP1]], 0 ; CHECK-NEXT: [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = or i1 [[TMP14]], [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = sext i8 [[TMP1]] to i32 -; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP15]], [[IDENT_CHECK]] -; CHECK-NEXT: br i1 [[TMP18]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP10]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = sext i8 [[TMP1]] to i32 +; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = or i1 [[TMP14]], [[IDENT_CHECK]] +; CHECK-NEXT: br i1 [[TMP16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] -; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[CAST_CRD]], [[STEP]] +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i32 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = mul <4 x i32> , [[DOTSPLAT]] -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[STEP]], 4 -; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = mul <4 x i32> , [[DOTSPLAT]] +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[STEP]], 4 +; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i32 0 ; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP22]], i32 0 -; CHECK-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP24]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 0 +; CHECK-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP21]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]] -; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -101,8 +100,8 @@ define void @doit1(i32 %n, i32 %step) local_unnamed_addr { ; CHECK-NEXT: [[P_09:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[SEXT:%.*]] = shl i32 [[P_09]], 24 ; CHECK-NEXT: [[CONV:%.*]] = ashr exact i32 [[SEXT]], 24 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[CONV]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[CONV]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ADD]] = add nsw i32 [[CONV]], [[STEP]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] @@ -125,8 +124,8 @@ for.body: %p.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] %sext = shl i32 %p.09, 24 %conv = ashr exact i32 %sext, 24 - %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv - store i32 %conv, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 %indvars.iv + store i32 %conv, ptr %arrayidx, align 4 %add = add nsw i32 %conv, %step %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count @@ -176,43 +175,42 @@ define void @doit2(i32 %n, i32 %step) local_unnamed_addr { ; CHECK-NEXT: [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[TMP4]], i8 [[TMP5]]) ; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0 ; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i8, i1 } [[MUL]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = sub i8 0, [[MUL_RESULT]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP7]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP3]], i1 [[TMP9]], i1 false -; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP10]], [[MUL_OVERFLOW]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], 255 -; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i8 [[TMP1]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = or i1 [[TMP14]], [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = sext i8 [[TMP1]] to i32 -; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP15]], [[IDENT_CHECK]] -; CHECK-NEXT: br i1 [[TMP18]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = sub i8 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i8 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP3]], i1 [[TMP7]], i1 false +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[TMP0]], 255 +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP1]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = sext i8 [[TMP1]] to i32 +; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = or i1 [[TMP13]], [[IDENT_CHECK]] +; CHECK-NEXT: br i1 [[TMP15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] -; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[CAST_CRD]], [[STEP]] +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i32 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = mul <4 x i32> , [[DOTSPLAT]] -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[STEP]], 4 -; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = mul <4 x i32> , [[DOTSPLAT]] +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = mul i32 [[STEP]], 4 +; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP17]], i32 0 ; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP22]], i32 0 -; CHECK-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP24]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0 +; CHECK-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP20]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]] -; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -224,8 +222,8 @@ define void @doit2(i32 %n, i32 %step) local_unnamed_addr { ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[P_09:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[CONV:%.*]] = and i32 [[P_09]], 255 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[CONV]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[CONV]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ADD]] = add nsw i32 [[CONV]], [[STEP]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] @@ -247,8 +245,8 @@ for.body: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] %p.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] %conv = and i32 %p.09, 255 - %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv - store i32 %conv, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 %indvars.iv + store i32 %conv, ptr %arrayidx, align 4 %add = add nsw i32 %conv, %step %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count @@ -293,8 +291,8 @@ define void @doit3(i32 %n, i32 %step) local_unnamed_addr { ; CHECK-NEXT: [[STEP_ADDR_010:%.*]] = phi i32 [ [[ADD3:%.*]], [[FOR_BODY]] ], [ [[STEP:%.*]], [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: [[SEXT:%.*]] = shl i32 [[P_012]], 24 ; CHECK-NEXT: [[CONV:%.*]] = ashr exact i32 [[SEXT]], 24 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[CONV]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[CONV]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ADD]] = add nsw i32 [[CONV]], [[STEP_ADDR_010]] ; CHECK-NEXT: [[ADD3]] = add nsw i32 [[STEP_ADDR_010]], 2 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -319,8 +317,8 @@ for.body: %step.addr.010 = phi i32 [ %add3, %for.body ], [ %step, %for.body.preheader ] %sext = shl i32 %p.012, 24 %conv = ashr exact i32 %sext, 24 - %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv - store i32 %conv, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 %indvars.iv + store i32 %conv, ptr %arrayidx, align 4 %add = add nsw i32 %conv, %step.addr.010 %add3 = add nsw i32 %step.addr.010, 2 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 @@ -374,41 +372,40 @@ define void @doit4(i32 %n, i8 signext %cstep) local_unnamed_addr { ; CHECK-NEXT: [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[TMP3]], i8 [[TMP4]]) ; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0 ; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i8, i1 } [[MUL]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = sub i8 0, [[MUL_RESULT]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp slt i8 [[MUL_RESULT]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i8 [[TMP6]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP2]], i1 [[TMP8]], i1 [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[TMP5:%.*]] = sub i8 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp slt i8 [[MUL_RESULT]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt i8 [[TMP5]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP2]], i1 [[TMP7]], i1 [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[TMP0]], 255 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[CSTEP]], 0 ; CHECK-NEXT: [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP13]], [[TMP12]] -; CHECK-NEXT: br i1 [[TMP14]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]] +; CHECK-NEXT: br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] -; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[CAST_CRD]], [[CONV]] +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[CONV]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[CONV]], i32 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = mul <4 x i32> , [[DOTSPLAT]] -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = mul i32 [[CONV]], 4 -; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = mul <4 x i32> , [[DOTSPLAT]] +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[CONV]], 4 +; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0 ; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0 -; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP21]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 +; CHECK-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP18]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]] -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -421,8 +418,8 @@ define void @doit4(i32 %n, i8 signext %cstep) local_unnamed_addr { ; CHECK-NEXT: [[P_011:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[SEXT:%.*]] = shl i32 [[P_011]], 24 ; CHECK-NEXT: [[CONV2:%.*]] = ashr exact i32 [[SEXT]], 24 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[CONV2]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[CONV2]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ADD]] = add nsw i32 [[CONV2]], [[CONV]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] @@ -446,8 +443,8 @@ for.body: %p.011 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] %sext = shl i32 %p.011, 24 %conv2 = ashr exact i32 %sext, 24 - %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv - store i32 %conv2, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 %indvars.iv + store i32 %conv2, ptr %arrayidx, align 4 %add = add nsw i32 %conv2, %conv %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count diff --git a/llvm/test/Transforms/LoopVectorize/pr45259.ll b/llvm/test/Transforms/LoopVectorize/pr45259.ll index aa7df10bc8288..3a9aa6b5726cf 100644 --- a/llvm/test/Transforms/LoopVectorize/pr45259.ll +++ b/llvm/test/Transforms/LoopVectorize/pr45259.ll @@ -3,18 +3,18 @@ ; Check that we can vectorize this loop without crashing. -define i8 @widget(i8* %arr, i8 %t9) { +define i8 @widget(ptr %arr, i8 %t9) { ; CHECK-LABEL: @widget( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARR1:%.*]] = ptrtoint i8* [[ARR:%.*]] to i64 +; CHECK-NEXT: [[ARR1:%.*]] = ptrtoint ptr [[ARR:%.*]] to i64 ; CHECK-NEXT: br label [[BB6:%.*]] ; CHECK: bb6: -; CHECK-NEXT: [[T1_0:%.*]] = phi i8* [ [[ARR]], [[BB:%.*]] ], [ null, [[BB6]] ] +; CHECK-NEXT: [[T1_0:%.*]] = phi ptr [ [[ARR]], [[BB:%.*]] ], [ null, [[BB6]] ] ; CHECK-NEXT: [[C:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[C]], label [[FOR_PREHEADER:%.*]], label [[BB6]] ; CHECK: for.preheader: -; CHECK-NEXT: [[T1_0_LCSSA:%.*]] = phi i8* [ [[T1_0]], [[BB6]] ] -; CHECK-NEXT: [[T1_0_LCSSA2:%.*]] = ptrtoint i8* [[T1_0_LCSSA]] to i64 +; CHECK-NEXT: [[T1_0_LCSSA:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ] +; CHECK-NEXT: [[T1_0_LCSSA2:%.*]] = ptrtoint ptr [[T1_0_LCSSA]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[ARR1]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 0, [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[T1_0_LCSSA2]] to i32 @@ -42,16 +42,15 @@ define i8 @widget(i8* %arr, i8 %t9) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i8> [[VEC_IND]], ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i8> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[ARR]], i8 [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[ARR]], i8 [[TMP12]] ; CHECK-NEXT: [[TMP14:%.*]] = icmp slt <4 x i8> [[TMP11]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP15:%.*]] = zext <4 x i1> [[TMP14]] to <4 x i8> -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to <4 x i8>* -; CHECK-NEXT: store <4 x i8> [[TMP15]], <4 x i8>* [[TMP17]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; CHECK-NEXT: store <4 x i8> [[TMP15]], ptr [[TMP16]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] @@ -61,11 +60,11 @@ define i8 @widget(i8* %arr, i8 %t9) { ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i8 [[IV]], 1 -; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i8, i8* [[ARR]], i8 [[IV_NEXT]] +; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i8, ptr [[ARR]], i8 [[IV_NEXT]] ; CHECK-NEXT: [[T3_I:%.*]] = icmp slt i8 [[IV_NEXT]], [[T9]] ; CHECK-NEXT: [[T3_I8:%.*]] = zext i1 [[T3_I]] to i8 -; CHECK-NEXT: store i8 [[T3_I8]], i8* [[PTR]], align 1 -; CHECK-NEXT: [[EC:%.*]] = icmp eq i8* [[T1_0_LCSSA]], [[PTR]] +; CHECK-NEXT: store i8 [[T3_I8]], ptr [[PTR]], align 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[T1_0_LCSSA]], [[PTR]] ; CHECK-NEXT: br i1 [[EC]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: [[IV_NEXT_LCSSA:%.*]] = phi i8 [ [[IV_NEXT]], [[FOR_BODY]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] @@ -75,7 +74,7 @@ bb: br label %bb6 bb6: - %t1.0 = phi i8* [ %arr, %bb ], [ null, %bb6 ] + %t1.0 = phi ptr [ %arr, %bb ], [ null, %bb6 ] %c = call i1 @cond() br i1 %c, label %for.preheader, label %bb6 @@ -85,11 +84,11 @@ for.preheader: for.body: %iv = phi i8 [ %iv.next, %for.body ], [ 0, %for.preheader ] %iv.next = add i8 %iv, 1 - %ptr = getelementptr inbounds i8, i8* %arr, i8 %iv.next + %ptr = getelementptr inbounds i8, ptr %arr, i8 %iv.next %t3.i = icmp slt i8 %iv.next, %t9 %t3.i8 = zext i1 %t3.i to i8 - store i8 %t3.i8, i8* %ptr - %ec = icmp eq i8* %t1.0, %ptr + store i8 %t3.i8, ptr %ptr + %ec = icmp eq ptr %t1.0, %ptr br i1 %ec, label %for.exit, label %for.body for.exit: diff --git a/llvm/test/Transforms/LoopVectorize/pr50686.ll b/llvm/test/Transforms/LoopVectorize/pr50686.ll index b74df3a3ee3f3..f1be6867ad942 100644 --- a/llvm/test/Transforms/LoopVectorize/pr50686.ll +++ b/llvm/test/Transforms/LoopVectorize/pr50686.ll @@ -1,21 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S < %s | FileCheck %s -define void @m(i32* nocapture %p, i32* nocapture %p2, i32 %q) { +define void @m(ptr nocapture %p, ptr nocapture %p2, i32 %q) { ; CHECK-LABEL: @m( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = bitcast i32* [[P:%.*]] to i8* -; CHECK-NEXT: [[P23:%.*]] = bitcast i32* [[P2:%.*]] to i8* -; CHECK-NEXT: [[ARRAYIDX9_1:%.*]] = getelementptr inbounds i32, i32* [[P2]], i64 1 -; CHECK-NEXT: [[ARRAYIDX9_2:%.*]] = getelementptr inbounds i32, i32* [[P2]], i64 2 +; CHECK-NEXT: [[ARRAYIDX9_1:%.*]] = getelementptr inbounds i32, ptr [[P2:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX9_2:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 2 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[P]], i64 63 -; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[P2]], i64 3 -; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[P1]], [[SCEVGEP45]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[P23]], [[SCEVGEP2]] +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 252 +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[P2]], i64 12 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[P]], [[UGLYGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[P2]], [[UGLYGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -23,25 +19,24 @@ define void @m(i32* nocapture %p, i32* nocapture %p2, i32 %q) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[P2]], align 4, !alias.scope !0 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[P2]], align 4, !alias.scope !0 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> zeroinitializer, [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX9_1]], align 4, !alias.scope !0 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP2]], [[BROADCAST_SPLAT7]] -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9_2]], align 4, !alias.scope !0 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <4 x i32> [[TMP4]], [[BROADCAST_SPLAT9]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP9]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX9_1]], align 4, !alias.scope !0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP2]], [[BROADCAST_SPLAT3]] +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX9_2]], align 4, !alias.scope !0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <4 x i32> [[TMP4]], [[BROADCAST_SPLAT5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[TMP8]], align 4, !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 60 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 60 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 63, 60 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END17:%.*]], label [[SCALAR_PH]] @@ -50,14 +45,14 @@ define void @m(i32* nocapture %p, i32* nocapture %p2, i32 %q) { ; CHECK-NEXT: br label [[FOR_COND5:%.*]] ; CHECK: for.cond5: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND5]] ] -; CHECK-NEXT: [[I3:%.*]] = load i32, i32* [[P2]], align 4 +; CHECK-NEXT: [[I3:%.*]] = load i32, ptr [[P2]], align 4 ; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[I3]] -; CHECK-NEXT: [[I4:%.*]] = load i32, i32* [[ARRAYIDX9_1]], align 4 +; CHECK-NEXT: [[I4:%.*]] = load i32, ptr [[ARRAYIDX9_1]], align 4 ; CHECK-NEXT: [[SUB_1:%.*]] = sub nsw i32 [[SUB]], [[I4]] -; CHECK-NEXT: [[I5:%.*]] = load i32, i32* [[ARRAYIDX9_2]], align 4 +; CHECK-NEXT: [[I5:%.*]] = load i32, ptr [[ARRAYIDX9_2]], align 4 ; CHECK-NEXT: [[SUB_2:%.*]] = sub nsw i32 [[SUB_1]], [[I5]] -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[SUB_2]], i32* [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[SUB_2]], ptr [[ARRAYIDX14]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 63 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END17]], label [[FOR_COND5]], !llvm.loop [[LOOP7:![0-9]+]] @@ -65,20 +60,20 @@ define void @m(i32* nocapture %p, i32* nocapture %p2, i32 %q) { ; CHECK-NEXT: ret void ; entry: - %arrayidx9.1 = getelementptr inbounds i32, i32* %p2, i64 1 - %arrayidx9.2 = getelementptr inbounds i32, i32* %p2, i64 2 + %arrayidx9.1 = getelementptr inbounds i32, ptr %p2, i64 1 + %arrayidx9.2 = getelementptr inbounds i32, ptr %p2, i64 2 br label %for.cond5 for.cond5: ; preds = %entry, %for.cond5 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond5 ] - %i3 = load i32, i32* %p2, align 4 + %i3 = load i32, ptr %p2, align 4 %sub = sub nsw i32 0, %i3 - %i4 = load i32, i32* %arrayidx9.1, align 4 + %i4 = load i32, ptr %arrayidx9.1, align 4 %sub.1 = sub nsw i32 %sub, %i4 - %i5 = load i32, i32* %arrayidx9.2, align 4 + %i5 = load i32, ptr %arrayidx9.2, align 4 %sub.2 = sub nsw i32 %sub.1, %i5 - %arrayidx14 = getelementptr inbounds i32, i32* %p, i64 %indvars.iv - store i32 %sub.2, i32* %arrayidx14, align 4 + %arrayidx14 = getelementptr inbounds i32, ptr %p, i64 %indvars.iv + store i32 %sub.2, ptr %arrayidx14, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 63 br i1 %exitcond, label %for.end17, label %for.cond5 diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll index eb7e8316f66f3..7e5204a784a03 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s -define void @test(float* %A, i32 %x) { +define void @test(ptr %A, i32 %x) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] @@ -11,27 +11,25 @@ define void @test(float* %A, i32 %x) { ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDEX]] to i32 -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32 -; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[X]] -; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP2]], [[X]] -; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[WIDE_LOAD]], <4 x float>* [[TMP15]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], [[X]] +; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 +; CHECK-NEXT: store <4 x float> [[WIDE_LOAD]], ptr [[TMP12]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 undef, undef ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -44,13 +42,13 @@ define void @test(float* %A, i32 %x) { ; CHECK-NEXT: [[T_IV_NEXT:%.*]] = trunc i64 [[IV_NEXT]] to i32 ; CHECK-NEXT: [[MUL_IV_NEXT:%.*]] = mul i32 [[T_IV_NEXT]], [[X]] ; CHECK-NEXT: [[IDX_1:%.*]] = zext i32 [[MUL_IV_NEXT]] to i64 -; CHECK-NEXT: [[ARRAYIDX1215:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IDX_1]] -; CHECK-NEXT: [[LV:%.*]] = load float, float* [[ARRAYIDX1215]], align 4 +; CHECK-NEXT: [[ARRAYIDX1215:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDX_1]] +; CHECK-NEXT: [[LV:%.*]] = load float, ptr [[ARRAYIDX1215]], align 4 ; CHECK-NEXT: [[T_IV:%.*]] = trunc i64 [[IV]] to i32 ; CHECK-NEXT: [[MUL_IV:%.*]] = mul i32 [[T_IV]], [[X]] ; CHECK-NEXT: [[IDX_2:%.*]] = zext i32 [[MUL_IV]] to i64 -; CHECK-NEXT: [[ARRAYIDX1209:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IDX_2]] -; CHECK-NEXT: store float [[LV]], float* [[ARRAYIDX1209]], align 4 +; CHECK-NEXT: [[ARRAYIDX1209:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDX_2]] +; CHECK-NEXT: store float [[LV]], ptr [[ARRAYIDX1209]], align 4 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], undef ; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: exit: @@ -65,14 +63,14 @@ loop: ; preds = %loop, %entry %t.iv.next = trunc i64 %iv.next to i32 %mul.iv.next = mul i32 %t.iv.next, %x %idx.1 = zext i32 %mul.iv.next to i64 - %arrayidx1215 = getelementptr inbounds float, float* %A, i64 %idx.1 - %lv = load float, float* %arrayidx1215, align 4 + %arrayidx1215 = getelementptr inbounds float, ptr %A, i64 %idx.1 + %lv = load float, ptr %arrayidx1215, align 4 %t.iv = trunc i64 %iv to i32 %mul.iv = mul i32 %t.iv, %x %idx.2 = zext i32 %mul.iv to i64 - %arrayidx1209 = getelementptr inbounds float, float* %A, i64 %idx.2 - store float %lv, float* %arrayidx1209, align 4 + %arrayidx1209 = getelementptr inbounds float, ptr %A, i64 %idx.2 + store float %lv, ptr %arrayidx1209, align 4 %ec = icmp eq i64 %iv.next, undef br i1 %ec, label %exit, label %loop diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll index 621c92f29e160..cf12fe4947a55 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll @@ -10,17 +10,17 @@ ; runtime checks. ; The relevant bounds for %gep.A are [%A, %A+4). -define void @load_clamped_index(i32* %A, i32* %B, i32 %N) { +define void @load_clamped_index(ptr %A, ptr %B, i32 %N) { ; CHECK-LABEL: @load_clamped_index( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A2:%.*]] = ptrtoint i32* [[A:%.*]] to i64 -; CHECK-NEXT: [[B1:%.*]] = ptrtoint i32* [[B:%.*]] to i64 +; CHECK-NEXT: [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64 +; CHECK-NEXT: [[B1:%.*]] = ptrtoint ptr [[B:%.*]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP0]], 3 -; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP0]], 3 +; CHECK-NEXT: br i1 [[TMP1]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]] ; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8 @@ -31,20 +31,18 @@ define void @load_clamped_index(i32* %A, i32* %B, i32 %N) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = urem i32 [[TMP13]], 4 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = add <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP13]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0 -; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = urem i32 [[TMP3]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store <2 x i32> [[TMP7]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -54,14 +52,14 @@ define void @load_clamped_index(i32* %A, i32* %B, i32 %N) { ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[CLAMPED_INDEX:%.*]] = urem i32 [[IV]], 4 -; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[CLAMPED_INDEX]] -; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[GEP_A]], align 4 +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[CLAMPED_INDEX]] +; CHECK-NEXT: [[LV:%.*]] = load i32, ptr [[GEP_A]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LV]], 10 -; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[IV]] -; CHECK-NEXT: store i32 [[ADD]], i32* [[GEP_B]], align 4 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP_B]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -71,11 +69,11 @@ entry: loop: %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] %clamped.index = urem i32 %iv, 4 - %gep.A = getelementptr inbounds i32, i32* %A, i32 %clamped.index - %lv = load i32, i32* %gep.A + %gep.A = getelementptr inbounds i32, ptr %A, i32 %clamped.index + %lv = load i32, ptr %gep.A %add = add i32 %lv, 10 - %gep.B = getelementptr inbounds i32, i32* %B, i32 %iv - store i32 %add, i32* %gep.B + %gep.B = getelementptr inbounds i32, ptr %B, i32 %iv + store i32 %add, ptr %gep.B %iv.next = add nuw nsw i32 %iv, 1 %cond = icmp eq i32 %iv.next, %N br i1 %cond, label %exit, label %loop @@ -85,17 +83,17 @@ exit: } ; The relevant bounds for %gep.A are [%A, %A+4). -define void @store_clamped_index(i32* %A, i32* %B, i32 %N) { +define void @store_clamped_index(ptr %A, ptr %B, i32 %N) { ; CHECK-LABEL: @store_clamped_index( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[B2:%.*]] = ptrtoint i32* [[B:%.*]] to i64 -; CHECK-NEXT: [[A1:%.*]] = ptrtoint i32* [[A:%.*]] to i64 +; CHECK-NEXT: [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; CHECK-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP0]], 3 -; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP0]], 3 +; CHECK-NEXT: br i1 [[TMP1]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[A1]], [[B2]] ; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8 @@ -106,20 +104,18 @@ define void @store_clamped_index(i32* %A, i32* %B, i32 %N) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = urem i32 [[TMP13]], 4 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP13]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = add <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP14]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0 -; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = urem i32 [[TMP3]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store <2 x i32> [[TMP7]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -129,14 +125,14 @@ define void @store_clamped_index(i32* %A, i32* %B, i32 %N) { ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[CLAMPED_INDEX:%.*]] = urem i32 [[IV]], 4 -; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[IV]] -; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[GEP_B]], align 4 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]] +; CHECK-NEXT: [[LV:%.*]] = load i32, ptr [[GEP_B]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LV]], 10 -; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[CLAMPED_INDEX]] -; CHECK-NEXT: store i32 [[ADD]], i32* [[GEP_A]], align 4 +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[CLAMPED_INDEX]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP_A]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -146,11 +142,11 @@ entry: loop: %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] %clamped.index = urem i32 %iv, 4 - %gep.B = getelementptr inbounds i32, i32* %B, i32 %iv - %lv = load i32, i32* %gep.B + %gep.B = getelementptr inbounds i32, ptr %B, i32 %iv + %lv = load i32, ptr %gep.B %add = add i32 %lv, 10 - %gep.A = getelementptr inbounds i32, i32* %A, i32 %clamped.index - store i32 %add, i32* %gep.A + %gep.A = getelementptr inbounds i32, ptr %A, i32 %clamped.index + store i32 %add, ptr %gep.A %iv.next = add nuw nsw i32 %iv, 1 %cond = icmp eq i32 %iv.next, %N br i1 %cond, label %exit, label %loop @@ -159,21 +155,21 @@ exit: ret void } -define void @clamped_index_dependence_non_clamped(i32* %A, i32* %B, i32 %N) { +define void @clamped_index_dependence_non_clamped(ptr %A, ptr %B, i32 %N) { ; CHECK-LABEL: @clamped_index_dependence_non_clamped( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[IV]] -; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[GEP_B]], align 4 -; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[IV]] -; CHECK-NEXT: [[LV_A:%.*]] = load i32, i32* [[GEP_A_1]], align 4 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[IV]] +; CHECK-NEXT: [[LV:%.*]] = load i32, ptr [[GEP_B]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[IV]] +; CHECK-NEXT: [[LV_A:%.*]] = load i32, ptr [[GEP_A_1]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LV]], [[LV_A]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: [[CLAMPED_INDEX:%.*]] = urem i32 [[IV_NEXT]], 4 -; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[CLAMPED_INDEX]] -; CHECK-NEXT: store i32 [[ADD]], i32* [[GEP_A]], align 4 +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[CLAMPED_INDEX]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP_A]], align 4 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N:%.*]] ; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: @@ -184,16 +180,16 @@ entry: loop: %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] - %gep.B = getelementptr inbounds i32, i32* %B, i32 %iv - %lv = load i32, i32* %gep.B - %gep.A.1 = getelementptr inbounds i32, i32* %A, i32 %iv - %lv.A = load i32, i32* %gep.A.1 + %gep.B = getelementptr inbounds i32, ptr %B, i32 %iv + %lv = load i32, ptr %gep.B + %gep.A.1 = getelementptr inbounds i32, ptr %A, i32 %iv + %lv.A = load i32, ptr %gep.A.1 %add = add i32 %lv, %lv.A %iv.next = add nuw nsw i32 %iv, 1 %clamped.index = urem i32 %iv.next, 4 - %gep.A = getelementptr inbounds i32, i32* %A, i32 %clamped.index - store i32 %add, i32* %gep.A + %gep.A = getelementptr inbounds i32, ptr %A, i32 %clamped.index + store i32 %add, ptr %gep.A %cond = icmp eq i32 %iv.next, %N br i1 %cond, label %exit, label %loop @@ -201,20 +197,20 @@ exit: ret void } -define void @clamped_index_dependence_clamped_index(i32* %A, i32* %B, i32 %N) { +define void @clamped_index_dependence_clamped_index(ptr %A, ptr %B, i32 %N) { ; CHECK-LABEL: @clamped_index_dependence_clamped_index( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[CLAMPED_INDEX_1:%.*]] = urem i32 [[IV]], 4 -; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[CLAMPED_INDEX_1]] -; CHECK-NEXT: [[LV_A:%.*]] = load i32, i32* [[GEP_A_1]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[CLAMPED_INDEX_1]] +; CHECK-NEXT: [[LV_A:%.*]] = load i32, ptr [[GEP_A_1]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LV_A]], 10 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: [[CLAMPED_INDEX:%.*]] = urem i32 [[IV_NEXT]], 4 -; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[CLAMPED_INDEX]] -; CHECK-NEXT: store i32 [[ADD]], i32* [[GEP_A]], align 4 +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[CLAMPED_INDEX]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP_A]], align 4 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N:%.*]] ; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: @@ -226,14 +222,14 @@ entry: loop: %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] %clamped.index.1 = urem i32 %iv, 4 - %gep.A.1 = getelementptr inbounds i32, i32* %A, i32 %clamped.index.1 - %lv.A = load i32, i32* %gep.A.1 + %gep.A.1 = getelementptr inbounds i32, ptr %A, i32 %clamped.index.1 + %lv.A = load i32, ptr %gep.A.1 %add = add i32 %lv.A, 10 %iv.next = add nuw nsw i32 %iv, 1 %clamped.index = urem i32 %iv.next, 4 - %gep.A = getelementptr inbounds i32, i32* %A, i32 %clamped.index - store i32 %add, i32* %gep.A + %gep.A = getelementptr inbounds i32, ptr %A, i32 %clamped.index + store i32 %add, ptr %gep.A %cond = icmp eq i32 %iv.next, %N br i1 %cond, label %exit, label %loop @@ -241,33 +237,31 @@ exit: ret void } -define void @clamped_index_equal_dependence(i32* %A, i32* %B, i32 %N) { +define void @clamped_index_equal_dependence(ptr %A, ptr %B, i32 %N) { ; CHECK-LABEL: @clamped_index_equal_dependence( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP0]], 3 -; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP0]], 3 +; CHECK-NEXT: br i1 [[TMP1]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = urem i32 [[TMP10]], 4 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP14]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = add <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP13]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP15]], <2 x i32>* [[TMP16]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = urem i32 [[TMP2]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: store <2 x i32> [[TMP6]], ptr [[TMP5]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -277,13 +271,13 @@ define void @clamped_index_equal_dependence(i32* %A, i32* %B, i32 %N) { ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[CLAMPED_INDEX:%.*]] = urem i32 [[IV]], 4 -; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[CLAMPED_INDEX]] -; CHECK-NEXT: [[LV_A:%.*]] = load i32, i32* [[GEP_A]], align 4 +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[CLAMPED_INDEX]] +; CHECK-NEXT: [[LV_A:%.*]] = load i32, ptr [[GEP_A]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LV_A]], 10 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 -; CHECK-NEXT: store i32 [[ADD]], i32* [[GEP_A]], align 4 +; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP_A]], align 4 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -293,12 +287,12 @@ entry: loop: %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] %clamped.index = urem i32 %iv, 4 - %gep.A = getelementptr inbounds i32, i32* %A, i32 %clamped.index - %lv.A = load i32, i32* %gep.A + %gep.A = getelementptr inbounds i32, ptr %A, i32 %clamped.index + %lv.A = load i32, ptr %gep.A %add = add i32 %lv.A, 10 %iv.next = add nuw nsw i32 %iv, 1 - store i32 %add, i32* %gep.A + store i32 %add, ptr %gep.A %cond = icmp eq i32 %iv.next, %N br i1 %cond, label %exit, label %loop