diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index a6a216392de74..2f6b747140e59 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -25,7 +25,7 @@ static cl::opt RVVRegisterWidthLMUL( cl::desc( "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), - cl::init(1), cl::Hidden); + cl::init(2), cl::Hidden); static cl::opt SLPMaxVF( "riscv-v-slp-max-vf", diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll b/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll index d58ff5051c621..4b93ea30cf252 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll @@ -14,27 +14,30 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v) { ; CHECK-LABEL: @vector_add( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP5]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -49,7 +52,7 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v) { ; CHECK-NEXT: store i64 [[ADD]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -74,32 +77,35 @@ define i64 @vector_add_reduce(ptr noalias nocapture %a) { ; CHECK-LABEL: @vector_add_reduce( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5]] = add [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7]] = add [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64( [[TMP5]]) +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP7]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -111,7 +117,7 @@ define i64 @vector_add_reduce(ptr noalias nocapture %a) { ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll index 7d079d13dc710..119f50df5b8e3 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll @@ -11,27 +11,30 @@ define void @vector_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-LABEL: @vector_udiv( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = udiv [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP5]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = udiv [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -54,26 +57,26 @@ define void @vector_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: entry: ; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[TMP6:%.*]] = udiv <2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; FIXED-NEXT: [[TMP7:%.*]] = udiv <2 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]] -; FIXED-NEXT: store <2 x i64> [[TMP6]], ptr [[TMP4]], align 8 -; FIXED-NEXT: store <2 x i64> [[TMP7]], ptr [[TMP5]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; FIXED-NEXT: [[TMP7:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]] +; FIXED-NEXT: store <4 x i64> [[TMP6]], ptr [[TMP4]], align 8 +; FIXED-NEXT: store <4 x i64> [[TMP7]], ptr [[TMP5]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; FIXED: middle.block: @@ -115,27 +118,30 @@ define void @vector_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-LABEL: @vector_sdiv( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = sdiv [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP5]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = sdiv [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -158,26 +164,26 @@ define void @vector_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: entry: ; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[TMP6:%.*]] = sdiv <2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; FIXED-NEXT: [[TMP7:%.*]] = sdiv <2 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]] -; FIXED-NEXT: store <2 x i64> [[TMP6]], ptr [[TMP4]], align 8 -; FIXED-NEXT: store <2 x i64> [[TMP7]], ptr [[TMP5]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; FIXED-NEXT: [[TMP7:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]] +; FIXED-NEXT: store <4 x i64> [[TMP6]], ptr [[TMP4]], align 8 +; FIXED-NEXT: store <4 x i64> [[TMP7]], ptr [[TMP5]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; FIXED: middle.block: @@ -219,27 +225,30 @@ define void @vector_urem(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-LABEL: @vector_urem( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = urem [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP5]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = urem [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -262,26 +271,26 @@ define void @vector_urem(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: entry: ; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[TMP6:%.*]] = urem <2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; FIXED-NEXT: [[TMP7:%.*]] = urem <2 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]] -; FIXED-NEXT: store <2 x i64> [[TMP6]], ptr [[TMP4]], align 8 -; FIXED-NEXT: store <2 x i64> [[TMP7]], ptr [[TMP5]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[TMP6:%.*]] = urem <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; FIXED-NEXT: [[TMP7:%.*]] = urem <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]] +; FIXED-NEXT: store <4 x i64> [[TMP6]], ptr [[TMP4]], align 8 +; FIXED-NEXT: store <4 x i64> [[TMP7]], ptr [[TMP5]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; FIXED: middle.block: @@ -323,27 +332,30 @@ define void @vector_srem(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-LABEL: @vector_srem( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = srem [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP5]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = srem [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -366,26 +378,26 @@ define void @vector_srem(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: entry: ; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[TMP6:%.*]] = srem <2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; FIXED-NEXT: [[TMP7:%.*]] = srem <2 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]] -; FIXED-NEXT: store <2 x i64> [[TMP6]], ptr [[TMP4]], align 8 -; FIXED-NEXT: store <2 x i64> [[TMP7]], ptr [[TMP5]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[TMP6:%.*]] = srem <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; FIXED-NEXT: [[TMP7:%.*]] = srem <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]] +; FIXED-NEXT: store <4 x i64> [[TMP6]], ptr [[TMP4]], align 8 +; FIXED-NEXT: store <4 x i64> [[TMP7]], ptr [[TMP5]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; FIXED: middle.block: @@ -427,31 +439,34 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-LABEL: @predicated_udiv( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = select [[TMP5]], [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP7:%.*]] = udiv [[WIDE_LOAD]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = xor [[TMP5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP5]], [[TMP7]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = udiv [[WIDE_LOAD]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP9]], [[WIDE_LOAD]] +; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -480,34 +495,34 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: entry: ; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer -; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT3]], zeroinitializer -; FIXED-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[BROADCAST_SPLAT]], <2 x i64> -; FIXED-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[BROADCAST_SPLAT3]], <2 x i64> -; FIXED-NEXT: [[TMP10:%.*]] = udiv <2 x i64> [[WIDE_LOAD]], [[TMP8]] -; FIXED-NEXT: [[TMP11:%.*]] = udiv <2 x i64> [[WIDE_LOAD1]], [[TMP9]] -; FIXED-NEXT: [[TMP12:%.*]] = xor <2 x i1> [[TMP6]], -; FIXED-NEXT: [[TMP13:%.*]] = xor <2 x i1> [[TMP7]], -; FIXED-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[TMP10]], <2 x i64> [[WIDE_LOAD]] -; FIXED-NEXT: [[PREDPHI4:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[TMP11]], <2 x i64> [[WIDE_LOAD1]] -; FIXED-NEXT: store <2 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 -; FIXED-NEXT: store <2 x i64> [[PREDPHI4]], ptr [[TMP5]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer +; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT3]], zeroinitializer +; FIXED-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> +; FIXED-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[BROADCAST_SPLAT3]], <4 x i64> +; FIXED-NEXT: [[TMP10:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], [[TMP8]] +; FIXED-NEXT: [[TMP11:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP9]] +; FIXED-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP6]], +; FIXED-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP7]], +; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP10]], <4 x i64> [[WIDE_LOAD]] +; FIXED-NEXT: [[PREDPHI4:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP11]], <4 x i64> [[WIDE_LOAD1]] +; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 +; FIXED-NEXT: store <4 x i64> [[PREDPHI4]], ptr [[TMP5]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; FIXED: middle.block: @@ -561,31 +576,34 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-LABEL: @predicated_sdiv( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = select [[TMP5]], [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP7:%.*]] = sdiv [[WIDE_LOAD]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = xor [[TMP5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP5]], [[TMP7]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = sdiv [[WIDE_LOAD]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP9]], [[WIDE_LOAD]] +; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -614,34 +632,34 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: entry: ; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer -; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT3]], zeroinitializer -; FIXED-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[BROADCAST_SPLAT]], <2 x i64> -; FIXED-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[BROADCAST_SPLAT3]], <2 x i64> -; FIXED-NEXT: [[TMP10:%.*]] = sdiv <2 x i64> [[WIDE_LOAD]], [[TMP8]] -; FIXED-NEXT: [[TMP11:%.*]] = sdiv <2 x i64> [[WIDE_LOAD1]], [[TMP9]] -; FIXED-NEXT: [[TMP12:%.*]] = xor <2 x i1> [[TMP6]], -; FIXED-NEXT: [[TMP13:%.*]] = xor <2 x i1> [[TMP7]], -; FIXED-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[TMP10]], <2 x i64> [[WIDE_LOAD]] -; FIXED-NEXT: [[PREDPHI4:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[TMP11]], <2 x i64> [[WIDE_LOAD1]] -; FIXED-NEXT: store <2 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 -; FIXED-NEXT: store <2 x i64> [[PREDPHI4]], ptr [[TMP5]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer +; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT3]], zeroinitializer +; FIXED-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> +; FIXED-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[BROADCAST_SPLAT3]], <4 x i64> +; FIXED-NEXT: [[TMP10:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], [[TMP8]] +; FIXED-NEXT: [[TMP11:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP9]] +; FIXED-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP6]], +; FIXED-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP7]], +; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP10]], <4 x i64> [[WIDE_LOAD]] +; FIXED-NEXT: [[PREDPHI4:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP11]], <4 x i64> [[WIDE_LOAD1]] +; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 +; FIXED-NEXT: store <4 x i64> [[PREDPHI4]], ptr [[TMP5]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; FIXED: middle.block: @@ -695,28 +713,31 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; CHECK-LABEL: @predicated_udiv_by_constant( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 42, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP6:%.*]] = udiv [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 27, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP7:%.*]] = xor [[TMP5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP5]], [[TMP6]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 42, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = udiv [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 27, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP8]], [[WIDE_LOAD]] +; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -749,24 +770,24 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <2 x i64> [[WIDE_LOAD]], -; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <2 x i64> [[WIDE_LOAD1]], -; FIXED-NEXT: [[TMP8:%.*]] = udiv <2 x i64> [[WIDE_LOAD]], -; FIXED-NEXT: [[TMP9:%.*]] = udiv <2 x i64> [[WIDE_LOAD1]], -; FIXED-NEXT: [[TMP10:%.*]] = xor <2 x i1> [[TMP6]], -; FIXED-NEXT: [[TMP11:%.*]] = xor <2 x i1> [[TMP7]], -; FIXED-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[WIDE_LOAD]] -; FIXED-NEXT: [[PREDPHI2:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[TMP9]], <2 x i64> [[WIDE_LOAD1]] -; FIXED-NEXT: store <2 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 -; FIXED-NEXT: store <2 x i64> [[PREDPHI2]], ptr [[TMP5]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], +; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], +; FIXED-NEXT: [[TMP8:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], +; FIXED-NEXT: [[TMP9:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], +; FIXED-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], +; FIXED-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], +; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD]] +; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP9]], <4 x i64> [[WIDE_LOAD1]] +; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 +; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP5]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; FIXED: middle.block: @@ -820,28 +841,31 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; CHECK-LABEL: @predicated_sdiv_by_constant( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 42, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP6:%.*]] = sdiv [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 27, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP7:%.*]] = xor [[TMP5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP5]], [[TMP6]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 42, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = sdiv [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 27, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP8]], [[WIDE_LOAD]] +; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -874,24 +898,24 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <2 x i64> [[WIDE_LOAD]], -; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <2 x i64> [[WIDE_LOAD1]], -; FIXED-NEXT: [[TMP8:%.*]] = sdiv <2 x i64> [[WIDE_LOAD]], -; FIXED-NEXT: [[TMP9:%.*]] = sdiv <2 x i64> [[WIDE_LOAD1]], -; FIXED-NEXT: [[TMP10:%.*]] = xor <2 x i1> [[TMP6]], -; FIXED-NEXT: [[TMP11:%.*]] = xor <2 x i1> [[TMP7]], -; FIXED-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[WIDE_LOAD]] -; FIXED-NEXT: [[PREDPHI2:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[TMP9]], <2 x i64> [[WIDE_LOAD1]] -; FIXED-NEXT: store <2 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 -; FIXED-NEXT: store <2 x i64> [[PREDPHI2]], ptr [[TMP5]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], +; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], +; FIXED-NEXT: [[TMP8:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], +; FIXED-NEXT: [[TMP9:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], +; FIXED-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], +; FIXED-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], +; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD]] +; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP9]], <4 x i64> [[WIDE_LOAD1]] +; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 +; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP5]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; FIXED: middle.block: @@ -945,12 +969,12 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) { ; CHECK-LABEL: @predicated_sdiv_by_minus_one( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -959,15 +983,15 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 -128, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], shufflevector ( insertelement ( poison, i8 -1, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP9:%.*]] = sdiv [[WIDE_LOAD]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP9]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP6]], align 1 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 -128, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], shufflevector ( insertelement ( poison, i8 -1, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = sdiv [[WIDE_LOAD]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP9]], [[WIDE_LOAD]] +; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP6]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 16 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] @@ -1003,26 +1027,26 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) { ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 16 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 32 ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <16 x i8> [[WIDE_LOAD]], -; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <16 x i8> [[WIDE_LOAD1]], -; FIXED-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> , <16 x i8> -; FIXED-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP7]], <16 x i8> , <16 x i8> -; FIXED-NEXT: [[TMP10:%.*]] = sdiv <16 x i8> [[WIDE_LOAD]], [[TMP8]] -; FIXED-NEXT: [[TMP11:%.*]] = sdiv <16 x i8> [[WIDE_LOAD1]], [[TMP9]] -; FIXED-NEXT: [[TMP12:%.*]] = xor <16 x i1> [[TMP6]], -; FIXED-NEXT: [[TMP13:%.*]] = xor <16 x i1> [[TMP7]], -; FIXED-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP10]], <16 x i8> [[WIDE_LOAD]] -; FIXED-NEXT: [[PREDPHI2:%.*]] = select <16 x i1> [[TMP7]], <16 x i8> [[TMP11]], <16 x i8> [[WIDE_LOAD1]] -; FIXED-NEXT: store <16 x i8> [[PREDPHI]], ptr [[TMP4]], align 1 -; FIXED-NEXT: store <16 x i8> [[PREDPHI2]], ptr [[TMP5]], align 1 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP4]], align 1 +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 32 +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1 +; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <32 x i8> [[WIDE_LOAD]], +; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <32 x i8> [[WIDE_LOAD1]], +; FIXED-NEXT: [[TMP8:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> , <32 x i8> +; FIXED-NEXT: [[TMP9:%.*]] = select <32 x i1> [[TMP7]], <32 x i8> , <32 x i8> +; FIXED-NEXT: [[TMP10:%.*]] = sdiv <32 x i8> [[WIDE_LOAD]], [[TMP8]] +; FIXED-NEXT: [[TMP11:%.*]] = sdiv <32 x i8> [[WIDE_LOAD1]], [[TMP9]] +; FIXED-NEXT: [[TMP12:%.*]] = xor <32 x i1> [[TMP6]], +; FIXED-NEXT: [[TMP13:%.*]] = xor <32 x i1> [[TMP7]], +; FIXED-NEXT: [[PREDPHI:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP10]], <32 x i8> [[WIDE_LOAD]] +; FIXED-NEXT: [[PREDPHI2:%.*]] = select <32 x i1> [[TMP7]], <32 x i8> [[TMP11]], <32 x i8> [[WIDE_LOAD1]] +; FIXED-NEXT: store <32 x i8> [[PREDPHI]], ptr [[TMP4]], align 1 +; FIXED-NEXT: store <32 x i8> [[PREDPHI2]], ptr [[TMP5]], align 1 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 ; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; FIXED: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll index 56c8f901a668d..0f7600e9b2235 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll @@ -102,31 +102,31 @@ define void @uniform_store_i1(ptr noalias %dst, ptr noalias %start, i64 %N) { ; CHECK-LABEL: @uniform_store_i1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 64 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 32 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 64 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[N_VEC]], 8 ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x ptr> poison, ptr [[START]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x ptr> [[BROADCAST_SPLATINSERT]], <16 x ptr> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x ptr> poison, ptr [[START]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x ptr> [[BROADCAST_SPLATINSERT2]], <16 x ptr> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x ptr> poison, ptr [[START]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x ptr> [[BROADCAST_SPLATINSERT]], <32 x ptr> poison, <32 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <32 x ptr> poison, ptr [[START]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <32 x ptr> [[BROADCAST_SPLATINSERT2]], <32 x ptr> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <16 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <16 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, <16 x ptr> [[TMP2]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, <16 x ptr> [[TMP3]], i64 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <16 x ptr> [[TMP4]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <16 x ptr> [[TMP5]], [[BROADCAST_SPLAT3]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x i1> [[TMP7]], i32 15 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, <32 x ptr> [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, <32 x ptr> [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <32 x ptr> [[TMP4]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <32 x ptr> [[TMP5]], [[BROADCAST_SPLAT3]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <32 x i1> [[TMP7]], i32 31 ; CHECK-NEXT: store i1 [[TMP8]], ptr [[DST:%.*]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 256 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 512 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll index 2eb8ac4086f78..1310ed3618b2c 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll @@ -13,31 +13,31 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; OUTLOOP-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; OUTLOOP: for.body.preheader: ; OUTLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() -; OUTLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 2 +; OUTLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4 ; OUTLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP1]] ; OUTLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; OUTLOOP: vector.ph: ; OUTLOOP-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() -; OUTLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 2 +; OUTLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 4 ; OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP3]] ; OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] ; OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; OUTLOOP: vector.body: ; OUTLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] ; OUTLOOP-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 ; OUTLOOP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP4]] ; OUTLOOP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 -; OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 2 -; OUTLOOP-NEXT: [[TMP7:%.*]] = sext [[WIDE_LOAD]] to -; OUTLOOP-NEXT: [[TMP8]] = add [[VEC_PHI]], [[TMP7]] +; OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 2 +; OUTLOOP-NEXT: [[TMP7:%.*]] = sext [[WIDE_LOAD]] to +; OUTLOOP-NEXT: [[TMP8]] = add [[VEC_PHI]], [[TMP7]] ; OUTLOOP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() -; OUTLOOP-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 2 +; OUTLOOP-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 ; OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP10]] ; OUTLOOP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; OUTLOOP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; OUTLOOP: middle.block: -; OUTLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[TMP8]]) +; OUTLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP8]]) ; OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; OUTLOOP: scalar.ph: @@ -67,12 +67,12 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; INLOOP-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; INLOOP: for.body.preheader: ; INLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() -; INLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4 +; INLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 8 ; INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP1]] ; INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; INLOOP: vector.ph: ; INLOOP-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() -; INLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 4 +; INLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 8 ; INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP3]] ; INLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] ; INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] @@ -82,12 +82,12 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; INLOOP-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 ; INLOOP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP4]] ; INLOOP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 -; INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 2 -; INLOOP-NEXT: [[TMP7:%.*]] = sext [[WIDE_LOAD]] to -; INLOOP-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP7]]) +; INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 2 +; INLOOP-NEXT: [[TMP7:%.*]] = sext [[WIDE_LOAD]] to +; INLOOP-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP7]]) ; INLOOP-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]] ; INLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.vscale.i32() -; INLOOP-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 4 +; INLOOP-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 8 ; INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP11]] ; INLOOP-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; INLOOP-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll index d51f7becebeb5..827131ed19117 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll @@ -13,18 +13,18 @@ define void @load_store_factor2_i32(ptr %p) { ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[TMP2]], i32 0 -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC]], +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[STRIDED_VEC1]], +; CHECK-NEXT: [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP6]], i32 -1 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP7]], <8 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -125,46 +125,46 @@ define void @load_store_factor3_i32(ptr %p) { ; CHECK-LABEL: @load_store_factor3_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() -; CHECK-NEXT: [[TMP5:%.*]] = add [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP5:%.*]] = add [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = mul [[VEC_IND]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[P:%.*]], [[TMP10]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0( [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP12:%.*]] = add [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i32.nxv2p0( [[TMP12]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[P]], [[TMP13]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0( [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP15:%.*]] = add [[WIDE_MASKED_GATHER1]], shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i32.nxv2p0( [[TMP15]], [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP13]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[P]], [[TMP16]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0( [[TMP17]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP18:%.*]] = add [[WIDE_MASKED_GATHER2]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i32.nxv2p0( [[TMP18]], [[TMP17]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = mul [[VEC_IND]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[P:%.*]], [[TMP10]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP12:%.*]] = add [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP12]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[P]], [[TMP13]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP15:%.*]] = add [[WIDE_MASKED_GATHER1]], shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP15]], [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP13]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[P]], [[TMP16]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP17]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP18:%.*]] = add [[WIDE_MASKED_GATHER2]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP18]], [[TMP17]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: @@ -404,28 +404,28 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0 -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4 -; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]] +; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[Q]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 -; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP12]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 4 -; CHECK-NEXT: store <4 x i32> [[TMP9]], ptr [[TMP13]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 8 +; CHECK-NEXT: store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: @@ -479,21 +479,45 @@ exit: define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; CHECK-LABEL: @combine_load_factor2_i64( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC1]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store <4 x i64> [[TMP4]], ptr [[TMP6]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 4 ; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 4 ; CHECK-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; CHECK-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[I]] +; CHECK-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] ; CHECK-NEXT: store i64 [[RES]], ptr [[DST]], align 4 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll index c456e0e1df7e8..9b3b90a7bc3b6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=LMUL1 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S --riscv-v-register-bit-width-lmul=1 | FileCheck %s -check-prefix=LMUL1 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S --riscv-v-register-bit-width-lmul=2 | FileCheck %s -check-prefix=LMUL2 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S --riscv-v-register-bit-width-lmul=4 | FileCheck %s -check-prefix=LMUL4 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S --riscv-v-register-bit-width-lmul=8 | FileCheck %s -check-prefix=LMUL8 +; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=LMUL2 define void @load_store(ptr %p) { ; LMUL1-LABEL: @load_store( diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll index dc4522756c369..ace267d72dea0 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -9,9 +9,9 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 ; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 5, [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] @@ -20,18 +20,18 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP5]], i64 5) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP5]], i64 5) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8i8.p0(ptr [[TMP7]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP8:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP7]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP8:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv8i8.p0(ptr [[TMP10]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP8]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0( [[TMP11]], ptr [[TMP10]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP10]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP8]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP11]], ptr [[TMP10]], i32 1, [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8 +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]] ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll index 3d580a5671783..34b06972dab06 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll @@ -12,44 +12,44 @@ define void @test(ptr noalias nocapture %a, ptr noalias nocapture %b, i32 %v) { ; VLENUNK-LABEL: @test( ; VLENUNK-NEXT: entry: ; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLENUNK: vector.ph: ; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLENUNK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() -; VLENUNK-NEXT: [[TMP5:%.*]] = add [[TMP4]], zeroinitializer -; VLENUNK-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; VLENUNK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] +; VLENUNK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; VLENUNK-NEXT: [[TMP5:%.*]] = add [[TMP4]], zeroinitializer +; VLENUNK-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; VLENUNK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] ; VLENUNK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; VLENUNK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] -; VLENUNK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 -; VLENUNK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 -; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLENUNK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; VLENUNK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 +; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLENUNK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; VLENUNK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; VLENUNK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 -; VLENUNK-NEXT: [[TMP11:%.*]] = icmp ult [[VEC_IND]], shufflevector ( insertelement ( poison, i64 512, i64 0), poison, zeroinitializer) +; VLENUNK-NEXT: [[TMP11:%.*]] = icmp ult [[VEC_IND]], shufflevector ( insertelement ( poison, i64 512, i64 0), poison, zeroinitializer) ; VLENUNK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP10]] ; VLENUNK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 -; VLENUNK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i32.p0(ptr [[TMP13]], i32 4, [[TMP11]], poison) -; VLENUNK-NEXT: [[TMP14:%.*]] = xor [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; VLENUNK-NEXT: [[PREDPHI:%.*]] = select [[TMP14]], zeroinitializer, [[WIDE_MASKED_LOAD]] -; VLENUNK-NEXT: [[TMP15:%.*]] = add [[PREDPHI]], [[BROADCAST_SPLAT]] +; VLENUNK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, [[TMP11]], poison) +; VLENUNK-NEXT: [[TMP14:%.*]] = xor [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; VLENUNK-NEXT: [[PREDPHI:%.*]] = select [[TMP14]], zeroinitializer, [[WIDE_MASKED_LOAD]] +; VLENUNK-NEXT: [[TMP15:%.*]] = add [[PREDPHI]], [[BROADCAST_SPLAT]] ; VLENUNK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP10]] ; VLENUNK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 -; VLENUNK-NEXT: store [[TMP15]], ptr [[TMP17]], align 4 +; VLENUNK-NEXT: store [[TMP15]], ptr [[TMP17]], align 4 ; VLENUNK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 +; VLENUNK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 ; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] -; VLENUNK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; VLENUNK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; VLENUNK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VLENUNK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VLENUNK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll index ac56579af2d26..00cabd58de913 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll @@ -17,8 +17,9 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32-LABEL: @foo4( ; RV32-NEXT: entry: ; RV32-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 12, i64 [[TMP0]]) -; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP1]] +; RV32-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; RV32-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]]) +; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP2]] ; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; RV32: vector.memcheck: ; RV32-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880 @@ -33,37 +34,40 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; RV32-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; RV32: vector.ph: -; RV32-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 625, [[TMP2]] +; RV32-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 +; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]] ; RV32-NEXT: [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]] ; RV32-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 16 -; RV32-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv1i64() -; RV32-NEXT: [[TMP4:%.*]] = add [[TMP3]], zeroinitializer -; RV32-NEXT: [[TMP5:%.*]] = mul [[TMP4]], shufflevector ( insertelement ( poison, i64 16, i64 0), poison, zeroinitializer) -; RV32-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP5]] -; RV32-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP7:%.*]] = mul i64 16, [[TMP6]] -; RV32-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 -; RV32-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; RV32-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; RV32-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; RV32-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 16, i64 0), poison, zeroinitializer) +; RV32-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] +; RV32-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; RV32-NEXT: [[TMP10:%.*]] = mul i64 16, [[TMP9]] +; RV32-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP10]], i64 0 +; RV32-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; RV32-NEXT: br label [[VECTOR_BODY:%.*]] ; RV32: vector.body: ; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; RV32-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; RV32-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], [[VEC_IND]] -; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv1i32.nxv1p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison), !alias.scope !0 -; RV32-NEXT: [[TMP9:%.*]] = icmp slt [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer) -; RV32-NEXT: [[TMP10:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; RV32-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[B]], [[TMP10]] -; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv1f64.nxv1p0( [[TMP11]], i32 8, [[TMP9]], poison), !alias.scope !3 -; RV32-NEXT: [[TMP12:%.*]] = sitofp [[WIDE_MASKED_GATHER]] to -; RV32-NEXT: [[TMP13:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP12]] -; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] -; RV32-NEXT: call void @llvm.masked.scatter.nxv1f64.nxv1p0( [[TMP13]], [[TMP14]], i32 8, [[TMP9]]), !alias.scope !5, !noalias !7 -; RV32-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] -; RV32-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; RV32-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; RV32-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; RV32-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], [[VEC_IND]] +; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0( [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison), !alias.scope !0 +; RV32-NEXT: [[TMP12:%.*]] = icmp slt [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer) +; RV32-NEXT: [[TMP13:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[B]], [[TMP13]] +; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv2f64.nxv2p0( [[TMP14]], i32 8, [[TMP12]], poison), !alias.scope !3 +; RV32-NEXT: [[TMP15:%.*]] = sitofp [[WIDE_MASKED_GATHER]] to +; RV32-NEXT: [[TMP16:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP15]] +; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] +; RV32-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0( [[TMP16]], [[TMP17]], i32 8, [[TMP12]]), !alias.scope !5, !noalias !7 +; RV32-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 +; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; RV32-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; RV32-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV32-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; RV32: middle.block: ; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 625, [[N_VEC]] ; RV32-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -73,30 +77,31 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32: for.body: ; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] -; RV32-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; RV32-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100 +; RV32-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; RV32-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP21]], 100 ; RV32-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; RV32: if.then: -; RV32-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 -; RV32-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP18]] -; RV32-NEXT: [[TMP19:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 -; RV32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP17]] to double -; RV32-NEXT: [[ADD:%.*]] = fadd double [[TMP19]], [[CONV]] +; RV32-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; RV32-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP22]] +; RV32-NEXT: [[TMP23:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 +; RV32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP21]] to double +; RV32-NEXT: [[ADD:%.*]] = fadd double [[TMP23]], [[CONV]] ; RV32-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] ; RV32-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8 ; RV32-NEXT: br label [[FOR_INC]] ; RV32: for.inc: ; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; RV32-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; RV32-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]] +; RV32-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]] ; RV32: for.end: ; RV32-NEXT: ret void ; ; RV64-LABEL: @foo4( ; RV64-NEXT: entry: ; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 12, i64 [[TMP0]]) -; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP1]] +; RV64-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]]) +; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP2]] ; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; RV64: vector.memcheck: ; RV64-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880 @@ -111,37 +116,40 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV64-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; RV64-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; RV64: vector.ph: -; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 625, [[TMP2]] +; RV64-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 +; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]] ; RV64-NEXT: [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]] ; RV64-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 16 -; RV64-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv1i64() -; RV64-NEXT: [[TMP4:%.*]] = add [[TMP3]], zeroinitializer -; RV64-NEXT: [[TMP5:%.*]] = mul [[TMP4]], shufflevector ( insertelement ( poison, i64 16, i64 0), poison, zeroinitializer) -; RV64-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP5]] -; RV64-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP7:%.*]] = mul i64 16, [[TMP6]] -; RV64-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 -; RV64-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; RV64-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; RV64-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; RV64-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 16, i64 0), poison, zeroinitializer) +; RV64-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] +; RV64-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; RV64-NEXT: [[TMP10:%.*]] = mul i64 16, [[TMP9]] +; RV64-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP10]], i64 0 +; RV64-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; RV64-NEXT: br label [[VECTOR_BODY:%.*]] ; RV64: vector.body: ; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; RV64-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], [[VEC_IND]] -; RV64-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv1i32.nxv1p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison), !alias.scope !0 -; RV64-NEXT: [[TMP9:%.*]] = icmp slt [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer) -; RV64-NEXT: [[TMP10:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; RV64-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[B]], [[TMP10]] -; RV64-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv1f64.nxv1p0( [[TMP11]], i32 8, [[TMP9]], poison), !alias.scope !3 -; RV64-NEXT: [[TMP12:%.*]] = sitofp [[WIDE_MASKED_GATHER]] to -; RV64-NEXT: [[TMP13:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP12]] -; RV64-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] -; RV64-NEXT: call void @llvm.masked.scatter.nxv1f64.nxv1p0( [[TMP13]], [[TMP14]], i32 8, [[TMP9]]), !alias.scope !5, !noalias !7 -; RV64-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] -; RV64-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; RV64-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; RV64-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; RV64-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], [[VEC_IND]] +; RV64-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0( [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison), !alias.scope !0 +; RV64-NEXT: [[TMP12:%.*]] = icmp slt [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer) +; RV64-NEXT: [[TMP13:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; RV64-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[B]], [[TMP13]] +; RV64-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv2f64.nxv2p0( [[TMP14]], i32 8, [[TMP12]], poison), !alias.scope !3 +; RV64-NEXT: [[TMP15:%.*]] = sitofp [[WIDE_MASKED_GATHER]] to +; RV64-NEXT: [[TMP16:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP15]] +; RV64-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] +; RV64-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0( [[TMP16]], [[TMP17]], i32 8, [[TMP12]]), !alias.scope !5, !noalias !7 +; RV64-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 +; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; RV64-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; RV64: middle.block: ; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 625, [[N_VEC]] ; RV64-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -151,22 +159,22 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV64: for.body: ; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] -; RV64-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; RV64-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100 +; RV64-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; RV64-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP21]], 100 ; RV64-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; RV64: if.then: -; RV64-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 -; RV64-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP18]] -; RV64-NEXT: [[TMP19:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 -; RV64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP17]] to double -; RV64-NEXT: [[ADD:%.*]] = fadd double [[TMP19]], [[CONV]] +; RV64-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; RV64-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP22]] +; RV64-NEXT: [[TMP23:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 +; RV64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP21]] to double +; RV64-NEXT: [[ADD:%.*]] = fadd double [[TMP23]], [[CONV]] ; RV64-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] ; RV64-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8 ; RV64-NEXT: br label [[FOR_INC]] ; RV64: for.inc: ; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; RV64-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; RV64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]] +; RV64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]] ; RV64: for.end: ; RV64-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-interleaved.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-interleaved.ll index 5fc46c203167f..7f4eb387a1ece 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-interleaved.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-interleaved.ll @@ -5,8 +5,8 @@ ; CHECK-LABEL: foo ; CHECK: LV: IC is 2 -; CHECK: %{{.*}} = add <4 x i32> %{{.*}}, -; CHECK: %{{.*}} = add {{.*}}, 8 +; CHECK: %{{.*}} = add <8 x i32> %{{.*}}, +; CHECK: %{{.*}} = add {{.*}}, 16 ; Function Attrs: nofree norecurse nosync nounwind writeonly define dso_local void @foo(i32 signext %n, ptr nocapture %A) local_unnamed_addr #0 { diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll index 4e9ec86df7ca0..e8d5dc6211b70 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll @@ -14,10 +14,10 @@ define ptr @array_add(ptr noalias nocapture readonly %a, ptr noalias nocapture r ; LMUL1-NEXT: [[TMP0:%.*]] = add i32 [[SIZE]], -1 ; LMUL1-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; LMUL1-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; LMUL1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; LMUL1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8 ; LMUL1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; LMUL1: vector.ph: -; LMUL1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 +; LMUL1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8 ; LMUL1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; LMUL1-NEXT: br label [[VECTOR_BODY:%.*]] ; LMUL1: vector.body: @@ -25,15 +25,15 @@ define ptr @array_add(ptr noalias nocapture readonly %a, ptr noalias nocapture r ; LMUL1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 ; LMUL1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]] ; LMUL1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 -; LMUL1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4 +; LMUL1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4 ; LMUL1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP3]] ; LMUL1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 -; LMUL1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4 -; LMUL1-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; LMUL1-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4 +; LMUL1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] ; LMUL1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP3]] ; LMUL1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 -; LMUL1-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP10]], align 4 -; LMUL1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; LMUL1-NEXT: store <8 x i32> [[TMP8]], ptr [[TMP10]], align 4 +; LMUL1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; LMUL1-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; LMUL1-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; LMUL1: middle.block: @@ -54,7 +54,7 @@ define ptr @array_add(ptr noalias nocapture readonly %a, ptr noalias nocapture r ; LMUL1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; LMUL1-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; LMUL1-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]] -; LMUL1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; LMUL1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; LMUL1: for.end.loopexit: ; LMUL1-NEXT: br label [[FOR_END]] ; LMUL1: for.end: @@ -108,7 +108,7 @@ define ptr @array_add(ptr noalias nocapture readonly %a, ptr noalias nocapture r ; LMUL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; LMUL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; LMUL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]] -; LMUL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; LMUL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; LMUL2: for.end.loopexit: ; LMUL2-NEXT: br label [[FOR_END]] ; LMUL2: for.end: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll index bc4e60425ac4c..a266ae643c1af 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll @@ -11,27 +11,30 @@ define void @test(ptr %p) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 32 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP2]], 200 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0 -; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP7]], align 32 -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 32 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP4]], 200 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP9]], align 32 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 200, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -47,7 +50,7 @@ define void @test(ptr %p) { ; CHECK-NEXT: store i64 [[V]], ptr [[A2]], align 32 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -82,12 +85,12 @@ define void @test_may_clobber(ptr %p) { ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 100 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0 -; CHECK-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: store <4 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: @@ -132,27 +135,30 @@ define void @trivial_due_max_vscale(ptr %p) { ; CHECK-LABEL: @trivial_due_max_vscale( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 32 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP2]], 8192 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0 -; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP7]], align 32 -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 32 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP4]], 8192 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP9]], align 32 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 200, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -195,27 +201,30 @@ define void @no_high_lmul_or_interleave(ptr %p) { ; CHECK-LABEL: @no_high_lmul_or_interleave( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 32 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP2]], 1024 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0 -; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP7]], align 32 -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 32 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP4]], 1024 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP9]], align 32 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 200, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll index 53e00ad0fee2b..f7bc4bd35f377 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll @@ -16,27 +16,30 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) { ; VLENUNK-LABEL: @vector_add( ; VLENUNK-NEXT: entry: ; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLENUNK: vector.ph: -; VLENUNK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLENUNK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLENUNK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; VLENUNK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; VLENUNK-NEXT: [[TMP5:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; VLENUNK-NEXT: store [[TMP5]], ptr [[TMP4]], align 8 -; VLENUNK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; VLENUNK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLENUNK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VLENUNK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; VLENUNK-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; VLENUNK-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; VLENUNK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; VLENUNK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLENUNK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VLENUNK: middle.block: ; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -58,27 +61,30 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) { ; VLEN128-LABEL: @vector_add( ; VLEN128-NEXT: entry: ; VLEN128-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLEN128-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLEN128: vector.ph: -; VLEN128-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLEN128-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLEN128-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLEN128-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; VLEN128-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; VLEN128-NEXT: [[TMP5:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; VLEN128-NEXT: store [[TMP5]], ptr [[TMP4]], align 8 -; VLEN128-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; VLEN128-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLEN128-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VLEN128-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; VLEN128-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; VLEN128-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; VLEN128-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; VLEN128-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; VLEN128-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLEN128-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VLEN128: middle.block: ; VLEN128-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLEN128-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -121,27 +127,27 @@ define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) { ; VLENUNK-LABEL: @vector_add_i32( ; VLENUNK-NEXT: entry: ; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLENUNK: vector.ph: ; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 -; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 +; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VLENUNK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP4]] ; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 -; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; VLENUNK-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; VLENUNK-NEXT: store [[TMP7]], ptr [[TMP6]], align 4 +; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; VLENUNK-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; VLENUNK-NEXT: store [[TMP7]], ptr [[TMP6]], align 4 ; VLENUNK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; VLENUNK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VLENUNK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -166,27 +172,27 @@ define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) { ; VLEN128-LABEL: @vector_add_i32( ; VLEN128-NEXT: entry: ; VLEN128-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLEN128-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLEN128: vector.ph: ; VLEN128-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 -; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 +; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VLEN128-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP4]] ; VLEN128-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 -; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; VLEN128-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; VLEN128-NEXT: store [[TMP7]], ptr [[TMP6]], align 4 +; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; VLEN128-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; VLEN128-NEXT: store [[TMP7]], ptr [[TMP6]], align 4 ; VLEN128-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; VLEN128-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; VLEN128-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VLEN128-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -287,27 +293,30 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; VLENUNK-LABEL: @indexed_store( ; VLENUNK-NEXT: entry: ; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLENUNK: vector.ph: -; VLENUNK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLENUNK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLENUNK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]] -; VLENUNK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] -; VLENUNK-NEXT: call void @llvm.masked.scatter.nxv1i64.nxv1p0( [[BROADCAST_SPLAT]], [[TMP5]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; VLENUNK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; VLENUNK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLENUNK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VLENUNK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP4]] +; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; VLENUNK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] +; VLENUNK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], [[TMP7]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; VLENUNK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; VLENUNK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLENUNK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; VLENUNK: middle.block: ; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -329,27 +338,30 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; VLEN128-LABEL: @indexed_store( ; VLEN128-NEXT: entry: ; VLEN128-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLEN128-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLEN128: vector.ph: -; VLEN128-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLEN128-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLEN128-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLEN128-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]] -; VLEN128-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] -; VLEN128-NEXT: call void @llvm.masked.scatter.nxv1i64.nxv1p0( [[BROADCAST_SPLAT]], [[TMP5]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; VLEN128-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; VLEN128-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLEN128-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VLEN128-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP4]] +; VLEN128-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; VLEN128-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] +; VLEN128-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], [[TMP7]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; VLEN128-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; VLEN128-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLEN128-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; VLEN128: middle.block: ; VLEN128-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLEN128-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -389,34 +401,37 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 ; VLENUNK-LABEL: @indexed_load( ; VLENUNK-NEXT: entry: ; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLENUNK: vector.ph: -; VLENUNK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLENUNK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] -; VLENUNK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLENUNK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]] -; VLENUNK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] -; VLENUNK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv1i64.nxv1p0( [[TMP5]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; VLENUNK-NEXT: [[TMP6]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; VLENUNK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] -; VLENUNK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLENUNK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; VLENUNK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; VLENUNK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP4]] +; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; VLENUNK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] +; VLENUNK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP7]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; VLENUNK-NEXT: [[TMP8]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] +; VLENUNK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; VLENUNK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLENUNK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; VLENUNK: middle.block: -; VLENUNK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64( [[TMP6]]) +; VLENUNK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP8]]) ; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; VLENUNK: scalar.ph: ; VLENUNK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; VLENUNK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; VLENUNK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; VLENUNK-NEXT: br label [[FOR_BODY:%.*]] ; VLENUNK: for.body: ; VLENUNK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -430,40 +445,43 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 ; VLENUNK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; VLENUNK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; VLENUNK: for.end: -; VLENUNK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; VLENUNK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; VLENUNK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] ; ; VLEN128-LABEL: @indexed_load( ; VLEN128-NEXT: entry: ; VLEN128-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLEN128-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLEN128: vector.ph: -; VLEN128-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLEN128-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLEN128-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] -; VLEN128-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLEN128-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]] -; VLEN128-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] -; VLEN128-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv1i64.nxv1p0( [[TMP5]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; VLEN128-NEXT: [[TMP6]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; VLEN128-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] -; VLEN128-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLEN128-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; VLEN128-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; VLEN128-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP4]] +; VLEN128-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; VLEN128-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] +; VLEN128-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP7]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; VLEN128-NEXT: [[TMP8]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] +; VLEN128-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; VLEN128-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLEN128-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; VLEN128: middle.block: -; VLEN128-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64( [[TMP6]]) +; VLEN128-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP8]]) ; VLEN128-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLEN128-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; VLEN128: scalar.ph: ; VLEN128-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; VLEN128-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; VLEN128-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; VLEN128-NEXT: br label [[FOR_BODY:%.*]] ; VLEN128: for.body: ; VLEN128-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -477,7 +495,7 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 ; VLEN128-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; VLEN128-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; VLEN128: for.end: -; VLEN128-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; VLEN128-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; VLEN128-NEXT: ret i64 [[SUM_NEXT_LCSSA]] ; entry: @@ -503,25 +521,28 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) { ; VLENUNK-LABEL: @splat_int( ; VLENUNK-NEXT: entry: ; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLENUNK: vector.ph: -; VLENUNK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLENUNK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLENUNK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; VLENUNK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; VLENUNK-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 -; VLENUNK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; VLENUNK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLENUNK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; VLENUNK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; VLENUNK-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; VLENUNK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; VLENUNK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLENUNK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; VLENUNK: middle.block: ; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -541,25 +562,28 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) { ; VLEN128-LABEL: @splat_int( ; VLEN128-NEXT: entry: ; VLEN128-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLEN128-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLEN128: vector.ph: -; VLEN128-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLEN128-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLEN128-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLEN128-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; VLEN128-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; VLEN128-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 -; VLEN128-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; VLEN128-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLEN128-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; VLEN128-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; VLEN128-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; VLEN128-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; VLEN128-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; VLEN128-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLEN128-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; VLEN128: middle.block: ; VLEN128-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLEN128-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -595,25 +619,28 @@ define void @splat_ptr(ptr noalias nocapture %a, ptr %v, i64 %n) { ; VLENUNK-LABEL: @splat_ptr( ; VLENUNK-NEXT: entry: ; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLENUNK: vector.ph: -; VLENUNK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[V:%.*]], i64 0 -; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[V:%.*]], i64 0 +; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLENUNK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLENUNK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; VLENUNK-NEXT: [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i32 0 -; VLENUNK-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 -; VLENUNK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; VLENUNK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLENUNK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; VLENUNK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 0 +; VLENUNK-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; VLENUNK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; VLENUNK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLENUNK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; VLENUNK: middle.block: ; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -633,25 +660,28 @@ define void @splat_ptr(ptr noalias nocapture %a, ptr %v, i64 %n) { ; VLEN128-LABEL: @splat_ptr( ; VLEN128-NEXT: entry: ; VLEN128-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLEN128-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLEN128: vector.ph: -; VLEN128-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLEN128-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[V:%.*]], i64 0 -; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[V:%.*]], i64 0 +; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLEN128-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLEN128-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; VLEN128-NEXT: [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i32 0 -; VLEN128-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 -; VLEN128-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; VLEN128-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLEN128-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; VLEN128-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; VLEN128-NEXT: [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 0 +; VLEN128-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; VLEN128-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; VLEN128-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLEN128-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; VLEN128: middle.block: ; VLEN128-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLEN128-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll index 8ed7b6444ec6e..c553977a83626 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll @@ -227,17 +227,17 @@ for.end: } ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. -; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2) +; CHECK-REMARK: vectorized loop (vectorization width: 16, interleaved count: 2) define bfloat @fadd_fast_bfloat(ptr noalias nocapture readonly %a, i64 %n) { ; CHECK-LABEL: @fadd_fast_bfloat ; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load <8 x bfloat> -; CHECK: %[[LOAD2:.*]] = load <8 x bfloat> -; CHECK: %[[FADD1:.*]] = fadd fast <8 x bfloat> %[[LOAD1]] -; CHECK: %[[FADD2:.*]] = fadd fast <8 x bfloat> %[[LOAD2]] +; CHECK: %[[LOAD1:.*]] = load <16 x bfloat> +; CHECK: %[[LOAD2:.*]] = load <16 x bfloat> +; CHECK: %[[FADD1:.*]] = fadd fast <16 x bfloat> %[[LOAD1]] +; CHECK: %[[FADD2:.*]] = fadd fast <16 x bfloat> %[[LOAD2]] ; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = fadd fast <8 x bfloat> %[[FADD2]], %[[FADD1]] -; CHECK: call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR8000, <8 x bfloat> %[[RDX]]) +; CHECK: %[[RDX:.*]] = fadd fast <16 x bfloat> %[[FADD2]], %[[FADD1]] +; CHECK: call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR8000, <16 x bfloat> %[[RDX]]) entry: br label %for.body @@ -328,17 +328,17 @@ for.end: ; MUL ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. -; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2) +; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2) define i32 @mul(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @mul ; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load <4 x i32> -; CHECK: %[[LOAD2:.*]] = load <4 x i32> -; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD1]] -; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD2]] +; CHECK: %[[LOAD1:.*]] = load <8 x i32> +; CHECK: %[[LOAD2:.*]] = load <8 x i32> +; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD1]] +; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD2]] ; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] -; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) +; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]] +; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]]) entry: br label %for.body @@ -358,21 +358,21 @@ for.end: ; preds = %for.body, %entry ; Note: This test was added to ensure we always check the legality of reductions (and emit a warning if necessary) before checking for memory dependencies ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. -; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2) +; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2) define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @memory_dependence ; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load <4 x i32> -; CHECK: %[[LOAD2:.*]] = load <4 x i32> -; CHECK: %[[LOAD3:.*]] = load <4 x i32> -; CHECK: %[[LOAD4:.*]] = load <4 x i32> -; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]] -; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]] -; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]] -; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]] +; CHECK: %[[LOAD1:.*]] = load <8 x i32> +; CHECK: %[[LOAD2:.*]] = load <8 x i32> +; CHECK: %[[LOAD3:.*]] = load <8 x i32> +; CHECK: %[[LOAD4:.*]] = load <8 x i32> +; CHECK: %[[ADD1:.*]] = add nsw <8 x i32> %[[LOAD3]], %[[LOAD1]] +; CHECK: %[[ADD2:.*]] = add nsw <8 x i32> %[[LOAD4]], %[[LOAD2]] +; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD3]] +; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD4]] ; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] -; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) +; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]] +; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]]) entry: br label %for.body @@ -396,19 +396,19 @@ for.end: ret i32 %mul } -; CHECK-REMARK: vectorized loop (vectorization width: vscale x 2, interleaved count: 2) +; CHECK-REMARK: vectorized loop (vectorization width: vscale x 4, interleaved count: 2) define float @fmuladd(ptr %a, ptr %b, i64 %n) { ; CHECK-LABEL: @fmuladd( ; CHECK: vector.body: -; CHECK: [[WIDE_LOAD:%.*]] = load -; CHECK: [[WIDE_LOAD2:%.*]] = load -; CHECK: [[WIDE_LOAD3:%.*]] = load -; CHECK: [[WIDE_LOAD4:%.*]] = load -; CHECK: [[MULADD1:%.*]] = call reassoc @llvm.fmuladd.nxv2f32( [[WIDE_LOAD]], [[WIDE_LOAD3]], -; CHECK: [[MULADD2:%.*]] = call reassoc @llvm.fmuladd.nxv2f32( [[WIDE_LOAD2]], [[WIDE_LOAD4]], +; CHECK: [[WIDE_LOAD:%.*]] = load +; CHECK: [[WIDE_LOAD2:%.*]] = load +; CHECK: [[WIDE_LOAD3:%.*]] = load +; CHECK: [[WIDE_LOAD4:%.*]] = load +; CHECK: [[MULADD1:%.*]] = call reassoc @llvm.fmuladd.nxv4f32( [[WIDE_LOAD]], [[WIDE_LOAD3]], +; CHECK: [[MULADD2:%.*]] = call reassoc @llvm.fmuladd.nxv4f32( [[WIDE_LOAD2]], [[WIDE_LOAD4]], ; CHECK: middle.block: -; CHECK: [[BIN_RDX:%.*]] = fadd reassoc [[MULADD2]], [[MULADD1]] -; CHECK: call reassoc float @llvm.vector.reduce.fadd.nxv2f32(float -0.000000e+00, [[BIN_RDX]]) +; CHECK: [[BIN_RDX:%.*]] = fadd reassoc [[MULADD2]], [[MULADD1]] +; CHECK: call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[BIN_RDX]]) ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll index 6d057f378d199..5e231da7e7b57 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll @@ -12,27 +12,30 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv1i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP6:%.*]] = add [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[TMP6]], ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP8:%.*]] = add [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP8]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -75,27 +78,30 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv1i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv1i64.nxv1p0( [[BROADCAST_SPLAT]], [[TMP6]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], [[TMP8]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -136,34 +142,37 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv1i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv1i64.nxv1p0( [[TMP6]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP7]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; CHECK-NEXT: [[TMP8:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP7]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP8]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP9]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64( [[TMP8]]) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP10]]) ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -177,7 +186,7 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] ; entry: @@ -205,25 +214,28 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -260,26 +272,29 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024) +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024) ; CHECK-NEXT: store i64 [[V]], ptr [[B:%.*]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll index 0d6ef7c00def8..4c994772643ef 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll @@ -57,33 +57,22 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 2 -; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP3]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 4, [[TMP4]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 [[TMP5]], i32 4) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i32.p0(ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP8:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: call void @llvm.masked.store.nxv2i32.p0( [[TMP8]], ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[TMP2]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 4, 4 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll index 7b6e29388c759..0bdcf5b1efd01 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -13,26 +13,29 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6 ; SCALABLE-LABEL: @uniform_load( ; SCALABLE-NEXT: entry: ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP3:%.*]] = load i64, ptr [[B:%.*]], align 8 -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8 -; SCALABLE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; SCALABLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP5:%.*]] = load i64, ptr [[B:%.*]], align 8 +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8 +; SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -58,17 +61,17 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6 ; FIXEDLEN: vector.body: ; FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXEDLEN-NEXT: [[TMP2:%.*]] = load i64, ptr [[B:%.*]], align 8 -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP2]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 -; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 2 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8 -; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 +; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8 +; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXEDLEN-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXEDLEN-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; FIXEDLEN: middle.block: @@ -93,26 +96,29 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6 ; TF-SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-SCALABLE: vector.ph: ; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024) -; TF-SCALABLE-NEXT: [[TMP4:%.*]] = load i64, ptr [[B:%.*]], align 8 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP4]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]] -; TF-SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP6]], i32 8, [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024) +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = load i64, ptr [[B:%.*]], align 8 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] +; TF-SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TF-SCALABLE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; TF-SCALABLE: middle.block: ; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; TF-SCALABLE: scalar.ph: @@ -138,12 +144,12 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6 ; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = load i64, ptr [[B:%.*]], align 8 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP1]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0 +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; TF-FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8 -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; TF-FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8 +; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; TF-FIXEDLEN-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; TF-FIXEDLEN: middle.block: @@ -183,26 +189,29 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; SCALABLE-LABEL: @uniform_load_outside_use( ; SCALABLE-NEXT: entry: ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP3:%.*]] = load i64, ptr [[B:%.*]], align 8 -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8 -; SCALABLE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; SCALABLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP5:%.*]] = load i64, ptr [[B:%.*]], align 8 +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8 +; SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -218,7 +227,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; SCALABLE: for.end: -; SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], [[FOR_BODY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; SCALABLE-NEXT: ret i64 [[V_LCSSA]] ; ; FIXEDLEN-LABEL: @uniform_load_outside_use( @@ -229,17 +238,17 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; FIXEDLEN: vector.body: ; FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXEDLEN-NEXT: [[TMP2:%.*]] = load i64, ptr [[B:%.*]], align 8 -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP2]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 -; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 2 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8 -; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 +; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8 +; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXEDLEN-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXEDLEN-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; FIXEDLEN: middle.block: @@ -284,12 +293,12 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = load i64, ptr [[B:%.*]], align 8 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP1]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0 +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; TF-FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8 -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; TF-FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8 +; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; TF-FIXEDLEN-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; TF-FIXEDLEN: middle.block: @@ -331,39 +340,43 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; SCALABLE-LABEL: @conditional_uniform_load( ; SCALABLE-NEXT: entry: ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; SCALABLE-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv1i64() -; SCALABLE-NEXT: [[TMP3:%.*]] = add [[TMP2]], zeroinitializer -; SCALABLE-NEXT: [[TMP4:%.*]] = mul [[TMP3]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP4]] -; SCALABLE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 1, [[TMP5]] -; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 -; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; SCALABLE-NEXT: [[TMP5:%.*]] = add [[TMP4]], zeroinitializer +; SCALABLE-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] +; SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] +; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP8:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv1i64.nxv1p0( [[BROADCAST_SPLAT]], i32 8, [[TMP8]], poison) -; SCALABLE-NEXT: [[TMP9:%.*]] = xor [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[PREDPHI:%.*]] = select [[TMP8]], [[WIDE_MASKED_GATHER]], zeroinitializer -; SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 -; SCALABLE-NEXT: store [[PREDPHI]], ptr [[TMP11]], align 8 -; SCALABLE-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] -; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP11:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], i32 8, [[TMP11]], poison) +; SCALABLE-NEXT: [[TMP12:%.*]] = xor [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[PREDPHI:%.*]] = select [[TMP11]], [[WIDE_MASKED_GATHER]], zeroinitializer +; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP10]] +; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0 +; SCALABLE-NEXT: store [[PREDPHI]], ptr [[TMP14]], align 8 +; SCALABLE-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] +; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -391,33 +404,33 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; FIXEDLEN-NEXT: entry: ; FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXEDLEN: vector.ph: -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x ptr> poison, ptr [[B:%.*]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT]], <2 x ptr> poison, <2 x i32> zeroinitializer -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x ptr> poison, ptr [[B]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT2]], <2 x ptr> poison, <2 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[B:%.*]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x ptr> poison, ptr [[B]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT2]], <4 x ptr> poison, <4 x i32> zeroinitializer ; FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXEDLEN: vector.body: ; FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXEDLEN-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], +; FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXEDLEN-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], ; FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 -; FIXEDLEN-NEXT: [[TMP2:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], -; FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp ugt <2 x i64> [[STEP_ADD]], -; FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[BROADCAST_SPLAT]], i32 8, <2 x i1> [[TMP2]], <2 x i64> poison) -; FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[BROADCAST_SPLAT3]], i32 8, <2 x i1> [[TMP3]], <2 x i64> poison) -; FIXEDLEN-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP2]], -; FIXEDLEN-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP3]], -; FIXEDLEN-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[WIDE_MASKED_GATHER]], <2 x i64> zeroinitializer -; FIXEDLEN-NEXT: [[PREDPHI5:%.*]] = select <2 x i1> [[TMP3]], <2 x i64> [[WIDE_MASKED_GATHER4]], <2 x i64> zeroinitializer +; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; FIXEDLEN-NEXT: [[TMP2:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], +; FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp ugt <4 x i64> [[STEP_ADD]], +; FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP2]], <4 x i64> poison) +; FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT3]], i32 8, <4 x i1> [[TMP3]], <4 x i64> poison) +; FIXEDLEN-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP2]], +; FIXEDLEN-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP3]], +; FIXEDLEN-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[WIDE_MASKED_GATHER]], <4 x i64> zeroinitializer +; FIXEDLEN-NEXT: [[PREDPHI5:%.*]] = select <4 x i1> [[TMP3]], <4 x i64> [[WIDE_MASKED_GATHER4]], <4 x i64> zeroinitializer ; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 -; FIXEDLEN-NEXT: store <2 x i64> [[PREDPHI]], ptr [[TMP8]], align 8 -; FIXEDLEN-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 2 -; FIXEDLEN-NEXT: store <2 x i64> [[PREDPHI5]], ptr [[TMP9]], align 8 -; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], +; FIXEDLEN-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP8]], align 8 +; FIXEDLEN-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[PREDPHI5]], ptr [[TMP9]], align 8 +; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], ; FIXEDLEN-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXEDLEN-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; FIXEDLEN: middle.block: @@ -448,42 +461,46 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-SCALABLE: vector.ph: ; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TF-SCALABLE-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv1i64() -; TF-SCALABLE-NEXT: [[TMP4:%.*]] = add [[TMP3]], zeroinitializer -; TF-SCALABLE-NEXT: [[TMP5:%.*]] = mul [[TMP4]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP5]] -; TF-SCALABLE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]] -; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 -; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 1, [[TMP9]] +; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP10]], i64 0 +; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP8]], i64 1024) -; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) -; TF-SCALABLE-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer -; TF-SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv1i64.nxv1p0( [[BROADCAST_SPLAT]], i32 8, [[TMP10]], poison) -; TF-SCALABLE-NEXT: [[TMP11:%.*]] = xor [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TF-SCALABLE-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer -; TF-SCALABLE-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], [[WIDE_MASKED_GATHER]], zeroinitializer -; TF-SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP8]] -; TF-SCALABLE-NEXT: [[TMP14:%.*]] = or [[TMP10]], [[TMP12]] -; TF-SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[PREDPHI]], ptr [[TMP15]], i32 8, [[TMP14]]) -; TF-SCALABLE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP16]] -; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; TF-SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP11]], i64 1024) +; TF-SCALABLE-NEXT: [[TMP12:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) +; TF-SCALABLE-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP12]], zeroinitializer +; TF-SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], i32 8, [[TMP13]], poison) +; TF-SCALABLE-NEXT: [[TMP14:%.*]] = xor [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TF-SCALABLE-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP14]], zeroinitializer +; TF-SCALABLE-NEXT: [[PREDPHI:%.*]] = select [[TMP13]], [[WIDE_MASKED_GATHER]], zeroinitializer +; TF-SCALABLE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP11]] +; TF-SCALABLE-NEXT: [[TMP17:%.*]] = or [[TMP13]], [[TMP15]] +; TF-SCALABLE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP16]], i32 0 +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP18]], i32 8, [[TMP17]]) +; TF-SCALABLE-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]] +; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; TF-SCALABLE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TF-SCALABLE-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; TF-SCALABLE: middle.block: ; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; TF-SCALABLE: scalar.ph: @@ -510,22 +527,22 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-FIXEDLEN-NEXT: entry: ; TF-FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-FIXEDLEN: vector.ph: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x ptr> poison, ptr [[B:%.*]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT]], <2 x ptr> poison, <2 x i32> zeroinitializer +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[B:%.*]], i64 0 +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer ; TF-FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-FIXEDLEN: vector.body: ; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], -; TF-FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[BROADCAST_SPLAT]], i32 8, <2 x i1> [[TMP1]], <2 x i64> poison) -; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = xor <2 x i1> [[TMP1]], -; TF-FIXEDLEN-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i64> [[WIDE_MASKED_GATHER]], <2 x i64> zeroinitializer +; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], +; TF-FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP1]], <4 x i64> poison) +; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], +; TF-FIXEDLEN-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> [[WIDE_MASKED_GATHER]], <4 x i64> zeroinitializer ; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; TF-FIXEDLEN-NEXT: store <2 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; TF-FIXEDLEN-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 +; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; TF-FIXEDLEN-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; TF-FIXEDLEN-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; TF-FIXEDLEN: middle.block: @@ -578,26 +595,29 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; SCALABLE-LABEL: @uniform_load_unaligned( ; SCALABLE-NEXT: entry: ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP3:%.*]] = load i64, ptr [[B:%.*]], align 1 -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8 -; SCALABLE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; SCALABLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP5:%.*]] = load i64, ptr [[B:%.*]], align 1 +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8 +; SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -623,17 +643,17 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; FIXEDLEN: vector.body: ; FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXEDLEN-NEXT: [[TMP2:%.*]] = load i64, ptr [[B:%.*]], align 1 -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP2]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 -; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 2 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8 -; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 +; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8 +; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXEDLEN-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXEDLEN-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; FIXEDLEN: middle.block: @@ -658,26 +678,29 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; TF-SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-SCALABLE: vector.ph: ; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024) -; TF-SCALABLE-NEXT: [[TMP4:%.*]] = load i64, ptr [[B:%.*]], align 1 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP4]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]] -; TF-SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP6]], i32 8, [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024) +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = load i64, ptr [[B:%.*]], align 1 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] +; TF-SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TF-SCALABLE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; TF-SCALABLE: middle.block: ; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; TF-SCALABLE: scalar.ph: @@ -703,12 +726,12 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = load i64, ptr [[B:%.*]], align 1 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP1]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0 +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; TF-FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8 -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; TF-FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8 +; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; TF-FIXEDLEN-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; TF-FIXEDLEN: middle.block: @@ -748,26 +771,29 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; SCALABLE-LABEL: @uniform_store( ; SCALABLE-NEXT: entry: ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; SCALABLE-NEXT: store i64 [[V]], ptr [[B:%.*]], align 8 -; SCALABLE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; SCALABLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 -; SCALABLE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -789,23 +815,23 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; FIXEDLEN-NEXT: entry: ; FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXEDLEN: vector.ph: -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXEDLEN: vector.body: ; FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXEDLEN-NEXT: store i64 [[V]], ptr [[B:%.*]], align 8 ; FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 -; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 -; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 +; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 +; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXEDLEN-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXEDLEN-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; FIXEDLEN: middle.block: @@ -830,26 +856,29 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; TF-SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-SCALABLE: vector.ph: ; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024) +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024) ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[B:%.*]], align 8 -; TF-SCALABLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]] -; TF-SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; TF-SCALABLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TF-SCALABLE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; TF-SCALABLE: middle.block: ; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; TF-SCALABLE: scalar.ph: @@ -870,8 +899,8 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; TF-FIXEDLEN-NEXT: entry: ; TF-FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-FIXEDLEN: vector.ph: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; TF-FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-FIXEDLEN: vector.body: ; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -879,8 +908,8 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B:%.*]], align 8 ; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 -; TF-FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8 -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; TF-FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8 +; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; TF-FIXEDLEN: middle.block: @@ -920,35 +949,40 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; SCALABLE-LABEL: @uniform_store_of_loop_varying( ; SCALABLE-NEXT: entry: ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv1i64() -; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX]], i64 0 -; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP3:%.*]] = add zeroinitializer, [[TMP2]] -; SCALABLE-NEXT: [[TMP4:%.*]] = mul [[TMP3]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP5:%.*]] = add [[DOTSPLAT]], [[TMP4]] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() -; SCALABLE-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 -; SCALABLE-NEXT: [[TMP9:%.*]] = extractelement [[TMP5]], i32 [[TMP8]] -; SCALABLE-NEXT: store i64 [[TMP9]], ptr [[B:%.*]], align 8 -; SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 -; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP11]], align 8 -; SCALABLE-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] -; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; SCALABLE-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX]], i64 0 +; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP5:%.*]] = add zeroinitializer, [[TMP4]] +; SCALABLE-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP7:%.*]] = add [[DOTSPLAT]], [[TMP6]] +; SCALABLE-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 1 +; SCALABLE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vscale.i32() +; SCALABLE-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 2 +; SCALABLE-NEXT: [[TMP12:%.*]] = sub i32 [[TMP11]], 1 +; SCALABLE-NEXT: [[TMP13:%.*]] = extractelement [[TMP7]], i32 [[TMP12]] +; SCALABLE-NEXT: store i64 [[TMP13]], ptr [[B:%.*]], align 8 +; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP8]] +; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 0 +; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP15]], align 8 +; SCALABLE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 2 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] +; SCALABLE-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -970,33 +1004,31 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; FIXEDLEN-NEXT: entry: ; FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXEDLEN: vector.ph: -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x ptr> poison, ptr [[B:%.*]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT]], <2 x ptr> poison, <2 x i32> zeroinitializer -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x ptr> poison, ptr [[B]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT2]], <2 x ptr> poison, <2 x i32> zeroinitializer -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT4]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT6]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXEDLEN: vector.body: ; FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXEDLEN-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], ; FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 -; FIXEDLEN-NEXT: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[VEC_IND]], <2 x ptr> [[BROADCAST_SPLAT]], i32 8, <2 x i1> ) -; FIXEDLEN-NEXT: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[STEP_ADD]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 8, <2 x i1> ) -; FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] -; FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT5]], ptr [[TMP4]], align 8 -; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT7]], ptr [[TMP5]], align 8 -; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], -; FIXEDLEN-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXEDLEN-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; FIXEDLEN-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; FIXEDLEN-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; FIXEDLEN-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; FIXEDLEN-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; FIXEDLEN-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; FIXEDLEN-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; FIXEDLEN-NEXT: store i64 [[TMP7]], ptr [[B:%.*]], align 8 +; FIXEDLEN-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; FIXEDLEN-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; FIXEDLEN-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP10]], align 8 +; FIXEDLEN-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP11]], align 8 +; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FIXEDLEN-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; FIXEDLEN-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; FIXEDLEN: middle.block: ; FIXEDLEN-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 ; FIXEDLEN-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1019,38 +1051,42 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; TF-SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-SCALABLE: vector.ph: ; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TF-SCALABLE-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv1i64() -; TF-SCALABLE-NEXT: [[TMP4:%.*]] = add [[TMP3]], zeroinitializer -; TF-SCALABLE-NEXT: [[TMP5:%.*]] = mul [[TMP4]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP5]] -; TF-SCALABLE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]] -; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 -; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 1, [[TMP9]] +; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP10]], i64 0 +; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP8]], i64 1024) -; TF-SCALABLE-NEXT: call void @llvm.masked.scatter.nxv1i64.nxv1p0( [[VEC_IND]], [[BROADCAST_SPLAT]], i32 8, [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP8]] -; TF-SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT2]], ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP11]] -; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; TF-SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP11]], i64 1024) +; TF-SCALABLE-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[VEC_IND]], [[BROADCAST_SPLAT]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP11]] +; TF-SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0 +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT2]], ptr [[TMP13]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]] +; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; TF-SCALABLE-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TF-SCALABLE-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; TF-SCALABLE: middle.block: ; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; TF-SCALABLE: scalar.ph: @@ -1071,23 +1107,22 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; TF-FIXEDLEN-NEXT: entry: ; TF-FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-FIXEDLEN: vector.ph: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x ptr> poison, ptr [[B:%.*]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT]], <2 x ptr> poison, <2 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; TF-FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-FIXEDLEN: vector.body: ; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; TF-FIXEDLEN-NEXT: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[VEC_IND]], <2 x ptr> [[BROADCAST_SPLAT]], i32 8, <2 x i1> ) -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] -; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 -; TF-FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP2]], align 8 -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], -; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; TF-FIXEDLEN-NEXT: store i64 [[TMP3]], ptr [[B:%.*]], align 8 +; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; TF-FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 +; TF-FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8 +; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; TF-FIXEDLEN-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; TF-FIXEDLEN-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; TF-FIXEDLEN: middle.block: ; TF-FIXEDLEN-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 ; TF-FIXEDLEN-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1125,39 +1160,43 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; SCALABLE-LABEL: @conditional_uniform_store( ; SCALABLE-NEXT: entry: ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; SCALABLE-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv1i64() -; SCALABLE-NEXT: [[TMP3:%.*]] = add [[TMP2]], zeroinitializer -; SCALABLE-NEXT: [[TMP4:%.*]] = mul [[TMP3]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP4]] -; SCALABLE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 1, [[TMP5]] -; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 -; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; SCALABLE-NEXT: [[TMP5:%.*]] = add [[TMP4]], zeroinitializer +; SCALABLE-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] +; SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] +; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP8:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: call void @llvm.masked.scatter.nxv1i64.nxv1p0( [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]], i32 8, [[TMP8]]) -; SCALABLE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 0 -; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP10]], align 8 -; SCALABLE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] -; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP11:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]], i32 8, [[TMP11]]) +; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP10]] +; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0 +; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP13]], align 8 +; SCALABLE-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; SCALABLE-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1184,33 +1223,33 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; FIXEDLEN-NEXT: entry: ; FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXEDLEN: vector.ph: -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x ptr> poison, ptr [[B:%.*]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT2]], <2 x ptr> poison, <2 x i32> zeroinitializer -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT4]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <2 x ptr> poison, ptr [[B]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT6]], <2 x ptr> poison, <2 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x ptr> poison, ptr [[B:%.*]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT2]], <4 x ptr> poison, <4 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT4]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x ptr> poison, ptr [[B]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT6]], <4 x ptr> poison, <4 x i32> zeroinitializer ; FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXEDLEN: vector.body: ; FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXEDLEN-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], +; FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXEDLEN-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], ; FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 -; FIXEDLEN-NEXT: [[TMP2:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], -; FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp ugt <2 x i64> [[STEP_ADD]], -; FIXEDLEN-NEXT: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[BROADCAST_SPLAT]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 8, <2 x i1> [[TMP2]]) -; FIXEDLEN-NEXT: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[BROADCAST_SPLAT5]], <2 x ptr> [[BROADCAST_SPLAT7]], i32 8, <2 x i1> [[TMP3]]) +; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; FIXEDLEN-NEXT: [[TMP2:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], +; FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp ugt <4 x i64> [[STEP_ADD]], +; FIXEDLEN-NEXT: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT3]], i32 8, <4 x i1> [[TMP2]]) +; FIXEDLEN-NEXT: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT5]], <4 x ptr> [[BROADCAST_SPLAT7]], i32 8, <4 x i1> [[TMP3]]) ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 -; FIXEDLEN-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 2 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT5]], ptr [[TMP7]], align 8 -; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; FIXEDLEN-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT5]], ptr [[TMP7]], align 8 +; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], ; FIXEDLEN-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXEDLEN-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; FIXEDLEN: middle.block: @@ -1240,43 +1279,47 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; TF-SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-SCALABLE: vector.ph: ; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TF-SCALABLE-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv1i64() -; TF-SCALABLE-NEXT: [[TMP4:%.*]] = add [[TMP3]], zeroinitializer -; TF-SCALABLE-NEXT: [[TMP5:%.*]] = mul [[TMP4]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP5]] -; TF-SCALABLE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]] -; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 -; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 1, [[TMP9]] +; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP10]], i64 0 +; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP8]], i64 1024) -; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) -; TF-SCALABLE-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer -; TF-SCALABLE-NEXT: call void @llvm.masked.scatter.nxv1i64.nxv1p0( [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]], i32 8, [[TMP10]]) -; TF-SCALABLE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP8]] -; TF-SCALABLE-NEXT: [[TMP12:%.*]] = xor [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TF-SCALABLE-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP12]], zeroinitializer -; TF-SCALABLE-NEXT: [[TMP14:%.*]] = or [[TMP10]], [[TMP13]] -; TF-SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP15]], i32 8, [[TMP14]]) -; TF-SCALABLE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP16]] -; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; TF-SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP11]], i64 1024) +; TF-SCALABLE-NEXT: [[TMP12:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) +; TF-SCALABLE-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP12]], zeroinitializer +; TF-SCALABLE-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]], i32 8, [[TMP13]]) +; TF-SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP11]] +; TF-SCALABLE-NEXT: [[TMP15:%.*]] = xor [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TF-SCALABLE-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP15]], zeroinitializer +; TF-SCALABLE-NEXT: [[TMP17:%.*]] = or [[TMP13]], [[TMP16]] +; TF-SCALABLE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 0 +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP18]], i32 8, [[TMP17]]) +; TF-SCALABLE-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]] +; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; TF-SCALABLE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TF-SCALABLE-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; TF-SCALABLE: middle.block: ; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; TF-SCALABLE: scalar.ph: @@ -1302,22 +1345,22 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; TF-FIXEDLEN-NEXT: entry: ; TF-FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-FIXEDLEN: vector.ph: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x ptr> poison, ptr [[B:%.*]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT1]], <2 x ptr> poison, <2 x i32> zeroinitializer +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x ptr> poison, ptr [[B:%.*]], i64 0 +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT1]], <4 x ptr> poison, <4 x i32> zeroinitializer ; TF-FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-FIXEDLEN: vector.body: ; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], -; TF-FIXEDLEN-NEXT: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[BROADCAST_SPLAT]], <2 x ptr> [[BROADCAST_SPLAT2]], i32 8, <2 x i1> [[TMP1]]) +; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], +; TF-FIXEDLEN-NEXT: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]], i32 8, <4 x i1> [[TMP1]]) ; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; TF-FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8 -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; TF-FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8 +; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; TF-FIXEDLEN-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; TF-FIXEDLEN: middle.block: @@ -1368,26 +1411,29 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; SCALABLE-LABEL: @uniform_store_unaligned( ; SCALABLE-NEXT: entry: ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; SCALABLE-NEXT: store i64 [[V]], ptr [[B:%.*]], align 1 -; SCALABLE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; SCALABLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 -; SCALABLE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1409,23 +1455,23 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; FIXEDLEN-NEXT: entry: ; FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXEDLEN: vector.ph: -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXEDLEN: vector.body: ; FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXEDLEN-NEXT: store i64 [[V]], ptr [[B:%.*]], align 1 ; FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 -; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 -; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 +; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 +; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXEDLEN-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXEDLEN-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; FIXEDLEN: middle.block: @@ -1450,26 +1496,29 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; TF-SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-SCALABLE: vector.ph: ; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024) +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024) ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[B:%.*]], align 1 -; TF-SCALABLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]] -; TF-SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; TF-SCALABLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TF-SCALABLE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; TF-SCALABLE: middle.block: ; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; TF-SCALABLE: scalar.ph: @@ -1490,8 +1539,8 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; TF-FIXEDLEN-NEXT: entry: ; TF-FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-FIXEDLEN: vector.ph: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; TF-FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-FIXEDLEN: vector.body: ; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1499,8 +1548,8 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B:%.*]], align 1 ; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 -; TF-FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8 -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; TF-FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8 +; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; TF-FIXEDLEN: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll b/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll index 0e2f916ac08b6..fb2167b7f5c33 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll @@ -12,33 +12,23 @@ define void @vector_add_i16(ptr noalias nocapture %a, i16 %v, i64 %n) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i16> poison, i16 [[V]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT4]], <2 x i16> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <2 x i64> [[VEC_IND]] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A]], <2 x i64> [[STEP_ADD]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i16, ptr [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x i16>, ptr [[TMP3]], align 2 -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_VEC]], <4 x i16> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x i16> [[WIDE_VEC2]], <4 x i16> poison, <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i16> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i16> [[STRIDED_VEC3]], [[BROADCAST_SPLAT5]] -; CHECK-NEXT: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> [[TMP6]], <2 x ptr> [[TMP0]], i32 2, <2 x i1> ) -; CHECK-NEXT: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> [[TMP7]], <2 x ptr> [[TMP1]], i32 2, <2 x i1> ) +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i16> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> [[TMP3]], <4 x ptr> [[TMP0]], i32 2, <4 x i1> ) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020 +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -56,6 +46,7 @@ define void @vector_add_i16(ptr noalias nocapture %a, i16 %v, i64 %n) { ; CHECK: for.end: ; CHECK-NEXT: ret void ; + entry: br label %for.body diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll index cb017795077f1..08bac7f788c77 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll @@ -10,26 +10,10 @@ target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128" target triple = "riscv64" define void @foo(ptr nocapture writeonly %da) { -; CHECK-128-LABEL: @foo( -; CHECK-128-NEXT: entry: -; CHECK-128-NEXT: store i64 0, ptr [[DA:%.*]], align 8 -; CHECK-128-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[DA]], i64 1 -; CHECK-128-NEXT: store i64 0, ptr [[ARRAYIDX1]], align 8 -; CHECK-128-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[DA]], i64 2 -; CHECK-128-NEXT: store i64 0, ptr [[ARRAYIDX2]], align 8 -; CHECK-128-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, ptr [[DA]], i64 3 -; CHECK-128-NEXT: store i64 0, ptr [[ARRAYIDX3]], align 8 -; CHECK-128-NEXT: ret void -; -; CHECK-256-LABEL: @foo( -; CHECK-256-NEXT: entry: -; CHECK-256-NEXT: store <4 x i64> zeroinitializer, ptr [[DA:%.*]], align 8 -; CHECK-256-NEXT: ret void -; -; CHECK-512-LABEL: @foo( -; CHECK-512-NEXT: entry: -; CHECK-512-NEXT: store <4 x i64> zeroinitializer, ptr [[DA:%.*]], align 8 -; CHECK-512-NEXT: ret void +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr [[DA:%.*]], align 8 +; CHECK-NEXT: ret void ; entry: store i64 0, ptr %da, align 8 @@ -58,3 +42,7 @@ entry: %arrayidx2 = getelementptr inbounds i8, ptr %da, i8 2 ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-128: {{.*}} +; CHECK-256: {{.*}} +; CHECK-512: {{.*}}