35 changes: 32 additions & 3 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1701,6 +1701,11 @@ class LoopVectorizationCostModel {
private:
unsigned NumPredStores = 0;

/// Convenience function that returns the value of vscale_range iff
/// vscale_range.min == vscale_range.max or otherwise returns the value
/// returned by the corresponding TLI method.
Optional<unsigned> getVScaleForTuning() const;

/// \return An upper bound for the vectorization factors for both
/// fixed and scalable vectorization, where the minimum-known number of
/// elements is a power-of-2 larger than zero. If scalable vectorization is
Expand Down Expand Up @@ -5600,6 +5605,18 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
return MaxVF;
}

Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
auto Min = Attr.getVScaleRangeMin();
auto Max = Attr.getVScaleRangeMax();
if (Max && Min == Max)
return Max;
}

return TTI.getVScaleForTuning();
}

bool LoopVectorizationCostModel::isMoreProfitable(
const VectorizationFactor &A, const VectorizationFactor &B) const {
InstructionCost CostA = A.Cost;
Expand All @@ -5624,7 +5641,7 @@ bool LoopVectorizationCostModel::isMoreProfitable(
// Improve estimate for the vector width if it is scalable.
unsigned EstimatedWidthA = A.Width.getKnownMinValue();
unsigned EstimatedWidthB = B.Width.getKnownMinValue();
if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) {
if (Optional<unsigned> VScale = getVScaleForTuning()) {
if (A.Width.isScalable())
EstimatedWidthA *= VScale.getValue();
if (B.Width.isScalable())
Expand Down Expand Up @@ -5673,7 +5690,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(

#ifndef NDEBUG
unsigned AssumedMinimumVscale = 1;
if (Optional<unsigned> VScale = TTI.getVScaleForTuning())
if (Optional<unsigned> VScale = getVScaleForTuning())
AssumedMinimumVscale = VScale.getValue();
unsigned Width =
Candidate.Width.isScalable()
Expand Down Expand Up @@ -5885,8 +5902,20 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
return Result;
}

// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
// the main loop handles 8 lanes per iteration. We could still benefit from
// vectorizing the epilogue loop with VF=4.
ElementCount EstimatedRuntimeVF = MainLoopVF;
if (MainLoopVF.isScalable()) {
EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
if (Optional<unsigned> VScale = getVScaleForTuning())
EstimatedRuntimeVF *= VScale.getValue();
}

for (auto &NextVF : ProfitableVFs)
if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
(Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
LVP.hasPlanWithVF(NextVF.Width))
Result = NextVF;
Expand Down
374 changes: 374 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-fcvt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -898,3 +898,377 @@ define <vscale x 2 x double> @ucvtf_d_nxv2i64(<vscale x 2 x i64> %a) {
%res = uitofp <vscale x 2 x i64> %a to <vscale x 2 x double>
ret <vscale x 2 x double> %res
}

define <vscale x 4 x float> @fcvt_htos_movprfx(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
; CHECK-LABEL: fcvt_htos_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvt z0.s, p0/m, z1.h
; CHECK-NEXT: ret
%res = fpext <vscale x 4 x half> %b to <vscale x 4 x float>
ret <vscale x 4 x float> %res
}

define <vscale x 2 x double> @fcvt_htod_movprfx(<vscale x 2 x half> %a, <vscale x 2 x half> %b) {
; CHECK-LABEL: fcvt_htod_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvt z0.d, p0/m, z1.h
; CHECK-NEXT: ret
%res = fpext <vscale x 2 x half> %b to <vscale x 2 x double>
ret <vscale x 2 x double> %res
}

define <vscale x 2 x double> @fcvt_stod_movprfx(<vscale x 2 x float> %a, <vscale x 2 x float> %b) {
; CHECK-LABEL: fcvt_stod_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvt z0.d, p0/m, z1.s
; CHECK-NEXT: ret
%res = fpext <vscale x 2 x float> %b to <vscale x 2 x double>
ret <vscale x 2 x double> %res
}

define <vscale x 4 x half> @fcvt_stoh_movprfx(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
; CHECK-LABEL: fcvt_stoh_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvt z0.h, p0/m, z1.s
; CHECK-NEXT: ret
%res = fptrunc <vscale x 4 x float> %b to <vscale x 4 x half>
ret <vscale x 4 x half> %res
}

define <vscale x 2 x half> @fcvt_dtoh_movprfx(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
; CHECK-LABEL: fcvt_dtoh_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvt z0.h, p0/m, z1.d
; CHECK-NEXT: ret
%res = fptrunc <vscale x 2 x double> %b to <vscale x 2 x half>
ret <vscale x 2 x half> %res
}

define <vscale x 2 x float> @fcvt_dtos_movprfx(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
; CHECK-LABEL: fcvt_dtos_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvt z0.s, p0/m, z1.d
; CHECK-NEXT: ret
%res = fptrunc <vscale x 2 x double> %b to <vscale x 2 x float>
ret <vscale x 2 x float> %res
}

define <vscale x 8 x half> @scvtf_htoh_movprfx(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: scvtf_htoh_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: scvtf z0.h, p0/m, z1.h
; CHECK-NEXT: ret
%res = sitofp <vscale x 8 x i16> %b to <vscale x 8 x half>
ret <vscale x 8 x half> %res
}

define <vscale x 4 x float> @scvtf_stos_movprfx(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: scvtf_stos_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: scvtf z0.s, p0/m, z1.s
; CHECK-NEXT: ret
%res = sitofp <vscale x 4 x i32> %b to <vscale x 4 x float>
ret <vscale x 4 x float> %res
}

define <vscale x 2 x double> @scvtf_stod_movprfx(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b) {
; CHECK-LABEL: scvtf_stod_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: scvtf z0.d, p0/m, z1.s
; CHECK-NEXT: ret
%res = sitofp <vscale x 2 x i32> %b to <vscale x 2 x double>
ret <vscale x 2 x double> %res
}

define <vscale x 2 x float> @scvtf_dtos_movprfx(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: scvtf_dtos_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: scvtf z0.s, p0/m, z1.d
; CHECK-NEXT: ret
%res = sitofp <vscale x 2 x i64> %b to <vscale x 2 x float>
ret <vscale x 2 x float> %res
}

define <vscale x 4 x half> @scvtf_stoh_movprfx(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: scvtf_stoh_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: scvtf z0.h, p0/m, z1.s
; CHECK-NEXT: ret
%res = sitofp <vscale x 4 x i32> %b to <vscale x 4 x half>
ret <vscale x 4 x half> %res
}

define <vscale x 2 x half> @scvtf_dtoh_movprfx(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: scvtf_dtoh_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: scvtf z0.h, p0/m, z1.d
; CHECK-NEXT: ret
%res = sitofp <vscale x 2 x i64> %b to <vscale x 2 x half>
ret <vscale x 2 x half> %res
}

define <vscale x 2 x double> @scvtf_dtod_movprfx(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: scvtf_dtod_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: scvtf z0.d, p0/m, z1.d
; CHECK-NEXT: ret
%res = sitofp <vscale x 2 x i64> %b to <vscale x 2 x double>
ret <vscale x 2 x double> %res
}

define <vscale x 4 x float> @ucvtf_stos_movprfx(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: ucvtf_stos_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: ucvtf z0.s, p0/m, z1.s
; CHECK-NEXT: ret
%res = uitofp <vscale x 4 x i32> %b to <vscale x 4 x float>
ret <vscale x 4 x float> %res
}

define <vscale x 8 x half> @ucvtf_htoh_movprfx(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: ucvtf_htoh_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: ucvtf z0.h, p0/m, z1.h
; CHECK-NEXT: ret
%res = uitofp <vscale x 8 x i16> %b to <vscale x 8 x half>
ret <vscale x 8 x half> %res
}

define <vscale x 2 x double> @ucvtf_stod_movprfx(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b) {
; CHECK-LABEL: ucvtf_stod_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: ucvtf z0.d, p0/m, z1.s
; CHECK-NEXT: ret
%res = uitofp <vscale x 2 x i32> %b to <vscale x 2 x double>
ret <vscale x 2 x double> %res
}

define <vscale x 4 x half> @ucvtf_stoh_movprfx(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: ucvtf_stoh_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: ucvtf z0.h, p0/m, z1.s
; CHECK-NEXT: ret
%res = uitofp <vscale x 4 x i32> %b to <vscale x 4 x half>
ret <vscale x 4 x half> %res
}

define <vscale x 2 x float> @ucvtf_dtos_movprfx(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: ucvtf_dtos_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: ucvtf z0.s, p0/m, z1.d
; CHECK-NEXT: ret
%res = uitofp <vscale x 2 x i64> %b to <vscale x 2 x float>
ret <vscale x 2 x float> %res
}

define <vscale x 2 x half> @ucvtf_dtoh_movprfx(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: ucvtf_dtoh_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: ucvtf z0.h, p0/m, z1.d
; CHECK-NEXT: ret
%res = uitofp <vscale x 2 x i64> %b to <vscale x 2 x half>
ret <vscale x 2 x half> %res
}

define <vscale x 2 x double> @ucvtf_dtod_movprfx(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: ucvtf_dtod_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: ucvtf z0.d, p0/m, z1.d
; CHECK-NEXT: ret
%res = uitofp <vscale x 2 x i64> %b to <vscale x 2 x double>
ret <vscale x 2 x double> %res
}

define <vscale x 8 x i16> @fcvtzs_htoh_movprfx(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
; CHECK-LABEL: fcvtzs_htoh_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvtzs z0.h, p0/m, z1.h
; CHECK-NEXT: ret
%res = fptosi <vscale x 8 x half> %b to <vscale x 8 x i16>
ret <vscale x 8 x i16> %res
}

define <vscale x 4 x i32> @fcvtzs_stos_movprfx(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
; CHECK-LABEL: fcvtzs_stos_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvtzs z0.s, p0/m, z1.s
; CHECK-NEXT: ret
%res = fptosi <vscale x 4 x float> %b to <vscale x 4 x i32>
ret <vscale x 4 x i32> %res
}

define <vscale x 2 x i32> @fcvtzs_dtos_movprfx(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
; CHECK-LABEL: fcvtzs_dtos_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.d
; CHECK-NEXT: ret
%res = fptosi <vscale x 2 x double> %b to <vscale x 2 x i32>
ret <vscale x 2 x i32> %res
}

define <vscale x 2 x i64> @fcvtzs_stod_movprfx(<vscale x 2 x float> %a, <vscale x 2 x float> %b) {
; CHECK-LABEL: fcvtzs_stod_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.s
; CHECK-NEXT: ret
%res = fptosi <vscale x 2 x float> %b to <vscale x 2 x i64>
ret <vscale x 2 x i64> %res
}

define <vscale x 4 x i32> @fcvtzs_htos_movprfx(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
; CHECK-LABEL: fcvtzs_htos_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvtzs z0.s, p0/m, z1.h
; CHECK-NEXT: ret
%res = fptosi <vscale x 4 x half> %b to <vscale x 4 x i32>
ret <vscale x 4 x i32> %res
}

define <vscale x 2 x i64> @fcvtzs_htod_movprfx(<vscale x 2 x half> %a, <vscale x 2 x half> %b) {
; CHECK-LABEL: fcvtzs_htod_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.h
; CHECK-NEXT: ret
%res = fptosi <vscale x 2 x half> %b to <vscale x 2 x i64>
ret <vscale x 2 x i64> %res
}

define <vscale x 2 x i64> @fcvtzs_dtod_movprfx(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
; CHECK-LABEL: fcvtzs_dtod_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.d
; CHECK-NEXT: ret
%res = fptosi <vscale x 2 x double> %b to <vscale x 2 x i64>
ret <vscale x 2 x i64> %res
}

define <vscale x 8 x i16> @fcvtzu_htoh_movprfx(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
; CHECK-LABEL: fcvtzu_htoh_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvtzu z0.h, p0/m, z1.h
; CHECK-NEXT: ret
%res = fptoui <vscale x 8 x half> %b to <vscale x 8 x i16>
ret <vscale x 8 x i16> %res
}

define <vscale x 4 x i32> @fcvtzu_stos_movprfx(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
; CHECK-LABEL: fcvtzu_stos_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvtzu z0.s, p0/m, z1.s
; CHECK-NEXT: ret
%res = fptoui <vscale x 4 x float> %b to <vscale x 4 x i32>
ret <vscale x 4 x i32> %res
}

define <vscale x 2 x i32> @fcvtzu_dtos_movprfx(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
; CHECK-LABEL: fcvtzu_dtos_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.d
; CHECK-NEXT: ret
%res = fptoui <vscale x 2 x double> %b to <vscale x 2 x i32>
ret <vscale x 2 x i32> %res
}

define <vscale x 2 x i64> @fcvtzu_stod_movprfx(<vscale x 2 x float> %a, <vscale x 2 x float> %b) {
; CHECK-LABEL: fcvtzu_stod_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvtzu z0.d, p0/m, z1.s
; CHECK-NEXT: ret
%res = fptoui <vscale x 2 x float> %b to <vscale x 2 x i64>
ret <vscale x 2 x i64> %res
}

define <vscale x 4 x i32> @fcvtzu_htos_movprfx(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
; CHECK-LABEL: fcvtzu_htos_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvtzu z0.s, p0/m, z1.h
; CHECK-NEXT: ret
%res = fptoui <vscale x 4 x half> %b to <vscale x 4 x i32>
ret <vscale x 4 x i32> %res
}

define <vscale x 2 x i64> @fcvtzu_htod_movprfx(<vscale x 2 x half> %a, <vscale x 2 x half> %b) {
; CHECK-LABEL: fcvtzu_htod_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvtzu z0.d, p0/m, z1.h
; CHECK-NEXT: ret
%res = fptoui <vscale x 2 x half> %b to <vscale x 2 x i64>
ret <vscale x 2 x i64> %res
}

define <vscale x 2 x i64> @fcvtzu_dtod_movprfx(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
; CHECK-LABEL: fcvtzu_dtod_movprfx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvtzu z0.d, p0/m, z1.d
; CHECK-NEXT: ret
%res = fptoui <vscale x 2 x double> %b to <vscale x 2 x i64>
ret <vscale x 2 x i64> %res
}
9 changes: 8 additions & 1 deletion llvm/test/CodeGen/AArch64/sve-fpext-load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ define <vscale x 4 x double> @ext4_f16_f64(<vscale x 4 x half> *%ptr, i64 %index
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpklo z1.d, z0.s
; CHECK-NEXT: uunpkhi z2.d, z0.s
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvt z0.d, p0/m, z1.h
; CHECK-NEXT: movprfx z1, z2
; CHECK-NEXT: fcvt z1.d, p0/m, z2.h
; CHECK-NEXT: ret
%load = load <vscale x 4 x half>, <vscale x 4 x half>* %ptr, align 4
Expand All @@ -43,10 +45,13 @@ define <vscale x 8 x double> @ext8_f16_f64(<vscale x 8 x half> *%ptr, i64 %index
; CHECK-NEXT: uunpklo z2.d, z1.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: fcvt z1.d, p0/m, z1.h
; CHECK-NEXT: uunpkhi z4.d, z0.s
; CHECK-NEXT: movprfx z0, z2
; CHECK-NEXT: fcvt z0.d, p0/m, z2.h
; CHECK-NEXT: fcvt z1.d, p0/m, z1.h
; CHECK-NEXT: movprfx z2, z3
; CHECK-NEXT: fcvt z2.d, p0/m, z3.h
; CHECK-NEXT: movprfx z3, z4
; CHECK-NEXT: fcvt z3.d, p0/m, z4.h
; CHECK-NEXT: ret
%load = load <vscale x 8 x half>, <vscale x 8 x half>* %ptr, align 4
Expand Down Expand Up @@ -76,7 +81,9 @@ define <vscale x 4 x double> @ext4_f32_f64(<vscale x 4 x float> *%ptr, i64 %inde
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpklo z1.d, z0.s
; CHECK-NEXT: uunpkhi z2.d, z0.s
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvt z0.d, p0/m, z1.s
; CHECK-NEXT: movprfx z1, z2
; CHECK-NEXT: fcvt z1.d, p0/m, z2.s
; CHECK-NEXT: ret
%load = load <vscale x 4 x float>, <vscale x 4 x float>* %ptr, align 4
Expand Down
54 changes: 41 additions & 13 deletions llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
define <vscale x 8 x float> @fcvts_nxv8f16(<vscale x 8 x half> %a) {
; CHECK-LABEL: fcvts_nxv8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: uunpklo z1.s, z0.h
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: uunpkhi z2.s, z0.h
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvt z0.s, p0/m, z1.h
; CHECK-NEXT: movprfx z1, z2
; CHECK-NEXT: fcvt z1.s, p0/m, z2.h
; CHECK-NEXT: ret
%res = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
Expand All @@ -19,10 +21,12 @@ define <vscale x 8 x float> @fcvts_nxv8f16(<vscale x 8 x half> %a) {
define <vscale x 4 x double> @fcvtd_nxv4f16(<vscale x 4 x half> %a) {
; CHECK-LABEL: fcvtd_nxv4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpklo z1.d, z0.s
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpkhi z2.d, z0.s
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvt z0.d, p0/m, z1.h
; CHECK-NEXT: movprfx z1, z2
; CHECK-NEXT: fcvt z1.d, p0/m, z2.h
; CHECK-NEXT: ret
%res = fpext <vscale x 4 x half> %a to <vscale x 4 x double>
Expand All @@ -33,15 +37,18 @@ define <vscale x 8 x double> @fcvtd_nxv8f16(<vscale x 8 x half> %a) {
; CHECK-LABEL: fcvtd_nxv8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: uunpklo z1.s, z0.h
; CHECK-NEXT: uunpkhi z0.s, z0.h
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpkhi z0.s, z0.h
; CHECK-NEXT: uunpklo z2.d, z1.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: fcvt z1.d, p0/m, z1.h
; CHECK-NEXT: uunpkhi z4.d, z0.s
; CHECK-NEXT: movprfx z0, z2
; CHECK-NEXT: fcvt z0.d, p0/m, z2.h
; CHECK-NEXT: fcvt z1.d, p0/m, z1.h
; CHECK-NEXT: movprfx z2, z3
; CHECK-NEXT: fcvt z2.d, p0/m, z3.h
; CHECK-NEXT: movprfx z3, z4
; CHECK-NEXT: fcvt z3.d, p0/m, z4.h
; CHECK-NEXT: ret
%res = fpext <vscale x 8 x half> %a to <vscale x 8 x double>
Expand All @@ -51,10 +58,12 @@ define <vscale x 8 x double> @fcvtd_nxv8f16(<vscale x 8 x half> %a) {
define <vscale x 4 x double> @fcvtd_nxv4f32(<vscale x 4 x float> %a) {
; CHECK-LABEL: fcvtd_nxv4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpklo z1.d, z0.s
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpkhi z2.d, z0.s
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvt z0.d, p0/m, z1.s
; CHECK-NEXT: movprfx z1, z2
; CHECK-NEXT: fcvt z1.d, p0/m, z2.s
; CHECK-NEXT: ret
%res = fpext <vscale x 4 x float> %a to <vscale x 4 x double>
Expand All @@ -64,14 +73,18 @@ define <vscale x 4 x double> @fcvtd_nxv4f32(<vscale x 4 x float> %a) {
define <vscale x 8 x double> @fcvtd_nxv8f32(<vscale x 8 x float> %a) {
; CHECK-LABEL: fcvtd_nxv8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpklo z2.d, z0.s
; CHECK-NEXT: uunpkhi z3.d, z0.s
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpklo z4.d, z1.s
; CHECK-NEXT: uunpkhi z5.d, z1.s
; CHECK-NEXT: movprfx z0, z2
; CHECK-NEXT: fcvt z0.d, p0/m, z2.s
; CHECK-NEXT: movprfx z1, z3
; CHECK-NEXT: fcvt z1.d, p0/m, z3.s
; CHECK-NEXT: movprfx z2, z4
; CHECK-NEXT: fcvt z2.d, p0/m, z4.s
; CHECK-NEXT: movprfx z3, z5
; CHECK-NEXT: fcvt z3.d, p0/m, z5.s
; CHECK-NEXT: ret
%res = fpext <vscale x 8 x float> %a to <vscale x 8 x double>
Expand Down Expand Up @@ -182,10 +195,12 @@ define <vscale x 8 x i16> @fcvtzs_h_nxv8f64(<vscale x 8 x double> %a) {
define <vscale x 4 x i64> @fcvtzs_d_nxv4f32(<vscale x 4 x float> %a) {
; CHECK-LABEL: fcvtzs_d_nxv4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpklo z1.d, z0.s
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpkhi z2.d, z0.s
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.s
; CHECK-NEXT: movprfx z1, z2
; CHECK-NEXT: fcvtzs z1.d, p0/m, z2.s
; CHECK-NEXT: ret
%res = fptosi <vscale x 4 x float> %a to <vscale x 4 x i64>
Expand All @@ -195,14 +210,18 @@ define <vscale x 4 x i64> @fcvtzs_d_nxv4f32(<vscale x 4 x float> %a) {
define <vscale x 16 x i32> @fcvtzs_s_nxv16f16(<vscale x 16 x half> %a) {
; CHECK-LABEL: fcvtzs_s_nxv16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: uunpklo z2.s, z0.h
; CHECK-NEXT: uunpkhi z3.s, z0.h
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: uunpklo z4.s, z1.h
; CHECK-NEXT: uunpkhi z5.s, z1.h
; CHECK-NEXT: movprfx z0, z2
; CHECK-NEXT: fcvtzs z0.s, p0/m, z2.h
; CHECK-NEXT: movprfx z1, z3
; CHECK-NEXT: fcvtzs z1.s, p0/m, z3.h
; CHECK-NEXT: movprfx z2, z4
; CHECK-NEXT: fcvtzs z2.s, p0/m, z4.h
; CHECK-NEXT: movprfx z3, z5
; CHECK-NEXT: fcvtzs z3.s, p0/m, z5.h
; CHECK-NEXT: ret
%res = fptosi <vscale x 16 x half> %a to <vscale x 16 x i32>
Expand All @@ -228,10 +247,12 @@ define <vscale x 4 x i32> @fcvtzu_s_nxv4f64(<vscale x 4 x double> %a) {
define <vscale x 4 x i64> @fcvtzu_d_nxv4f32(<vscale x 4 x float> %a) {
; CHECK-LABEL: fcvtzu_d_nxv4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpklo z1.d, z0.s
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpkhi z2.d, z0.s
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvtzu z0.d, p0/m, z1.s
; CHECK-NEXT: movprfx z1, z2
; CHECK-NEXT: fcvtzu z1.d, p0/m, z2.s
; CHECK-NEXT: ret
%res = fptoui <vscale x 4 x float> %a to <vscale x 4 x i64>
Expand Down Expand Up @@ -274,15 +295,18 @@ define <vscale x 16 x float> @scvtf_s_nxv16i8(<vscale x 16 x i8> %a) {
; CHECK-LABEL: scvtf_s_nxv16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: sunpklo z1.h, z0.b
; CHECK-NEXT: sunpkhi z0.h, z0.b
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: sunpkhi z0.h, z0.b
; CHECK-NEXT: sunpklo z2.s, z1.h
; CHECK-NEXT: sunpkhi z1.s, z1.h
; CHECK-NEXT: sunpklo z3.s, z0.h
; CHECK-NEXT: scvtf z1.s, p0/m, z1.s
; CHECK-NEXT: sunpkhi z4.s, z0.h
; CHECK-NEXT: movprfx z0, z2
; CHECK-NEXT: scvtf z0.s, p0/m, z2.s
; CHECK-NEXT: scvtf z1.s, p0/m, z1.s
; CHECK-NEXT: movprfx z2, z3
; CHECK-NEXT: scvtf z2.s, p0/m, z3.s
; CHECK-NEXT: movprfx z3, z4
; CHECK-NEXT: scvtf z3.s, p0/m, z4.s
; CHECK-NEXT: ret
%res = sitofp <vscale x 16 x i8> %a to <vscale x 16 x float>
Expand All @@ -292,10 +316,12 @@ define <vscale x 16 x float> @scvtf_s_nxv16i8(<vscale x 16 x i8> %a) {
define <vscale x 4 x double> @scvtf_d_nxv4i32(<vscale x 4 x i32> %a) {
; CHECK-LABEL: scvtf_d_nxv4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sunpklo z1.d, z0.s
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sunpkhi z2.d, z0.s
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: scvtf z0.d, p0/m, z1.d
; CHECK-NEXT: movprfx z1, z2
; CHECK-NEXT: scvtf z1.d, p0/m, z2.d
; CHECK-NEXT: ret
%res = sitofp <vscale x 4 x i32> %a to <vscale x 4 x double>
Expand Down Expand Up @@ -352,10 +378,12 @@ define <vscale x 8 x half> @ucvtf_h_nxv8i64(<vscale x 8 x i64> %a) {
define <vscale x 4 x double> @ucvtf_d_nxv4i32(<vscale x 4 x i32> %a) {
; CHECK-LABEL: ucvtf_d_nxv4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpklo z1.d, z0.s
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpkhi z2.d, z0.s
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: ucvtf z0.d, p0/m, z1.d
; CHECK-NEXT: movprfx z1, z2
; CHECK-NEXT: ucvtf z1.d, p0/m, z2.d
; CHECK-NEXT: ret
%res = uitofp <vscale x 4 x i32> %a to <vscale x 4 x double>
Expand Down
36 changes: 19 additions & 17 deletions llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
; RUN: llc < %s -mtriple=x86_64-- -mcpu=nehalem | FileCheck %s --check-prefixes=NHM
; RUN: llc < %s -mtriple=x86_64-- -mcpu=sandybridge | FileCheck %s --check-prefixes=FAST-SCALAR,SNB
; RUN: llc < %s -mtriple=x86_64-- -mcpu=broadwell | FileCheck %s --check-prefixes=FAST-SCALAR,BDW
; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake | FileCheck %s --check-prefixes=FAST-SCALAR,SKL
; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake | FileCheck %s --check-prefixes=FAST-SCALAR,FAST-VECTOR
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=FAST-SCALAR,FAST-VECTOR
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=FAST-SCALAR,FAST-VECTOR

define float @f32_no_daz(float %f) #0 {
; NHM-LABEL: f32_no_daz:
Expand Down Expand Up @@ -76,10 +78,10 @@ define <4 x float> @v4f32_no_daz(<4 x float> %f) #0 {
; BDW-NEXT: vandps %xmm1, %xmm0, %xmm0
; BDW-NEXT: retq
;
; SKL-LABEL: v4f32_no_daz:
; SKL: # %bb.0:
; SKL-NEXT: vsqrtps %xmm0, %xmm0
; SKL-NEXT: retq
; FAST-VECTOR-LABEL: v4f32_no_daz:
; FAST-VECTOR: # %bb.0:
; FAST-VECTOR-NEXT: vsqrtps %xmm0, %xmm0
; FAST-VECTOR-NEXT: retq
%call = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #2
ret <4 x float> %call
}
Expand Down Expand Up @@ -147,10 +149,10 @@ define <8 x float> @v8f32_no_daz(<8 x float> %f) #0 {
; BDW-NEXT: vandps %ymm1, %ymm0, %ymm0
; BDW-NEXT: retq
;
; SKL-LABEL: v8f32_no_daz:
; SKL: # %bb.0:
; SKL-NEXT: vsqrtps %ymm0, %ymm0
; SKL-NEXT: retq
; FAST-VECTOR-LABEL: v8f32_no_daz:
; FAST-VECTOR: # %bb.0:
; FAST-VECTOR-NEXT: vsqrtps %ymm0, %ymm0
; FAST-VECTOR-NEXT: retq
%call = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #2
ret <8 x float> %call
}
Expand Down Expand Up @@ -224,10 +226,10 @@ define <4 x float> @v4f32_daz(<4 x float> %f) #1 {
; BDW-NEXT: vandps %xmm1, %xmm0, %xmm0
; BDW-NEXT: retq
;
; SKL-LABEL: v4f32_daz:
; SKL: # %bb.0:
; SKL-NEXT: vsqrtps %xmm0, %xmm0
; SKL-NEXT: retq
; FAST-VECTOR-LABEL: v4f32_daz:
; FAST-VECTOR: # %bb.0:
; FAST-VECTOR-NEXT: vsqrtps %xmm0, %xmm0
; FAST-VECTOR-NEXT: retq
%call = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #2
ret <4 x float> %call
}
Expand Down Expand Up @@ -286,10 +288,10 @@ define <8 x float> @v8f32_daz(<8 x float> %f) #1 {
; BDW-NEXT: vandps %ymm1, %ymm0, %ymm0
; BDW-NEXT: retq
;
; SKL-LABEL: v8f32_daz:
; SKL: # %bb.0:
; SKL-NEXT: vsqrtps %ymm0, %ymm0
; SKL-NEXT: retq
; FAST-VECTOR-LABEL: v8f32_daz:
; FAST-VECTOR: # %bb.0:
; FAST-VECTOR-NEXT: vsqrtps %ymm0, %ymm0
; FAST-VECTOR-NEXT: retq
%call = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #2
ret <8 x float> %call
}
Expand Down
193 changes: 186 additions & 7 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,22 @@
; REQUIRES: asserts
; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-minimum-VF=0 --debug-only=loop-vectorize -force-target-instruction-cost=1 -S 2>%t | FileCheck %s --check-prefix=CHECK
; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-minimum-VF=8 --debug-only=loop-vectorize -S 2>%t | FileCheck %s --check-prefix=CHECK
; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-force-VF=8 --debug-only=loop-vectorize -S 2>%t | FileCheck %s --check-prefix=CHECK-VF8
; RUN: cat %t | FileCheck %s --check-prefix=DEBUG-FORCED

target triple = "aarch64-linux-gnu"

; DEBUG: LV: Checking a loop in "f1"
; DEBUG: LV: Checking a loop in "main_vf_vscale_x_16"
; DEBUG: Create Skeleton for epilogue vectorized loop (first pass)
; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:vscale x 8, Epilogue Loop UF:1

; DEBUG-FORCED: LV: Checking a loop in "f1"
; DEBUG-FORCED: LV: Checking a loop in "main_vf_vscale_x_16"
; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced.
; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass)
; DEBUG-FORCED: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1

define void @f1(i8* %A) #0 {
; CHECK-LABEL: @f1(
define void @main_vf_vscale_x_16(i8* %A) #0 {
; CHECK-LABEL: @main_vf_vscale_x_16(
; CHECK-NEXT: iter.check:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
Expand Down Expand Up @@ -105,7 +103,7 @@ define void @f1(i8* %A) #0 {
; CHECK: exit:
; CHECK-NEXT: ret void
;
; CHECK-VF8-LABEL: @f1(
; CHECK-VF8-LABEL: @main_vf_vscale_x_16(
; CHECK-VF8-NEXT: iter.check:
; CHECK-VF8-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; CHECK-VF8: vector.main.loop.iter.check:
Expand Down Expand Up @@ -195,4 +193,185 @@ exit:
ret void
}


; DEBUG: LV: Checking a loop in "main_vf_vscale_x_2"
; DEBUG: Create Skeleton for epilogue vectorized loop (first pass)
; DEBUG: Main Loop VF:vscale x 2, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1

; DEBUG-FORCED: LV: Checking a loop in "main_vf_vscale_x_2"
; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced.
; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass)
; DEBUG-FORCED: Main Loop VF:vscale x 2, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1

; When the vector.body uses VF=vscale x 1 (or VF=vscale x 2 because
; that's the minimum supported VF by SVE), we could still use a wide
; fixed-width VF=8 for the epilogue if the vectors are known to be
; sufficiently wide. This information can be deduced from vscale_range or
; VScaleForTuning (set by mcpu/mtune).
define void @main_vf_vscale_x_2(i64* %A) #0 vscale_range(8, 8) {
; CHECK-LABEL: @main_vf_vscale_x_2(
; CHECK-NEXT: iter.check:
; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; CHECK: vector.main.loop.iter.check:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP4]]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP9]]
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64* [[TMP12]] to <vscale x 2 x i64>*
; CHECK-NEXT: store <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64>* [[TMP13]], align 1
; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 2
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 [[TMP15]]
; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64* [[TMP16]] to <vscale x 2 x i64>*
; CHECK-NEXT: store <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64>* [[TMP17]], align 1
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX2]], 0
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP21]]
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, i64* [[TMP22]], i32 0
; CHECK-NEXT: [[TMP24:%.*]] = bitcast i64* [[TMP23]] to <8 x i64>*
; CHECK-NEXT: store <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64>* [[TMP24]], align 1
; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 8
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
; CHECK-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 1024, 1024
; CHECK-NEXT: br i1 [[CMP_N1]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]]
; CHECK-NEXT: store i64 1, i64* [[ARRAYIDX]], align 1
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 1024
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: exit.loopexit:
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
; CHECK-VF8-LABEL: @main_vf_vscale_x_2(
; CHECK-VF8-NEXT: iter.check:
; CHECK-VF8-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; CHECK-VF8: vector.main.loop.iter.check:
; CHECK-VF8-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-VF8-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-VF8-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
; CHECK-VF8-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-VF8: vector.ph:
; CHECK-VF8-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-VF8-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
; CHECK-VF8-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
; CHECK-VF8-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-VF8-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-VF8: vector.body:
; CHECK-VF8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-VF8-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
; CHECK-VF8-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-VF8-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
; CHECK-VF8-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
; CHECK-VF8-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
; CHECK-VF8-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
; CHECK-VF8-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP4]]
; CHECK-VF8-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP9]]
; CHECK-VF8-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 0
; CHECK-VF8-NEXT: [[TMP13:%.*]] = bitcast i64* [[TMP12]] to <vscale x 2 x i64>*
; CHECK-VF8-NEXT: store <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64>* [[TMP13]], align 1
; CHECK-VF8-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-VF8-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 2
; CHECK-VF8-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 [[TMP15]]
; CHECK-VF8-NEXT: [[TMP17:%.*]] = bitcast i64* [[TMP16]] to <vscale x 2 x i64>*
; CHECK-VF8-NEXT: store <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64>* [[TMP17]], align 1
; CHECK-VF8-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-VF8-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4
; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
; CHECK-VF8-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF8-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK-VF8: middle.block:
; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK-VF8: vec.epilog.iter.check:
; CHECK-VF8-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
; CHECK-VF8-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK-VF8: vec.epilog.ph:
; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-VF8-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK-VF8: vec.epilog.vector.body:
; CHECK-VF8-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-VF8-NEXT: [[TMP21:%.*]] = add i64 [[INDEX2]], 0
; CHECK-VF8-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP21]]
; CHECK-VF8-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, i64* [[TMP22]], i32 0
; CHECK-VF8-NEXT: [[TMP24:%.*]] = bitcast i64* [[TMP23]] to <8 x i64>*
; CHECK-VF8-NEXT: store <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64>* [[TMP24]], align 1
; CHECK-VF8-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 8
; CHECK-VF8-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
; CHECK-VF8-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-VF8: vec.epilog.middle.block:
; CHECK-VF8-NEXT: [[CMP_N1:%.*]] = icmp eq i64 1024, 1024
; CHECK-VF8-NEXT: br i1 [[CMP_N1]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK-VF8: vec.epilog.scalar.ph:
; CHECK-VF8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; CHECK-VF8-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-VF8: for.body:
; CHECK-VF8-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-VF8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]]
; CHECK-VF8-NEXT: store i64 1, i64* [[ARRAYIDX]], align 1
; CHECK-VF8-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-VF8-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 1024
; CHECK-VF8-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK-VF8: exit.loopexit:
; CHECK-VF8-NEXT: br label [[EXIT]]
; CHECK-VF8: exit:
; CHECK-VF8-NEXT: ret void
;
entry:
br label %for.body

for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%arrayidx = getelementptr inbounds i64, i64* %A, i64 %iv
store i64 1, i64* %arrayidx, align 1
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp ne i64 %iv.next, 1024
br i1 %exitcond, label %for.body, label %exit

exit:
ret void
}

attributes #0 = { "target-features"="+sve" }