Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to double
// CHECK-NEXT: [[TMP1:%.*]] = fmul double [[TMP0]], [[B]]
// CHECK-NEXT: [[TMP2:%.*]] = bitcast double [[TMP1]] to <1 x double>
// CHECK-NEXT: ret <1 x double> [[TMP2]]
// CHECK-NEXT: [[REF_TMP_I_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double [[TMP1]], i32 0
// CHECK-NEXT: ret <1 x double> [[REF_TMP_I_0_VEC_INSERT]]
//
float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) {
return vmul_n_f64(a, b);
Expand Down Expand Up @@ -552,12 +552,12 @@ int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_lane_f64_0(
// CHECK-SAME: ) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
// CHECK-NEXT: [[VGET_LANE9:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
// CHECK-NEXT: [[__PROMOTE_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FD6304BC43AB5C2, i32 0
// CHECK-NEXT: [[__PROMOTE2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FEE211E215AEEF3, i32 0
// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], i32 0
// CHECK-NEXT: [[VGET_LANE9:%.*]] = extractelement <1 x double> [[__PROMOTE2_SROA_0_0_VEC_INSERT]], i32 0
// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE9]])
// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], double [[VMULXD_F64_I]], i32 0
// CHECK-NEXT: ret <1 x double> [[VSET_LANE]]
//
float64x1_t test_vmulx_lane_f64_0() {
Expand All @@ -574,13 +574,13 @@ float64x1_t test_vmulx_lane_f64_0() {
// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_laneq_f64_2(
// CHECK-SAME: ) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
// CHECK-NEXT: [[__PROMOTE_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FD6304BC43AB5C2, i32 0
// CHECK-NEXT: [[__PROMOTE2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FEE211E215AEEF3, i32 0
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], <1 x double> [[__PROMOTE2_SROA_0_0_VEC_INSERT]], <2 x i32> <i32 0, i32 1>
// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], i32 0
// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[SHUFFLE_I]], i32 1
// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], double [[VMULXD_F64_I]], i32 0
// CHECK-NEXT: ret <1 x double> [[VSET_LANE]]
//
float64x1_t test_vmulx_laneq_f64_2() {
Expand Down
42 changes: 20 additions & 22 deletions clang/test/CodeGen/arm-bf16-convert-intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -196,35 +196,33 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast i64 0 to <4 x bfloat>
// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]])
// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP0]], <4 x bfloat> [[VCVTFP2BF1_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> [[VCVTFP2BF1_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// CHECK-A32-HARDFP-NEXT: ret <8 x bfloat> [[SHUFFLE_I]]
//
// CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A32-SOFTFP-NEXT: entry:
// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast i64 0 to <4 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]])
// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <2 x i32>
// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <4 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP0]] to <2 x i32>
// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <2 x i32>
// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to <2 x i32>
// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> zeroinitializer to <2 x i32>
// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP4]] to <2 x i32>
// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP6]] to <4 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP7]] to <4 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP8]], <4 x bfloat> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I]] to <4 x i32>
// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <8 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <8 x bfloat> [[TMP11]] to <4 x i32>
// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[TMP12]] to <8 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP14:%.*]] = bitcast <8 x bfloat> [[TMP13]] to <4 x i32>
// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP14]]
// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP7]], <4 x bfloat> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I]] to <4 x i32>
// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <8 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <8 x bfloat> [[TMP10]] to <4 x i32>
// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <8 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <8 x bfloat> [[TMP12]] to <4 x i32>
// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP13]]
//
bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
return vcvtq_low_bf16_f32(a);
Expand Down
137 changes: 85 additions & 52 deletions llvm/lib/Transforms/Scalar/SROA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1482,19 +1482,23 @@ LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }

/// Walk the range of a partitioning looking for a common type to cover this
/// sequence of slices.
static std::pair<Type *, IntegerType *>
/// Returns: {CommonType, LargestIntegerType, OnlyIntrinsicUsers}
static std::tuple<Type *, IntegerType *, bool>
findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
uint64_t EndOffset) {
Type *Ty = nullptr;
bool TyIsCommon = true;
IntegerType *ITy = nullptr;
bool OnlyIntrinsicUsers = true;

// Note that we need to look at *every* alloca slice's Use to ensure we
// always get consistent results regardless of the order of slices.
for (AllocaSlices::const_iterator I = B; I != E; ++I) {
Use *U = I->getUse();
if (isa<IntrinsicInst>(*U->getUser()))
continue;
// We found a non-intrinsic user
OnlyIntrinsicUsers = false;
if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
continue;

Expand Down Expand Up @@ -1528,7 +1532,7 @@ findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
Ty = UserTy;
}

return {TyIsCommon ? Ty : nullptr, ITy};
return {TyIsCommon ? Ty : nullptr, ITy, OnlyIntrinsicUsers};
}

/// PHI instructions that use an alloca and are subsequently loaded can be
Expand Down Expand Up @@ -5209,63 +5213,92 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
/// promoted.
AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
Partition &P) {
const DataLayout &DL = AI.getDataLayout();
// Try to compute a friendly type for this partition of the alloca. This
// won't always succeed, in which case we fall back to a legal integer type
// or an i8 array of an appropriate size.
Type *SliceTy = nullptr;
VectorType *SliceVecTy = nullptr;
const DataLayout &DL = AI.getDataLayout();
unsigned VScale = AI.getFunction()->getVScaleValue();

std::pair<Type *, IntegerType *> CommonUseTy =
findCommonType(P.begin(), P.end(), P.endOffset());
// Do all uses operate on the same type?
if (CommonUseTy.first) {
TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first);
if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) {
SliceTy = CommonUseTy.first;
SliceVecTy = dyn_cast<VectorType>(SliceTy);
}
}
// If not, can we find an appropriate subtype in the original allocated type?
if (!SliceTy)
if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
P.beginOffset(), P.size()))
SliceTy = TypePartitionTy;

// If still not, can we use the largest bitwidth integer type used?
if (!SliceTy && CommonUseTy.second)
if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) {
SliceTy = CommonUseTy.second;
SliceVecTy = dyn_cast<VectorType>(SliceTy);
auto SelectPartitionTy = [&]() -> std::tuple<Type *, bool, VectorType *> {
// First check if the partition is viable for vetor promotion.
//
// We prefer vector promotion over integer widening promotion when:
// - The vector element type is a floating-point type.
// - All the loads/stores to the alloca are vector loads/stores to the
// entire alloca.
//
// Otherwise when there is a integer vector with mixed
// loads/stores we prefer integer widening promotion because it's more
// likely the user is doing bitwise arithmetic and we generate better code.
VectorType *VecTy =
isVectorPromotionViable(P, DL, AI.getFunction()->getVScaleValue());
// If the vector element type is a floating-point type, we prefer vector
// promotion.
if (VecTy && VecTy->getElementType()->isFloatingPointTy())
return {VecTy, false, VecTy};

// Check if there is a common type that all slices of the partition use that
// spans the partition.
auto [CommonUseTy, LargestIntTy, OnlyIntrinsicUsers] =
findCommonType(P.begin(), P.end(), P.endOffset());
if (CommonUseTy) {
TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy);
if (CommonUseSize.isFixed() &&
CommonUseSize.getFixedValue() >= P.size()) {
// We prefer vector promotion here because if vector promotion is viable
// and there is a common type used, then it implies the second listed
// condition for prefering vector promotion is true.
if (VecTy)
return {VecTy, false, VecTy};
return {CommonUseTy, isIntegerWideningViable(P, CommonUseTy, DL),
nullptr};
}
}
if ((!SliceTy || (SliceTy->isArrayTy() &&
SliceTy->getArrayElementType()->isIntegerTy())) &&
DL.isLegalInteger(P.size() * 8)) {
SliceTy = Type::getIntNTy(*C, P.size() * 8);
}

// If the common use types are not viable for promotion then attempt to find
// another type that is viable.
if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL, VScale))
// If there are only intrinsic users, try to represent as a legal integer
// type because we are probably just copying data around and the integer can
// be promoted.
if (OnlyIntrinsicUsers && DL.isLegalInteger(P.size() * 8))
return {Type::getIntNTy(*C, P.size() * 8), false, nullptr};

// Can we find an appropriate subtype in the original allocated
// type?
if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
P.beginOffset(), P.size())) {
VectorType *TypePartitionVecTy = dyn_cast<VectorType>(TypePartitionTy);
if (TypePartitionVecTy &&
checkVectorTypeForPromotion(P, TypePartitionVecTy, DL, VScale))
SliceTy = TypePartitionTy;
// If the partition is an integer array that can be spanned by a legal
// integer type, prefer to represent it as a legal integer type because
// it's more likely to be promotable.
if (TypePartitionTy->isArrayTy() &&
TypePartitionTy->getArrayElementType()->isIntegerTy() &&
DL.isLegalInteger(P.size() * 8))
TypePartitionTy = Type::getIntNTy(*C, P.size() * 8);
// There was no common type used, so we prefer integer widening promotion.
if (isIntegerWideningViable(P, TypePartitionTy, DL))
return {TypePartitionTy, true, nullptr};
if (VecTy)
return {VecTy, false, VecTy};
// If we couldn't promotion with TypePartitionTy, try with the largest
// integer type used.
if (LargestIntTy &&
DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size() &&
isIntegerWideningViable(P, LargestIntTy, DL))
return {LargestIntTy, true, nullptr};
// Fallback to TypePartitionTy and we probably won't promote.
return {TypePartitionTy, false, nullptr};
}

if (!SliceTy)
SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
assert(DL.getTypeAllocSize(SliceTy).getFixedValue() >= P.size());
// Select the largest integer type used if it spans the partition.
if (LargestIntTy &&
DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size())
return {LargestIntTy, false, nullptr};

bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
// Select a legal integer type if it spans the partition.
if (DL.isLegalInteger(P.size() * 8))
return {Type::getIntNTy(*C, P.size() * 8), false, nullptr};

// Fallback to an i8 array.
return {ArrayType::get(Type::getInt8Ty(*C), P.size()), false, nullptr};
};

VectorType *VecTy =
IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale);
if (VecTy)
SliceTy = VecTy;
auto [PartitionTy, IsIntegerPromotable, VecTy] = SelectPartitionTy();

// Check for the case where we're going to rewrite to a new alloca of the
// exact same type as the original, and with the same access offsets. In that
Expand All @@ -5274,7 +5307,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// P.beginOffset() can be non-zero even with the same type in a case with
// out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
AllocaInst *NewAI;
if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
if (PartitionTy == AI.getAllocatedType() && P.beginOffset() == 0) {
NewAI = &AI;
// FIXME: We should be able to bail at this point with "nothing changed".
// FIXME: We might want to defer PHI speculation until after here.
Expand All @@ -5284,10 +5317,10 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
// If we will get at least this much alignment from the type alone, leave
// the alloca's alignment unconstrained.
const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy);
const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(PartitionTy);
NewAI = new AllocaInst(
SliceTy, AI.getAddressSpace(), nullptr,
IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
PartitionTy, AI.getAddressSpace(), nullptr,
IsUnconstrained ? DL.getPrefTypeAlign(PartitionTy) : Alignment,
AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()),
AI.getIterator());
// Copy the old AI debug location over to the new one.
Expand Down
Loading