From b64130735b97f310ddc226c85fe718880fa6877a Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Wed, 12 Nov 2025 21:53:29 +0000 Subject: [PATCH 01/21] prefer integer partitions --- llvm/lib/Transforms/Scalar/SROA.cpp | 4 +++- .../Transforms/SROA/prefer-integer-partition.ll | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/SROA/prefer-integer-partition.ll diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 5c60fad6f91aa..7905cfe95336d 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -5234,7 +5234,9 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, SliceTy = TypePartitionTy; // If still not, can we use the largest bitwidth integer type used? - if (!SliceTy && CommonUseTy.second) + // If SliceTy is a non-promotable aggregate, prefer to represent as an integer type + // because it's more likely to be promotable. + if ((!SliceTy || !SliceTy->isSingleValueType()) && CommonUseTy.second) if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) { SliceTy = CommonUseTy.second; SliceVecTy = dyn_cast(SliceTy); diff --git a/llvm/test/Transforms/SROA/prefer-integer-partition.ll b/llvm/test/Transforms/SROA/prefer-integer-partition.ll new file mode 100644 index 0000000000000..3606af8debd69 --- /dev/null +++ b/llvm/test/Transforms/SROA/prefer-integer-partition.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=sroa -S | FileCheck %s + +; Ensure that the [2 x half] alloca is spanned by an i32 partition. + +define void @test() { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 42 to float +; CHECK-NEXT: ret void +; +entry: + %alloca = alloca [2 x half] + store i32 42, ptr %alloca + %val = load float, ptr %alloca + ret void +} From b4d756acbeabbdf58744d470fa96c5b418ca8637 Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Wed, 12 Nov 2025 22:16:38 +0000 Subject: [PATCH 02/21] format --- llvm/lib/Transforms/Scalar/SROA.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 7905cfe95336d..e0eeb416092b1 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -5234,8 +5234,8 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, SliceTy = TypePartitionTy; // If still not, can we use the largest bitwidth integer type used? - // If SliceTy is a non-promotable aggregate, prefer to represent as an integer type - // because it's more likely to be promotable. + // If SliceTy is a non-promotable aggregate, prefer to represent as an integer + // type because it's more likely to be promotable. if ((!SliceTy || !SliceTy->isSingleValueType()) && CommonUseTy.second) if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) { SliceTy = CommonUseTy.second; From 89e8fbce60922a98fac635c4be5418a12c9d7cc3 Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Thu, 13 Nov 2025 01:06:26 +0000 Subject: [PATCH 03/21] julia fix --- llvm/lib/Transforms/Scalar/SROA.cpp | 4 +-- .../SROA/prefer-integer-partition.ll | 36 +++++++++++++++++-- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index e0eeb416092b1..883a9c0c4612c 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -5241,8 +5241,8 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, SliceTy = CommonUseTy.second; SliceVecTy = dyn_cast(SliceTy); } - if ((!SliceTy || (SliceTy->isArrayTy() && - SliceTy->getArrayElementType()->isIntegerTy())) && + // Try representing the partition as a legal integer type of the same size as the alloca. + if ((!SliceTy || SliceTy->isArrayTy()) && DL.isLegalInteger(P.size() * 8)) { SliceTy = Type::getIntNTy(*C, P.size() * 8); } diff --git a/llvm/test/Transforms/SROA/prefer-integer-partition.ll b/llvm/test/Transforms/SROA/prefer-integer-partition.ll index 3606af8debd69..0ed400f18bc37 100644 --- a/llvm/test/Transforms/SROA/prefer-integer-partition.ll +++ b/llvm/test/Transforms/SROA/prefer-integer-partition.ll @@ -1,10 +1,35 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=sroa -S | FileCheck %s -; Ensure that the [2 x half] alloca is spanned by an i32 partition. +; Test that SROA converts array types to integer types for promotion. -define void @test() { -; CHECK-LABEL: @test( +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32-ni:10:11:12:13" + +define void @test_float_array_only_intrinsics() { +; CHECK-LABEL: @test_float_array_only_intrinsics( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; +entry: + %src = alloca [2 x float], align 4 + %dst = alloca [2 x float], align 4 + + ; Initialize src + call void @llvm.lifetime.start.p0(i64 8, ptr %src) + call void @llvm.lifetime.start.p0(i64 8, ptr %dst) + + ; Only intrinsic uses - no scalar loads/stores to establish common type + call void @llvm.memset.p0.i64(ptr %src, i8 42, i64 8, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 8, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr %src, ptr %dst, i64 8, i1 false) + + call void @llvm.lifetime.end.p0(i64 8, ptr %dst) + call void @llvm.lifetime.end.p0(i64 8, ptr %src) + ret void +} + +define void @test_mixed_types() { +; CHECK-LABEL: @test_mixed_types( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 42 to float ; CHECK-NEXT: ret void @@ -15,3 +40,8 @@ entry: %val = load float, ptr %alloca ret void } + +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) +declare void @llvm.lifetime.start.p0(i64, ptr nocapture) +declare void @llvm.lifetime.end.p0(i64, ptr nocapture) From 758018e1d4de6c02a207bcd7805c3094add22732 Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Thu, 13 Nov 2025 01:14:32 +0000 Subject: [PATCH 04/21] format --- llvm/lib/Transforms/Scalar/SROA.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 883a9c0c4612c..f348bf2ca7353 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -5241,9 +5241,9 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, SliceTy = CommonUseTy.second; SliceVecTy = dyn_cast(SliceTy); } - // Try representing the partition as a legal integer type of the same size as the alloca. - if ((!SliceTy || SliceTy->isArrayTy()) && - DL.isLegalInteger(P.size() * 8)) { + // Try representing the partition as a legal integer type of the same size as + // the alloca. + if ((!SliceTy || SliceTy->isArrayTy()) && DL.isLegalInteger(P.size() * 8)) { SliceTy = Type::getIntNTy(*C, P.size() * 8); } From 5a58db436fc9ea39677f454fa735af055037e6d2 Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Thu, 13 Nov 2025 01:21:45 +0000 Subject: [PATCH 05/21] remove comment --- llvm/test/Transforms/SROA/prefer-integer-partition.ll | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/test/Transforms/SROA/prefer-integer-partition.ll b/llvm/test/Transforms/SROA/prefer-integer-partition.ll index 0ed400f18bc37..78d3b22d3fdc0 100644 --- a/llvm/test/Transforms/SROA/prefer-integer-partition.ll +++ b/llvm/test/Transforms/SROA/prefer-integer-partition.ll @@ -14,7 +14,6 @@ entry: %src = alloca [2 x float], align 4 %dst = alloca [2 x float], align 4 - ; Initialize src call void @llvm.lifetime.start.p0(i64 8, ptr %src) call void @llvm.lifetime.start.p0(i64 8, ptr %dst) From 74cd7b7021d0df53ef9940fe15403114a04606cc Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Wed, 19 Nov 2025 22:48:36 +0000 Subject: [PATCH 06/21] test --- llvm/test/Transforms/SROA/prefer-integer-partition.ll | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/test/Transforms/SROA/prefer-integer-partition.ll b/llvm/test/Transforms/SROA/prefer-integer-partition.ll index 78d3b22d3fdc0..b9a7af6276565 100644 --- a/llvm/test/Transforms/SROA/prefer-integer-partition.ll +++ b/llvm/test/Transforms/SROA/prefer-integer-partition.ll @@ -13,15 +13,15 @@ define void @test_float_array_only_intrinsics() { entry: %src = alloca [2 x float], align 4 %dst = alloca [2 x float], align 4 - + call void @llvm.lifetime.start.p0(i64 8, ptr %src) call void @llvm.lifetime.start.p0(i64 8, ptr %dst) - + ; Only intrinsic uses - no scalar loads/stores to establish common type call void @llvm.memset.p0.i64(ptr %src, i8 42, i64 8, i1 false) call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 8, i1 false) call void @llvm.memcpy.p0.p0.i64(ptr %src, ptr %dst, i64 8, i1 false) - + call void @llvm.lifetime.end.p0(i64 8, ptr %dst) call void @llvm.lifetime.end.p0(i64 8, ptr %src) ret void From 7b49609c2de6461ee74f77d58e5df4a8906e6de6 Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Tue, 25 Nov 2025 18:22:17 +0000 Subject: [PATCH 07/21] solve regression --- llvm/lib/Transforms/Scalar/SROA.cpp | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index f348bf2ca7353..e906358a9ba50 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -5262,12 +5262,28 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size()); assert(DL.getTypeAllocSize(SliceTy).getFixedValue() >= P.size()); - bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL); - - VectorType *VecTy = - IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale); - if (VecTy) - SliceTy = VecTy; + // Prefer vector promotion over integer widening for floating-point vectors + // because it is more likely the user is just accessing whole vector elements + // and not doing bitsise arithmetic. + bool PreferVectorPromotion = false; + if (auto *FixedVecSliceTy = dyn_cast(SliceTy)) + PreferVectorPromotion = FixedVecSliceTy->getElementType()->isFloatingPointTy(); + + bool IsIntegerPromotable = false; + VectorType *VecTy = nullptr; + + if (PreferVectorPromotion) { + // For float vectors, try vector promotion first + VecTy = isVectorPromotionViable(P, DL, VScale); + if (!VecTy) + IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL); + } else { + // For integer vectors (especially small integers like i8), try integer + // widening first as InstCombine can optimize the resulting operations + IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL); + if (!IsIntegerPromotable) + VecTy = isVectorPromotionViable(P, DL, VScale); + } // Check for the case where we're going to rewrite to a new alloca of the // exact same type as the original, and with the same access offsets. In that From 035337851338f723fa149a1bbaa5a73b9e5f56c3 Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Wed, 26 Nov 2025 08:06:00 +0000 Subject: [PATCH 08/21] updated to fix regression --- llvm/lib/Transforms/Scalar/SROA.cpp | 124 +++++++++++----------------- 1 file changed, 50 insertions(+), 74 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index e906358a9ba50..a017c2d3b49ff 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -5209,81 +5209,57 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { /// promoted. AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P) { - // Try to compute a friendly type for this partition of the alloca. This - // won't always succeed, in which case we fall back to a legal integer type - // or an i8 array of an appropriate size. - Type *SliceTy = nullptr; - VectorType *SliceVecTy = nullptr; const DataLayout &DL = AI.getDataLayout(); - unsigned VScale = AI.getFunction()->getVScaleValue(); - - std::pair CommonUseTy = - findCommonType(P.begin(), P.end(), P.endOffset()); - // Do all uses operate on the same type? - if (CommonUseTy.first) { - TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first); - if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) { - SliceTy = CommonUseTy.first; - SliceVecTy = dyn_cast(SliceTy); + auto ComputePartitionTy = [&]() -> std::tuple { + // First check if the partition is viable for vetor promotion. If it is + // via a floating-point vector, we are done because we would never prefer integer widening. + VectorType *VecTy = isVectorPromotionViable(P, DL, AI.getFunction()->getVScaleValue()); + if (VecTy) { + if (VecTy->getElementType()->isFloatingPointTy()) { + return {VecTy, false, VecTy}; + } } - } - // If not, can we find an appropriate subtype in the original allocated type? - if (!SliceTy) - if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), - P.beginOffset(), P.size())) - SliceTy = TypePartitionTy; - - // If still not, can we use the largest bitwidth integer type used? - // If SliceTy is a non-promotable aggregate, prefer to represent as an integer - // type because it's more likely to be promotable. - if ((!SliceTy || !SliceTy->isSingleValueType()) && CommonUseTy.second) - if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) { - SliceTy = CommonUseTy.second; - SliceVecTy = dyn_cast(SliceTy); + + // Otherwise, check if there is a common type that all slices of the + // partition use. Collect the largest integer type used as a backup. + auto CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()); + // If there is a common type that spans the partition, use it. + if (CommonUseTy.first) { + TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first); + if (CommonUseSize.isFixed() && + CommonUseSize.getFixedValue() >= P.size()) { + + if (VecTy) + return {VecTy, false, VecTy}; + return {CommonUseTy.first, isIntegerWideningViable(P, CommonUseTy.first, DL), nullptr}; + } } - // Try representing the partition as a legal integer type of the same size as - // the alloca. - if ((!SliceTy || SliceTy->isArrayTy()) && DL.isLegalInteger(P.size() * 8)) { - SliceTy = Type::getIntNTy(*C, P.size() * 8); - } - - // If the common use types are not viable for promotion then attempt to find - // another type that is viable. - if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL, VScale)) - if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), - P.beginOffset(), P.size())) { - VectorType *TypePartitionVecTy = dyn_cast(TypePartitionTy); - if (TypePartitionVecTy && - checkVectorTypeForPromotion(P, TypePartitionVecTy, DL, VScale)) - SliceTy = TypePartitionTy; + + // If not, can we find an appropriate subtype in the original allocated type? + if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), P.beginOffset(), P.size())) { + if (TypePartitionTy->isArrayTy() && TypePartitionTy->getArrayElementType()->isIntegerTy() && DL.isLegalInteger(P.size() * 8)) + TypePartitionTy = Type::getIntNTy(*C, P.size() * 8); + + if (isIntegerWideningViable(P, TypePartitionTy, DL)) + return {TypePartitionTy, true, nullptr}; + if (VecTy) + return {VecTy, false, VecTy}; + if (CommonUseTy.second && DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size() && isIntegerWideningViable(P, CommonUseTy.second, DL)) + return {CommonUseTy.second, true, nullptr}; + return {TypePartitionTy, false, nullptr}; } - if (!SliceTy) - SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size()); - assert(DL.getTypeAllocSize(SliceTy).getFixedValue() >= P.size()); - - // Prefer vector promotion over integer widening for floating-point vectors - // because it is more likely the user is just accessing whole vector elements - // and not doing bitsise arithmetic. - bool PreferVectorPromotion = false; - if (auto *FixedVecSliceTy = dyn_cast(SliceTy)) - PreferVectorPromotion = FixedVecSliceTy->getElementType()->isFloatingPointTy(); - - bool IsIntegerPromotable = false; - VectorType *VecTy = nullptr; - - if (PreferVectorPromotion) { - // For float vectors, try vector promotion first - VecTy = isVectorPromotionViable(P, DL, VScale); - if (!VecTy) - IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL); - } else { - // For integer vectors (especially small integers like i8), try integer - // widening first as InstCombine can optimize the resulting operations - IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL); - if (!IsIntegerPromotable) - VecTy = isVectorPromotionViable(P, DL, VScale); - } + // If still not, can we use the largest bitwidth integer type used? + if (CommonUseTy.second && DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) + return {CommonUseTy.second, false, nullptr}; + + if (DL.isLegalInteger(P.size() * 8)) + return {Type::getIntNTy(*C, P.size() * 8), false, nullptr}; + + return {ArrayType::get(Type::getInt8Ty(*C), P.size()), false, nullptr}; + }; + + auto [PartitionTy, IsIntegerPromotable, VecTy] = ComputePartitionTy(); // Check for the case where we're going to rewrite to a new alloca of the // exact same type as the original, and with the same access offsets. In that @@ -5292,7 +5268,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // P.beginOffset() can be non-zero even with the same type in a case with // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll). AllocaInst *NewAI; - if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) { + if (PartitionTy == AI.getAllocatedType() && P.beginOffset() == 0) { NewAI = &AI; // FIXME: We should be able to bail at this point with "nothing changed". // FIXME: We might want to defer PHI speculation until after here. @@ -5302,10 +5278,10 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset()); // If we will get at least this much alignment from the type alone, leave // the alloca's alignment unconstrained. - const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy); + const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(PartitionTy); NewAI = new AllocaInst( - SliceTy, AI.getAddressSpace(), nullptr, - IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment, + PartitionTy, AI.getAddressSpace(), nullptr, + IsUnconstrained ? DL.getPrefTypeAlign(PartitionTy) : Alignment, AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), AI.getIterator()); // Copy the old AI debug location over to the new one. From c55ee9f5592d1dbf63c2298accd80fc719af5b0d Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Wed, 26 Nov 2025 08:10:43 +0000 Subject: [PATCH 09/21] remove julia test --- .../SROA/prefer-integer-partition.ll | 82 +++++++++++++------ 1 file changed, 56 insertions(+), 26 deletions(-) diff --git a/llvm/test/Transforms/SROA/prefer-integer-partition.ll b/llvm/test/Transforms/SROA/prefer-integer-partition.ll index b9a7af6276565..5b639169cc207 100644 --- a/llvm/test/Transforms/SROA/prefer-integer-partition.ll +++ b/llvm/test/Transforms/SROA/prefer-integer-partition.ll @@ -1,30 +1,65 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=sroa -S | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt %s -passes=sroa -S | FileCheck %s -; Test that SROA converts array types to integer types for promotion. +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" -target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32-ni:10:11:12:13" +%"struct.pbrt::RaySamples" = type { %struct.anon.45, %struct.anon.46, i8, %struct.anon.47 } +%struct.anon.45 = type { %"class.pbrt::Point2", float } +%"class.pbrt::Point2" = type { %"class.pbrt::Tuple2" } +%"class.pbrt::Tuple2" = type { float, float } +%struct.anon.46 = type { float, float, %"class.pbrt::Point2" } +%struct.anon.47 = type { float, %"class.pbrt::Point2" } -define void @test_float_array_only_intrinsics() { -; CHECK-LABEL: @test_float_array_only_intrinsics( -; CHECK-NEXT: entry: -; CHECK-NEXT: ret void +define <2 x float> @subsurface_test() local_unnamed_addr { +; CHECK-LABEL: define <2 x float> @subsurface_test() local_unnamed_addr { +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr inttoptr (i64 12 to ptr), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fptosi float [[TMP1]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i1 +; CHECK-NEXT: br i1 [[TMP3]], label %[[BB4:.*]], label %[[_ZNK4PBRT3SOAINS_10RAYSAMPLESEEIXEI_EXIT:.*]] +; CHECK: [[BB4]]: +; CHECK-NEXT: [[TMP5:%.*]] = load volatile { <2 x float>, <2 x float> }, ptr null, align 8 +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP5]], 1 +; CHECK-NEXT: [[BC_I:%.*]] = bitcast <2 x float> [[TMP6]] to <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[BC_I]], i64 1 +; CHECK-NEXT: [[BC2_I:%.*]] = bitcast <2 x float> [[TMP7]] to <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[BC2_I]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP8]] to float +; CHECK-NEXT: [[DOTSROA_1_36_VEC_INSERT:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP9]] to float +; CHECK-NEXT: [[DOTSROA_1_40_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_1_36_VEC_INSERT]], float [[TMP11]], i32 1 +; CHECK-NEXT: br label %[[_ZNK4PBRT3SOAINS_10RAYSAMPLESEEIXEI_EXIT]] +; CHECK: [[_ZNK4PBRT3SOAINS_10RAYSAMPLESEEIXEI_EXIT]]: +; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x float> [ [[DOTSROA_1_40_VEC_INSERT]], %[[BB4]] ], [ zeroinitializer, [[TMP0:%.*]] ] +; CHECK-NEXT: ret <2 x float> [[TMP10]] ; -entry: - %src = alloca [2 x float], align 4 - %dst = alloca [2 x float], align 4 + %1 = alloca %"struct.pbrt::RaySamples", align 4 + %2 = getelementptr i8, ptr %1, i64 36 + store i64 0, ptr %2, align 4 + %3 = load float, ptr inttoptr (i64 12 to ptr), align 4 + %4 = fptosi float %3 to i32 + %5 = trunc i32 %4 to i1 + br i1 %5, label %6, label %_ZNK4pbrt3SOAINS_10RaySamplesEEixEi.exit - call void @llvm.lifetime.start.p0(i64 8, ptr %src) - call void @llvm.lifetime.start.p0(i64 8, ptr %dst) +6: ; preds = %0 + %7 = load volatile { <2 x float>, <2 x float> }, ptr null, align 8 + %8 = extractvalue { <2 x float>, <2 x float> } %7, 0 + %9 = extractvalue { <2 x float>, <2 x float> } %7, 1 + store float 0.000000e+00, ptr %1, align 4 + %bc.i = bitcast <2 x float> %8 to <2 x i32> + %10 = extractelement <2 x i32> %bc.i, i64 1 + %bc2.i = bitcast <2 x float> %9 to <2 x i32> + %11 = extractelement <2 x i32> %bc2.i, i64 0 + store i32 %10, ptr %2, align 4 + %.sroa_idx1.i = getelementptr i8, ptr %1, i64 40 + store i32 %11, ptr %.sroa_idx1.i, align 4 + br label %_ZNK4pbrt3SOAINS_10RaySamplesEEixEi.exit - ; Only intrinsic uses - no scalar loads/stores to establish common type - call void @llvm.memset.p0.i64(ptr %src, i8 42, i64 8, i1 false) - call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 8, i1 false) - call void @llvm.memcpy.p0.p0.i64(ptr %src, ptr %dst, i64 8, i1 false) - - call void @llvm.lifetime.end.p0(i64 8, ptr %dst) - call void @llvm.lifetime.end.p0(i64 8, ptr %src) - ret void +_ZNK4pbrt3SOAINS_10RaySamplesEEixEi.exit: ; preds = %0, %6 + %12 = getelementptr inbounds nuw i8, ptr %1, i64 36 + %.sroa.01.0.copyload = load <2 x float>, ptr %12, align 4 + ret <2 x float> %.sroa.01.0.copyload } define void @test_mixed_types() { @@ -39,8 +74,3 @@ entry: %val = load float, ptr %alloca ret void } - -declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) -declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) -declare void @llvm.lifetime.start.p0(i64, ptr nocapture) -declare void @llvm.lifetime.end.p0(i64, ptr nocapture) From ee7fb53531008e1088d8115557ab547f842bcdb9 Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Wed, 26 Nov 2025 08:10:51 +0000 Subject: [PATCH 10/21] format --- llvm/lib/Transforms/Scalar/SROA.cpp | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index a017c2d3b49ff..f583dd67d60a8 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -5212,8 +5212,10 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, const DataLayout &DL = AI.getDataLayout(); auto ComputePartitionTy = [&]() -> std::tuple { // First check if the partition is viable for vetor promotion. If it is - // via a floating-point vector, we are done because we would never prefer integer widening. - VectorType *VecTy = isVectorPromotionViable(P, DL, AI.getFunction()->getVScaleValue()); + // via a floating-point vector, we are done because we would never prefer + // integer widening. + VectorType *VecTy = + isVectorPromotionViable(P, DL, AI.getFunction()->getVScaleValue()); if (VecTy) { if (VecTy->getElementType()->isFloatingPointTy()) { return {VecTy, false, VecTy}; @@ -5231,26 +5233,34 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, if (VecTy) return {VecTy, false, VecTy}; - return {CommonUseTy.first, isIntegerWideningViable(P, CommonUseTy.first, DL), nullptr}; + return {CommonUseTy.first, + isIntegerWideningViable(P, CommonUseTy.first, DL), nullptr}; } } - // If not, can we find an appropriate subtype in the original allocated type? - if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), P.beginOffset(), P.size())) { - if (TypePartitionTy->isArrayTy() && TypePartitionTy->getArrayElementType()->isIntegerTy() && DL.isLegalInteger(P.size() * 8)) + // If not, can we find an appropriate subtype in the original allocated + // type? + if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), + P.beginOffset(), P.size())) { + if (TypePartitionTy->isArrayTy() && + TypePartitionTy->getArrayElementType()->isIntegerTy() && + DL.isLegalInteger(P.size() * 8)) TypePartitionTy = Type::getIntNTy(*C, P.size() * 8); - + if (isIntegerWideningViable(P, TypePartitionTy, DL)) return {TypePartitionTy, true, nullptr}; if (VecTy) return {VecTy, false, VecTy}; - if (CommonUseTy.second && DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size() && isIntegerWideningViable(P, CommonUseTy.second, DL)) + if (CommonUseTy.second && + DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size() && + isIntegerWideningViable(P, CommonUseTy.second, DL)) return {CommonUseTy.second, true, nullptr}; return {TypePartitionTy, false, nullptr}; } // If still not, can we use the largest bitwidth integer type used? - if (CommonUseTy.second && DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) + if (CommonUseTy.second && + DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) return {CommonUseTy.second, false, nullptr}; if (DL.isLegalInteger(P.size() * 8)) From 1aa16e3737600aa1daad2c2000c09fc939c4da33 Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Wed, 26 Nov 2025 21:00:40 +0000 Subject: [PATCH 11/21] improvements on arm --- .../AArch64/neon-scalar-x-indexed-elem.c | 98 +++----- .../CodeGen/arm-bf16-convert-intrinsics.c | 237 ++++++++---------- 2 files changed, 143 insertions(+), 192 deletions(-) diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c index 9b98126500444..2b1af62789eac 100644 --- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c +++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c @@ -111,8 +111,8 @@ float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) { // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[A]], i32 0 -// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <1 x double> [[B]], i32 0 -// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE4]]) +// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <1 x double> [[B]], i32 0 +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE3]]) // CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[A]], double [[VMULXD_F64_I]], i32 0 // CHECK-NEXT: ret <1 x double> [[VSET_LANE]] // @@ -196,19 +196,13 @@ float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) { // CHECK-LABEL: define dso_local <1 x double> @test_vfma_lane_f64( // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 -// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 -// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64 -// CHECK-NEXT: [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> // CHECK-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]]) // CHECK-NEXT: ret <1 x double> [[FMLA2]] // @@ -219,20 +213,14 @@ float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) { // CHECK-LABEL: define dso_local <1 x double> @test_vfms_lane_f64( // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 -// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 // CHECK-NEXT: [[FNEG:%.*]] = fneg <1 x double> [[B]] -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64 -// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64 -// CHECK-NEXT: [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> // CHECK-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]]) // CHECK-NEXT: ret <1 x double> [[FMLA2]] // @@ -243,21 +231,16 @@ float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) { // CHECK-LABEL: define dso_local <1 x double> @test_vfma_laneq_f64( // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 -// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 -// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double -// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 -// CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]]) -// CHECK-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double> -// CHECK-NEXT: ret <1 x double> [[TMP10]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +// CHECK-NEXT: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]]) +// CHECK-NEXT: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP7]] // float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) { return vfma_laneq_f64(a, b, v, 0); @@ -266,22 +249,17 @@ float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) { // CHECK-LABEL: define dso_local <1 x double> @test_vfms_laneq_f64( // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 -// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 // CHECK-NEXT: [[FNEG:%.*]] = fneg <1 x double> [[B]] -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64 -// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double -// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 -// CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]]) -// CHECK-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double> -// CHECK-NEXT: ret <1 x double> [[TMP10]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +// CHECK-NEXT: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]]) +// CHECK-NEXT: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP7]] // float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) { return vfms_laneq_f64(a, b, v, 0); @@ -555,8 +533,8 @@ int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) { // CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double> // CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double> // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0 -// CHECK-NEXT: [[VGET_LANE9:%.*]] = extractelement <1 x double> [[TMP1]], i32 0 -// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE9]]) +// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <1 x double> [[TMP1]], i32 0 +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE8]]) // CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0 // CHECK-NEXT: ret <1 x double> [[VSET_LANE]] // diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c index 65a23dc0325c8..ee1c1af53811d 100644 --- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c +++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c @@ -26,21 +26,19 @@ // CHECK-A64-NEXT: entry: // CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> // CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-A64-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> -// CHECK-A64-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) -// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float> -// CHECK-A64-NEXT: ret <4 x float> [[TMP4]] +// CHECK-A64-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32> +// CHECK-A64-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float> +// CHECK-A64-NEXT: ret <4 x float> [[TMP3]] // // CHECK-A32-HARDFP-LABEL: @test_vcvt_f32_bf16( // CHECK-A32-HARDFP-NEXT: entry: // CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> // CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> -// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) -// CHECK-A32-HARDFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float> -// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP4]] +// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) +// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float> +// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvt_f32_bf16( // CHECK-A32-SOFTFP-NEXT: entry: @@ -49,11 +47,10 @@ // CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x bfloat> // CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to <4 x i16> // CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP6]], splat (i32 16) -// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float> -// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP7]] +// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP5]], splat (i32 16) +// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float> +// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP6]] // float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) { return vcvt_f32_bf16(a); @@ -64,22 +61,20 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) { // CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> // CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16> // CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-A64-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> -// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) -// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> -// CHECK-A64-NEXT: ret <4 x float> [[TMP4]] +// CHECK-A64-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32> +// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A64-NEXT: ret <4 x float> [[TMP3]] // // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_f32_bf16( // CHECK-A32-HARDFP-NEXT: entry: // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> // CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16> // CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> -// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) -// CHECK-A32-HARDFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> -// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP4]] +// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) +// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_f32_bf16( // CHECK-A32-SOFTFP-NEXT: entry: @@ -95,11 +90,10 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) { // CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <4 x bfloat> // CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <4 x bfloat> [[TMP8]] to <4 x i16> // CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = zext <4 x i16> [[TMP11]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP12]], splat (i32 16) -// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> -// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP13]] +// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = sext <4 x i16> [[TMP9]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16) +// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP12]] // float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) { return vcvtq_low_f32_bf16(a); @@ -110,22 +104,20 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) { // CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> // CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16> // CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-A64-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> -// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) -// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> -// CHECK-A64-NEXT: ret <4 x float> [[TMP4]] +// CHECK-A64-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32> +// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A64-NEXT: ret <4 x float> [[TMP3]] // // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_f32_bf16( // CHECK-A32-HARDFP-NEXT: entry: // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> // CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16> // CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> -// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) -// CHECK-A32-HARDFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> -// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP4]] +// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) +// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_high_f32_bf16( // CHECK-A32-SOFTFP-NEXT: entry: @@ -141,11 +133,10 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) { // CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <4 x bfloat> // CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <4 x bfloat> [[TMP8]] to <4 x i16> // CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = zext <4 x i16> [[TMP11]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP12]], splat (i32 16) -// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> -// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP13]] +// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = sext <4 x i16> [[TMP9]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16) +// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP12]] // float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) { return vcvtq_high_f32_bf16(a); @@ -153,33 +144,30 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) { // CHECK-A64-LABEL: @test_vcvt_bf16_f32( // CHECK-A64-NEXT: entry: -// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> -// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[TMP2]] to <4 x bfloat> -// CHECK-A64-NEXT: ret <4 x bfloat> [[TMP3]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat> +// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> +// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <16 x i8> +// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP2]], <4 x i32> +// CHECK-A64-NEXT: ret <4 x bfloat> [[SHUFFLE_I]] // // CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32( // CHECK-A32-HARDFP-NEXT: entry: -// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> -// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]]) +// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]]) // CHECK-A32-HARDFP-NEXT: ret <4 x bfloat> [[VCVTFP2BF1_I]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvt_bf16_f32( // CHECK-A32-SOFTFP-NEXT: entry: -// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]]) -// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to <2 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[TMP4]] to <2 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[TMP6]] to <2 x i32> -// CHECK-A32-SOFTFP-NEXT: ret <2 x i32> [[TMP7]] +// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]]) +// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[TMP1]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: ret <2 x i32> [[TMP6]] // bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) { return vcvt_bf16_f32(a); @@ -187,44 +175,36 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) { // CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32( // CHECK-A64-NEXT: entry: -// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> -// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[TMP2]] to <4 x bfloat> -// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP3]], <4 x bfloat> zeroinitializer, <8 x i32> -// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat> +// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> +// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <16 x i8> +// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP2]] // // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32( // CHECK-A32-HARDFP-NEXT: entry: -// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast i64 0 to <4 x bfloat> -// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> -// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> -// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> -// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]]) -// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP0]], <4 x bfloat> [[VCVTFP2BF1_I]], <8 x i32> +// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]]) +// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> [[VCVTFP2BF1_I]], <8 x i32> // CHECK-A32-HARDFP-NEXT: ret <8 x bfloat> [[SHUFFLE_I]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_bf16_f32( // CHECK-A32-SOFTFP-NEXT: entry: -// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast i64 0 to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> -// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> -// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]]) -// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]]) +// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[TMP1]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x bfloat> // CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <2 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP0]] to <2 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <2 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP6]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP7]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP8]], <4 x bfloat> [[TMP9]], <8 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <8 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <8 x bfloat> [[TMP11]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[TMP12]] to <8 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP14:%.*]] = bitcast <8 x bfloat> [[TMP13]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP14]] +// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> zeroinitializer to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP5]], <4 x bfloat> [[TMP6]], <8 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <8 x bfloat> [[TMP10]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP11]] // bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) { return vcvtq_low_bf16_f32(a); @@ -232,23 +212,18 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) { // CHECK-A64-LABEL: @test_vcvtq_high_bf16_f32( // CHECK-A64-NEXT: entry: -// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <8 x i16> -// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> -// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> -// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> -// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x bfloat> -// CHECK-A64-NEXT: [[TMP5:%.*]] = shufflevector <8 x bfloat> [[TMP4]], <8 x bfloat> poison, <4 x i32> -// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> -// CHECK-A64-NEXT: [[TMP7:%.*]] = fptrunc <4 x float> [[TMP6]] to <4 x bfloat> -// CHECK-A64-NEXT: [[TMP8:%.*]] = shufflevector <4 x bfloat> [[TMP5]], <4 x bfloat> [[TMP7]], <8 x i32> -// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP8]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> +// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat> +// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> +// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[TMP4]] to <16 x i8> +// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]] // // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32( // CHECK-A32-HARDFP-NEXT: entry: -// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> -// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]]) +// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]]) // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[INACTIVE:%.*]], <8 x bfloat> [[INACTIVE]], <4 x i32> // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I8:%.*]] = shufflevector <4 x bfloat> [[VCVTFP2BF1_I]], <4 x bfloat> [[SHUFFLE_I]], <8 x i32> // CHECK-A32-HARDFP-NEXT: ret <8 x bfloat> [[SHUFFLE_I8]] @@ -258,29 +233,27 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) { // CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[INACTIVE_COERCE:%.*]] to <8 x bfloat> // CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[TMP0]] to <4 x i32> // CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <8 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]]) -// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <2 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <8 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP9]], <8 x bfloat> [[TMP9]], <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <2 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x bfloat> [[TMP7]] to <2 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <4 x bfloat> [[TMP11]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]]) +// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[TMP4]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP8]], <8 x bfloat> [[TMP8]], <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <4 x bfloat> [[TMP6]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x bfloat> [[TMP10]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <2 x i32> [[TMP11]] to <4 x bfloat> // CHECK-A32-SOFTFP-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP12]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[TMP14]], <4 x bfloat> [[TMP15]], <8 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP16:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I17]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP17:%.*]] = bitcast <4 x i32> [[TMP16]] to <8 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP18:%.*]] = bitcast <8 x bfloat> [[TMP17]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP18]] to <8 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP20:%.*]] = bitcast <8 x bfloat> [[TMP19]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP20]] +// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[TMP13]], <4 x bfloat> [[TMP14]], <8 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP15:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I17]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP15]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP17:%.*]] = bitcast <8 x bfloat> [[TMP16]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP18:%.*]] = bitcast <4 x i32> [[TMP17]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP19:%.*]] = bitcast <8 x bfloat> [[TMP18]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP19]] // bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) { return vcvtq_high_bf16_f32(inactive, a); @@ -308,7 +281,7 @@ bfloat16_t test_vcvth_bf16_f32(float32_t a) { // CHECK-LABEL: @test_vcvtah_f32_bf16( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast bfloat [[A:%.*]] to i16 -// CHECK-NEXT: [[CONV_I:%.*]] = zext i16 [[TMP0]] to i32 +// CHECK-NEXT: [[CONV_I:%.*]] = sext i16 [[TMP0]] to i32 // CHECK-NEXT: [[SHL_I:%.*]] = shl i32 [[CONV_I]], 16 // CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[SHL_I]] to float // CHECK-NEXT: ret float [[TMP1]] From 3c3a6773827811c9099032b98c20d76a0f17a412 Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Wed, 26 Nov 2025 21:50:11 +0000 Subject: [PATCH 12/21] arm changes --- .../AArch64/neon-scalar-x-indexed-elem.c | 120 +++++---- .../CodeGen/arm-bf16-convert-intrinsics.c | 231 ++++++++++-------- 2 files changed, 199 insertions(+), 152 deletions(-) diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c index 2b1af62789eac..a86a80a939b16 100644 --- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c +++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c @@ -56,8 +56,8 @@ float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to double // CHECK-NEXT: [[TMP1:%.*]] = fmul double [[TMP0]], [[B]] -// CHECK-NEXT: [[TMP2:%.*]] = bitcast double [[TMP1]] to <1 x double> -// CHECK-NEXT: ret <1 x double> [[TMP2]] +// CHECK-NEXT: [[REF_TMP_I_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double [[TMP1]], i32 0 +// CHECK-NEXT: ret <1 x double> [[REF_TMP_I_0_VEC_INSERT]] // float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) { return vmul_n_f64(a, b); @@ -111,8 +111,8 @@ float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) { // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[A]], i32 0 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <1 x double> [[B]], i32 0 -// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE3]]) +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <1 x double> [[B]], i32 0 +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE4]]) // CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[A]], double [[VMULXD_F64_I]], i32 0 // CHECK-NEXT: ret <1 x double> [[VSET_LANE]] // @@ -196,13 +196,19 @@ float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) { // CHECK-LABEL: define dso_local <1 x double> @test_vfma_lane_f64( // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64 +// CHECK-NEXT: [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> // CHECK-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]]) // CHECK-NEXT: ret <1 x double> [[FMLA2]] // @@ -213,14 +219,20 @@ float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) { // CHECK-LABEL: define dso_local <1 x double> @test_vfms_lane_f64( // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 // CHECK-NEXT: [[FNEG:%.*]] = fneg <1 x double> [[B]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64 +// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64 +// CHECK-NEXT: [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> // CHECK-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]]) // CHECK-NEXT: ret <1 x double> [[FMLA2]] // @@ -231,16 +243,21 @@ float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) { // CHECK-LABEL: define dso_local <1 x double> @test_vfma_laneq_f64( // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -// CHECK-NEXT: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]]) -// CHECK-NEXT: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double> -// CHECK-NEXT: ret <1 x double> [[TMP7]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 +// CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]]) +// CHECK-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP10]] // float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) { return vfma_laneq_f64(a, b, v, 0); @@ -249,17 +266,22 @@ float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) { // CHECK-LABEL: define dso_local <1 x double> @test_vfms_laneq_f64( // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 // CHECK-NEXT: [[FNEG:%.*]] = fneg <1 x double> [[B]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -// CHECK-NEXT: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]]) -// CHECK-NEXT: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double> -// CHECK-NEXT: ret <1 x double> [[TMP7]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64 +// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 +// CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]]) +// CHECK-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP10]] // float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) { return vfms_laneq_f64(a, b, v, 0); @@ -530,12 +552,12 @@ int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) { // CHECK-LABEL: define dso_local <1 x double> @test_vmulx_lane_f64_0( // CHECK-SAME: ) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double> -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <1 x double> [[TMP1]], i32 0 -// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE8]]) -// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0 +// CHECK-NEXT: [[__PROMOTE_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FD6304BC43AB5C2, i32 0 +// CHECK-NEXT: [[__PROMOTE2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FEE211E215AEEF3, i32 0 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], i32 0 +// CHECK-NEXT: [[VGET_LANE9:%.*]] = extractelement <1 x double> [[__PROMOTE2_SROA_0_0_VEC_INSERT]], i32 0 +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE9]]) +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], double [[VMULXD_F64_I]], i32 0 // CHECK-NEXT: ret <1 x double> [[VSET_LANE]] // float64x1_t test_vmulx_lane_f64_0() { @@ -552,13 +574,13 @@ float64x1_t test_vmulx_lane_f64_0() { // CHECK-LABEL: define dso_local <1 x double> @test_vmulx_laneq_f64_2( // CHECK-SAME: ) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double> -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0 +// CHECK-NEXT: [[__PROMOTE_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FD6304BC43AB5C2, i32 0 +// CHECK-NEXT: [[__PROMOTE2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FEE211E215AEEF3, i32 0 +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], <1 x double> [[__PROMOTE2_SROA_0_0_VEC_INSERT]], <2 x i32> +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], i32 0 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[SHUFFLE_I]], i32 1 // CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) -// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], double [[VMULXD_F64_I]], i32 0 // CHECK-NEXT: ret <1 x double> [[VSET_LANE]] // float64x1_t test_vmulx_laneq_f64_2() { diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c index ee1c1af53811d..b7f961e4ce15c 100644 --- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c +++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c @@ -26,19 +26,21 @@ // CHECK-A64-NEXT: entry: // CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> // CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A64-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32> -// CHECK-A64-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float> -// CHECK-A64-NEXT: ret <4 x float> [[TMP3]] +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> +// CHECK-A64-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float> +// CHECK-A64-NEXT: ret <4 x float> [[TMP4]] // // CHECK-A32-HARDFP-LABEL: @test_vcvt_f32_bf16( // CHECK-A32-HARDFP-NEXT: entry: // CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> // CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32> -// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float> -// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]] +// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) +// CHECK-A32-HARDFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float> +// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP4]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvt_f32_bf16( // CHECK-A32-SOFTFP-NEXT: entry: @@ -47,10 +49,11 @@ // CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x bfloat> // CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to <4 x i16> // CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP5]], splat (i32 16) -// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float> -// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP6]] +// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP6]], splat (i32 16) +// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float> +// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) { return vcvt_f32_bf16(a); @@ -61,20 +64,22 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) { // CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> // CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16> // CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A64-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32> -// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> -// CHECK-A64-NEXT: ret <4 x float> [[TMP3]] +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> +// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A64-NEXT: ret <4 x float> [[TMP4]] // // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_f32_bf16( // CHECK-A32-HARDFP-NEXT: entry: // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> // CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16> // CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32> -// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> -// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]] +// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) +// CHECK-A32-HARDFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP4]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_f32_bf16( // CHECK-A32-SOFTFP-NEXT: entry: @@ -90,10 +95,11 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) { // CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <4 x bfloat> // CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <4 x bfloat> [[TMP8]] to <4 x i16> // CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = sext <4 x i16> [[TMP9]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16) -// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> -// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP12]] +// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> +// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = zext <4 x i16> [[TMP11]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP12]], splat (i32 16) +// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP13]] // float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) { return vcvtq_low_f32_bf16(a); @@ -104,20 +110,22 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) { // CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> // CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16> // CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A64-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32> -// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> -// CHECK-A64-NEXT: ret <4 x float> [[TMP3]] +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> +// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A64-NEXT: ret <4 x float> [[TMP4]] // // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_f32_bf16( // CHECK-A32-HARDFP-NEXT: entry: // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> // CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16> // CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32> -// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> -// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]] +// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) +// CHECK-A32-HARDFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP4]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_high_f32_bf16( // CHECK-A32-SOFTFP-NEXT: entry: @@ -133,10 +141,11 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) { // CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <4 x bfloat> // CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <4 x bfloat> [[TMP8]] to <4 x i16> // CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = sext <4 x i16> [[TMP9]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16) -// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> -// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP12]] +// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> +// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = zext <4 x i16> [[TMP11]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP12]], splat (i32 16) +// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP13]] // float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) { return vcvtq_high_f32_bf16(a); @@ -144,30 +153,33 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) { // CHECK-A64-LABEL: @test_vcvt_bf16_f32( // CHECK-A64-NEXT: entry: -// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat> -// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> -// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <16 x i8> -// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP2]], <4 x i32> -// CHECK-A64-NEXT: ret <4 x bfloat> [[SHUFFLE_I]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[TMP2]] to <4 x bfloat> +// CHECK-A64-NEXT: ret <4 x bfloat> [[TMP3]] // // CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32( // CHECK-A32-HARDFP-NEXT: entry: -// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]]) +// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]]) // CHECK-A32-HARDFP-NEXT: ret <4 x bfloat> [[VCVTFP2BF1_I]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvt_bf16_f32( // CHECK-A32-SOFTFP-NEXT: entry: -// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]]) -// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[TMP1]] to <2 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <2 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <2 x i32> -// CHECK-A32-SOFTFP-NEXT: ret <2 x i32> [[TMP6]] +// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]]) +// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[TMP4]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[TMP6]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: ret <2 x i32> [[TMP7]] // bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) { return vcvt_bf16_f32(a); @@ -175,36 +187,42 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) { // CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32( // CHECK-A64-NEXT: entry: -// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat> -// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> -// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <16 x i8> -// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP2]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[TMP2]] to <4 x bfloat> +// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP3]], <4 x bfloat> zeroinitializer, <8 x i32> +// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]] // // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32( // CHECK-A32-HARDFP-NEXT: entry: -// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]]) +// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]]) // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> [[VCVTFP2BF1_I]], <8 x i32> // CHECK-A32-HARDFP-NEXT: ret <8 x bfloat> [[SHUFFLE_I]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_bf16_f32( // CHECK-A32-SOFTFP-NEXT: entry: -// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]]) -// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[TMP1]] to <2 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <2 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> zeroinitializer to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP5]], <4 x bfloat> [[TMP6]], <8 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <8 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]]) +// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> zeroinitializer to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP4]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP6]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP7]], <4 x bfloat> [[TMP8]], <8 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I]] to <4 x i32> // CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <8 x bfloat> // CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <8 x bfloat> [[TMP10]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP11]] +// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <8 x bfloat> [[TMP12]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP13]] // bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) { return vcvtq_low_bf16_f32(a); @@ -212,18 +230,23 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) { // CHECK-A64-LABEL: @test_vcvtq_high_bf16_f32( // CHECK-A64-NEXT: entry: -// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8> -// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> -// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat> -// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> -// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[TMP4]] to <16 x i8> -// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x bfloat> +// CHECK-A64-NEXT: [[TMP5:%.*]] = shufflevector <8 x bfloat> [[TMP4]], <8 x bfloat> poison, <4 x i32> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-A64-NEXT: [[TMP7:%.*]] = fptrunc <4 x float> [[TMP6]] to <4 x bfloat> +// CHECK-A64-NEXT: [[TMP8:%.*]] = shufflevector <4 x bfloat> [[TMP5]], <4 x bfloat> [[TMP7]], <8 x i32> +// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP8]] // // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32( // CHECK-A32-HARDFP-NEXT: entry: -// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]]) +// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]]) // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[INACTIVE:%.*]], <8 x bfloat> [[INACTIVE]], <4 x i32> // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I8:%.*]] = shufflevector <4 x bfloat> [[VCVTFP2BF1_I]], <4 x bfloat> [[SHUFFLE_I]], <8 x i32> // CHECK-A32-HARDFP-NEXT: ret <8 x bfloat> [[SHUFFLE_I8]] @@ -233,27 +256,29 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) { // CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[INACTIVE_COERCE:%.*]] to <8 x bfloat> // CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[TMP0]] to <4 x i32> // CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <8 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]]) -// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[TMP4]] to <2 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <8 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP8]], <8 x bfloat> [[TMP8]], <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <2 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <4 x bfloat> [[TMP6]] to <2 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x bfloat> [[TMP10]] to <2 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <2 x i32> [[TMP11]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]]) +// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP9]], <8 x bfloat> [[TMP9]], <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x bfloat> [[TMP7]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <4 x bfloat> [[TMP11]] to <2 x i32> // CHECK-A32-SOFTFP-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP12]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[TMP13]], <4 x bfloat> [[TMP14]], <8 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP15:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I17]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP15]] to <8 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP17:%.*]] = bitcast <8 x bfloat> [[TMP16]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[TMP18:%.*]] = bitcast <4 x i32> [[TMP17]] to <8 x bfloat> -// CHECK-A32-SOFTFP-NEXT: [[TMP19:%.*]] = bitcast <8 x bfloat> [[TMP18]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP19]] +// CHECK-A32-SOFTFP-NEXT: [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[TMP14]], <4 x bfloat> [[TMP15]], <8 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP16:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I17]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP17:%.*]] = bitcast <4 x i32> [[TMP16]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP18:%.*]] = bitcast <8 x bfloat> [[TMP17]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP18]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP20:%.*]] = bitcast <8 x bfloat> [[TMP19]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP20]] // bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) { return vcvtq_high_bf16_f32(inactive, a); @@ -281,7 +306,7 @@ bfloat16_t test_vcvth_bf16_f32(float32_t a) { // CHECK-LABEL: @test_vcvtah_f32_bf16( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast bfloat [[A:%.*]] to i16 -// CHECK-NEXT: [[CONV_I:%.*]] = sext i16 [[TMP0]] to i32 +// CHECK-NEXT: [[CONV_I:%.*]] = zext i16 [[TMP0]] to i32 // CHECK-NEXT: [[SHL_I:%.*]] = shl i32 [[CONV_I]], 16 // CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[SHL_I]] to float // CHECK-NEXT: ret float [[TMP1]] From 2b45d9a2646e22b73f2464cfd27ceca3cde9f8e3 Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Thu, 27 Nov 2025 00:53:50 +0000 Subject: [PATCH 13/21] julia fix --- llvm/lib/Transforms/Scalar/SROA.cpp | 36 ++++++++++++++++---------- llvm/test/Transforms/SROA/basictest.ll | 12 ++++----- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index f583dd67d60a8..a0537b8ff19ef 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -1482,12 +1482,14 @@ LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); } /// Walk the range of a partitioning looking for a common type to cover this /// sequence of slices. -static std::pair +/// Returns: {CommonType, LargestIntegerType, OnlyIntrinsicUsers} +static std::tuple findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, uint64_t EndOffset) { Type *Ty = nullptr; bool TyIsCommon = true; IntegerType *ITy = nullptr; + bool OnlyIntrinsicUsers = true; // Note that we need to look at *every* alloca slice's Use to ensure we // always get consistent results regardless of the order of slices. @@ -1495,6 +1497,8 @@ findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, Use *U = I->getUse(); if (isa(*U->getUser())) continue; + // We found a non-intrinsic user + OnlyIntrinsicUsers = false; if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset) continue; @@ -1528,7 +1532,7 @@ findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, Ty = UserTy; } - return {TyIsCommon ? Ty : nullptr, ITy}; + return {TyIsCommon ? Ty : nullptr, ITy, OnlyIntrinsicUsers}; } /// PHI instructions that use an alloca and are subsequently loaded can be @@ -5224,20 +5228,24 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // Otherwise, check if there is a common type that all slices of the // partition use. Collect the largest integer type used as a backup. - auto CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()); + auto [CommonUseTy, LargestIntTy, OnlyIntrinsicUsers] = + findCommonType(P.begin(), P.end(), P.endOffset()); // If there is a common type that spans the partition, use it. - if (CommonUseTy.first) { - TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first); + if (CommonUseTy) { + TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy); if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) { if (VecTy) return {VecTy, false, VecTy}; - return {CommonUseTy.first, - isIntegerWideningViable(P, CommonUseTy.first, DL), nullptr}; + return {CommonUseTy, isIntegerWideningViable(P, CommonUseTy, DL), + nullptr}; } } + if (OnlyIntrinsicUsers && DL.isLegalInteger(P.size() * 8)) + return {Type::getIntNTy(*C, P.size() * 8), false, nullptr}; + // If not, can we find an appropriate subtype in the original allocated // type? if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), @@ -5251,17 +5259,17 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, return {TypePartitionTy, true, nullptr}; if (VecTy) return {VecTy, false, VecTy}; - if (CommonUseTy.second && - DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size() && - isIntegerWideningViable(P, CommonUseTy.second, DL)) - return {CommonUseTy.second, true, nullptr}; + if (LargestIntTy && + DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size() && + isIntegerWideningViable(P, LargestIntTy, DL)) + return {LargestIntTy, true, nullptr}; return {TypePartitionTy, false, nullptr}; } // If still not, can we use the largest bitwidth integer type used? - if (CommonUseTy.second && - DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) - return {CommonUseTy.second, false, nullptr}; + if (LargestIntTy && + DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size()) + return {LargestIntTy, false, nullptr}; if (DL.isLegalInteger(P.size() * 8)) return {Type::getIntNTy(*C, P.size() * 8), false, nullptr}; diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll index 15803f7b5a25b..b530742bd66ac 100644 --- a/llvm/test/Transforms/SROA/basictest.ll +++ b/llvm/test/Transforms/SROA/basictest.ll @@ -785,7 +785,7 @@ define i64 @test19(ptr %x) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[X:%.*]], align 1 ; CHECK-NEXT: [[A_SROA_2_0_X_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 8 -; CHECK-NEXT: [[A_SROA_2_0_COPYLOAD:%.*]] = load ptr, ptr [[A_SROA_2_0_X_SROA_IDX]], align 1 +; CHECK-NEXT: [[A_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr [[A_SROA_2_0_X_SROA_IDX]], align 1 ; CHECK-NEXT: ret i64 [[A_SROA_0_0_COPYLOAD]] ; entry: @@ -809,7 +809,7 @@ define i64 @test19_addrspacecast(ptr %x) { ; CHECK-NEXT: [[CAST1:%.*]] = addrspacecast ptr [[X:%.*]] to ptr addrspace(1) ; CHECK-NEXT: [[A_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr addrspace(1) [[CAST1]], align 1 ; CHECK-NEXT: [[A_SROA_2_0_CAST1_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[CAST1]], i16 8 -; CHECK-NEXT: [[A_SROA_2_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(1) [[A_SROA_2_0_CAST1_SROA_IDX]], align 1 +; CHECK-NEXT: [[A_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(1) [[A_SROA_2_0_CAST1_SROA_IDX]], align 1 ; CHECK-NEXT: ret i64 [[A_SROA_0_0_COPYLOAD]] ; entry: @@ -1332,10 +1332,10 @@ define void @PR15674(ptr %data, ptr %src, i32 %size) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP_SROA_0:%.*]] = alloca i32, align 4 ; CHECK-NEXT: switch i32 [[SIZE:%.*]], label [[END:%.*]] [ -; CHECK-NEXT: i32 4, label [[BB4:%.*]] -; CHECK-NEXT: i32 3, label [[BB3:%.*]] -; CHECK-NEXT: i32 2, label [[BB2:%.*]] -; CHECK-NEXT: i32 1, label [[BB1:%.*]] +; CHECK-NEXT: i32 4, label [[BB4:%.*]] +; CHECK-NEXT: i32 3, label [[BB3:%.*]] +; CHECK-NEXT: i32 2, label [[BB2:%.*]] +; CHECK-NEXT: i32 1, label [[BB1:%.*]] ; CHECK-NEXT: ] ; CHECK: bb4: ; CHECK-NEXT: [[SRC_GEP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i32 3 From 6d3b4c9df261d444937cba51e48116b146fa31db Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Thu, 27 Nov 2025 00:56:01 +0000 Subject: [PATCH 14/21] julia test --- .../SROA/prefer-integer-partition.ll | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/llvm/test/Transforms/SROA/prefer-integer-partition.ll b/llvm/test/Transforms/SROA/prefer-integer-partition.ll index 5b639169cc207..bf0d2562a8745 100644 --- a/llvm/test/Transforms/SROA/prefer-integer-partition.ll +++ b/llvm/test/Transforms/SROA/prefer-integer-partition.ll @@ -62,6 +62,28 @@ _ZNK4pbrt3SOAINS_10RaySamplesEEixEi.exit: ; preds = %0, %6 ret <2 x float> %.sroa.01.0.copyload } +define void @test_float_array_only_intrinsics() { +; CHECK-LABEL: @test_float_array_only_intrinsics( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; +entry: + %src = alloca [2 x float], align 4 + %dst = alloca [2 x float], align 4 + + call void @llvm.lifetime.start.p0(i64 8, ptr %src) + call void @llvm.lifetime.start.p0(i64 8, ptr %dst) + + ; Only intrinsic uses - no scalar loads/stores to establish common type + call void @llvm.memset.p0.i64(ptr %src, i8 42, i64 8, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 8, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr %src, ptr %dst, i64 8, i1 false) + + call void @llvm.lifetime.end.p0(i64 8, ptr %dst) + call void @llvm.lifetime.end.p0(i64 8, ptr %src) + ret void +} + define void @test_mixed_types() { ; CHECK-LABEL: @test_mixed_types( ; CHECK-NEXT: entry: From 72599ba978dd2f01e1305f7cd8fafbb1067f193a Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Thu, 27 Nov 2025 01:12:11 +0000 Subject: [PATCH 15/21] adding comments --- llvm/lib/Transforms/Scalar/SROA.cpp | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index a0537b8ff19ef..ca2c20ff19ab7 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -5214,7 +5214,10 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P) { const DataLayout &DL = AI.getDataLayout(); - auto ComputePartitionTy = [&]() -> std::tuple { + // Try to compute a friendly type for this partition of the alloca. This + // won't always succeed, in which case we fall back to a legal integer type + // or an i8 array of an appropriate size. + auto SelectPartitionTy = [&]() -> std::tuple { // First check if the partition is viable for vetor promotion. If it is // via a floating-point vector, we are done because we would never prefer // integer widening. @@ -5225,36 +5228,35 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, return {VecTy, false, VecTy}; } } - // Otherwise, check if there is a common type that all slices of the - // partition use. Collect the largest integer type used as a backup. + // partition use that spans the partition. auto [CommonUseTy, LargestIntTy, OnlyIntrinsicUsers] = findCommonType(P.begin(), P.end(), P.endOffset()); - // If there is a common type that spans the partition, use it. if (CommonUseTy) { TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy); if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) { - + // Prefer vector promotion here because we already calculated it. if (VecTy) return {VecTy, false, VecTy}; return {CommonUseTy, isIntegerWideningViable(P, CommonUseTy, DL), nullptr}; } } - + // If there are only intrinsic users, try to represent as a legal integer type + // because we are probably just copying data around and the integer can be promoted. if (OnlyIntrinsicUsers && DL.isLegalInteger(P.size() * 8)) return {Type::getIntNTy(*C, P.size() * 8), false, nullptr}; - - // If not, can we find an appropriate subtype in the original allocated + // Can we find an appropriate subtype in the original allocated // type? if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), P.beginOffset(), P.size())) { + // If the partition is an integer array that can be spanned by a legal integer type, + // prefer to represent it as a legal integer type because it's more likely to be promotable. if (TypePartitionTy->isArrayTy() && TypePartitionTy->getArrayElementType()->isIntegerTy() && DL.isLegalInteger(P.size() * 8)) TypePartitionTy = Type::getIntNTy(*C, P.size() * 8); - if (isIntegerWideningViable(P, TypePartitionTy, DL)) return {TypePartitionTy, true, nullptr}; if (VecTy) @@ -5277,7 +5279,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, return {ArrayType::get(Type::getInt8Ty(*C), P.size()), false, nullptr}; }; - auto [PartitionTy, IsIntegerPromotable, VecTy] = ComputePartitionTy(); + auto [PartitionTy, IsIntegerPromotable, VecTy] = SelectPartitionTy(); // Check for the case where we're going to rewrite to a new alloca of the // exact same type as the original, and with the same access offsets. In that From 54ad0c788b81a119ee11eb25c359f4cd8123d6d5 Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Thu, 27 Nov 2025 01:51:35 +0000 Subject: [PATCH 16/21] add comments --- llvm/lib/Transforms/Scalar/SROA.cpp | 35 +++++++++++++++++++---------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index ca2c20ff19ab7..52bdb4668fa24 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -5218,35 +5218,41 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // won't always succeed, in which case we fall back to a legal integer type // or an i8 array of an appropriate size. auto SelectPartitionTy = [&]() -> std::tuple { - // First check if the partition is viable for vetor promotion. If it is - // via a floating-point vector, we are done because we would never prefer - // integer widening. + // First check if the partition is viable for vetor promotion. + // We prefer vector promotion over integer widening promotion when: + // - The vector element type is a floating-point type. + // - All the loads/stores to the alloca are vector loads/stores to the entire alloca. + // Otherwise when there is a integer vector with mixed loads/stores we prefer integer widening + // promotion because it's more likely the user is doing bitwise arithmetic and we + // generate better code. VectorType *VecTy = isVectorPromotionViable(P, DL, AI.getFunction()->getVScaleValue()); - if (VecTy) { - if (VecTy->getElementType()->isFloatingPointTy()) { - return {VecTy, false, VecTy}; - } - } - // Otherwise, check if there is a common type that all slices of the - // partition use that spans the partition. + // If the vector element type is a floating-point type, we prefer vector promotion. + if (VecTy && VecTy->getElementType()->isFloatingPointTy()) + return {VecTy, false, VecTy}; + + // Check if there is a common type that all slices of the partition use that spans the partition. auto [CommonUseTy, LargestIntTy, OnlyIntrinsicUsers] = findCommonType(P.begin(), P.end(), P.endOffset()); if (CommonUseTy) { TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy); if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) { - // Prefer vector promotion here because we already calculated it. + // We prefer vector promotion here because if vector promotion is viable and + // there is a common type used, then it implies the second listed condition for prefering + // vector promotion is true. if (VecTy) return {VecTy, false, VecTy}; return {CommonUseTy, isIntegerWideningViable(P, CommonUseTy, DL), nullptr}; } } + // If there are only intrinsic users, try to represent as a legal integer type // because we are probably just copying data around and the integer can be promoted. if (OnlyIntrinsicUsers && DL.isLegalInteger(P.size() * 8)) return {Type::getIntNTy(*C, P.size() * 8), false, nullptr}; + // Can we find an appropriate subtype in the original allocated // type? if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), @@ -5257,25 +5263,30 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, TypePartitionTy->getArrayElementType()->isIntegerTy() && DL.isLegalInteger(P.size() * 8)) TypePartitionTy = Type::getIntNTy(*C, P.size() * 8); + // There was no common type used, so we prefer integer widening promotion. if (isIntegerWideningViable(P, TypePartitionTy, DL)) return {TypePartitionTy, true, nullptr}; if (VecTy) return {VecTy, false, VecTy}; + // If we couldn't promotion with TypePartitionTy, try with the largest integer type used. if (LargestIntTy && DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size() && isIntegerWideningViable(P, LargestIntTy, DL)) return {LargestIntTy, true, nullptr}; + // Fallback to TypePartitionTy and we probably won't promote. return {TypePartitionTy, false, nullptr}; } - // If still not, can we use the largest bitwidth integer type used? + // Select the largest integer type used if it spans the partition. if (LargestIntTy && DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size()) return {LargestIntTy, false, nullptr}; + // Select a legal integer type if it spans the partition. if (DL.isLegalInteger(P.size() * 8)) return {Type::getIntNTy(*C, P.size() * 8), false, nullptr}; + // Fallback to an i8 array. return {ArrayType::get(Type::getInt8Ty(*C), P.size()), false, nullptr}; }; From be50248fb99d7a4966390ca30d361b6e242a0fab Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Thu, 27 Nov 2025 01:51:45 +0000 Subject: [PATCH 17/21] format' --- llvm/lib/Transforms/Scalar/SROA.cpp | 33 +++++++++++++++++------------ 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 52bdb4668fa24..ce7c65f65845c 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -5221,26 +5221,28 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // First check if the partition is viable for vetor promotion. // We prefer vector promotion over integer widening promotion when: // - The vector element type is a floating-point type. - // - All the loads/stores to the alloca are vector loads/stores to the entire alloca. - // Otherwise when there is a integer vector with mixed loads/stores we prefer integer widening - // promotion because it's more likely the user is doing bitwise arithmetic and we - // generate better code. + // - All the loads/stores to the alloca are vector loads/stores to the + // entire alloca. Otherwise when there is a integer vector with mixed + // loads/stores we prefer integer widening promotion because it's more + // likely the user is doing bitwise arithmetic and we generate better code. VectorType *VecTy = isVectorPromotionViable(P, DL, AI.getFunction()->getVScaleValue()); - // If the vector element type is a floating-point type, we prefer vector promotion. + // If the vector element type is a floating-point type, we prefer vector + // promotion. if (VecTy && VecTy->getElementType()->isFloatingPointTy()) return {VecTy, false, VecTy}; - // Check if there is a common type that all slices of the partition use that spans the partition. + // Check if there is a common type that all slices of the partition use that + // spans the partition. auto [CommonUseTy, LargestIntTy, OnlyIntrinsicUsers] = findCommonType(P.begin(), P.end(), P.endOffset()); if (CommonUseTy) { TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy); if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) { - // We prefer vector promotion here because if vector promotion is viable and - // there is a common type used, then it implies the second listed condition for prefering - // vector promotion is true. + // We prefer vector promotion here because if vector promotion is viable + // and there is a common type used, then it implies the second listed + // condition for prefering vector promotion is true. if (VecTy) return {VecTy, false, VecTy}; return {CommonUseTy, isIntegerWideningViable(P, CommonUseTy, DL), @@ -5248,8 +5250,9 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, } } - // If there are only intrinsic users, try to represent as a legal integer type - // because we are probably just copying data around and the integer can be promoted. + // If there are only intrinsic users, try to represent as a legal integer + // type because we are probably just copying data around and the integer can + // be promoted. if (OnlyIntrinsicUsers && DL.isLegalInteger(P.size() * 8)) return {Type::getIntNTy(*C, P.size() * 8), false, nullptr}; @@ -5257,8 +5260,9 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // type? if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), P.beginOffset(), P.size())) { - // If the partition is an integer array that can be spanned by a legal integer type, - // prefer to represent it as a legal integer type because it's more likely to be promotable. + // If the partition is an integer array that can be spanned by a legal + // integer type, prefer to represent it as a legal integer type because + // it's more likely to be promotable. if (TypePartitionTy->isArrayTy() && TypePartitionTy->getArrayElementType()->isIntegerTy() && DL.isLegalInteger(P.size() * 8)) @@ -5268,7 +5272,8 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, return {TypePartitionTy, true, nullptr}; if (VecTy) return {VecTy, false, VecTy}; - // If we couldn't promotion with TypePartitionTy, try with the largest integer type used. + // If we couldn't promotion with TypePartitionTy, try with the largest + // integer type used. if (LargestIntTy && DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size() && isIntegerWideningViable(P, LargestIntTy, DL)) From 36c50cd84ebb636deed51a2c6fcbeabd7c9b0373 Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Thu, 27 Nov 2025 01:52:54 +0000 Subject: [PATCH 18/21] format --- llvm/lib/Transforms/Scalar/SROA.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index ce7c65f65845c..5ea27873ca734 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -5219,10 +5219,13 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // or an i8 array of an appropriate size. auto SelectPartitionTy = [&]() -> std::tuple { // First check if the partition is viable for vetor promotion. + // // We prefer vector promotion over integer widening promotion when: // - The vector element type is a floating-point type. // - All the loads/stores to the alloca are vector loads/stores to the - // entire alloca. Otherwise when there is a integer vector with mixed + // entire alloca. + // + // Otherwise when there is a integer vector with mixed // loads/stores we prefer integer widening promotion because it's more // likely the user is doing bitwise arithmetic and we generate better code. VectorType *VecTy = From 1a2f9d19ff8c151fb7fc67d3e06fb1c728b4cfeb Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Thu, 27 Nov 2025 01:53:02 +0000 Subject: [PATCH 19/21] format --- llvm/lib/Transforms/Scalar/SROA.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 5ea27873ca734..9d59000ec8b9b 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -5224,7 +5224,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // - The vector element type is a floating-point type. // - All the loads/stores to the alloca are vector loads/stores to the // entire alloca. - // + // // Otherwise when there is a integer vector with mixed // loads/stores we prefer integer widening promotion because it's more // likely the user is doing bitwise arithmetic and we generate better code. From dd03b0438b05ae1bee5a426ee9e6de7c8344559a Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Thu, 27 Nov 2025 03:37:45 +0000 Subject: [PATCH 20/21] ptx test update --- llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 84 ++++++++------------- 1 file changed, 31 insertions(+), 53 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index ca2914a2e8043..16ca96e5fbe84 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -455,64 +455,42 @@ define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly ; PTX-NEXT: .local .align 8 .b8 __local_depot9[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; -; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<47>; +; PTX-NEXT: .reg .b64 %rd<30>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %SPL, __local_depot9; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: ld.param.b64 %rd1, [memcpy_to_param_param_0]; -; PTX-NEXT: add.u64 %rd2, %SPL, 0; -; PTX-NEXT: ld.param.b32 %r1, [memcpy_to_param_param_1+4]; -; PTX-NEXT: st.local.b32 [%rd2+4], %r1; -; PTX-NEXT: ld.param.b32 %r2, [memcpy_to_param_param_1]; -; PTX-NEXT: st.local.b32 [%rd2], %r2; -; PTX-NEXT: ld.volatile.b8 %rd3, [%rd1]; -; PTX-NEXT: ld.volatile.b8 %rd4, [%rd1+1]; -; PTX-NEXT: shl.b64 %rd5, %rd4, 8; -; PTX-NEXT: or.b64 %rd6, %rd5, %rd3; -; PTX-NEXT: ld.volatile.b8 %rd7, [%rd1+2]; -; PTX-NEXT: shl.b64 %rd8, %rd7, 16; -; PTX-NEXT: ld.volatile.b8 %rd9, [%rd1+3]; -; PTX-NEXT: shl.b64 %rd10, %rd9, 24; -; PTX-NEXT: or.b64 %rd11, %rd10, %rd8; -; PTX-NEXT: or.b64 %rd12, %rd11, %rd6; -; PTX-NEXT: ld.volatile.b8 %rd13, [%rd1+4]; -; PTX-NEXT: ld.volatile.b8 %rd14, [%rd1+5]; -; PTX-NEXT: shl.b64 %rd15, %rd14, 8; -; PTX-NEXT: or.b64 %rd16, %rd15, %rd13; -; PTX-NEXT: ld.volatile.b8 %rd17, [%rd1+6]; -; PTX-NEXT: shl.b64 %rd18, %rd17, 16; -; PTX-NEXT: ld.volatile.b8 %rd19, [%rd1+7]; -; PTX-NEXT: shl.b64 %rd20, %rd19, 24; -; PTX-NEXT: or.b64 %rd21, %rd20, %rd18; -; PTX-NEXT: or.b64 %rd22, %rd21, %rd16; -; PTX-NEXT: shl.b64 %rd23, %rd22, 32; -; PTX-NEXT: or.b64 %rd24, %rd23, %rd12; -; PTX-NEXT: st.volatile.b64 [%SP], %rd24; -; PTX-NEXT: ld.volatile.b8 %rd25, [%rd1+8]; -; PTX-NEXT: ld.volatile.b8 %rd26, [%rd1+9]; -; PTX-NEXT: shl.b64 %rd27, %rd26, 8; -; PTX-NEXT: or.b64 %rd28, %rd27, %rd25; -; PTX-NEXT: ld.volatile.b8 %rd29, [%rd1+10]; -; PTX-NEXT: shl.b64 %rd30, %rd29, 16; -; PTX-NEXT: ld.volatile.b8 %rd31, [%rd1+11]; -; PTX-NEXT: shl.b64 %rd32, %rd31, 24; -; PTX-NEXT: or.b64 %rd33, %rd32, %rd30; -; PTX-NEXT: or.b64 %rd34, %rd33, %rd28; -; PTX-NEXT: ld.volatile.b8 %rd35, [%rd1+12]; -; PTX-NEXT: ld.volatile.b8 %rd36, [%rd1+13]; -; PTX-NEXT: shl.b64 %rd37, %rd36, 8; -; PTX-NEXT: or.b64 %rd38, %rd37, %rd35; -; PTX-NEXT: ld.volatile.b8 %rd39, [%rd1+14]; -; PTX-NEXT: shl.b64 %rd40, %rd39, 16; -; PTX-NEXT: ld.volatile.b8 %rd41, [%rd1+15]; -; PTX-NEXT: shl.b64 %rd42, %rd41, 24; -; PTX-NEXT: or.b64 %rd43, %rd42, %rd40; -; PTX-NEXT: or.b64 %rd44, %rd43, %rd38; -; PTX-NEXT: shl.b64 %rd45, %rd44, 32; -; PTX-NEXT: or.b64 %rd46, %rd45, %rd34; -; PTX-NEXT: st.volatile.b64 [%SP+8], %rd46; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: ld.param.b32 %rd3, [memcpy_to_param_param_1+4]; +; PTX-NEXT: shl.b64 %rd4, %rd3, 32; +; PTX-NEXT: ld.param.b32 %rd5, [memcpy_to_param_param_1]; +; PTX-NEXT: or.b64 %rd6, %rd4, %rd5; +; PTX-NEXT: st.b64 [%SP], %rd6; +; PTX-NEXT: ld.volatile.global.b8 %rd7, [%rd2]; +; PTX-NEXT: ld.volatile.global.b8 %rd8, [%rd2+1]; +; PTX-NEXT: shl.b64 %rd9, %rd8, 8; +; PTX-NEXT: or.b64 %rd10, %rd9, %rd7; +; PTX-NEXT: ld.volatile.global.b8 %rd11, [%rd2+2]; +; PTX-NEXT: shl.b64 %rd12, %rd11, 16; +; PTX-NEXT: ld.volatile.global.b8 %rd13, [%rd2+3]; +; PTX-NEXT: shl.b64 %rd14, %rd13, 24; +; PTX-NEXT: or.b64 %rd15, %rd14, %rd12; +; PTX-NEXT: or.b64 %rd16, %rd15, %rd10; +; PTX-NEXT: ld.volatile.global.b8 %rd17, [%rd2+4]; +; PTX-NEXT: ld.volatile.global.b8 %rd18, [%rd2+5]; +; PTX-NEXT: shl.b64 %rd19, %rd18, 8; +; PTX-NEXT: or.b64 %rd20, %rd19, %rd17; +; PTX-NEXT: ld.volatile.global.b8 %rd21, [%rd2+6]; +; PTX-NEXT: shl.b64 %rd22, %rd21, 16; +; PTX-NEXT: ld.volatile.global.b8 %rd23, [%rd2+7]; +; PTX-NEXT: shl.b64 %rd24, %rd23, 24; +; PTX-NEXT: or.b64 %rd25, %rd24, %rd22; +; PTX-NEXT: or.b64 %rd26, %rd25, %rd20; +; PTX-NEXT: shl.b64 %rd27, %rd26, 32; +; PTX-NEXT: or.b64 %rd28, %rd27, %rd16; +; PTX-NEXT: add.u64 %rd29, %SPL, 0; +; PTX-NEXT: st.local.b64 [%rd29], %rd28; ; PTX-NEXT: ret; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true) From 2f70a560b923a6c95d46ef57ce026d43744b24c1 Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Thu, 27 Nov 2025 04:20:22 +0000 Subject: [PATCH 21/21] fix debug info test --- llvm/test/DebugInfo/X86/sroasplit-5.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/DebugInfo/X86/sroasplit-5.ll b/llvm/test/DebugInfo/X86/sroasplit-5.ll index 34aa30f55728e..7500bc97efbc8 100644 --- a/llvm/test/DebugInfo/X86/sroasplit-5.ll +++ b/llvm/test/DebugInfo/X86/sroasplit-5.ll @@ -23,7 +23,7 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK-NOT: DW_OP_LLVM_fragment, 56 ; CHECK: DIExpression(DW_OP_LLVM_fragment, 0, 32) ; CHECK-NOT: DW_OP_LLVM_fragment, 56 -; CHECK: DIExpression(DW_OP_LLVM_fragment, 32, 24) +; CHECK: DIExpression(DW_OP_LLVM_fragment, 32, 32) ; CHECK-NOT: DW_OP_LLVM_fragment, 56 %struct.prog_src_register = type { i32, i24 }