-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[AArch64] Lower mathlib call ldexp into fscale when sve is enabled #67552
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-aarch64 ChangesThere are more efficient implementations for llvm.ldexp on different targets. This patch transforms llvm.ldexp into target supported intrinsics before lowering. Full diff: https://github.com/llvm/llvm-project/pull/67552.diff 10 Files Affected:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 04c0325c7fd038b..da01c34731386e0 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2719,6 +2719,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
*this, E, Intrinsic::llrint,
Intrinsic::experimental_constrained_llrint));
+ case Builtin::BIldexp:
+ case Builtin::BIldexpf:
+ case Builtin::BIldexpl:
case Builtin::BI__builtin_ldexp:
case Builtin::BI__builtin_ldexpf:
case Builtin::BI__builtin_ldexpl:
diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c
index 02df4fe5fea6018..a906bda4c88c958 100644
--- a/clang/test/CodeGen/math-libcalls.c
+++ b/clang/test/CodeGen/math-libcalls.c
@@ -71,15 +71,15 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
ldexp(f,f); ldexpf(f,f); ldexpl(f,f);
- // NO__ERRNO: declare double @ldexp(double noundef, i32 noundef) [[READNONE]]
- // NO__ERRNO: declare float @ldexpf(float noundef, i32 noundef) [[READNONE]]
- // NO__ERRNO: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[READNONE]]
+ // NO__ERRNO: declare double @llvm.ldexp.f64.i32(double, i32) [[READNONE_INTRINSIC]]
+ // NO__ERRNO: declare float @llvm.ldexp.f32.i32(float, i32) [[READNONE_INTRINSIC]]
+ // NO__ERRNO: declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]]
// HAS_ERRNO: declare double @ldexp(double noundef, i32 noundef) [[NOT_READNONE]]
// HAS_ERRNO: declare float @ldexpf(float noundef, i32 noundef) [[NOT_READNONE]]
// HAS_ERRNO: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[NOT_READNONE]]
- // HAS_MAYTRAP: declare double @ldexp(double noundef, i32 noundef) [[NOT_READNONE]]
- // HAS_MAYTRAP: declare float @ldexpf(float noundef, i32 noundef) [[NOT_READNONE]]
- // HAS_MAYTRAP: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[NOT_READNONE]]
+ // HAS_MAYTRAP: declare double @llvm.experimental.constrained.ldexp.f64.i32(
+ // HAS_MAYTRAP: declare float @llvm.experimental.constrained.ldexp.f32.i32(
+ // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.ldexp.f80.i32(
modf(f,d); modff(f,fp); modfl(f,l);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 1ae595d2110457d..c8805aadf146874 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1669,6 +1669,7 @@ class TargetTransformInfo {
/// \return The maximum number of function arguments the target supports.
unsigned getMaxNumArgs() const;
+ unsigned getTargetSupportedLdexpInst(Type *Ty) const;
/// @}
private:
@@ -2035,6 +2036,7 @@ class TargetTransformInfo::Concept {
getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
virtual bool hasArmWideBranch(bool Thumb) const = 0;
virtual unsigned getMaxNumArgs() const = 0;
+ virtual unsigned getTargetSupportedLdexpInst(Type *Ty) const = 0;
};
template <typename T>
@@ -2745,6 +2747,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
unsigned getMaxNumArgs() const override {
return Impl.getMaxNumArgs();
}
+
+ unsigned getTargetSupportedLdexpInst(Type *Ty) const override {
+ return Impl.getTargetSupportedLdexpInst(Ty);
+ }
};
template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 326c3130c6cff76..6d6a715f62b201c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -891,6 +891,8 @@ class TargetTransformInfoImplBase {
unsigned getMaxNumArgs() const { return UINT_MAX; }
+ unsigned getTargetSupportedLdexpInst(Type *Ty) const { return 0; }
+
protected:
// Obtain the minimum required size to hold the value (without the sign)
// In case of a vector it returns the min required size for one element.
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c751d174a48ab1f..6a58a146d0431f9 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1237,6 +1237,10 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
}
+unsigned TargetTransformInfo::getTargetSupportedLdexpInst(Type *Ty) const {
+ return TTIImpl->getTargetSupportedLdexpInst(Ty);
+}
+
TargetTransformInfo::Concept::~Concept() = default;
TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index faee623d7c62fba..ce0c6b653e1c6c5 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -61,6 +61,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/IntrinsicsX86.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
@@ -486,6 +487,7 @@ class CodeGenPrepare : public FunctionPass {
bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
void verifyBFIUpdates(Function &F);
+ void optimizeScalarLdexp(Instruction *Ldexp, Value *X, Value *Exp);
};
} // end anonymous namespace
@@ -2432,6 +2434,13 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
return optimizeGatherScatterInst(II, II->getArgOperand(0));
case Intrinsic::masked_scatter:
return optimizeGatherScatterInst(II, II->getArgOperand(1));
+ case Intrinsic::ldexp: {
+ // Vector versions of llvm.ldexp are not fully supported for all targets,
+ // only handle scalar version currently.
+ if (!II->getType()->isVectorTy())
+ optimizeScalarLdexp(II, II->getArgOperand(0), II->getArgOperand(1));
+ break;
+ }
}
SmallVector<Value *, 2> PtrOps;
@@ -8667,3 +8676,62 @@ bool CodeGenPrepare::splitBranchCondition(Function &F, ModifyDT &ModifiedDT) {
}
return MadeChange;
}
+
+// Transform llvm.ldexp.T.i32(T x, i32 exp) into target supported instructions.
+void CodeGenPrepare::optimizeScalarLdexp(Instruction *Ldexp, Value *X,
+ Value *Exp) {
+ auto IID = TTI->getTargetSupportedLdexpInst(X->getType());
+ if (IID == 0)
+ return;
+
+ unsigned XScalarSize = X->getType()->getScalarSizeInBits();
+ // Target related intrinsics for ldexp.f128 are not well supported, filter out
+ // the scenario currently.
+ if (XScalarSize > 64)
+ return;
+ unsigned VL = 128 / XScalarSize;
+
+ IRBuilder<> B(Ldexp);
+ LLVMContext &C = Ldexp->getModule()->getContext();
+ Type *VXTy = nullptr, *VExpTy = nullptr;
+ Value *VX = nullptr, *VExp = nullptr, *CvtExp = nullptr;
+ Value *Ret = nullptr, *Pg = nullptr;
+ ElementCount EC;
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::aarch64_sve_fscale: {
+ EC = ElementCount::get(VL, true);
+ CvtExp = Exp;
+ if (X->getType() == Type::getDoubleTy(C))
+ CvtExp = B.CreateSExt(Exp, Type::getInt64Ty(C));
+ VExpTy = VectorType::get(CvtExp->getType(), EC);
+ VExp = B.CreateInsertElement(PoisonValue::get(VExpTy), CvtExp, uint64_t(0));
+ VXTy = VectorType::get(X->getType(), EC);
+ VX = B.CreateInsertElement(PoisonValue::get(VXTy), X, uint64_t(0));
+ Type *PTy = VectorType::get(Type::getInt1Ty(C), EC);
+ Constant *True = ConstantInt::get(Type::getInt32Ty(C), 31);
+ Pg = B.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PTy}, {True});
+ Value *FScale = B.CreateIntrinsic(IID, {VXTy}, {Pg, VX, VExp});
+ Ret = B.CreateExtractElement(FScale, (uint64_t)0);
+ Ldexp->replaceAllUsesWith(Ret);
+ break;
+ }
+ case Intrinsic::x86_avx512_mask_scalef_ss:
+ case Intrinsic::x86_avx512_mask_scalef_sd: {
+ EC = ElementCount::get(VL, false);
+ CvtExp = B.CreateSIToFP(Exp, X->getType());
+ VExpTy = VectorType::get(CvtExp->getType(), EC);
+ VExp = B.CreateInsertElement(PoisonValue::get(VExpTy), CvtExp, uint64_t(0));
+ VXTy = VectorType::get(X->getType(), EC);
+ VX = B.CreateInsertElement(PoisonValue::get(VXTy), X, uint64_t(0));
+ Pg = ConstantInt::get(Type::getInt8Ty(C), -1);
+ Constant *Round = ConstantInt::get(Type::getInt32Ty(C), 4);
+ Value *Scalef =
+ B.CreateIntrinsic(IID, std::nullopt, {VX, VExp, VX, Pg, Round});
+ Ret = B.CreateExtractElement(Scalef, (uint64_t)0);
+ Ldexp->replaceAllUsesWith(Ret);
+ break;
+ }
+ }
+}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a6baade412c77d2..5190572b3d386da 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
#include <cstdint>
#include <optional>
@@ -412,6 +413,12 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
}
+
+ unsigned getTargetSupportedLdexpInst(Type *Ty) const {
+ if (!ST->hasSVE())
+ return 0;
+ return Intrinsic::aarch64_sve_fscale;
+ }
};
} // end namespace llvm
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 0fa0d240a548b96..4ceada4e756f6f5 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -19,6 +19,7 @@
#include "X86TargetMachine.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/IR/IntrinsicsX86.h"
#include <optional>
namespace llvm {
@@ -285,6 +286,16 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
bool prefersVectorizedAddressing() const;
bool supportsEfficientVectorElementLoadStore() const;
bool enableInterleavedAccessVectorization();
+ unsigned getTargetSupportedLdexpInst(Type *Ty) const {
+ if (!ST->hasAVX512())
+ return 0;
+ if (Ty->isFloatTy())
+ return Intrinsic::x86_avx512_mask_scalef_ss;
+ else if (Ty->isDoubleTy())
+ return Intrinsic::x86_avx512_mask_scalef_sd;
+ else
+ return 0;
+ }
private:
bool supportsGather() const;
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/optimize-ldexp.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/optimize-ldexp.ll
new file mode 100644
index 000000000000000..77605844450d006
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/optimize-ldexp.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck --check-prefixes=AARCH64 %s
+
+define dso_local double @testExp(double noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExp:
+; AARCH64: // %bb.0: // %entry
+; AARCH64-NEXT: ptrue p0.d
+; AARCH64-NEXT: // kill: def $w0 killed $w0 def $x0
+; AARCH64-NEXT: sxtw x8, w0
+; AARCH64-NEXT: // kill: def $d0 killed $d0 def $z0
+; AARCH64-NEXT: fmov d1, x8
+; AARCH64-NEXT: fscale z0.d, p0/m, z0.d, z1.d
+; AARCH64-NEXT: // kill: def $d0 killed $d0 killed $z0
+; AARCH64-NEXT: ret
+entry:
+ %0 = tail call fast double @llvm.ldexp.f64.i32(double %val, i32 %a)
+ ret double %0
+}
+declare double @llvm.ldexp.f64.i32(double, i32)
+
+define dso_local float @testExpf(float noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExpf:
+; AARCH64: // %bb.0: // %entry
+; AARCH64-NEXT: ptrue p0.s
+; AARCH64-NEXT: fmov s1, w0
+; AARCH64-NEXT: // kill: def $s0 killed $s0 def $z0
+; AARCH64-NEXT: fscale z0.s, p0/m, z0.s, z1.s
+; AARCH64-NEXT: // kill: def $s0 killed $s0 killed $z0
+; AARCH64-NEXT: ret
+entry:
+ %0 = tail call fast float @llvm.ldexp.f32.i32(float %val, i32 %a)
+ ret float %0
+}
+declare float @llvm.ldexp.f32.i32(float, i32)
+
+; Target related intrinsics for f128 are not well supported, use call ldexpl
+; currently.
+define dso_local fp128 @testExpl(fp128 noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExpl:
+; AARCH64: // %bb.0: // %entry
+; AARCH64-NEXT: b ldexpl
+entry:
+ %0 = tail call fast fp128 @llvm.ldexp.f128.i32(fp128 %val, i32 %a)
+ ret fp128 %0
+}
+declare fp128 @llvm.ldexp.f128.i32(fp128, i32)
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/optimize-ldexp.ll b/llvm/test/Transforms/CodeGenPrepare/X86/optimize-ldexp.ll
new file mode 100644
index 000000000000000..97dd7bd80aa43b1
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/optimize-ldexp.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64 -mattr=+avx512f < %s -o - | FileCheck --check-prefixes=AARCH64 %s
+
+define dso_local double @testExp(double noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExp:
+; AARCH64: # %bb.0: # %entry
+; AARCH64-NEXT: vcvtsi2sd %edi, %xmm1, %xmm1
+; AARCH64-NEXT: vscalefsd %xmm1, %xmm0, %xmm0
+; AARCH64-NEXT: retq
+entry:
+ %0 = tail call fast double @llvm.ldexp.f64.i32(double %val, i32 %a)
+ ret double %0
+}
+declare double @llvm.ldexp.f64.i32(double, i32)
+
+define dso_local float @testExpf(float noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExpf:
+; AARCH64: # %bb.0: # %entry
+; AARCH64-NEXT: vcvtsi2ss %edi, %xmm1, %xmm1
+; AARCH64-NEXT: vscalefss %xmm1, %xmm0, %xmm0
+; AARCH64-NEXT: retq
+entry:
+ %0 = tail call fast float @llvm.ldexp.f32.i32(float %val, i32 %a)
+ ret float %0
+}
+declare float @llvm.ldexp.f32.i32(float, i32)
+
+; Target related intrinsics for f128 are not well supported, use call ldexpl
+; currently.
+define dso_local fp128 @testExpl(fp128 noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExpl:
+; AARCH64: # %bb.0: # %entry
+; AARCH64-NEXT: jmp ldexpl@PLT # TAILCALL
+entry:
+ %0 = tail call fast fp128 @llvm.ldexp.f128.i32(fp128 %val, i32 %a)
+ ret fp128 %0
+}
+declare fp128 @llvm.ldexp.f128.i32(fp128, i32)
|
Why does this need special handling in CGP instead of being a normal custom lowering for FLDEXP? |
I agree and custom lowering also gives a straight forward way to support vectors types as well. |
You are right, I'll update the patch as soon, thanks! |
@nikic , thanks for your suggestion, I'll update the patch as soon! |
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi. Can you split this into two separated patches - one for AArch64 and another for X86? I think they should be logically separable.
llvm/test/CodeGen/AArch64/ldexp.ll
Outdated
@@ -0,0 +1,55 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 | |||
; RUN: llc -O3 -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
llc usually does not need to run with -O3
llvm/test/CodeGen/AArch64/ldexp.ll
Outdated
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 | ||
; RUN: llc -O3 -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck %s | ||
|
||
define dso_local nofpclass(nan inf) double @testExp(double noundef nofpclass(nan inf) %val, i32 noundef %a) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The tests can usually be cleaned up a little by removing dso_local, noundef and probably the nofpclass.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, thanks for your suggestions!
XVT = MVT::nxv4f32; | ||
ExpVT = MVT::nxv4i32; | ||
break; | ||
case 64: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It might be worth adding a fp16 version for it too, as there should be an instruction available.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi, @davemgreen, you are right, but it seems f16
for ldexp
is not fully supported? I got some compile failures as shown,
https://godbolt.org/
SDLoc DL(Op); | ||
EVT XVT, ExpVT; | ||
SDValue IID; | ||
switch (Op.getValueSizeInBits()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably should switch over the actual type, or the scalar type
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I'll fix it.
Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XScalarTy, Exp); | ||
SDValue VX = | ||
DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero); | ||
SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this lowering the scalar case as a vector operation? Should this move to the vector legalizer?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi, @arsenm, here are some of may consideration:
-
Is this lowering the scalar case as a vector operation?
Actually,ldexp@PLT
(ISD::FLDEXP
) is a scalar call, butllvm.aarch64.sve.fscale.*
orllvm.x86.mask.scalef.*
only has vector versions. I hope to replace call with more efficient target supported instructions, so there are someINSERT_VECTOR_ELT
andEXTRACT_VECTOR_ELT
. -
"Should this move to the vector legalizer?"
FLDEXP
will occur in many scalar cases, so I think it is not necessary to move to the vector legalizer.
|
ping |
Sorry when I said patch I should have said pull request. Can you split the two patches into separate prs, so they can be reviewed separately. I think what you have for AArch64 looks OK to me |
Hi, @davemgreen , thanks for your reply. Do you mean split these two commits into two Pull Request? If so, I'll remove the commit about X86 in this Patch, and create another Pull Request for it. |
Yeah that sounds good to me, thanks. |
6593af5
to
69fc42c
Compare
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. I checked the edge cases and this looks good to me, if you can address the last round of comments.
default: | ||
return SDValue(); | ||
case MVT::f16: | ||
X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this intended to fallthrough? I think the idea of extending f16 to f32 sounds good.
Can you add a [[fallthrough]] attribute to make it explicit
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thank very much!
llvm/test/CodeGen/AArch64/ldexp.ll
Outdated
|
||
declare half @llvm.ldexp.f16.i32(half, i32) #1 | ||
|
||
attributes #1 = { mustprogress nofree nosync nounwind willreturn memory(none) } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This line can probably be removed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, only keep the necessary attribute.
@@ -26414,3 +26422,46 @@ bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const { | |||
} | |||
return true; | |||
} | |||
|
|||
SDValue AArch64TargetLowering::LowerFLDEXP(SDValue Op, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This can be made static and moved to above AArch64TargetLowering::LowerOperation, similar to LowerFunnelShift.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
The function of 'fscale' is equivalent to mathlib call ldexp, but has better performance. This patch lowers ldexp into fscale when sve is enabled.
69fc42c
to
9644a49
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, this LGTM.
Do you have commit access, or should I hit commit? If so are you happy for this to go in?
Hi, @davemgreen, I have asked someone to hit commit, thanks for your reviews! |
The function of 'fscale' is equivalent to mathlib call ldexp, but has better performance. This patch lowers ldexp into fscale when sve is enabled.