[AArch64] Lower mathlib call ldexp into fscale when sve is enabled #67552

huhu233 · 2023-09-27T13:02:33Z

The function of 'fscale' is equivalent to mathlib call ldexp, but has better performance. This patch lowers ldexp into fscale when sve is enabled.

llvmbot · 2023-09-27T13:03:44Z

@llvm/pr-subscribers-llvm-transforms
@llvm/pr-subscribers-clang-codegen
@llvm/pr-subscribers-llvm-analysis
@llvm/pr-subscribers-backend-x86
@llvm/pr-subscribers-clang

@llvm/pr-subscribers-backend-aarch64

Changes

There are more efficient implementations for llvm.ldexp on different targets. This patch transforms llvm.ldexp into target supported intrinsics before lowering.

Full diff: https://github.com/llvm/llvm-project/pull/67552.diff

10 Files Affected:

(modified) clang/lib/CodeGen/CGBuiltin.cpp (+3)
(modified) clang/test/CodeGen/math-libcalls.c (+6-6)
(modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+6)
(modified) llvm/include/llvm/Analysis/TargetTransformInfoImpl.h (+2)
(modified) llvm/lib/Analysis/TargetTransformInfo.cpp (+4)
(modified) llvm/lib/CodeGen/CodeGenPrepare.cpp (+68)
(modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h (+7)
(modified) llvm/lib/Target/X86/X86TargetTransformInfo.h (+11)
(added) llvm/test/Transforms/CodeGenPrepare/AArch64/optimize-ldexp.ll (+46)
(added) llvm/test/Transforms/CodeGenPrepare/X86/optimize-ldexp.ll (+38)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 04c0325c7fd038b..da01c34731386e0 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2719,6 +2719,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
       return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
           *this, E, Intrinsic::llrint,
           Intrinsic::experimental_constrained_llrint));
+    case Builtin::BIldexp:
+    case Builtin::BIldexpf:
+    case Builtin::BIldexpl:
     case Builtin::BI__builtin_ldexp:
     case Builtin::BI__builtin_ldexpf:
     case Builtin::BI__builtin_ldexpl:
diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c
index 02df4fe5fea6018..a906bda4c88c958 100644
--- a/clang/test/CodeGen/math-libcalls.c
+++ b/clang/test/CodeGen/math-libcalls.c
@@ -71,15 +71,15 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   ldexp(f,f);    ldexpf(f,f);   ldexpl(f,f);
 
-  // NO__ERRNO: declare double @ldexp(double noundef, i32 noundef) [[READNONE]]
-  // NO__ERRNO: declare float @ldexpf(float noundef, i32 noundef) [[READNONE]]
-  // NO__ERRNO: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[READNONE]]
+  // NO__ERRNO: declare double @llvm.ldexp.f64.i32(double, i32) [[READNONE_INTRINSIC]]
+  // NO__ERRNO: declare float @llvm.ldexp.f32.i32(float, i32) [[READNONE_INTRINSIC]]
+  // NO__ERRNO: declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]]
   // HAS_ERRNO: declare double @ldexp(double noundef, i32 noundef) [[NOT_READNONE]]
   // HAS_ERRNO: declare float @ldexpf(float noundef, i32 noundef) [[NOT_READNONE]]
   // HAS_ERRNO: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[NOT_READNONE]]
-  // HAS_MAYTRAP: declare double @ldexp(double noundef, i32 noundef) [[NOT_READNONE]]
-  // HAS_MAYTRAP: declare float @ldexpf(float noundef, i32 noundef) [[NOT_READNONE]]
-  // HAS_MAYTRAP: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[NOT_READNONE]]
+  // HAS_MAYTRAP: declare double @llvm.experimental.constrained.ldexp.f64.i32(
+  // HAS_MAYTRAP: declare float @llvm.experimental.constrained.ldexp.f32.i32(
+  // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.ldexp.f80.i32(
 
   modf(f,d);       modff(f,fp);      modfl(f,l);
 
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 1ae595d2110457d..c8805aadf146874 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1669,6 +1669,7 @@ class TargetTransformInfo {
   /// \return The maximum number of function arguments the target supports.
   unsigned getMaxNumArgs() const;
 
+  unsigned getTargetSupportedLdexpInst(Type *Ty) const;
   /// @}
 
 private:
@@ -2035,6 +2036,7 @@ class TargetTransformInfo::Concept {
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
   virtual unsigned getMaxNumArgs() const = 0;
+  virtual unsigned getTargetSupportedLdexpInst(Type *Ty) const = 0;
 };
 
 template <typename T>
@@ -2745,6 +2747,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned getMaxNumArgs() const override {
     return Impl.getMaxNumArgs();
   }
+
+  unsigned getTargetSupportedLdexpInst(Type *Ty) const override {
+    return Impl.getTargetSupportedLdexpInst(Ty);
+  }
 };
 
 template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 326c3130c6cff76..6d6a715f62b201c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -891,6 +891,8 @@ class TargetTransformInfoImplBase {
 
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
+  unsigned getTargetSupportedLdexpInst(Type *Ty) const { return 0; }
+
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
   // In case of a vector it returns the min required size for one element.
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c751d174a48ab1f..6a58a146d0431f9 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1237,6 +1237,10 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
   return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
 }
 
+unsigned TargetTransformInfo::getTargetSupportedLdexpInst(Type *Ty) const {
+  return TTIImpl->getTargetSupportedLdexpInst(Ty);
+}
+
 TargetTransformInfo::Concept::~Concept() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index faee623d7c62fba..ce0c6b653e1c6c5 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -61,6 +61,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
@@ -486,6 +487,7 @@ class CodeGenPrepare : public FunctionPass {
   bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
   bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
   void verifyBFIUpdates(Function &F);
+  void optimizeScalarLdexp(Instruction *Ldexp, Value *X, Value *Exp);
 };
 
 } // end anonymous namespace
@@ -2432,6 +2434,13 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
       return optimizeGatherScatterInst(II, II->getArgOperand(0));
     case Intrinsic::masked_scatter:
       return optimizeGatherScatterInst(II, II->getArgOperand(1));
+    case Intrinsic::ldexp: {
+      // Vector versions of llvm.ldexp are not fully supported for all targets,
+      // only handle scalar version currently.
+      if (!II->getType()->isVectorTy())
+        optimizeScalarLdexp(II, II->getArgOperand(0), II->getArgOperand(1));
+      break;
+    }
     }
 
     SmallVector<Value *, 2> PtrOps;
@@ -8667,3 +8676,62 @@ bool CodeGenPrepare::splitBranchCondition(Function &F, ModifyDT &ModifiedDT) {
   }
   return MadeChange;
 }
+
+// Transform llvm.ldexp.T.i32(T x, i32 exp) into target supported instructions.
+void CodeGenPrepare::optimizeScalarLdexp(Instruction *Ldexp, Value *X,
+                                         Value *Exp) {
+  auto IID = TTI->getTargetSupportedLdexpInst(X->getType());
+  if (IID == 0)
+    return;
+
+  unsigned XScalarSize = X->getType()->getScalarSizeInBits();
+  // Target related intrinsics for ldexp.f128 are not well supported, filter out
+  // the scenario currently.
+  if (XScalarSize > 64)
+    return;
+  unsigned VL = 128 / XScalarSize;
+
+  IRBuilder<> B(Ldexp);
+  LLVMContext &C = Ldexp->getModule()->getContext();
+  Type *VXTy = nullptr, *VExpTy = nullptr;
+  Value *VX = nullptr, *VExp = nullptr, *CvtExp = nullptr;
+  Value *Ret = nullptr, *Pg = nullptr;
+  ElementCount EC;
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::aarch64_sve_fscale: {
+    EC = ElementCount::get(VL, true);
+    CvtExp = Exp;
+    if (X->getType() == Type::getDoubleTy(C))
+      CvtExp = B.CreateSExt(Exp, Type::getInt64Ty(C));
+    VExpTy = VectorType::get(CvtExp->getType(), EC);
+    VExp = B.CreateInsertElement(PoisonValue::get(VExpTy), CvtExp, uint64_t(0));
+    VXTy = VectorType::get(X->getType(), EC);
+    VX = B.CreateInsertElement(PoisonValue::get(VXTy), X, uint64_t(0));
+    Type *PTy = VectorType::get(Type::getInt1Ty(C), EC);
+    Constant *True = ConstantInt::get(Type::getInt32Ty(C), 31);
+    Pg = B.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PTy}, {True});
+    Value *FScale = B.CreateIntrinsic(IID, {VXTy}, {Pg, VX, VExp});
+    Ret = B.CreateExtractElement(FScale, (uint64_t)0);
+    Ldexp->replaceAllUsesWith(Ret);
+    break;
+  }
+  case Intrinsic::x86_avx512_mask_scalef_ss:
+  case Intrinsic::x86_avx512_mask_scalef_sd: {
+    EC = ElementCount::get(VL, false);
+    CvtExp = B.CreateSIToFP(Exp, X->getType());
+    VExpTy = VectorType::get(CvtExp->getType(), EC);
+    VExp = B.CreateInsertElement(PoisonValue::get(VExpTy), CvtExp, uint64_t(0));
+    VXTy = VectorType::get(X->getType(), EC);
+    VX = B.CreateInsertElement(PoisonValue::get(VXTy), X, uint64_t(0));
+    Pg = ConstantInt::get(Type::getInt8Ty(C), -1);
+    Constant *Round = ConstantInt::get(Type::getInt32Ty(C), 4);
+    Value *Scalef =
+        B.CreateIntrinsic(IID, std::nullopt, {VX, VExp, VX, Pg, Round});
+    Ret = B.CreateExtractElement(Scalef, (uint64_t)0);
+    Ldexp->replaceAllUsesWith(Ret);
+    break;
+  }
+  }
+}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a6baade412c77d2..5190572b3d386da 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
 #include <cstdint>
 #include <optional>
 
@@ -412,6 +413,12 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
 
     return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
   }
+
+  unsigned getTargetSupportedLdexpInst(Type *Ty) const {
+    if (!ST->hasSVE())
+      return 0;
+    return Intrinsic::aarch64_sve_fscale;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 0fa0d240a548b96..4ceada4e756f6f5 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -19,6 +19,7 @@
 #include "X86TargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/IR/IntrinsicsX86.h"
 #include <optional>
 
 namespace llvm {
@@ -285,6 +286,16 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
   bool prefersVectorizedAddressing() const;
   bool supportsEfficientVectorElementLoadStore() const;
   bool enableInterleavedAccessVectorization();
+  unsigned getTargetSupportedLdexpInst(Type *Ty) const {
+    if (!ST->hasAVX512())
+      return 0;
+    if (Ty->isFloatTy())
+      return Intrinsic::x86_avx512_mask_scalef_ss;
+    else if (Ty->isDoubleTy())
+      return Intrinsic::x86_avx512_mask_scalef_sd;
+    else
+      return 0;
+  }
 
 private:
   bool supportsGather() const;
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/optimize-ldexp.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/optimize-ldexp.ll
new file mode 100644
index 000000000000000..77605844450d006
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/optimize-ldexp.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck --check-prefixes=AARCH64 %s
+
+define dso_local double @testExp(double noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExp:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    ptrue p0.d
+; AARCH64-NEXT:    // kill: def $w0 killed $w0 def $x0
+; AARCH64-NEXT:    sxtw x8, w0
+; AARCH64-NEXT:    // kill: def $d0 killed $d0 def $z0
+; AARCH64-NEXT:    fmov d1, x8
+; AARCH64-NEXT:    fscale z0.d, p0/m, z0.d, z1.d
+; AARCH64-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; AARCH64-NEXT:    ret
+entry:
+  %0 = tail call fast double @llvm.ldexp.f64.i32(double %val, i32 %a)
+  ret double %0
+}
+declare double @llvm.ldexp.f64.i32(double, i32)
+
+define dso_local float @testExpf(float noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExpf:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    ptrue p0.s
+; AARCH64-NEXT:    fmov s1, w0
+; AARCH64-NEXT:    // kill: def $s0 killed $s0 def $z0
+; AARCH64-NEXT:    fscale z0.s, p0/m, z0.s, z1.s
+; AARCH64-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; AARCH64-NEXT:    ret
+entry:
+  %0 = tail call fast float @llvm.ldexp.f32.i32(float %val, i32 %a)
+  ret float %0
+}
+declare float @llvm.ldexp.f32.i32(float, i32)
+
+; Target related intrinsics for f128 are not well supported, use call ldexpl
+; currently.
+define dso_local fp128 @testExpl(fp128 noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExpl:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    b ldexpl
+entry:
+  %0 = tail call fast fp128 @llvm.ldexp.f128.i32(fp128 %val, i32 %a)
+  ret fp128 %0
+}
+declare fp128 @llvm.ldexp.f128.i32(fp128, i32)
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/optimize-ldexp.ll b/llvm/test/Transforms/CodeGenPrepare/X86/optimize-ldexp.ll
new file mode 100644
index 000000000000000..97dd7bd80aa43b1
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/optimize-ldexp.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64 -mattr=+avx512f < %s -o - | FileCheck --check-prefixes=AARCH64 %s
+
+define dso_local double @testExp(double noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExp:
+; AARCH64:       # %bb.0: # %entry
+; AARCH64-NEXT:    vcvtsi2sd %edi, %xmm1, %xmm1
+; AARCH64-NEXT:    vscalefsd %xmm1, %xmm0, %xmm0
+; AARCH64-NEXT:    retq
+entry:
+  %0 = tail call fast double @llvm.ldexp.f64.i32(double %val, i32 %a)
+  ret double %0
+}
+declare double @llvm.ldexp.f64.i32(double, i32)
+
+define dso_local float @testExpf(float noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExpf:
+; AARCH64:       # %bb.0: # %entry
+; AARCH64-NEXT:    vcvtsi2ss %edi, %xmm1, %xmm1
+; AARCH64-NEXT:    vscalefss %xmm1, %xmm0, %xmm0
+; AARCH64-NEXT:    retq
+entry:
+  %0 = tail call fast float @llvm.ldexp.f32.i32(float %val, i32 %a)
+  ret float %0
+}
+declare float @llvm.ldexp.f32.i32(float, i32)
+
+; Target related intrinsics for f128 are not well supported, use call ldexpl
+; currently.
+define dso_local fp128 @testExpl(fp128 noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExpl:
+; AARCH64:       # %bb.0: # %entry
+; AARCH64-NEXT:    jmp ldexpl@PLT # TAILCALL
+entry:
+  %0 = tail call fast fp128 @llvm.ldexp.f128.i32(fp128 %val, i32 %a)
+  ret fp128 %0
+}
+declare fp128 @llvm.ldexp.f128.i32(fp128, i32)

nikic · 2023-09-27T13:06:37Z

Why does this need special handling in CGP instead of being a normal custom lowering for FLDEXP?

paulwalker-arm · 2023-09-27T16:58:34Z

I agree and custom lowering also gives a straight forward way to support vectors types as well.

huhu233 · 2023-09-28T01:20:44Z

I agree and custom lowering also gives a straight forward way to support vectors types as well.

You are right, I'll update the patch as soon, thanks!

huhu233 · 2023-09-28T01:22:14Z

Why does this need special handling in CGP instead of being a normal custom lowering for FLDEXP?

@nikic , thanks for your suggestion, I'll update the patch as soon!

huhu233 · 2023-10-07T13:30:38Z

Rebase and update the patch

davemgreen

Hi. Can you split this into two separated patches - one for AArch64 and another for X86? I think they should be logically separable.

davemgreen · 2023-10-07T14:23:46Z

llvm/test/CodeGen/AArch64/ldexp.ll

@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -O3 -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck %s


llc usually does not need to run with -O3

davemgreen · 2023-10-07T14:24:51Z

llvm/test/CodeGen/AArch64/ldexp.ll

+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -O3 -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck %s
+
+define dso_local nofpclass(nan inf) double @testExp(double noundef nofpclass(nan inf) %val, i32 noundef %a) {


The tests can usually be cleaned up a little by removing dso_local, noundef and probably the nofpclass.

OK, thanks for your suggestions!

davemgreen · 2023-10-07T14:31:13Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+    XVT = MVT::nxv4f32;
+    ExpVT = MVT::nxv4i32;
+    break;
+  case 64:


It might be worth adding a fp16 version for it too, as there should be an instruction available.

Hi, @davemgreen, you are right, but it seems f16 for ldexp is not fully supported? I got some compile failures as shown,
https://godbolt.org/

arsenm · 2023-10-07T14:59:57Z

llvm/lib/Target/X86/X86ISelLowering.cpp

+  SDLoc DL(Op);
+  EVT XVT, ExpVT;
+  SDValue IID;
+  switch (Op.getValueSizeInBits()) {


Probably should switch over the actual type, or the scalar type

Thanks, I'll fix it.

arsenm · 2023-10-07T16:12:44Z

llvm/lib/Target/X86/X86ISelLowering.cpp

+  Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XScalarTy, Exp);
+  SDValue VX =
+      DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
+  SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,


Is this lowering the scalar case as a vector operation? Should this move to the vector legalizer?

Hi, @arsenm, here are some of may consideration:

Is this lowering the scalar case as a vector operation?
Actually, ldexp@PLT (ISD::FLDEXP) is a scalar call, but llvm.aarch64.sve.fscale.* or llvm.x86.mask.scalef.* only has vector versions. I hope to replace call with more efficient target supported instructions, so there are some INSERT_VECTOR_ELT and EXTRACT_VECTOR_ELT.

"Should this move to the vector legalizer?"
FLDEXP will occur in many scalar cases, so I think it is not necessary to move to the vector legalizer.

huhu233 · 2023-10-08T11:54:40Z

Rebase the branch
Split the original patch into 2 seperate commits
Clean up related testcases
Support f16 scenario.

huhu233 · 2023-10-18T01:20:31Z

ping

davemgreen · 2023-10-19T19:42:25Z

Sorry when I said patch I should have said pull request. Can you split the two patches into separate prs, so they can be reviewed separately. I think what you have for AArch64 looks OK to me

huhu233 · 2023-10-20T01:02:39Z

Sorry when I said patch I should have said pull request. Can you split the two patches into separate prs, so they can be reviewed separately. I think what you have for AArch64 looks OK to me

Hi, @davemgreen , thanks for your reply. Do you mean split these two commits into two Pull Request? If so, I'll remove the commit about X86 in this Patch, and create another Pull Request for it.

davemgreen · 2023-10-20T06:34:53Z

Yeah that sounds good to me, thanks.

huhu233 · 2023-10-20T11:20:19Z

rebase
Split X86 code into another Pull Request.

davemgreen

Thanks. I checked the edge cases and this looks good to me, if you can address the last round of comments.

davemgreen · 2023-10-20T12:19:36Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  default:
+    return SDValue();
+  case MVT::f16:
+    X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);


Is this intended to fallthrough? I think the idea of extending f16 to f32 sounds good.

Can you add a [[fallthrough]] attribute to make it explicit

Done, thank very much!

davemgreen · 2023-10-20T12:24:38Z

llvm/test/CodeGen/AArch64/ldexp.ll

+
+declare half @llvm.ldexp.f16.i32(half, i32) #1
+
+attributes #1 = { mustprogress nofree nosync nounwind willreturn memory(none) }


This line can probably be removed.

Done, only keep the necessary attribute.

davemgreen · 2023-10-20T12:25:13Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

@@ -26414,3 +26422,46 @@ bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
  }
  return true;
 }
+
+SDValue AArch64TargetLowering::LowerFLDEXP(SDValue Op,


This can be made static and moved to above AArch64TargetLowering::LowerOperation, similar to LowerFunnelShift.

Done, thanks!

The function of 'fscale' is equivalent to mathlib call ldexp, but has better performance. This patch lowers ldexp into fscale when sve is enabled.

davemgreen

Thanks, this LGTM.

Do you have commit access, or should I hit commit? If so are you happy for this to go in?

huhu233 · 2023-10-24T01:07:12Z

Thanks, this LGTM.

Do you have commit access, or should I hit commit? If so are you happy for this to go in?

Hi, @davemgreen, I have asked someone to hit commit, thanks for your reviews!

huhu233 requested review from arsenm, fhahn, paulwalker-arm and sdesmalen-arm September 27, 2023 13:02

llvmbot added clang Clang issues not falling into any other category backend:AArch64 backend:X86 clang:codegen IR generation bugs: mangling, exceptions, etc. llvm:analysis Includes value tracking, cost tables and constant folding llvm:transforms labels Sep 27, 2023

huhu233 force-pushed the feature-ldexp branch from dce0b8f to 38c28d0 Compare October 7, 2023 13:28

huhu233 changed the title ~~[CodeGenPrepare] Transform ldexp into target supported intrinsics~~ [TargetLowering] Lower ldexp into target supported instructions Oct 7, 2023

davemgreen reviewed Oct 7, 2023

View reviewed changes

arsenm reviewed Oct 7, 2023

View reviewed changes

huhu233 force-pushed the feature-ldexp branch from 38c28d0 to 6593af5 Compare October 8, 2023 11:51

huhu233 requested review from davemgreen and arsenm October 11, 2023 06:25

huhu233 force-pushed the feature-ldexp branch from 6593af5 to 69fc42c Compare October 20, 2023 11:17

huhu233 changed the title ~~[TargetLowering] Lower ldexp into target supported instructions~~ [AArch64] Lower mathlib call ldexp into fscale when sve is enabled Oct 20, 2023

huhu233 mentioned this pull request Oct 20, 2023

[X86] Lower mathlib call ldexp into scalef when avx512 is enabled #69710

Open

davemgreen reviewed Oct 20, 2023

View reviewed changes

[AArch64] Lower mathlib call ldexp into fscale when sve is enabled

9644a49

The function of 'fscale' is equivalent to mathlib call ldexp, but has better performance. This patch lowers ldexp into fscale when sve is enabled.

huhu233 force-pushed the feature-ldexp branch from 69fc42c to 9644a49 Compare October 23, 2023 11:38

huhu233 requested a review from davemgreen October 23, 2023 11:41

davemgreen approved these changes Oct 23, 2023

View reviewed changes

huhu233 merged commit dbe8def into llvm:main Oct 24, 2023

huhu233 deleted the feature-ldexp branch October 24, 2023 02:17

vfdff mentioned this pull request May 18, 2024

InstCombine should fold ldexp(x, zext(i1 y)) to fmul x, (select y, 2.0, 1.0) #92538

Closed

		@@ -0,0 +1,55 @@
		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
		; RUN: llc -O3 -mtriple=aarch64 -mattr=+sve < %s -o - \| FileCheck %s


		declare half @llvm.ldexp.f16.i32(half, i32) #1

		attributes #1 = { mustprogress nofree nosync nounwind willreturn memory(none) }

[AArch64] Lower mathlib call ldexp into fscale when sve is enabled #67552

[AArch64] Lower mathlib call ldexp into fscale when sve is enabled #67552

Uh oh!

Conversation

huhu233 commented Sep 27, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Sep 27, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

nikic commented Sep 27, 2023

Uh oh!

paulwalker-arm commented Sep 27, 2023

Uh oh!

huhu233 commented Sep 28, 2023

Uh oh!

huhu233 commented Sep 28, 2023

Uh oh!

huhu233 commented Oct 7, 2023

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

davemgreen Oct 7, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

huhu233 Oct 8, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

huhu233 commented Oct 8, 2023

Uh oh!

huhu233 commented Oct 18, 2023

Uh oh!

davemgreen commented Oct 19, 2023

Uh oh!

huhu233 commented Oct 20, 2023

Uh oh!

davemgreen commented Oct 20, 2023

Uh oh!

huhu233 commented Oct 20, 2023

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

huhu233 commented Oct 24, 2023

Uh oh!

Uh oh!

huhu233 commented Sep 27, 2023 •

edited

Loading

llvmbot commented Sep 27, 2023 •

edited

Loading

davemgreen Oct 7, 2023 •

edited

Loading

huhu233 Oct 8, 2023 •

edited

Loading