From dd8ade383bfe4d2fa927aa9c4f00bc9bd32f77c0 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 3 Nov 2025 14:02:17 -0800
Subject: [PATCH 1/2] RuntimeLibcalls: Add stub API for getting function
 signatures

Eventually this should be generated by tablegen for all functions.
For now add a manually implementation for sincos_stret, which I
have an immediate use for. This will allow pulling repeated code
across targets into shared call sequence code.

Also add sqrt just to make sure we can handle adding return attributes
on the declaration.
---
 llvm/include/llvm/IR/RuntimeLibcalls.h        |  7 ++
 llvm/lib/IR/RuntimeLibcalls.cpp               | 79 +++++++++++++++++++
 .../Utils/DeclareRuntimeLibcalls.cpp          | 48 ++++++++++-
 .../Util/DeclareRuntimeLibcalls/basic.ll      |  4 +
 .../merge_attributes.ll                       | 11 +++
 .../DeclareRuntimeLibcalls/sincos_stret.ll    | 23 ++++++
 .../wrong_declaration.ll                      | 21 +++++
 7 files changed, 189 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll
 create mode 100644 llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll
 create mode 100644 llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 01359894b0421..ab14ed44fed52 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -186,6 +186,13 @@ struct RuntimeLibcallsInfo {
     return RTLIB::Unsupported;
   }
 
+  /// \returns the function type and attributes for the \p LibcallImpl,
+  /// depending on the target \p TT. If the function has incomplete type
+  /// information, return nullptr for the function type.
+  std::pair<FunctionType *, AttributeList>
+  getFunctionTy(LLVMContext &Ctx, const Triple &TT, const DataLayout &DL,
+                RTLIB::LibcallImpl LibcallImpl) const;
+
 private:
   LLVM_ABI static iota_range<RTLIB::LibcallImpl>
   lookupLibcallImplNameImpl(StringRef Name);
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 77af29b9d70f6..2ce5719228a0d 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -7,7 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/RuntimeLibcalls.h"
+#include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/ADT/StringTable.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/xxhash.h"
 #include "llvm/TargetParser/ARMTargetParser.h"
@@ -72,3 +74,80 @@ bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) {
     return false;
   }
 }
+
+std::pair<FunctionType *, AttributeList>
+RuntimeLibcallsInfo::getFunctionTy(LLVMContext &Ctx, const Triple &TT,
+                                   const DataLayout &DL,
+                                   RTLIB::LibcallImpl LibcallImpl) const {
+  static constexpr Attribute::AttrKind CommonFnAttrs[] = {
+      Attribute::NoCallback, Attribute::NoFree, Attribute::NoSync,
+      Attribute::NoUnwind, Attribute::WillReturn};
+
+  switch (LibcallImpl) {
+  case RTLIB::impl___sincos_stret:
+  case RTLIB::impl___sincosf_stret: {
+    if (!darwinHasSinCosStret(TT)) // Non-darwin currently unexpected
+      return {};
+
+    Type *ScalarTy = LibcallImpl == RTLIB::impl___sincosf_stret
+                         ? Type::getFloatTy(Ctx)
+                         : Type::getDoubleTy(Ctx);
+
+    AttrBuilder FuncAttrBuilder(Ctx);
+    for (Attribute::AttrKind Attr : CommonFnAttrs)
+      FuncAttrBuilder.addAttribute(Attr);
+
+    const bool UseSret =
+        TT.isX86_32() || ((TT.isARM() || TT.isThumb()) &&
+                          ARM::computeTargetABI(TT) == ARM::ARM_ABI_APCS);
+
+    FuncAttrBuilder.addMemoryAttr(MemoryEffects::argumentOrErrnoMemOnly(
+        UseSret ? ModRefInfo::Mod : ModRefInfo::NoModRef, ModRefInfo::Mod));
+
+    AttributeList Attrs;
+    Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder);
+
+    if (UseSret) {
+      AttrBuilder AttrBuilder(Ctx);
+      StructType *StructTy = StructType::get(ScalarTy, ScalarTy);
+      AttrBuilder.addStructRetAttr(StructTy);
+      AttrBuilder.addAlignmentAttr(DL.getABITypeAlign(StructTy));
+      FunctionType *FuncTy = FunctionType::get(
+          Type::getVoidTy(Ctx), {DL.getAllocaPtrType(Ctx), ScalarTy}, false);
+
+      return {FuncTy, Attrs.addParamAttributes(Ctx, 0, AttrBuilder)};
+    }
+
+    Type *RetTy =
+        LibcallImpl == RTLIB::impl___sincosf_stret && TT.isX86_64()
+            ? static_cast<Type *>(FixedVectorType::get(ScalarTy, 2))
+            : static_cast<Type *>(StructType::get(ScalarTy, ScalarTy));
+
+    return {FunctionType::get(RetTy, {ScalarTy}, false), Attrs};
+  }
+  case RTLIB::impl_sqrtf:
+  case RTLIB::impl_sqrt: {
+    AttrBuilder FuncAttrBuilder(Ctx);
+
+    for (Attribute::AttrKind Attr : CommonFnAttrs)
+      FuncAttrBuilder.addAttribute(Attr);
+    FuncAttrBuilder.addMemoryAttr(MemoryEffects::errnoMemOnly(ModRefInfo::Mod));
+
+    AttributeList Attrs;
+    Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder);
+
+    Type *ScalarTy = LibcallImpl == RTLIB::impl_sqrtf ? Type::getFloatTy(Ctx)
+                                                      : Type::getDoubleTy(Ctx);
+    FunctionType *FuncTy = FunctionType::get(ScalarTy, {ScalarTy}, false);
+
+    Attrs = Attrs.addRetAttribute(
+        Ctx, Attribute::getWithNoFPClass(Ctx, fcNegInf | fcNegSubnormal |
+                                                  fcNegNormal));
+    return {FuncTy, Attrs};
+  }
+  default:
+    return {};
+  }
+
+  return {};
+}
diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
index 0642d51cd2c21..6d4436b92c119 100644
--- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
+++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
@@ -16,22 +16,62 @@
 
 using namespace llvm;
 
+static void mergeAttributes(LLVMContext &Ctx, const Module &M,
+                            const DataLayout &DL, const Triple &TT,
+                            Function *Func, FunctionType *FuncTy,
+                            AttributeList FuncAttrs) {
+  AttributeList OldAttrs = Func->getAttributes();
+  AttributeList NewAttrs = OldAttrs;
+
+  {
+    AttrBuilder OldBuilder(Ctx, OldAttrs.getFnAttrs());
+    AttrBuilder NewBuilder(Ctx, FuncAttrs.getFnAttrs());
+    OldBuilder.merge(NewBuilder);
+    NewAttrs = NewAttrs.addFnAttributes(Ctx, OldBuilder);
+  }
+
+  {
+    AttrBuilder OldBuilder(Ctx, OldAttrs.getRetAttrs());
+    AttrBuilder NewBuilder(Ctx, FuncAttrs.getRetAttrs());
+    OldBuilder.merge(NewBuilder);
+    NewAttrs = NewAttrs.addRetAttributes(Ctx, OldBuilder);
+  }
+
+  for (unsigned I = 0, E = FuncTy->getNumParams(); I != E; ++I) {
+    AttrBuilder OldBuilder(Ctx, OldAttrs.getParamAttrs(I));
+    AttrBuilder NewBuilder(Ctx, FuncAttrs.getParamAttrs(I));
+    OldBuilder.merge(NewBuilder);
+    NewAttrs = NewAttrs.addParamAttributes(Ctx, I, OldBuilder);
+  }
+
+  Func->setAttributes(NewAttrs);
+}
+
 PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M,
                                                   ModuleAnalysisManager &MAM) {
   RTLIB::RuntimeLibcallsInfo RTLCI(M.getTargetTriple());
   LLVMContext &Ctx = M.getContext();
+  const DataLayout &DL = M.getDataLayout();
+  const Triple &TT = M.getTargetTriple();
 
   for (RTLIB::LibcallImpl Impl : RTLCI.getLibcallImpls()) {
     if (Impl == RTLIB::Unsupported)
       continue;
 
-    // TODO: Declare with correct type, calling convention, and attributes.
+    auto [FuncTy, FuncAttrs] = RTLCI.getFunctionTy(Ctx, TT, DL, Impl);
 
-    FunctionType *FuncTy =
-        FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true);
+    // TODO: Declare with correct type, calling convention, and attributes.
+    if (!FuncTy)
+      FuncTy = FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true);
 
     StringRef FuncName = RTLCI.getLibcallImplName(Impl);
-    M.getOrInsertFunction(FuncName, FuncTy);
+
+    Function *Func =
+        cast<Function>(M.getOrInsertFunction(FuncName, FuncTy).getCallee());
+    if (Func->getFunctionType() == FuncTy) {
+      mergeAttributes(Ctx, M, DL, TT, Func, FuncTy, FuncAttrs);
+      Func->setCallingConv(RTLCI.getLibcallImplCallingConv(Impl));
+    }
   }
 
   return PreservedAnalyses::none();
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll
index ee3a0539bf300..c005316f07f06 100644
--- a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll
@@ -11,5 +11,9 @@ define float @sinf(float %x) {
 }
 
 ; CHECK: declare void @acosf(...)
+
+; CHECK: declare nofpclass(ninf nsub nnorm) float @sqrtf(float) [[SQRT_ATTRS:#[0-9]+]]
+; CHECK: declare nofpclass(ninf nsub nnorm) double @sqrt(double) [[SQRT_ATTRS:#[0-9]+]]
+
 ; CHECK: declare void @__umodti3(...)
 
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll
new file mode 100644
index 0000000000000..ffbf11d4106dc
--- /dev/null
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll
@@ -0,0 +1,11 @@
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define noundef nofpclass(nan) float @sqrtf(float %x) "foo" {
+  %ret = call float asm "; $0 = sqrt($1)", "=r,r"(float %x)
+  ret float %ret
+}
+
+; FIXME: Individual fields of nofpclass not merged
+; CHECK: define noundef nofpclass(ninf nsub nnorm) float @sqrtf(float %x) [[SQRT_ATTR:#[0-9]+]] {
+
+; CHECK: attributes [[SQRT_ATTR]] = { nocallback nofree nosync nounwind willreturn memory(errnomem: write) "foo" }
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll
new file mode 100644
index 0000000000000..0d0e3da25eea7
--- /dev/null
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll
@@ -0,0 +1,23 @@
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,X64 %s
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=arm64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7k-apple-watchos2.0 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s
+
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios6 < %s | FileCheck -check-prefix=NONE %s
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.8 < %s | FileCheck -check-prefix=NONE %s
+
+; X64: declare <2 x float> @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]]
+; X64: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]]
+
+; STRUCT: declare { float, float } @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]]
+; STRUCT: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]]
+
+; SRET: declare void @__sincosf_stret(ptr sret({ float, float }) align 4, float) [[SINCOS_ATTRS:#[0-9]+]]
+; SRET: declare void @__sincos_stret(ptr sret({ double, double }) align 4, double) [[SINCOS_ATTRS:#[0-9]+]]
+
+; CHECK: attributes [[SINCOS_ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(errnomem: write) }
+; SRET: attributes [[SINCOS_ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(argmem: write, errnomem: write) }
+
+; NONE-NOT: __sincos_stret
+; NONE-NOT: __sincosf_stret
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll
new file mode 100644
index 0000000000000..2451010df5b75
--- /dev/null
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll
@@ -0,0 +1,21 @@
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.9 < %s | FileCheck %s
+
+; Make sure there is no crash if there are definitions or declarations
+; with the wrong type signature.
+
+; CHECK: define void @sqrtf() {
+define void @sqrtf() {
+  ret void
+}
+
+; CHECK: define float @sqrt(float %0) {
+define float @sqrt(float) {
+  ret float 0.0
+}
+
+; CHECK: declare double @__sincos_stret(double)
+declare double @__sincos_stret(double)
+
+; CHECK: declare { float, float } @__sincosf_stret(float)
+declare { float, float } @__sincosf_stret(float)
+

From dd828b478b95163cd8c5e8acfc8694a10f0908f5 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 3 Nov 2025 18:22:53 -0800
Subject: [PATCH 2/2] DAG: Merge all sincos_stret emission code into legalizer

This avoids AArch64 legality rules depending on libcall
availability.

ARM, AArch64, and X86 all had custom lowering of fsincos which
all were just to emit calls to sincos_stret / sincosf_stret. This
messes with the cost heuristics around legality, because really
it's an expand/libcall cost and not a favorable custom.

This is a bit ugly, because we're emitting code trying to match the
C ABI lowered IR type for the aggregate return type. This now also
gives an easy way to lift the unhandled x86_32 darwin case, since
ARM already handled the return as sret case.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 109 ++++++++++++++++++
 .../Target/AArch64/AArch64ISelLowering.cpp    |  43 +------
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   1 -
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |  78 +------------
 llvm/lib/Target/ARM/ARMISelLowering.h         |   1 -
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  60 +---------
 6 files changed, 118 insertions(+), 174 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 431a81002074f..316aacdf6978e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -163,6 +163,8 @@ class SelectionDAGLegalize {
                                    RTLIB::Libcall CallI128);
   void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
 
+  SDValue ExpandSincosStretLibCall(SDNode *Node) const;
+
   SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
                            const SDLoc &dl);
   SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
@@ -2423,6 +2425,101 @@ static bool useSinCos(SDNode *Node) {
   return false;
 }
 
+SDValue SelectionDAGLegalize::ExpandSincosStretLibCall(SDNode *Node) const {
+  // For iOS, we want to call an alternative entry point: __sincos_stret,
+  // which returns the values in two S / D registers.
+  SDLoc dl(Node);
+  SDValue Arg = Node->getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT);
+  RTLIB::LibcallImpl SincosStret = TLI.getLibcallImpl(LC);
+  if (SincosStret == RTLIB::Unsupported)
+    return SDValue();
+
+  /// There are 3 different ABI cases to handle:
+  /// - Direct return of separate fields in registers
+  /// - Single return as vector elements
+  /// - sret struct
+
+  const RTLIB::RuntimeLibcallsInfo &CallsInfo = TLI.getRuntimeLibcallsInfo();
+
+  const DataLayout &DL = DAG.getDataLayout();
+
+  auto [FuncTy, FuncAttrs] = CallsInfo.getFunctionTy(
+      *DAG.getContext(), TM.getTargetTriple(), DL, SincosStret);
+
+  Type *SincosStretRetTy = FuncTy->getReturnType();
+  CallingConv::ID CallConv = CallsInfo.getLibcallImplCallingConv(SincosStret);
+  StringRef LibcallImplName = CallsInfo.getLibcallImplName(SincosStret);
+
+  SDValue Callee = DAG.getExternalSymbol(LibcallImplName.data(),
+                                         TLI.getProgramPointerTy(DL));
+
+  TargetLowering::ArgListTy Args;
+  SDValue SRet;
+
+  int FrameIdx;
+  if (FuncTy->getParamType(0)->isPointerTy()) {
+    // Uses sret
+    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+    AttributeSet PtrAttrs = FuncAttrs.getParamAttrs(0);
+    Type *StructTy = PtrAttrs.getStructRetType();
+    const uint64_t ByteSize = DL.getTypeAllocSize(StructTy);
+    const Align StackAlign = DL.getPrefTypeAlign(StructTy);
+
+    FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
+    SRet = DAG.getFrameIndex(FrameIdx, TLI.getFrameIndexTy(DL));
+
+    TargetLowering::ArgListEntry Entry(SRet, FuncTy->getParamType(0));
+    Entry.IsSRet = true;
+    Entry.IndirectType = StructTy;
+    Entry.Alignment = StackAlign;
+
+    Args.push_back(Entry);
+    Args.emplace_back(Arg, FuncTy->getParamType(1));
+  } else {
+    Args.emplace_back(Arg, FuncTy->getParamType(0));
+  }
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(CallConv, SincosStretRetTy, Callee, std::move(Args))
+      .setIsPostTypeLegalization();
+
+  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+  if (SRet) {
+    MachinePointerInfo PtrInfo =
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
+    SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, PtrInfo);
+
+    TypeSize StoreSize = ArgVT.getStoreSize();
+
+    // Address of cos field.
+    SDValue Add = DAG.getObjectPtrOffset(dl, SRet, StoreSize);
+    SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add,
+                                  PtrInfo.getWithOffset(StoreSize));
+
+    SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+    return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, LoadSin.getValue(0),
+                       LoadCos.getValue(0));
+  }
+
+  if (!CallResult.first.getValueType().isVector())
+    return CallResult.first;
+
+  SDValue SinVal =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
+                  DAG.getVectorIdxConstant(0, dl));
+  SDValue CosVal =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
+                  DAG.getVectorIdxConstant(1, dl));
+  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
+}
+
 SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const {
   SDLoc dl(Node);
   EVT VT = Node->getValueType(0);
@@ -4730,6 +4827,18 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
   case ISD::FSINCOS:
   case ISD::FSINCOSPI: {
     EVT VT = Node->getValueType(0);
+
+    if (Node->getOpcode() == ISD::FSINCOS) {
+      RTLIB::Libcall SincosStret = RTLIB::getSINCOS_STRET(VT);
+      if (SincosStret != RTLIB::UNKNOWN_LIBCALL) {
+        if (SDValue Expanded = ExpandSincosStretLibCall(Node)) {
+          Results.push_back(Expanded);
+          Results.push_back(Expanded.getValue(1));
+          break;
+        }
+      }
+    }
+
     RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS
                             ? RTLIB::getSINCOS(VT)
                             : RTLIB::getSINCOSPI(VT);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60aa61e993b26..d08f9b94227a2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1052,15 +1052,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
 
-  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
-      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
-    // Issue __sincos_stret if available.
-    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
-  } else {
-    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
-  }
+  // Issue __sincos_stret if available.
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 
   // Make floating-point constants legal for the large code model, so they don't
   // become loads from the constant pool.
@@ -5346,35 +5340,6 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
   return SDValue();
 }
 
-SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  // For iOS, we want to call an alternative entry point: __sincos_stret,
-  // which returns the values in two S / D registers.
-  SDLoc DL(Op);
-  SDValue Arg = Op.getOperand(0);
-  EVT ArgVT = Arg.getValueType();
-  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-
-  ArgListTy Args;
-  Args.emplace_back(Arg, ArgTy);
-
-  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
-                                        : RTLIB::SINCOS_STRET_F32;
-  const char *LibcallName = getLibcallName(LC);
-  SDValue Callee =
-      DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
-
-  StructType *RetTy = StructType::get(ArgTy, ArgTy);
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CallingConv::ID CC = getLibcallCallingConv(LC);
-  CLI.setDebugLoc(DL)
-      .setChain(DAG.getEntryNode())
-      .setLibCallee(CC, RetTy, Callee, std::move(Args));
-
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
-  return CallResult.first;
-}
-
 static MVT getSVEContainerType(EVT ContentTy);
 
 SDValue
@@ -7723,8 +7688,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::FP_TO_SINT_SAT:
   case ISD::FP_TO_UINT_SAT:
     return LowerFP_TO_INT_SAT(Op, DAG);
-  case ISD::FSINCOS:
-    return LowerFSINCOS(Op, DAG);
   case ISD::GET_ROUNDING:
     return LowerGET_ROUNDING(Op, DAG);
   case ISD::SET_ROUNDING:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 2cb8ed29f252a..70bfae717fb76 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -745,7 +745,6 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOOP_DEPENDENCE_MASK(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 6b0653457cbaf..92fae71121a81 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1312,8 +1312,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
   }
 
-  setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-  setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 
   // FP-ARMv8 implements a lot of rounding-like FP operations.
   if (Subtarget->hasFPARMv8Base()) {
@@ -9855,76 +9855,6 @@ static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
 }
 
-SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
-  // For iOS, we want to call an alternative entry point: __sincos_stret,
-  // return values are passed via sret.
-  SDLoc dl(Op);
-  SDValue Arg = Op.getOperand(0);
-  EVT ArgVT = Arg.getValueType();
-  RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT);
-  RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC);
-  if (SincosStret == RTLIB::Unsupported)
-    return SDValue();
-
-  assert(Subtarget->isTargetDarwin());
-
-  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-  auto PtrVT = getPointerTy(DAG.getDataLayout());
-
-  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-
-  // Pair of floats / doubles used to pass the result.
-  Type *RetTy = StructType::get(ArgTy, ArgTy);
-  auto &DL = DAG.getDataLayout();
-
-  ArgListTy Args;
-  bool ShouldUseSRet = getTM().isAPCS_ABI();
-  SDValue SRet;
-  if (ShouldUseSRet) {
-    // Create stack object for sret.
-    const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
-    const Align StackAlign = DL.getPrefTypeAlign(RetTy);
-    int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
-    SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
-
-    ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext()));
-    Entry.IsSExt = false;
-    Entry.IsZExt = false;
-    Entry.IsSRet = true;
-    Args.push_back(Entry);
-    RetTy = Type::getVoidTy(*DAG.getContext());
-  }
-
-  Args.emplace_back(Arg, ArgTy);
-
-  StringRef LibcallName = getLibcallImplName(SincosStret);
-  CallingConv::ID CC = getLibcallImplCallingConv(SincosStret);
-  SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL));
-
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl)
-      .setChain(DAG.getEntryNode())
-      .setCallee(CC, RetTy, Callee, std::move(Args))
-      .setDiscardResult(ShouldUseSRet);
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
-
-  if (!ShouldUseSRet)
-    return CallResult.first;
-
-  SDValue LoadSin =
-      DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
-
-  // Address of cos field.
-  SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
-                            DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
-  SDValue LoadCos =
-      DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
-
-  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
-                     LoadSin.getValue(0), LoadCos.getValue(0));
-}
-
 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
                                                   bool Signed,
                                                   SDValue &Chain) const {
@@ -10726,8 +10656,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::VECREDUCE_SMAX:
     return LowerVecReduceMinMax(Op, DAG, Subtarget);
   case ISD::ATOMIC_LOAD:
-  case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
-  case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
+  case ISD::ATOMIC_STORE:
+    return LowerAtomicLoadStore(Op, DAG);
   case ISD::SDIVREM:
   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index bf3438b0d8803..bc2fec3c1bdb5 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -901,7 +901,6 @@ class VectorType;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                               const ARMSubtarget *ST) const;
     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const;
     void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b97b5089cb0a3..aa4f0a7eb909b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2572,8 +2572,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   // Combine sin / cos into _sincos_stret if it is available.
-  setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-  setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 
   if (Subtarget.isTargetWin64()) {
     setOperationAction(ISD::SDIV, MVT::i128, Custom);
@@ -33004,61 +33004,6 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
 }
 
-static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
-                            SelectionDAG &DAG) {
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SDValue Arg = Op.getOperand(0);
-  EVT ArgVT = Arg.getValueType();
-  bool isF64 = ArgVT == MVT::f64;
-
-  RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
-  const char *LibcallName = TLI.getLibcallName(LC);
-  if (!LibcallName)
-    return SDValue();
-
-  assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
-
-  // For MacOSX, we want to call an alternative entry point: __sincos_stret,
-  // which returns the values as { float, float } (in XMM0) or
-  // { double, double } (which is returned in XMM0, XMM1).
-  SDLoc dl(Op);
-  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-
-  TargetLowering::ArgListTy Args;
-  Args.emplace_back(Arg, ArgTy);
-
-  // Only optimize x86_64 for now. i386 is a bit messy. For f32,
-  // the small struct {f32, f32} is returned in (eax, edx). For f64,
-  // the results are returned via SRet in memory.
-  SDValue Callee =
-      DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
-
-  Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
-                      : (Type *)FixedVectorType::get(ArgTy, 2);
-
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl)
-      .setChain(DAG.getEntryNode())
-      .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args))
-      .setIsPostTypeLegalization();
-
-  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
-
-  if (isF64)
-    // Returned in xmm0 and xmm1.
-    return CallResult.first;
-
-  // Returned in bits 0:31 and 32:64 xmm0.
-  SDValue SinVal =
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
-                  DAG.getVectorIdxConstant(0, dl));
-  SDValue CosVal =
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
-                  DAG.getVectorIdxConstant(1, dl));
-  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
-}
-
 /// Widen a vector input to a vector of NVT.  The
 /// input vector must have the same element type as NVT.
 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
@@ -33663,7 +33608,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ABDS:
   case ISD::ABDU:               return LowerABD(Op, Subtarget, DAG);
   case ISD::AVGCEILU:           return LowerAVG(Op, Subtarget, DAG);
-  case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);