diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index ad56d2f12caf6..bb4bb1195f78b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -196,7 +196,8 @@ static bool IsPTXVectorType(MVT VT) { // - unsigned int NumElts - The number of elements in the final vector // - EVT EltVT - The type of the elements in the final vector static std::optional> -getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) { +getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, + unsigned AddressSpace) { if (!VectorEVT.isSimple()) return std::nullopt; const MVT VectorVT = VectorEVT.getSimpleVT(); @@ -213,6 +214,8 @@ getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) { // The size of the PTX virtual register that holds a packed type. unsigned PackRegSize; + bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace); + // We only handle "native" vector sizes for now, e.g. <4 x double> is not // legal. We can (and should) split that into 2 stores of <2 x double> here // but I'm leaving that as a TODO for now. @@ -263,6 +266,8 @@ getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) { LLVM_FALLTHROUGH; case MVT::v2f32: // <1 x f32x2> case MVT::v4f32: // <2 x f32x2> + if (!STI.hasF32x2Instructions()) + return std::pair(NumElts, EltVT); PackRegSize = 64; break; } @@ -278,97 +283,44 @@ getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) { } /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive -/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors -/// into their primitive components. +/// legal-ish MVTs that compose it. Unlike ComputeValueVTs, this will legalize +/// the types as required by the calling convention (with special handling for +/// i8s). /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the /// same number of types as the Ins/Outs arrays in LowerFormalArguments, /// LowerCall, and LowerReturn. static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, + LLVMContext &Ctx, CallingConv::ID CallConv, Type *Ty, SmallVectorImpl &ValueVTs, - SmallVectorImpl *Offsets = nullptr, + SmallVectorImpl &Offsets, uint64_t StartingOffset = 0) { SmallVector TempVTs; SmallVector TempOffsets; - - // Special case for i128 - decompose to (i64, i64) - if (Ty->isIntegerTy(128) || Ty->isFP128Ty()) { - ValueVTs.append({MVT::i64, MVT::i64}); - - if (Offsets) - Offsets->append({StartingOffset + 0, StartingOffset + 8}); - - return; - } - - // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs. - if (StructType *STy = dyn_cast(Ty)) { - auto const *SL = DL.getStructLayout(STy); - auto ElementNum = 0; - for(auto *EI : STy->elements()) { - ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets, - StartingOffset + SL->getElementOffset(ElementNum)); - ++ElementNum; - } - return; - } - - // Given an array type, recursively traverse the elements with custom ComputePTXValueVTs. - if (ArrayType *ATy = dyn_cast(Ty)) { - Type *EltTy = ATy->getElementType(); - uint64_t EltSize = DL.getTypeAllocSize(EltTy); - for (int I : llvm::seq(ATy->getNumElements())) - ComputePTXValueVTs(TLI, DL, EltTy, ValueVTs, Offsets, StartingOffset + I * EltSize); - return; - } - - // Will split structs and arrays into member types, but will not split vector - // types. We do that manually below. ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); - for (auto [VT, Off] : zip(TempVTs, TempOffsets)) { - // Split vectors into individual elements that fit into registers. - if (VT.isVector()) { - unsigned NumElts = VT.getVectorNumElements(); - EVT EltVT = VT.getVectorElementType(); - // Below we must maintain power-of-2 sized vectors because - // TargetLoweringBase::getVectorTypeBreakdown() which is invoked in - // ComputePTXValueVTs() cannot currently break down non-power-of-2 sized - // vectors. - - // If the element type belongs to one of the supported packed vector types - // then we can pack multiples of this element into a single register. - if (VT == MVT::v2i8) { - // We can pack 2 i8s into a single 16-bit register. We only do this for - // loads and stores, which is why we have a separate case for it. - EltVT = MVT::v2i8; - NumElts = 1; - } else if (VT == MVT::v3i8) { - // We can also pack 3 i8s into 32-bit register, leaving the 4th - // element undefined. - EltVT = MVT::v4i8; - NumElts = 1; - } else if (NumElts > 1 && isPowerOf2_32(NumElts)) { - // Handle default packed types. - for (MVT PackedVT : NVPTX::packed_types()) { - const auto NumEltsPerReg = PackedVT.getVectorNumElements(); - if (NumElts % NumEltsPerReg == 0 && - EltVT == PackedVT.getVectorElementType()) { - EltVT = PackedVT; - NumElts /= NumEltsPerReg; - break; - } - } - } + for (const auto [VT, Off] : zip(TempVTs, TempOffsets)) { + MVT RegisterVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT); + unsigned NumRegs = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT); + + // Since we actually can load/store b8, we need to ensure that we'll use + // the original sized type for any i8s or i8 vectors. + if (VT.getScalarType() == MVT::i8) { + if (RegisterVT == MVT::i16) + RegisterVT = MVT::i8; + else if (RegisterVT == MVT::v2i16) + RegisterVT = MVT::v2i8; + else + assert(RegisterVT == MVT::v4i8 && + "Expected v4i8, v2i16, or i16 for i8 RegisterVT"); + } - for (unsigned J : seq(NumElts)) { - ValueVTs.push_back(EltVT); - if (Offsets) - Offsets->push_back(Off + J * EltVT.getStoreSize()); - } - } else { - ValueVTs.push_back(VT); - if (Offsets) - Offsets->push_back(Off); + // TODO: This is horribly incorrect for cases where the vector elements are + // not a multiple of bytes (ex i1) and legal or i8. However, this problem + // has existed for as long as NVPTX has and no one has complained, so we'll + // leave it for now. + for (unsigned I : seq(NumRegs)) { + ValueVTs.push_back(RegisterVT); + Offsets.push_back(Off + I * RegisterVT.getStoreSize()); } } } @@ -631,7 +583,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, addRegisterClass(MVT::v2f16, &NVPTX::B32RegClass); addRegisterClass(MVT::bf16, &NVPTX::B16RegClass); addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass); - addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass); + + if (STI.hasF32x2Instructions()) + addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass); // Conversion to/from FP16/FP16x2 is always legal. setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); @@ -672,7 +626,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Expand); // Need custom lowering in case the index is dynamic. - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); + if (STI.hasF32x2Instructions()) + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); // Custom conversions to/from v2i8. setOperationAction(ISD::BITCAST, MVT::v2i8, Custom); @@ -1606,7 +1561,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } else { SmallVector VTs; SmallVector Offsets; - ComputePTXValueVTs(*this, DL, Arg.Ty, VTs, &Offsets, VAOffset); + ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, Arg.Ty, VTs, Offsets, + VAOffset); assert(VTs.size() == Offsets.size() && "Size mismatch"); assert(VTs.size() == ArgOuts.size() && "Size mismatch"); @@ -1756,7 +1712,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (!Ins.empty()) { SmallVector VTs; SmallVector Offsets; - ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); + ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, RetTy, VTs, Offsets); assert(VTs.size() == Ins.size() && "Bad value decomposition"); const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL); @@ -3217,8 +3173,8 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { if (ValVT != MemVT) return SDValue(); - const auto NumEltsAndEltVT = getVectorLoweringShape( - ValVT, STI.has256BitVectorLoadStore(N->getAddressSpace())); + const auto NumEltsAndEltVT = + getVectorLoweringShape(ValVT, STI, N->getAddressSpace()); if (!NumEltsAndEltVT) return SDValue(); const auto [NumElts, EltVT] = NumEltsAndEltVT.value(); @@ -3386,6 +3342,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { const DataLayout &DL = DAG.getDataLayout(); + LLVMContext &Ctx = *DAG.getContext(); auto PtrVT = getPointerTy(DAG.getDataLayout()); const Function &F = DAG.getMachineFunction().getFunction(); @@ -3457,7 +3414,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( } else { SmallVector VTs; SmallVector Offsets; - ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); + ComputePTXValueVTs(*this, DL, Ctx, CallConv, Ty, VTs, Offsets); assert(VTs.size() == ArgIns.size() && "Size mismatch"); assert(VTs.size() == Offsets.size() && "Size mismatch"); @@ -3469,7 +3426,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( for (const unsigned NumElts : VI) { // i1 is loaded/stored as i8 const EVT LoadVT = VTs[I] == MVT::i1 ? MVT::i8 : VTs[I]; - const EVT VecVT = getVectorizedVT(LoadVT, NumElts, *DAG.getContext()); + const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx); SDValue VecAddr = DAG.getObjectPtrOffset( dl, ArgSymbol, TypeSize::getFixed(Offsets[I])); @@ -3514,6 +3471,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, } const DataLayout &DL = DAG.getDataLayout(); + LLVMContext &Ctx = *DAG.getContext(); const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32); const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL); @@ -3526,7 +3484,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SmallVector VTs; SmallVector Offsets; - ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); + ComputePTXValueVTs(*this, DL, Ctx, CallConv, RetTy, VTs, Offsets); assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); const auto GetRetVal = [&](unsigned I) -> SDValue { @@ -5985,8 +5943,8 @@ static void replaceLoadVector(SDNode *N, SelectionDAG &DAG, if (ResVT != MemVT) return; - const auto NumEltsAndEltVT = getVectorLoweringShape( - ResVT, STI.has256BitVectorLoadStore(LD->getAddressSpace())); + const auto NumEltsAndEltVT = + getVectorLoweringShape(ResVT, STI, LD->getAddressSpace()); if (!NumEltsAndEltVT) return; const auto [NumElts, EltVT] = NumEltsAndEltVT.value(); diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp index e5d680c19d921..a84ceaba991c7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -29,6 +29,12 @@ static cl::opt NoF16Math("nvptx-no-f16-math", cl::Hidden, cl::desc("NVPTX Specific: Disable generation of f16 math ops."), cl::init(false)); + +static cl::opt NoF32x2("nvptx-no-f32x2", cl::Hidden, + cl::desc("NVPTX Specific: Disable generation of " + "f32x2 instructions and registers."), + cl::init(false)); + // Pin the vtable to this file. void NVPTXSubtarget::anchor() {} @@ -70,6 +76,10 @@ bool NVPTXSubtarget::allowFP16Math() const { return hasFP16Math() && NoF16Math == false; } +bool NVPTXSubtarget::hasF32x2Instructions() const { + return SmVersion >= 100 && PTXVersion >= 86 && !NoF32x2; +} + bool NVPTXSubtarget::hasNativeBF16Support(int Opcode) const { if (!hasBF16Math()) return false; diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 81af55edccadb..acf025b70ce34 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -117,9 +117,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { return HasTcgen05 && PTXVersion >= 86; } // f32x2 instructions in Blackwell family - bool hasF32x2Instructions() const { - return SmVersion >= 100 && PTXVersion >= 86; - } + bool hasF32x2Instructions() const; // TMA G2S copy with cta_group::1/2 support bool hasCpAsyncBulkTensorCTAGroupSupport() const { diff --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll index bf51973e88357..fab60bdb3f2d1 100644 --- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll +++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll @@ -10,19 +10,20 @@ declare {float, float} @bars({float, float} %input) define void @test_v2f32(<2 x float> %input, ptr %output) { ; CHECK-LABEL: test_v2f32( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_v2f32_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v2f32_param_0]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; ; CHECK-NEXT: .param .align 8 .b8 retval0[8]; -; CHECK-NEXT: st.param.b64 [param0], %rd1; +; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2}; ; CHECK-NEXT: call.uni (retval0), barv, (param0); -; CHECK-NEXT: ld.param.b64 %rd2, [retval0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [retval0]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: ld.param.b64 %rd3, [test_v2f32_param_1]; -; CHECK-NEXT: st.b64 [%rd3], %rd2; +; CHECK-NEXT: ld.param.b64 %rd1, [test_v2f32_param_1]; +; CHECK-NEXT: st.v2.b32 [%rd1], {%r3, %r4}; ; CHECK-NEXT: ret; %call = tail call <2 x float> @barv(<2 x float> %input) store <2 x float> %call, ptr %output, align 8 @@ -32,24 +33,28 @@ define void @test_v2f32(<2 x float> %input, ptr %output) { define void @test_v3f32(<3 x float> %input, ptr %output) { ; CHECK-LABEL: test_v3f32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_v3f32_param_0]; -; CHECK-NEXT: ld.param.b32 %r1, [test_v3f32_param_0+8]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v3f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r3, [test_v3f32_param_0+8]; ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; ; CHECK-NEXT: .param .align 16 .b8 retval0[16]; -; CHECK-NEXT: st.param.b32 [param0+8], %r1; -; CHECK-NEXT: st.param.b64 [param0], %rd1; +; CHECK-NEXT: st.param.b32 [param0+8], %r3; +; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2}; ; CHECK-NEXT: call.uni (retval0), barv3, (param0); -; CHECK-NEXT: ld.param.b32 %r2, [retval0+8]; -; CHECK-NEXT: ld.param.b64 %rd2, [retval0]; +; CHECK-NEXT: ld.param.b32 %r4, [retval0+8]; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [retval0]; ; CHECK-NEXT: } // callseq 1 -; CHECK-NEXT: ld.param.b64 %rd3, [test_v3f32_param_1]; -; CHECK-NEXT: st.b32 [%rd3+8], %r2; -; CHECK-NEXT: st.b64 [%rd3], %rd2; +; CHECK-NEXT: cvt.u64.u32 %rd1, %r5; +; CHECK-NEXT: cvt.u64.u32 %rd2, %r6; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 32; +; CHECK-NEXT: or.b64 %rd4, %rd1, %rd3; +; CHECK-NEXT: ld.param.b64 %rd5, [test_v3f32_param_1]; +; CHECK-NEXT: st.b32 [%rd5+8], %r4; +; CHECK-NEXT: st.b64 [%rd5], %rd4; ; CHECK-NEXT: ret; %call = tail call <3 x float> @barv3(<3 x float> %input) ; Make sure we don't load more values than than we need to. diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll index aee58a044a986..a386e4292777b 100644 --- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll @@ -688,25 +688,25 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM70-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; SM70-NEXT: cvt.u32.u16 %r5, %rs2; +; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r4; +; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r2; +; SM70-NEXT: cvt.u32.u16 %r5, %rs8; ; SM70-NEXT: shl.b32 %r6, %r5, 16; -; SM70-NEXT: cvt.u32.u16 %r7, %rs1; +; SM70-NEXT: cvt.u32.u16 %r7, %rs7; ; SM70-NEXT: shl.b32 %r8, %r7, 16; -; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; SM70-NEXT: cvt.u32.u16 %r9, %rs4; +; SM70-NEXT: cvt.u32.u16 %r9, %rs6; ; SM70-NEXT: shl.b32 %r10, %r9, 16; -; SM70-NEXT: cvt.u32.u16 %r11, %rs3; +; SM70-NEXT: cvt.u32.u16 %r11, %rs5; ; SM70-NEXT: shl.b32 %r12, %r11, 16; -; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r4; -; SM70-NEXT: cvt.u32.u16 %r13, %rs6; +; SM70-NEXT: cvt.u32.u16 %r13, %rs4; ; SM70-NEXT: shl.b32 %r14, %r13, 16; -; SM70-NEXT: cvt.u32.u16 %r15, %rs5; +; SM70-NEXT: cvt.u32.u16 %r15, %rs3; ; SM70-NEXT: shl.b32 %r16, %r15, 16; -; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r3; -; SM70-NEXT: cvt.u32.u16 %r17, %rs8; +; SM70-NEXT: cvt.u32.u16 %r17, %rs2; ; SM70-NEXT: shl.b32 %r18, %r17, 16; -; SM70-NEXT: cvt.u32.u16 %r19, %rs7; +; SM70-NEXT: cvt.u32.u16 %r19, %rs1; ; SM70-NEXT: shl.b32 %r20, %r19, 16; ; SM70-NEXT: st.param.v4.b32 [func_retval0+16], {%r20, %r18, %r16, %r14}; ; SM70-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r10, %r8, %r6}; @@ -721,18 +721,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM80-NEXT: // %bb.0: ; SM80-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM80-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; SM80-NEXT: cvt.f32.bf16 %r5, %rs2; -; SM80-NEXT: cvt.f32.bf16 %r6, %rs1; -; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; SM80-NEXT: cvt.f32.bf16 %r7, %rs4; -; SM80-NEXT: cvt.f32.bf16 %r8, %rs3; -; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r4; -; SM80-NEXT: cvt.f32.bf16 %r9, %rs6; -; SM80-NEXT: cvt.f32.bf16 %r10, %rs5; -; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r3; -; SM80-NEXT: cvt.f32.bf16 %r11, %rs8; -; SM80-NEXT: cvt.f32.bf16 %r12, %rs7; +; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r4; +; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r2; +; SM80-NEXT: cvt.f32.bf16 %r5, %rs8; +; SM80-NEXT: cvt.f32.bf16 %r6, %rs7; +; SM80-NEXT: cvt.f32.bf16 %r7, %rs6; +; SM80-NEXT: cvt.f32.bf16 %r8, %rs5; +; SM80-NEXT: cvt.f32.bf16 %r9, %rs4; +; SM80-NEXT: cvt.f32.bf16 %r10, %rs3; +; SM80-NEXT: cvt.f32.bf16 %r11, %rs2; +; SM80-NEXT: cvt.f32.bf16 %r12, %rs1; ; SM80-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9}; ; SM80-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; SM80-NEXT: ret; @@ -746,18 +746,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM80-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs1; -; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs3; -; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs6; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs5; -; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs8; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs7; +; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4; +; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1; ; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9}; ; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; SM80-FTZ-NEXT: ret; @@ -771,18 +771,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; SM90-NEXT: cvt.f32.bf16 %r5, %rs2; -; SM90-NEXT: cvt.f32.bf16 %r6, %rs1; -; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; SM90-NEXT: cvt.f32.bf16 %r7, %rs4; -; SM90-NEXT: cvt.f32.bf16 %r8, %rs3; -; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r4; -; SM90-NEXT: cvt.f32.bf16 %r9, %rs6; -; SM90-NEXT: cvt.f32.bf16 %r10, %rs5; -; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r3; -; SM90-NEXT: cvt.f32.bf16 %r11, %rs8; -; SM90-NEXT: cvt.f32.bf16 %r12, %rs7; +; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r4; +; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r2; +; SM90-NEXT: cvt.f32.bf16 %r5, %rs8; +; SM90-NEXT: cvt.f32.bf16 %r6, %rs7; +; SM90-NEXT: cvt.f32.bf16 %r7, %rs6; +; SM90-NEXT: cvt.f32.bf16 %r8, %rs5; +; SM90-NEXT: cvt.f32.bf16 %r9, %rs4; +; SM90-NEXT: cvt.f32.bf16 %r10, %rs3; +; SM90-NEXT: cvt.f32.bf16 %r11, %rs2; +; SM90-NEXT: cvt.f32.bf16 %r12, %rs1; ; SM90-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9}; ; SM90-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; SM90-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index 64c7792a61c8c..7b2126870e319 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -596,18 +596,15 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, ; CHECK-F16: { ; CHECK-F16-NEXT: .reg .pred %p<3>; ; CHECK-F16-NEXT: .reg .b32 %r<9>; -; CHECK-F16-NEXT: .reg .b64 %rd<3>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3]; -; CHECK-F16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2]; -; CHECK-F16-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f16_param_1]; -; CHECK-F16-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f16_param_0]; -; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r1, %r2; -; CHECK-F16-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-F16-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-F16-NEXT: selp.f32 %r7, %r6, %r4, %p2; -; CHECK-F16-NEXT: selp.f32 %r8, %r5, %r3, %p1; +; CHECK-F16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_1]; +; CHECK-F16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f16_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %r6, [test_select_cc_f32_f16_param_3]; +; CHECK-F16-NEXT: ld.param.b32 %r5, [test_select_cc_f32_f16_param_2]; +; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r5, %r6; +; CHECK-F16-NEXT: selp.f32 %r7, %r2, %r4, %p2; +; CHECK-F16-NEXT: selp.f32 %r8, %r1, %r3, %p1; ; CHECK-F16-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; ; CHECK-F16-NEXT: ret; ; @@ -616,25 +613,22 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, ; CHECK-NOF16-NEXT: .reg .pred %p<3>; ; CHECK-NOF16-NEXT: .reg .b16 %rs<5>; ; CHECK-NOF16-NEXT: .reg .b32 %r<13>; -; CHECK-NOF16-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2]; -; CHECK-NOF16-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f16_param_1]; -; CHECK-NOF16-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f16_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs1; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs3; -; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs4; -; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r6, %r5; -; CHECK-NOF16-NEXT: mov.b64 {%r7, %r8}, %rd2; -; CHECK-NOF16-NEXT: mov.b64 {%r9, %r10}, %rd1; -; CHECK-NOF16-NEXT: selp.f32 %r11, %r10, %r8, %p2; -; CHECK-NOF16-NEXT: selp.f32 %r12, %r9, %r7, %p1; +; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_1]; +; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f16_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %r6, [test_select_cc_f32_f16_param_3]; +; CHECK-NOF16-NEXT: ld.param.b32 %r5, [test_select_cc_f32_f16_param_2]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r6; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r8, %r7; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs4; +; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r10, %r9; +; CHECK-NOF16-NEXT: selp.f32 %r11, %r2, %r4, %p2; +; CHECK-NOF16-NEXT: selp.f32 %r12, %r1, %r3, %p1; ; CHECK-NOF16-NEXT: st.param.v2.b32 [func_retval0], {%r12, %r11}; ; CHECK-NOF16-NEXT: ret; <2 x half> %c, <2 x half> %d) #0 { @@ -649,17 +643,14 @@ define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b, ; CHECK-NEXT: .reg .pred %p<3>; ; CHECK-NEXT: .reg .b16 %rs<7>; ; CHECK-NEXT: .reg .b32 %r<7>; -; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_f16_f32_param_3]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_f16_f32_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f16_f32_param_3]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f16_f32_param_2]; ; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_f16_f32_param_1]; ; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_f16_f32_param_0]; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-NEXT: setp.neu.f32 %p1, %r5, %r3; -; CHECK-NEXT: setp.neu.f32 %p2, %r6, %r4; +; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r5; +; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r6; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; ; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; ; CHECK-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; @@ -1501,11 +1492,9 @@ define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 { ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fptrunc_2xfloat_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptrunc_2xfloat_param_0]; ; CHECK-NEXT: cvt.rn.f16.f32 %rs1, %r2; ; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r1; ; CHECK-NEXT: mov.b32 %r3, {%rs2, %rs1}; @@ -1928,12 +1917,10 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { ; CHECK-F16: { ; CHECK-F16-NEXT: .reg .b16 %rs<3>; ; CHECK-F16-NEXT: .reg .b32 %r<8>; -; CHECK-F16-NEXT: .reg .b64 %rd<2>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.b64 %rd1, [test_copysign_f32_param_1]; +; CHECK-F16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1]; ; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; -; CHECK-F16-NEXT: mov.b64 {%r2, %r3}, %rd1; ; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs1, %r3; ; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs2, %r2; ; CHECK-F16-NEXT: mov.b32 %r4, {%rs2, %rs1}; @@ -1947,21 +1934,19 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { ; CHECK-NOF16: { ; CHECK-NOF16-NEXT: .reg .b16 %rs<9>; ; CHECK-NOF16-NEXT: .reg .b32 %r<6>; -; CHECK-NOF16-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b64 %rd1, [test_copysign_f32_param_1]; +; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1]; ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; -; CHECK-NOF16-NEXT: mov.b64 {%r2, %r3}, %rd1; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; -; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767; ; CHECK-NOF16-NEXT: and.b32 %r4, %r3, -2147483648; -; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r4; } -; CHECK-NOF16-NEXT: or.b16 %rs5, %rs3, %rs4; -; CHECK-NOF16-NEXT: and.b16 %rs6, %rs1, 32767; +; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r4; } +; CHECK-NOF16-NEXT: mov.b32 {%rs2, %rs3}, %r1; +; CHECK-NOF16-NEXT: and.b16 %rs4, %rs3, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs5, %rs4, %rs1; ; CHECK-NOF16-NEXT: and.b32 %r5, %r2, -2147483648; -; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r5; } -; CHECK-NOF16-NEXT: or.b16 %rs8, %rs6, %rs7; +; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r5; } +; CHECK-NOF16-NEXT: and.b16 %rs7, %rs2, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs8, %rs7, %rs6; ; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs8, %rs5}; ; CHECK-NOF16-NEXT: ret; %tb = fptrunc <2 x float> %b to <2 x half> diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll index bcaefa1699d8b..7ca16f702d8f3 100644 --- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll @@ -30,12 +30,10 @@ define <2 x float> @test_ret_const() #0 { define float @test_extract_0(<2 x float> %a) #0 { ; CHECK-NOF32X2-LABEL: test_extract_0( ; CHECK-NOF32X2: { -; CHECK-NOF32X2-NEXT: .reg .b32 %r<2>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_extract_0_param_0]; -; CHECK-NOF32X2-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; } +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_0_param_0]; ; CHECK-NOF32X2-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NOF32X2-NEXT: ret; ; @@ -56,13 +54,11 @@ define float @test_extract_0(<2 x float> %a) #0 { define float @test_extract_1(<2 x float> %a) #0 { ; CHECK-NOF32X2-LABEL: test_extract_1( ; CHECK-NOF32X2: { -; CHECK-NOF32X2-NEXT: .reg .b32 %r<2>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_extract_1_param_0]; -; CHECK-NOF32X2-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd1; } -; CHECK-NOF32X2-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_1_param_0]; +; CHECK-NOF32X2-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_extract_1( @@ -80,20 +76,42 @@ define float @test_extract_1(<2 x float> %a) #0 { } define float @test_extract_i(<2 x float> %a, i64 %idx) #0 { -; CHECK-LABEL: test_extract_i( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<2>; -; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_extract_i_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_i_param_0]; -; CHECK-NEXT: setp.eq.b64 %p1, %rd2, 0; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: selp.f32 %r3, %r1, %r2, %p1; -; CHECK-NEXT: st.param.b32 [func_retval0], %r3; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_extract_i( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .local .align 8 .b8 __local_depot3[8]; +; CHECK-NOF32X2-NEXT: .reg .b64 %SP; +; CHECK-NOF32X2-NEXT: .reg .b64 %SPL; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<4>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<6>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: mov.b64 %SPL, __local_depot3; +; CHECK-NOF32X2-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_i_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1]; +; CHECK-NOF32X2-NEXT: st.v2.b32 [%SP], {%r1, %r2}; +; CHECK-NOF32X2-NEXT: and.b64 %rd2, %rd1, 1; +; CHECK-NOF32X2-NEXT: shl.b64 %rd3, %rd2, 2; +; CHECK-NOF32X2-NEXT: add.u64 %rd4, %SP, 0; +; CHECK-NOF32X2-NEXT: or.b64 %rd5, %rd4, %rd3; +; CHECK-NOF32X2-NEXT: ld.b32 %r3, [%rd5]; +; CHECK-NOF32X2-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_extract_i( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<2>; +; CHECK-F32X2-NEXT: .reg .b32 %r<4>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_extract_i_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_extract_i_param_0]; +; CHECK-F32X2-NEXT: setp.eq.b64 %p1, %rd2, 0; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: selp.f32 %r3, %r1, %r2, %p1; +; CHECK-F32X2-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-F32X2-NEXT: ret; %e = extractelement <2 x float> %a, i64 %idx ret float %e } @@ -102,15 +120,12 @@ define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NOF32X2-LABEL: test_fadd( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fadd_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, %r2; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fadd_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r1, %r3; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -132,11 +147,9 @@ define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 { ; CHECK-NOF32X2-LABEL: test_fadd_imm_0( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_param_0]; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40000000; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f3F800000; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -163,11 +176,9 @@ define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 { ; CHECK-NOF32X2-LABEL: test_fadd_imm_1( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_param_0]; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40000000; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f3F800000; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -194,20 +205,15 @@ define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 { ; CHECK-NOF32X2-LABEL: test_fadd_v4( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<13>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<5>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, %r2; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, %r1; -; CHECK-NOF32X2-NEXT: mov.b64 {%r7, %r8}, %rd3; -; CHECK-NOF32X2-NEXT: mov.b64 {%r9, %r10}, %rd1; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r11, %r10, %r8; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r12, %r9, %r7; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r6, %r5}; +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_fadd_v4_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_v4_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r9, %r4, %r8; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r10, %r3, %r7; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r11, %r2, %r6; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r12, %r1, %r5; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_v4( @@ -229,17 +235,14 @@ define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 { ; CHECK-NOF32X2-LABEL: test_fadd_imm_0_v4( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40800000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f40400000; -; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r6, 0f40000000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r5, 0f3F800000; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3}; +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_0_v4_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, 0f40400000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r2, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r1, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_imm_0_v4( @@ -267,17 +270,14 @@ define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 { ; CHECK-NOF32X2-LABEL: test_fadd_imm_1_v4( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40800000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f40400000; -; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r6, 0f40000000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r5, 0f3F800000; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3}; +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_1_v4_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, 0f40400000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r2, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r1, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_imm_1_v4( @@ -305,15 +305,12 @@ define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NOF32X2-LABEL: test_fsub( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fsub_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fsub_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NOF32X2-NEXT: sub.rn.f32 %r5, %r4, %r2; -; CHECK-NOF32X2-NEXT: sub.rn.f32 %r6, %r3, %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fsub_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fsub_param_0]; +; CHECK-NOF32X2-NEXT: sub.rn.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: sub.rn.f32 %r6, %r1, %r3; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -332,18 +329,29 @@ define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 { } define <2 x float> @test_fneg(<2 x float> %a) #0 { -; CHECK-LABEL: test_fneg( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fneg_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: neg.f32 %r3, %r2; -; CHECK-NEXT: neg.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fneg( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fneg_param_0]; +; CHECK-NOF32X2-NEXT: neg.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: neg.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fneg( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fneg_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: neg.f32 %r3, %r2; +; CHECK-F32X2-NEXT: neg.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = fneg <2 x float> %a ret <2 x float> %r } @@ -352,15 +360,12 @@ define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NOF32X2-LABEL: test_fmul( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fmul_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fmul_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NOF32X2-NEXT: mul.rn.f32 %r5, %r4, %r2; -; CHECK-NOF32X2-NEXT: mul.rn.f32 %r6, %r3, %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmul_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmul_param_0]; +; CHECK-NOF32X2-NEXT: mul.rn.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: mul.rn.f32 %r6, %r1, %r3; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -379,50 +384,85 @@ define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 { } define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fdiv( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fdiv_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: div.rn.f32 %r5, %r4, %r2; -; CHECK-NEXT: div.rn.f32 %r6, %r3, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fdiv( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fdiv_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fdiv_param_0]; +; CHECK-NOF32X2-NEXT: div.rn.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: div.rn.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fdiv( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fdiv_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fdiv_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: div.rn.f32 %r5, %r4, %r2; +; CHECK-F32X2-NEXT: div.rn.f32 %r6, %r3, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-F32X2-NEXT: ret; %r = fdiv <2 x float> %a, %b ret <2 x float> %r } define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_frem( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b32 %r<15>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_frem_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: div.rn.f32 %r5, %r4, %r2; -; CHECK-NEXT: cvt.rzi.f32.f32 %r6, %r5; -; CHECK-NEXT: neg.f32 %r7, %r6; -; CHECK-NEXT: fma.rn.f32 %r8, %r7, %r2, %r4; -; CHECK-NEXT: testp.infinite.f32 %p1, %r2; -; CHECK-NEXT: selp.f32 %r9, %r4, %r8, %p1; -; CHECK-NEXT: div.rn.f32 %r10, %r3, %r1; -; CHECK-NEXT: cvt.rzi.f32.f32 %r11, %r10; -; CHECK-NEXT: neg.f32 %r12, %r11; -; CHECK-NEXT: fma.rn.f32 %r13, %r12, %r1, %r3; -; CHECK-NEXT: testp.infinite.f32 %p2, %r1; -; CHECK-NEXT: selp.f32 %r14, %r3, %r13, %p2; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_frem( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<15>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_frem_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_frem_param_0]; +; CHECK-NOF32X2-NEXT: div.rn.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r6, %r5; +; CHECK-NOF32X2-NEXT: neg.f32 %r7, %r6; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r7, %r4, %r2; +; CHECK-NOF32X2-NEXT: testp.infinite.f32 %p1, %r4; +; CHECK-NOF32X2-NEXT: selp.f32 %r9, %r2, %r8, %p1; +; CHECK-NOF32X2-NEXT: div.rn.f32 %r10, %r1, %r3; +; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r11, %r10; +; CHECK-NOF32X2-NEXT: neg.f32 %r12, %r11; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r13, %r12, %r3, %r1; +; CHECK-NOF32X2-NEXT: testp.infinite.f32 %p2, %r3; +; CHECK-NOF32X2-NEXT: selp.f32 %r14, %r1, %r13, %p2; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_frem( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<15>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_frem_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_frem_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: div.rn.f32 %r5, %r4, %r2; +; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r6, %r5; +; CHECK-F32X2-NEXT: neg.f32 %r7, %r6; +; CHECK-F32X2-NEXT: fma.rn.f32 %r8, %r7, %r2, %r4; +; CHECK-F32X2-NEXT: testp.infinite.f32 %p1, %r2; +; CHECK-F32X2-NEXT: selp.f32 %r9, %r4, %r8, %p1; +; CHECK-F32X2-NEXT: div.rn.f32 %r10, %r3, %r1; +; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r11, %r10; +; CHECK-F32X2-NEXT: neg.f32 %r12, %r11; +; CHECK-F32X2-NEXT: fma.rn.f32 %r13, %r12, %r1, %r3; +; CHECK-F32X2-NEXT: testp.infinite.f32 %p2, %r1; +; CHECK-F32X2-NEXT: selp.f32 %r14, %r3, %r13, %p2; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; +; CHECK-F32X2-NEXT: ret; %r = frem <2 x float> %a, %b ret <2 x float> %r } @@ -431,15 +471,12 @@ define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-NOF32X2-LABEL: test_fadd_ftz( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fadd_ftz_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_ftz_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, %r2; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fadd_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_ftz_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r1, %r3; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -461,11 +498,9 @@ define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 { ; CHECK-NOF32X2-LABEL: test_fadd_imm_0_ftz( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_ftz_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_ftz_param_0]; ; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40000000; ; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f3F800000; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -492,11 +527,9 @@ define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 { ; CHECK-NOF32X2-LABEL: test_fadd_imm_1_ftz( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_ftz_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_ftz_param_0]; ; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40000000; ; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f3F800000; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -523,20 +556,15 @@ define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 { ; CHECK-NOF32X2-LABEL: test_fadd_v4_ftz( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<13>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<5>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_ftz_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_ftz_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, %r2; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, %r1; -; CHECK-NOF32X2-NEXT: mov.b64 {%r7, %r8}, %rd3; -; CHECK-NOF32X2-NEXT: mov.b64 {%r9, %r10}, %rd1; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r11, %r10, %r8; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r12, %r9, %r7; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r6, %r5}; +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_fadd_v4_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_v4_ftz_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r9, %r4, %r8; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r10, %r3, %r7; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r11, %r2, %r6; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r12, %r1, %r5; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_v4_ftz( @@ -558,17 +586,14 @@ define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 { ; CHECK-NOF32X2-LABEL: test_fadd_imm_0_v4_ftz( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_ftz_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40800000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f40400000; -; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r6, 0f40000000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r5, 0f3F800000; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3}; +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_0_v4_ftz_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, 0f40400000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r2, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r1, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_imm_0_v4_ftz( @@ -596,17 +621,14 @@ define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 { ; CHECK-NOF32X2-LABEL: test_fadd_imm_1_v4_ftz( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_ftz_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40800000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f40400000; -; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r6, 0f40000000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r5, 0f3F800000; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3}; +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_1_v4_ftz_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, 0f40400000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r2, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r1, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_imm_1_v4_ftz( @@ -634,15 +656,12 @@ define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-NOF32X2-LABEL: test_fsub_ftz( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fsub_ftz_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fsub_ftz_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r5, %r4, %r2; -; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r6, %r3, %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fsub_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fsub_ftz_param_0]; +; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r6, %r1, %r3; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -661,18 +680,29 @@ define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 { } define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 { -; CHECK-LABEL: test_fneg_ftz( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fneg_ftz_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: neg.ftz.f32 %r3, %r2; -; CHECK-NEXT: neg.ftz.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fneg_ftz( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fneg_ftz_param_0]; +; CHECK-NOF32X2-NEXT: neg.ftz.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: neg.ftz.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fneg_ftz( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fneg_ftz_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: neg.ftz.f32 %r3, %r2; +; CHECK-F32X2-NEXT: neg.ftz.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = fneg <2 x float> %a ret <2 x float> %r } @@ -681,15 +711,12 @@ define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-NOF32X2-LABEL: test_fmul_ftz( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fmul_ftz_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fmul_ftz_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r5, %r4, %r2; -; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r6, %r3, %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmul_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmul_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r6, %r1, %r3; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -711,17 +738,13 @@ define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c) ; CHECK-NOF32X2-LABEL: test_fma_ftz( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd3, [test_fma_ftz_param_2]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fma_ftz_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fma_ftz_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd3; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r7, %r6, %r4, %r2; -; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r8, %r5, %r3, %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fma_ftz_param_2]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fma_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fma_ftz_param_0]; +; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r7, %r2, %r4, %r6; +; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r8, %r1, %r3, %r5; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -741,65 +764,112 @@ define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c) } define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 { -; CHECK-LABEL: test_fdiv_ftz( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_ftz_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fdiv_ftz_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: div.rn.ftz.f32 %r5, %r4, %r2; -; CHECK-NEXT: div.rn.ftz.f32 %r6, %r3, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fdiv_ftz( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fdiv_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fdiv_ftz_param_0]; +; CHECK-NOF32X2-NEXT: div.rn.ftz.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: div.rn.ftz.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fdiv_ftz( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fdiv_ftz_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fdiv_ftz_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: div.rn.ftz.f32 %r5, %r4, %r2; +; CHECK-F32X2-NEXT: div.rn.ftz.f32 %r6, %r3, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-F32X2-NEXT: ret; %r = fdiv <2 x float> %a, %b ret <2 x float> %r } define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 { -; CHECK-LABEL: test_frem_ftz( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b32 %r<15>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_ftz_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_frem_ftz_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: div.rn.ftz.f32 %r5, %r4, %r2; -; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %r6, %r5; -; CHECK-NEXT: neg.ftz.f32 %r7, %r6; -; CHECK-NEXT: fma.rn.ftz.f32 %r8, %r7, %r2, %r4; -; CHECK-NEXT: testp.infinite.f32 %p1, %r2; -; CHECK-NEXT: selp.f32 %r9, %r4, %r8, %p1; -; CHECK-NEXT: div.rn.ftz.f32 %r10, %r3, %r1; -; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %r11, %r10; -; CHECK-NEXT: neg.ftz.f32 %r12, %r11; -; CHECK-NEXT: fma.rn.ftz.f32 %r13, %r12, %r1, %r3; -; CHECK-NEXT: testp.infinite.f32 %p2, %r1; -; CHECK-NEXT: selp.f32 %r14, %r3, %r13, %p2; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_frem_ftz( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<15>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_frem_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_frem_ftz_param_0]; +; CHECK-NOF32X2-NEXT: div.rn.ftz.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: cvt.rzi.ftz.f32.f32 %r6, %r5; +; CHECK-NOF32X2-NEXT: neg.ftz.f32 %r7, %r6; +; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r8, %r7, %r4, %r2; +; CHECK-NOF32X2-NEXT: testp.infinite.f32 %p1, %r4; +; CHECK-NOF32X2-NEXT: selp.f32 %r9, %r2, %r8, %p1; +; CHECK-NOF32X2-NEXT: div.rn.ftz.f32 %r10, %r1, %r3; +; CHECK-NOF32X2-NEXT: cvt.rzi.ftz.f32.f32 %r11, %r10; +; CHECK-NOF32X2-NEXT: neg.ftz.f32 %r12, %r11; +; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r13, %r12, %r3, %r1; +; CHECK-NOF32X2-NEXT: testp.infinite.f32 %p2, %r3; +; CHECK-NOF32X2-NEXT: selp.f32 %r14, %r1, %r13, %p2; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_frem_ftz( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<15>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_frem_ftz_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_frem_ftz_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: div.rn.ftz.f32 %r5, %r4, %r2; +; CHECK-F32X2-NEXT: cvt.rzi.ftz.f32.f32 %r6, %r5; +; CHECK-F32X2-NEXT: neg.ftz.f32 %r7, %r6; +; CHECK-F32X2-NEXT: fma.rn.ftz.f32 %r8, %r7, %r2, %r4; +; CHECK-F32X2-NEXT: testp.infinite.f32 %p1, %r2; +; CHECK-F32X2-NEXT: selp.f32 %r9, %r4, %r8, %p1; +; CHECK-F32X2-NEXT: div.rn.ftz.f32 %r10, %r3, %r1; +; CHECK-F32X2-NEXT: cvt.rzi.ftz.f32.f32 %r11, %r10; +; CHECK-F32X2-NEXT: neg.ftz.f32 %r12, %r11; +; CHECK-F32X2-NEXT: fma.rn.ftz.f32 %r13, %r12, %r1, %r3; +; CHECK-F32X2-NEXT: testp.infinite.f32 %p2, %r1; +; CHECK-F32X2-NEXT: selp.f32 %r14, %r3, %r13, %p2; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; +; CHECK-F32X2-NEXT: ret; %r = frem <2 x float> %a, %b ret <2 x float> %r } define void @test_ldst_v2f32(ptr %a, ptr %b) #0 { -; CHECK-LABEL: test_ldst_v2f32( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v2f32_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v2f32_param_0]; -; CHECK-NEXT: ld.b64 %rd3, [%rd1]; -; CHECK-NEXT: st.b64 [%rd2], %rd3; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_ldst_v2f32( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v2f32_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v2f32_param_0]; +; CHECK-NOF32X2-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1]; +; CHECK-NOF32X2-NEXT: st.v2.b32 [%rd2], {%r1, %r2}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_ldst_v2f32( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v2f32_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v2f32_param_0]; +; CHECK-F32X2-NEXT: ld.b64 %rd3, [%rd1]; +; CHECK-F32X2-NEXT: st.b64 [%rd2], %rd3; +; CHECK-F32X2-NEXT: ret; %t1 = load <2 x float>, ptr %a store <2 x float> %t1, ptr %b, align 32 ret void @@ -825,34 +895,60 @@ define void @test_ldst_v3f32(ptr %a, ptr %b) #0 { } define void @test_ldst_v4f32(ptr %a, ptr %b) #0 { -; CHECK-LABEL: test_ldst_v4f32( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v4f32_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v4f32_param_0]; -; CHECK-NEXT: ld.v2.b64 {%rd3, %rd4}, [%rd1]; -; CHECK-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_ldst_v4f32( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v4f32_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v4f32_param_0]; +; CHECK-NOF32X2-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NOF32X2-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_ldst_v4f32( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<5>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v4f32_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v4f32_param_0]; +; CHECK-F32X2-NEXT: ld.v2.b64 {%rd3, %rd4}, [%rd1]; +; CHECK-F32X2-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4}; +; CHECK-F32X2-NEXT: ret; %t1 = load <4 x float>, ptr %a store <4 x float> %t1, ptr %b, align 32 ret void } define void @test_ldst_v8f32(ptr %a, ptr %b) #0 { -; CHECK-LABEL: test_ldst_v8f32( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<7>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v8f32_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v8f32_param_0]; -; CHECK-NEXT: ld.v2.b64 {%rd3, %rd4}, [%rd1]; -; CHECK-NEXT: ld.v2.b64 {%rd5, %rd6}, [%rd1+16]; -; CHECK-NEXT: st.v2.b64 [%rd2+16], {%rd5, %rd6}; -; CHECK-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_ldst_v8f32( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v8f32_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v8f32_param_0]; +; CHECK-NOF32X2-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NOF32X2-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NOF32X2-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NOF32X2-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_ldst_v8f32( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<7>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v8f32_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v8f32_param_0]; +; CHECK-F32X2-NEXT: ld.v2.b64 {%rd3, %rd4}, [%rd1]; +; CHECK-F32X2-NEXT: ld.v2.b64 {%rd5, %rd6}, [%rd1+16]; +; CHECK-F32X2-NEXT: st.v2.b64 [%rd2+16], {%rd5, %rd6}; +; CHECK-F32X2-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4}; +; CHECK-F32X2-NEXT: ret; %t1 = load <8 x float>, ptr %a store <8 x float> %t1, ptr %b, align 32 ret void @@ -861,571 +957,982 @@ define void @test_ldst_v8f32(ptr %a, ptr %b) #0 { declare <2 x float> @test_callee(<2 x float> %a, <2 x float> %b) #0 define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_call( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_call_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_call_param_0]; -; CHECK-NEXT: { // callseq 0, 0 -; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: .param .align 8 .b8 param1[8]; -; CHECK-NEXT: .param .align 8 .b8 retval0[8]; -; CHECK-NEXT: st.param.b64 [param1], %rd2; -; CHECK-NEXT: st.param.b64 [param0], %rd1; -; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1); -; CHECK-NEXT: ld.param.b64 %rd3, [retval0]; -; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_call( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_call_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_call_param_0]; +; CHECK-NOF32X2-NEXT: { // callseq 0, 0 +; CHECK-NOF32X2-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NOF32X2-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-NOF32X2-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [param1], {%r3, %r4}; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [param0], {%r1, %r2}; +; CHECK-NOF32X2-NEXT: call.uni (retval0), test_callee, (param0, param1); +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [retval0]; +; CHECK-NOF32X2-NEXT: } // callseq 0 +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r5, %r6}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_call( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_call_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_call_param_0]; +; CHECK-F32X2-NEXT: { // callseq 0, 0 +; CHECK-F32X2-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-F32X2-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-F32X2-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-F32X2-NEXT: st.param.b64 [param1], %rd2; +; CHECK-F32X2-NEXT: st.param.b64 [param0], %rd1; +; CHECK-F32X2-NEXT: call.uni (retval0), test_callee, (param0, param1); +; CHECK-F32X2-NEXT: ld.param.b64 %rd3, [retval0]; +; CHECK-F32X2-NEXT: } // callseq 0 +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @test_callee(<2 x float> %a, <2 x float> %b) ret <2 x float> %r } define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_call_flipped( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_call_flipped_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_call_flipped_param_0]; -; CHECK-NEXT: { // callseq 1, 0 -; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: .param .align 8 .b8 param1[8]; -; CHECK-NEXT: .param .align 8 .b8 retval0[8]; -; CHECK-NEXT: st.param.b64 [param1], %rd1; -; CHECK-NEXT: st.param.b64 [param0], %rd2; -; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1); -; CHECK-NEXT: ld.param.b64 %rd3, [retval0]; -; CHECK-NEXT: } // callseq 1 -; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_call_flipped( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_call_flipped_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_call_flipped_param_0]; +; CHECK-NOF32X2-NEXT: { // callseq 1, 0 +; CHECK-NOF32X2-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NOF32X2-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-NOF32X2-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [param1], {%r1, %r2}; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [param0], {%r3, %r4}; +; CHECK-NOF32X2-NEXT: call.uni (retval0), test_callee, (param0, param1); +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [retval0]; +; CHECK-NOF32X2-NEXT: } // callseq 1 +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r5, %r6}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_call_flipped( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_call_flipped_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_call_flipped_param_0]; +; CHECK-F32X2-NEXT: { // callseq 1, 0 +; CHECK-F32X2-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-F32X2-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-F32X2-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-F32X2-NEXT: st.param.b64 [param1], %rd1; +; CHECK-F32X2-NEXT: st.param.b64 [param0], %rd2; +; CHECK-F32X2-NEXT: call.uni (retval0), test_callee, (param0, param1); +; CHECK-F32X2-NEXT: ld.param.b64 %rd3, [retval0]; +; CHECK-F32X2-NEXT: } // callseq 1 +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a) ret <2 x float> %r } define <2 x float> @test_tailcall_flipped(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_tailcall_flipped( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_tailcall_flipped_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tailcall_flipped_param_0]; -; CHECK-NEXT: { // callseq 2, 0 -; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: .param .align 8 .b8 param1[8]; -; CHECK-NEXT: .param .align 8 .b8 retval0[8]; -; CHECK-NEXT: st.param.b64 [param1], %rd1; -; CHECK-NEXT: st.param.b64 [param0], %rd2; -; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1); -; CHECK-NEXT: ld.param.b64 %rd3, [retval0]; -; CHECK-NEXT: } // callseq 2 -; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_tailcall_flipped( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_tailcall_flipped_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_tailcall_flipped_param_0]; +; CHECK-NOF32X2-NEXT: { // callseq 2, 0 +; CHECK-NOF32X2-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NOF32X2-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-NOF32X2-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [param1], {%r1, %r2}; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [param0], {%r3, %r4}; +; CHECK-NOF32X2-NEXT: call.uni (retval0), test_callee, (param0, param1); +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [retval0]; +; CHECK-NOF32X2-NEXT: } // callseq 2 +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r5, %r6}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_tailcall_flipped( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_tailcall_flipped_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_tailcall_flipped_param_0]; +; CHECK-F32X2-NEXT: { // callseq 2, 0 +; CHECK-F32X2-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-F32X2-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-F32X2-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-F32X2-NEXT: st.param.b64 [param1], %rd1; +; CHECK-F32X2-NEXT: st.param.b64 [param0], %rd2; +; CHECK-F32X2-NEXT: call.uni (retval0), test_callee, (param0, param1); +; CHECK-F32X2-NEXT: ld.param.b64 %rd3, [retval0]; +; CHECK-F32X2-NEXT: } // callseq 2 +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; %r = tail call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a) ret <2 x float> %r } define <2 x float> @test_select(<2 x float> %a, <2 x float> %b, i1 zeroext %c) #0 { -; CHECK-LABEL: test_select( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<2>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b64 %rd<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [test_select_param_2]; -; CHECK-NEXT: and.b16 %rs2, %rs1, 1; -; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.b64 %rd2, [test_select_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_select_param_0]; -; CHECK-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_select( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<2>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.b8 %rs1, [test_select_param_2]; +; CHECK-NOF32X2-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NOF32X2-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_param_0]; +; CHECK-NOF32X2-NEXT: selp.f32 %r5, %r2, %r4, %p1; +; CHECK-NOF32X2-NEXT: selp.f32 %r6, %r1, %r3, %p1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_select( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<2>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b8 %rs1, [test_select_param_2]; +; CHECK-F32X2-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-F32X2-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_select_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_select_param_0]; +; CHECK-F32X2-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; %r = select i1 %c, <2 x float> %a, <2 x float> %b ret <2 x float> %r } define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) #0 { -; CHECK-LABEL: test_select_cc( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b32 %r<11>; -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd4, [test_select_cc_param_3]; -; CHECK-NEXT: ld.param.b64 %rd3, [test_select_cc_param_2]; -; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd3; -; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r1; -; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r2; -; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2; -; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1; -; CHECK-NEXT: selp.f32 %r9, %r8, %r6, %p2; -; CHECK-NEXT: selp.f32 %r10, %r7, %r5, %p1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r10, %r9}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_select_cc( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<11>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r7, %r8}, [test_select_cc_param_3]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_param_2]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_param_0]; +; CHECK-NOF32X2-NEXT: setp.neu.f32 %p1, %r5, %r7; +; CHECK-NOF32X2-NEXT: setp.neu.f32 %p2, %r6, %r8; +; CHECK-NOF32X2-NEXT: selp.f32 %r9, %r2, %r4, %p2; +; CHECK-NOF32X2-NEXT: selp.f32 %r10, %r1, %r3, %p1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r10, %r9}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_select_cc( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<11>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<5>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd4, [test_select_cc_param_3]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd3, [test_select_cc_param_2]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_select_cc_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_select_cc_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd3; +; CHECK-F32X2-NEXT: setp.neu.f32 %p1, %r3, %r1; +; CHECK-F32X2-NEXT: setp.neu.f32 %p2, %r4, %r2; +; CHECK-F32X2-NEXT: mov.b64 {%r5, %r6}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r7, %r8}, %rd1; +; CHECK-F32X2-NEXT: selp.f32 %r9, %r8, %r6, %p2; +; CHECK-F32X2-NEXT: selp.f32 %r10, %r7, %r5, %p1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r10, %r9}; +; CHECK-F32X2-NEXT: ret; %cc = fcmp une <2 x float> %c, %d %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b ret <2 x float> %r } define <2 x double> @test_select_cc_f64_f32(<2 x double> %a, <2 x double> %b, <2 x float> %c, <2 x float> %d) #0 { -; CHECK-LABEL: test_select_cc_f64_f32( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<9>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f64_f32_param_1]; -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f64_f32_param_0]; -; CHECK-NEXT: ld.param.b64 %rd6, [test_select_cc_f64_f32_param_3]; -; CHECK-NEXT: ld.param.b64 %rd5, [test_select_cc_f64_f32_param_2]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd6; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd5; -; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r1; -; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r2; -; CHECK-NEXT: selp.f64 %rd7, %rd2, %rd4, %p2; -; CHECK-NEXT: selp.f64 %rd8, %rd1, %rd3, %p1; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd7}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_select_cc_f64_f32( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f64_f32_param_3]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f64_f32_param_2]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f64_f32_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f64_f32_param_0]; +; CHECK-NOF32X2-NEXT: setp.neu.f32 %p1, %r1, %r3; +; CHECK-NOF32X2-NEXT: setp.neu.f32 %p2, %r2, %r4; +; CHECK-NOF32X2-NEXT: selp.f64 %rd5, %rd2, %rd4, %p2; +; CHECK-NOF32X2-NEXT: selp.f64 %rd6, %rd1, %rd3, %p1; +; CHECK-NOF32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_select_cc_f64_f32( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<9>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f64_f32_param_1]; +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f64_f32_param_0]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd6, [test_select_cc_f64_f32_param_3]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd5, [test_select_cc_f64_f32_param_2]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd6; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd5; +; CHECK-F32X2-NEXT: setp.neu.f32 %p1, %r3, %r1; +; CHECK-F32X2-NEXT: setp.neu.f32 %p2, %r4, %r2; +; CHECK-F32X2-NEXT: selp.f64 %rd7, %rd2, %rd4, %p2; +; CHECK-F32X2-NEXT: selp.f64 %rd8, %rd1, %rd3, %p1; +; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd7}; +; CHECK-F32X2-NEXT: ret; %cc = fcmp une <2 x float> %c, %d %r = select <2 x i1> %cc, <2 x double> %a, <2 x double> %b ret <2 x double> %r } define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x double> %c, <2 x double> %d) #0 { -; CHECK-LABEL: test_select_cc_f32_f64( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b32 %r<7>; -; CHECK-NEXT: .reg .b64 %rd<7>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_select_cc_f32_f64_param_3]; -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f32_f64_param_2]; -; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f64_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f64_param_0]; -; CHECK-NEXT: setp.neu.f64 %p1, %rd3, %rd5; -; CHECK-NEXT: setp.neu.f64 %p2, %rd4, %rd6; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: selp.f32 %r5, %r4, %r2, %p2; -; CHECK-NEXT: selp.f32 %r6, %r3, %r1, %p1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_select_cc_f32_f64( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f32_f64_param_3]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f32_f64_param_2]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f64_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f64_param_0]; +; CHECK-NOF32X2-NEXT: setp.neu.f64 %p1, %rd1, %rd3; +; CHECK-NOF32X2-NEXT: setp.neu.f64 %p2, %rd2, %rd4; +; CHECK-NOF32X2-NEXT: selp.f32 %r5, %r2, %r4, %p2; +; CHECK-NOF32X2-NEXT: selp.f32 %r6, %r1, %r3, %p1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_select_cc_f32_f64( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<7>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_select_cc_f32_f64_param_3]; +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f32_f64_param_2]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f64_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f64_param_0]; +; CHECK-F32X2-NEXT: setp.neu.f64 %p1, %rd3, %rd5; +; CHECK-F32X2-NEXT: setp.neu.f64 %p2, %rd4, %rd6; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: selp.f32 %r5, %r4, %r2, %p2; +; CHECK-F32X2-NEXT: selp.f32 %r6, %r3, %r1, %p1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-F32X2-NEXT: ret; %cc = fcmp une <2 x double> %c, %d %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b ret <2 x float> %r } define <2 x i1> @test_fcmp_une(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_une( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_une_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_une_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.neu.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.neu.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_une( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_une_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_une_param_0]; +; CHECK-NOF32X2-NEXT: setp.neu.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.neu.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_une( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_une_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_une_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.neu.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.neu.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp une <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_ueq(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_ueq( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ueq_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ueq_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.equ.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.equ.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_ueq( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ueq_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ueq_param_0]; +; CHECK-NOF32X2-NEXT: setp.equ.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.equ.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_ueq( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ueq_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ueq_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.equ.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.equ.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp ueq <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_ugt(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_ugt( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ugt_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ugt_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.gtu.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.gtu.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_ugt( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ugt_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ugt_param_0]; +; CHECK-NOF32X2-NEXT: setp.gtu.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.gtu.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_ugt( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ugt_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ugt_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.gtu.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.gtu.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp ugt <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_uge(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_uge( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_uge_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_uge_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.geu.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.geu.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_uge( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uge_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uge_param_0]; +; CHECK-NOF32X2-NEXT: setp.geu.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.geu.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_uge( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_uge_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_uge_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.geu.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.geu.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp uge <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_ult(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_ult( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ult_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ult_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.ltu.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.ltu.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_ult( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ult_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ult_param_0]; +; CHECK-NOF32X2-NEXT: setp.ltu.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.ltu.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_ult( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ult_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ult_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.ltu.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.ltu.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp ult <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_ule(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_ule( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ule_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ule_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.leu.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.leu.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_ule( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ule_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ule_param_0]; +; CHECK-NOF32X2-NEXT: setp.leu.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.leu.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_ule( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ule_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ule_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.leu.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.leu.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp ule <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_uno(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_uno( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_uno_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_uno_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.nan.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.nan.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_uno( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uno_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uno_param_0]; +; CHECK-NOF32X2-NEXT: setp.nan.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.nan.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_uno( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_uno_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_uno_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.nan.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.nan.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp uno <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_one(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_one( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_one_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_one_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.ne.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.ne.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_one( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_one_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_one_param_0]; +; CHECK-NOF32X2-NEXT: setp.ne.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.ne.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_one( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_one_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_one_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.ne.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.ne.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp one <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_oeq(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_oeq( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_oeq_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_oeq_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.eq.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.eq.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_oeq( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oeq_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oeq_param_0]; +; CHECK-NOF32X2-NEXT: setp.eq.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.eq.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_oeq( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_oeq_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_oeq_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.eq.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.eq.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp oeq <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_ogt(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_ogt( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ogt_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ogt_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.gt.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.gt.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_ogt( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ogt_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ogt_param_0]; +; CHECK-NOF32X2-NEXT: setp.gt.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.gt.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_ogt( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ogt_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ogt_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.gt.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.gt.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp ogt <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_oge(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_oge( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_oge_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_oge_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.ge.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.ge.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_oge( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oge_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oge_param_0]; +; CHECK-NOF32X2-NEXT: setp.ge.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.ge.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_oge( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_oge_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_oge_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.ge.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.ge.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp oge <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_olt(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_olt( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_olt_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_olt_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.lt.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.lt.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_olt( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_olt_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_olt_param_0]; +; CHECK-NOF32X2-NEXT: setp.lt.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.lt.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_olt( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_olt_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_olt_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.lt.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.lt.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp olt <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_ole(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_ole( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ole_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ole_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.le.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.le.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_ole( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ole_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ole_param_0]; +; CHECK-NOF32X2-NEXT: setp.le.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.le.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_ole( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ole_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ole_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.le.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.le.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp ole <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_ord(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_ord( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ord_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ord_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.num.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.num.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_ord( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ord_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ord_param_0]; +; CHECK-NOF32X2-NEXT: setp.num.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.num.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_ord( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ord_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ord_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.num.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.num.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp ord <2 x float> %a, %b ret <2 x i1> %r } define <2 x i32> @test_fptosi_i32(<2 x float> %a) #0 { -; CHECK-LABEL: test_fptosi_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fptosi_i32_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rzi.s32.f32 %r3, %r2; -; CHECK-NEXT: cvt.rzi.s32.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fptosi_i32( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i32_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rzi.s32.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rzi.s32.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fptosi_i32( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fptosi_i32_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rzi.s32.f32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rzi.s32.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = fptosi <2 x float> %a to <2 x i32> ret <2 x i32> %r } define <2 x i64> @test_fptosi_i64(<2 x float> %a) #0 { -; CHECK-LABEL: test_fptosi_i64( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fptosi_i64_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rzi.s64.f32 %rd2, %r2; -; CHECK-NEXT: cvt.rzi.s64.f32 %rd3, %r1; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fptosi_i64( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i64_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rzi.s64.f32 %rd1, %r2; +; CHECK-NOF32X2-NEXT: cvt.rzi.s64.f32 %rd2, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fptosi_i64( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fptosi_i64_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rzi.s64.f32 %rd2, %r2; +; CHECK-F32X2-NEXT: cvt.rzi.s64.f32 %rd3, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; +; CHECK-F32X2-NEXT: ret; %r = fptosi <2 x float> %a to <2 x i64> ret <2 x i64> %r } define <2 x i32> @test_fptoui_2xi32(<2 x float> %a) #0 { -; CHECK-LABEL: test_fptoui_2xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi32_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rzi.u32.f32 %r3, %r2; -; CHECK-NEXT: cvt.rzi.u32.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fptoui_2xi32( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi32_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rzi.u32.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rzi.u32.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fptoui_2xi32( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi32_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rzi.u32.f32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rzi.u32.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = fptoui <2 x float> %a to <2 x i32> ret <2 x i32> %r } define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 { -; CHECK-LABEL: test_fptoui_2xi64( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi64_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rzi.u64.f32 %rd2, %r2; -; CHECK-NEXT: cvt.rzi.u64.f32 %rd3, %r1; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fptoui_2xi64( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi64_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rzi.u64.f32 %rd1, %r2; +; CHECK-NOF32X2-NEXT: cvt.rzi.u64.f32 %rd2, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fptoui_2xi64( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi64_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rzi.u64.f32 %rd2, %r2; +; CHECK-F32X2-NEXT: cvt.rzi.u64.f32 %rd3, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; +; CHECK-F32X2-NEXT: ret; %r = fptoui <2 x float> %a to <2 x i64> ret <2 x i64> %r } @@ -1496,16 +2003,14 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 { ; CHECK-NOF32X2-LABEL: test_uitofp_2xi32_fadd( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_uitofp_2xi32_fadd_param_1]; ; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_fadd_param_1]; -; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r3, %r1; -; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r4, %r2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r6, %r4; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r5, %r3; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r5, %r1; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r6, %r2; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r4, %r6; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r3, %r5; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -1529,48 +2034,81 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 { } define <2 x float> @test_fptrunc_2xdouble(<2 x double> %a) #0 { -; CHECK-LABEL: test_fptrunc_2xdouble( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fptrunc_2xdouble_param_0]; -; CHECK-NEXT: cvt.rn.f32.f64 %r1, %rd2; -; CHECK-NEXT: cvt.rn.f32.f64 %r2, %rd1; -; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1}; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fptrunc_2xdouble( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fptrunc_2xdouble_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.f64 %r1, %rd2; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.f64 %r2, %rd1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fptrunc_2xdouble( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fptrunc_2xdouble_param_0]; +; CHECK-F32X2-NEXT: cvt.rn.f32.f64 %r1, %rd2; +; CHECK-F32X2-NEXT: cvt.rn.f32.f64 %r2, %rd1; +; CHECK-F32X2-NEXT: mov.b64 %rd3, {%r2, %r1}; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; %r = fptrunc <2 x double> %a to <2 x float> ret <2 x float> %r } define <2 x double> @test_fpext_2xdouble(<2 x float> %a) #0 { -; CHECK-LABEL: test_fpext_2xdouble( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fpext_2xdouble_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.f64.f32 %rd2, %r2; -; CHECK-NEXT: cvt.f64.f32 %rd3, %r1; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fpext_2xdouble( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fpext_2xdouble_param_0]; +; CHECK-NOF32X2-NEXT: cvt.f64.f32 %rd1, %r2; +; CHECK-NOF32X2-NEXT: cvt.f64.f32 %rd2, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fpext_2xdouble( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fpext_2xdouble_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.f64.f32 %rd2, %r2; +; CHECK-F32X2-NEXT: cvt.f64.f32 %rd3, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; +; CHECK-F32X2-NEXT: ret; %r = fpext <2 x float> %a to <2 x double> ret <2 x double> %r } define <2 x i32> @test_bitcast_2xfloat_to_2xi32(<2 x float> %a) #0 { -; CHECK-LABEL: test_bitcast_2xfloat_to_2xi32( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_2xi32_param_0]; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_bitcast_2xfloat_to_2xi32( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_bitcast_2xfloat_to_2xi32_param_0]; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_bitcast_2xfloat_to_2xi32( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_2xi32_param_0]; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-F32X2-NEXT: ret; %r = bitcast <2 x float> %a to <2 x i32> ret <2 x i32> %r } @@ -1602,31 +2140,51 @@ define <2 x float> @test_bitcast_double_to_2xfloat(double %a) #0 { } define double @test_bitcast_2xfloat_to_double(<2 x float> %a) #0 { -; CHECK-LABEL: test_bitcast_2xfloat_to_double( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_double_param_0]; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_bitcast_2xfloat_to_double( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_bitcast_2xfloat_to_double_param_0]; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_bitcast_2xfloat_to_double( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_double_param_0]; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-F32X2-NEXT: ret; %r = bitcast <2 x float> %a to double ret double %r } define <2 x float> @test_sqrt(<2 x float> %a) #0 { -; CHECK-LABEL: test_sqrt( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_sqrt_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: sqrt.rn.f32 %r3, %r2; -; CHECK-NEXT: sqrt.rn.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_sqrt( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sqrt_param_0]; +; CHECK-NOF32X2-NEXT: sqrt.rn.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: sqrt.rn.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_sqrt( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_sqrt_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: sqrt.rn.f32 %r3, %r2; +; CHECK-F32X2-NEXT: sqrt.rn.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.sqrt(<2 x float> %a) ret <2 x float> %r } @@ -1639,35 +2197,57 @@ define <2 x float> @test_sqrt(<2 x float> %a) #0 { ;} define <2 x float> @test_sin(<2 x float> %a) #0 { -; CHECK-LABEL: test_sin( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_sin_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: sin.approx.f32 %r3, %r2; -; CHECK-NEXT: sin.approx.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_sin( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sin_param_0]; +; CHECK-NOF32X2-NEXT: sin.approx.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: sin.approx.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_sin( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_sin_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: sin.approx.f32 %r3, %r2; +; CHECK-F32X2-NEXT: sin.approx.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = call afn <2 x float> @llvm.sin(<2 x float> %a) ret <2 x float> %r } define <2 x float> @test_cos(<2 x float> %a) #0 { -; CHECK-LABEL: test_cos( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_cos_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cos.approx.f32 %r3, %r2; -; CHECK-NEXT: cos.approx.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_cos( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_cos_param_0]; +; CHECK-NOF32X2-NEXT: cos.approx.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cos.approx.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_cos( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_cos_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cos.approx.f32 %r3, %r2; +; CHECK-F32X2-NEXT: cos.approx.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = call afn <2 x float> @llvm.cos(<2 x float> %a) ret <2 x float> %r } @@ -1719,17 +2299,13 @@ define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 ; CHECK-NOF32X2-LABEL: test_fma( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd3, [test_fma_param_2]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fma_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fma_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd3; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2; -; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r5, %r3, %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fma_param_2]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fma_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fma_param_0]; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r2, %r4, %r6; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r1, %r3, %r5; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -1749,266 +2325,448 @@ define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 } define <2 x float> @test_fabs(<2 x float> %a) #0 { -; CHECK-LABEL: test_fabs( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fabs_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: abs.f32 %r3, %r2; -; CHECK-NEXT: abs.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fabs( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fabs_param_0]; +; CHECK-NOF32X2-NEXT: abs.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: abs.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fabs( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fabs_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: abs.f32 %r3, %r2; +; CHECK-F32X2-NEXT: abs.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.fabs(<2 x float> %a) ret <2 x float> %r } define <2 x float> @test_minnum(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_minnum( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_minnum_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_minnum_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: min.f32 %r5, %r4, %r2; -; CHECK-NEXT: min.f32 %r6, %r3, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_minnum( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_minnum_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_minnum_param_0]; +; CHECK-NOF32X2-NEXT: min.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: min.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_minnum( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_minnum_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_minnum_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: min.f32 %r5, %r4, %r2; +; CHECK-F32X2-NEXT: min.f32 %r6, %r3, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.minnum(<2 x float> %a, <2 x float> %b) ret <2 x float> %r } define <2 x float> @test_maxnum(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_maxnum( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_maxnum_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_maxnum_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: max.f32 %r5, %r4, %r2; -; CHECK-NEXT: max.f32 %r6, %r3, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_maxnum( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_maxnum_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_maxnum_param_0]; +; CHECK-NOF32X2-NEXT: max.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: max.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_maxnum( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_maxnum_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_maxnum_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: max.f32 %r5, %r4, %r2; +; CHECK-F32X2-NEXT: max.f32 %r6, %r3, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.maxnum(<2 x float> %a, <2 x float> %b) ret <2 x float> %r } define <2 x float> @test_copysign(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_copysign( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_copysign_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_copysign_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NEXT: copysign.f32 %r5, %r4, %r2; -; CHECK-NEXT: copysign.f32 %r6, %r3, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_copysign( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_copysign_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_param_0]; +; CHECK-NOF32X2-NEXT: copysign.f32 %r5, %r4, %r2; +; CHECK-NOF32X2-NEXT: copysign.f32 %r6, %r3, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_copysign( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_copysign_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_copysign_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-F32X2-NEXT: copysign.f32 %r5, %r4, %r2; +; CHECK-F32X2-NEXT: copysign.f32 %r6, %r3, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %b) ret <2 x float> %r } define <2 x float> @test_copysign_f64(<2 x float> %a, <2 x double> %b) #0 { -; CHECK-LABEL: test_copysign_f64( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<8>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_copysign_f64_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_copysign_f64_param_0]; -; CHECK-NEXT: shr.u64 %rd4, %rd3, 63; -; CHECK-NEXT: and.b64 %rd5, %rd4, 1; -; CHECK-NEXT: setp.ne.b64 %p1, %rd5, 0; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: abs.f32 %r3, %r2; -; CHECK-NEXT: neg.f32 %r4, %r3; -; CHECK-NEXT: selp.f32 %r5, %r4, %r3, %p1; -; CHECK-NEXT: shr.u64 %rd6, %rd2, 63; -; CHECK-NEXT: and.b64 %rd7, %rd6, 1; -; CHECK-NEXT: setp.ne.b64 %p2, %rd7, 0; -; CHECK-NEXT: abs.f32 %r6, %r1; -; CHECK-NEXT: neg.f32 %r7, %r6; -; CHECK-NEXT: selp.f32 %r8, %r7, %r6, %p2; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r5}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_copysign_f64( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_copysign_f64_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_f64_param_0]; +; CHECK-NOF32X2-NEXT: abs.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: neg.f32 %r4, %r3; +; CHECK-NOF32X2-NEXT: shr.u64 %rd3, %rd2, 63; +; CHECK-NOF32X2-NEXT: and.b64 %rd4, %rd3, 1; +; CHECK-NOF32X2-NEXT: setp.ne.b64 %p1, %rd4, 0; +; CHECK-NOF32X2-NEXT: selp.f32 %r5, %r4, %r3, %p1; +; CHECK-NOF32X2-NEXT: abs.f32 %r6, %r1; +; CHECK-NOF32X2-NEXT: neg.f32 %r7, %r6; +; CHECK-NOF32X2-NEXT: shr.u64 %rd5, %rd1, 63; +; CHECK-NOF32X2-NEXT: and.b64 %rd6, %rd5, 1; +; CHECK-NOF32X2-NEXT: setp.ne.b64 %p2, %rd6, 0; +; CHECK-NOF32X2-NEXT: selp.f32 %r8, %r7, %r6, %p2; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_copysign_f64( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<9>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<8>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_copysign_f64_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_copysign_f64_param_0]; +; CHECK-F32X2-NEXT: shr.u64 %rd4, %rd3, 63; +; CHECK-F32X2-NEXT: and.b64 %rd5, %rd4, 1; +; CHECK-F32X2-NEXT: setp.ne.b64 %p1, %rd5, 0; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: abs.f32 %r3, %r2; +; CHECK-F32X2-NEXT: neg.f32 %r4, %r3; +; CHECK-F32X2-NEXT: selp.f32 %r5, %r4, %r3, %p1; +; CHECK-F32X2-NEXT: shr.u64 %rd6, %rd2, 63; +; CHECK-F32X2-NEXT: and.b64 %rd7, %rd6, 1; +; CHECK-F32X2-NEXT: setp.ne.b64 %p2, %rd7, 0; +; CHECK-F32X2-NEXT: abs.f32 %r6, %r1; +; CHECK-F32X2-NEXT: neg.f32 %r7, %r6; +; CHECK-F32X2-NEXT: selp.f32 %r8, %r7, %r6, %p2; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r5}; +; CHECK-F32X2-NEXT: ret; %tb = fptrunc <2 x double> %b to <2 x float> %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %tb) ret <2 x float> %r } define <2 x double> @test_copysign_extended(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_copysign_extended( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_copysign_extended_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_copysign_extended_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NEXT: copysign.f32 %r5, %r3, %r1; -; CHECK-NEXT: copysign.f32 %r6, %r4, %r2; -; CHECK-NEXT: cvt.f64.f32 %rd3, %r6; -; CHECK-NEXT: cvt.f64.f32 %rd4, %r5; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_copysign_extended( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_copysign_extended_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_extended_param_0]; +; CHECK-NOF32X2-NEXT: copysign.f32 %r5, %r3, %r1; +; CHECK-NOF32X2-NEXT: copysign.f32 %r6, %r4, %r2; +; CHECK-NOF32X2-NEXT: cvt.f64.f32 %rd1, %r6; +; CHECK-NOF32X2-NEXT: cvt.f64.f32 %rd2, %r5; +; CHECK-NOF32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_copysign_extended( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<5>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_copysign_extended_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_copysign_extended_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-F32X2-NEXT: copysign.f32 %r5, %r3, %r1; +; CHECK-F32X2-NEXT: copysign.f32 %r6, %r4, %r2; +; CHECK-F32X2-NEXT: cvt.f64.f32 %rd3, %r6; +; CHECK-F32X2-NEXT: cvt.f64.f32 %rd4, %r5; +; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd3}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %b) %xr = fpext <2 x float> %r to <2 x double> ret <2 x double> %xr } define <2 x float> @test_floor(<2 x float> %a) #0 { -; CHECK-LABEL: test_floor( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_floor_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rmi.f32.f32 %r3, %r2; -; CHECK-NEXT: cvt.rmi.f32.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_floor( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_floor_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rmi.f32.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rmi.f32.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_floor( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_floor_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rmi.f32.f32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rmi.f32.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.floor(<2 x float> %a) ret <2 x float> %r } define <2 x float> @test_ceil(<2 x float> %a) #0 { -; CHECK-LABEL: test_ceil( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_ceil_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rpi.f32.f32 %r3, %r2; -; CHECK-NEXT: cvt.rpi.f32.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_ceil( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_ceil_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rpi.f32.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rpi.f32.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_ceil( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_ceil_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rpi.f32.f32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rpi.f32.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.ceil(<2 x float> %a) ret <2 x float> %r } define <2 x float> @test_trunc(<2 x float> %a) #0 { -; CHECK-LABEL: test_trunc( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_trunc_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rzi.f32.f32 %r3, %r2; -; CHECK-NEXT: cvt.rzi.f32.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_trunc( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_trunc( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_trunc_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.trunc(<2 x float> %a) ret <2 x float> %r } define <2 x float> @test_rint(<2 x float> %a) #0 { -; CHECK-LABEL: test_rint( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_rint_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2; -; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_rint( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_rint_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_rint( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_rint_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.rint(<2 x float> %a) ret <2 x float> %r } define <2 x float> @test_nearbyint(<2 x float> %a) #0 { -; CHECK-LABEL: test_nearbyint( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_nearbyint_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2; -; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_nearbyint( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_nearbyint_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_nearbyint( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_nearbyint_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.nearbyint(<2 x float> %a) ret <2 x float> %r } define <2 x float> @test_roundeven(<2 x float> %a) #0 { -; CHECK-LABEL: test_roundeven( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_roundeven_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2; -; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_roundeven( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_roundeven_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_roundeven( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_roundeven_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.roundeven(<2 x float> %a) ret <2 x float> %r } ; check the use of sign mask and 0.5 to implement round define <2 x float> @test_round(<2 x float> %a) #0 { -; CHECK-LABEL: test_round( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<19>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_round_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: and.b32 %r3, %r2, -2147483648; -; CHECK-NEXT: or.b32 %r4, %r3, 1056964608; -; CHECK-NEXT: add.rn.f32 %r5, %r2, %r4; -; CHECK-NEXT: cvt.rzi.f32.f32 %r6, %r5; -; CHECK-NEXT: abs.f32 %r7, %r2; -; CHECK-NEXT: setp.gt.f32 %p1, %r7, 0f4B000000; -; CHECK-NEXT: selp.f32 %r8, %r2, %r6, %p1; -; CHECK-NEXT: cvt.rzi.f32.f32 %r9, %r2; -; CHECK-NEXT: setp.lt.f32 %p2, %r7, 0f3F000000; -; CHECK-NEXT: selp.f32 %r10, %r9, %r8, %p2; -; CHECK-NEXT: and.b32 %r11, %r1, -2147483648; -; CHECK-NEXT: or.b32 %r12, %r11, 1056964608; -; CHECK-NEXT: add.rn.f32 %r13, %r1, %r12; -; CHECK-NEXT: cvt.rzi.f32.f32 %r14, %r13; -; CHECK-NEXT: abs.f32 %r15, %r1; -; CHECK-NEXT: setp.gt.f32 %p3, %r15, 0f4B000000; -; CHECK-NEXT: selp.f32 %r16, %r1, %r14, %p3; -; CHECK-NEXT: cvt.rzi.f32.f32 %r17, %r1; -; CHECK-NEXT: setp.lt.f32 %p4, %r15, 0f3F000000; -; CHECK-NEXT: selp.f32 %r18, %r17, %r16, %p4; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r18, %r10}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_round( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<5>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<19>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_round_param_0]; +; CHECK-NOF32X2-NEXT: and.b32 %r3, %r2, -2147483648; +; CHECK-NOF32X2-NEXT: or.b32 %r4, %r3, 1056964608; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r6, %r5; +; CHECK-NOF32X2-NEXT: abs.f32 %r7, %r2; +; CHECK-NOF32X2-NEXT: setp.gt.f32 %p1, %r7, 0f4B000000; +; CHECK-NOF32X2-NEXT: selp.f32 %r8, %r2, %r6, %p1; +; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r9, %r2; +; CHECK-NOF32X2-NEXT: setp.lt.f32 %p2, %r7, 0f3F000000; +; CHECK-NOF32X2-NEXT: selp.f32 %r10, %r9, %r8, %p2; +; CHECK-NOF32X2-NEXT: and.b32 %r11, %r1, -2147483648; +; CHECK-NOF32X2-NEXT: or.b32 %r12, %r11, 1056964608; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r13, %r1, %r12; +; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r14, %r13; +; CHECK-NOF32X2-NEXT: abs.f32 %r15, %r1; +; CHECK-NOF32X2-NEXT: setp.gt.f32 %p3, %r15, 0f4B000000; +; CHECK-NOF32X2-NEXT: selp.f32 %r16, %r1, %r14, %p3; +; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r17, %r1; +; CHECK-NOF32X2-NEXT: setp.lt.f32 %p4, %r15, 0f3F000000; +; CHECK-NOF32X2-NEXT: selp.f32 %r18, %r17, %r16, %p4; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r18, %r10}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_round( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<5>; +; CHECK-F32X2-NEXT: .reg .b32 %r<19>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_round_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: and.b32 %r3, %r2, -2147483648; +; CHECK-F32X2-NEXT: or.b32 %r4, %r3, 1056964608; +; CHECK-F32X2-NEXT: add.rn.f32 %r5, %r2, %r4; +; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r6, %r5; +; CHECK-F32X2-NEXT: abs.f32 %r7, %r2; +; CHECK-F32X2-NEXT: setp.gt.f32 %p1, %r7, 0f4B000000; +; CHECK-F32X2-NEXT: selp.f32 %r8, %r2, %r6, %p1; +; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r9, %r2; +; CHECK-F32X2-NEXT: setp.lt.f32 %p2, %r7, 0f3F000000; +; CHECK-F32X2-NEXT: selp.f32 %r10, %r9, %r8, %p2; +; CHECK-F32X2-NEXT: and.b32 %r11, %r1, -2147483648; +; CHECK-F32X2-NEXT: or.b32 %r12, %r11, 1056964608; +; CHECK-F32X2-NEXT: add.rn.f32 %r13, %r1, %r12; +; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r14, %r13; +; CHECK-F32X2-NEXT: abs.f32 %r15, %r1; +; CHECK-F32X2-NEXT: setp.gt.f32 %p3, %r15, 0f4B000000; +; CHECK-F32X2-NEXT: selp.f32 %r16, %r1, %r14, %p3; +; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r17, %r1; +; CHECK-F32X2-NEXT: setp.lt.f32 %p4, %r15, 0f3F000000; +; CHECK-F32X2-NEXT: selp.f32 %r18, %r17, %r16, %p4; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r18, %r10}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.round(<2 x float> %a) ret <2 x float> %r } @@ -2017,17 +2775,13 @@ define <2 x float> @test_fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c) ; CHECK-NOF32X2-LABEL: test_fmuladd( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd3, [test_fmuladd_param_2]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fmuladd_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fmuladd_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd3; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2; -; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r5, %r3, %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fmuladd_param_2]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmuladd_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmuladd_param_0]; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r2, %r4, %r6; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r1, %r3, %r5; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -2047,16 +2801,25 @@ define <2 x float> @test_fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c) } define <2 x float> @test_shufflevector(<2 x float> %a) #0 { -; CHECK-LABEL: test_shufflevector( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_shufflevector_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_shufflevector( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_shufflevector_param_0]; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_shufflevector( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_shufflevector_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; CHECK-F32X2-NEXT: ret; %s = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> ret <2 x float> %s } @@ -2064,14 +2827,12 @@ define <2 x float> @test_shufflevector(<2 x float> %a) #0 { define <2 x float> @test_insertelement(<2 x float> %a, float %x) #0 { ; CHECK-NOF32X2-LABEL: test_insertelement( ; CHECK-NOF32X2: { -; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<4>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b32 %r1, [test_insertelement_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_insertelement_param_0]; -; CHECK-NOF32X2-NEXT: { .reg .b32 tmp; mov.b64 {%r2, tmp}, %rd1; } -; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_insertelement_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.b32 %r3, [test_insertelement_param_1]; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r3}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_insertelement( @@ -2120,36 +2881,60 @@ define <2 x float> @test_uitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 { } define void @test_trunc_to_v2bf16(<2 x float> %a, ptr %p) { -; CHECK-LABEL: test_trunc_to_v2bf16( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_trunc_to_v2bf16_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2bf16_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rn.bf16x2.f32 %r3, %r2, %r1; -; CHECK-NEXT: st.b32 [%rd2], %r3; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_trunc_to_v2bf16( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<4>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_to_v2bf16_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2bf16_param_1]; +; CHECK-NOF32X2-NEXT: cvt.rn.bf16x2.f32 %r3, %r2, %r1; +; CHECK-NOF32X2-NEXT: st.b32 [%rd1], %r3; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_trunc_to_v2bf16( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<4>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_trunc_to_v2bf16_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2bf16_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rn.bf16x2.f32 %r3, %r2, %r1; +; CHECK-F32X2-NEXT: st.b32 [%rd2], %r3; +; CHECK-F32X2-NEXT: ret; %trunc = fptrunc <2 x float> %a to <2 x bfloat> store <2 x bfloat> %trunc, ptr %p ret void } define void @test_trunc_to_v2f16(<2 x float> %a, ptr %p) { -; CHECK-LABEL: test_trunc_to_v2f16( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_trunc_to_v2f16_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2f16_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rn.f16x2.f32 %r3, %r2, %r1; -; CHECK-NEXT: st.b32 [%rd2], %r3; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_trunc_to_v2f16( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<4>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_to_v2f16_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2f16_param_1]; +; CHECK-NOF32X2-NEXT: cvt.rn.f16x2.f32 %r3, %r2, %r1; +; CHECK-NOF32X2-NEXT: st.b32 [%rd1], %r3; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_trunc_to_v2f16( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<4>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_trunc_to_v2f16_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2f16_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rn.f16x2.f32 %r3, %r2, %r1; +; CHECK-F32X2-NEXT: st.b32 [%rd2], %r3; +; CHECK-F32X2-NEXT: ret; %trunc = fptrunc <2 x float> %a to <2 x half> store <2 x half> %trunc, ptr %p ret void diff --git a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll index 3ac8f65ff858b..cb1d12661ed64 100644 --- a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll +++ b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll @@ -7,16 +7,17 @@ declare <4 x float> @bar() define void @foo(ptr %ptr) { ; CHECK-LABEL: foo( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 16 .b8 retval0[16]; ; CHECK-NEXT: call.uni (retval0), bar, (); -; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [retval0]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [retval0]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: st.v2.b64 [%rd1], {%rd2, %rd3}; +; CHECK-NEXT: st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; %val = tail call <4 x float> @bar() store <4 x float> %val, ptr %ptr diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll index 68c53cde7f9ac..a846607d816c5 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll @@ -137,18 +137,32 @@ define void @generic_4xi64(ptr %a, ptr %b) { } define void @generic_8xfloat(ptr %a, ptr %b) { -; CHECK-LABEL: generic_8xfloat( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<7>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [generic_8xfloat_param_0]; -; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd6, [generic_8xfloat_param_1]; -; CHECK-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5}; -; CHECK-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3}; -; CHECK-NEXT: ret; +; SM90-LABEL: generic_8xfloat( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [generic_8xfloat_param_0]; +; SM90-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [generic_8xfloat_param_1]; +; SM90-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: generic_8xfloat( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [generic_8xfloat_param_0]; +; SM100-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [generic_8xfloat_param_1]; +; SM100-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load <8 x float>, ptr %a store <8 x float> %a.load, ptr %b ret void @@ -288,18 +302,32 @@ define void @generic_volatile_4xi64(ptr %a, ptr %b) { } define void @generic_volatile_8xfloat(ptr %a, ptr %b) { -; CHECK-LABEL: generic_volatile_8xfloat( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<7>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_8xfloat_param_0]; -; CHECK-NEXT: ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd6, [generic_volatile_8xfloat_param_1]; -; CHECK-NEXT: st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5}; -; CHECK-NEXT: st.volatile.v2.b64 [%rd6], {%rd2, %rd3}; -; CHECK-NEXT: ret; +; SM90-LABEL: generic_volatile_8xfloat( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [generic_volatile_8xfloat_param_0]; +; SM90-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [generic_volatile_8xfloat_param_1]; +; SM90-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: generic_volatile_8xfloat( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [generic_volatile_8xfloat_param_0]; +; SM100-NEXT: ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [generic_volatile_8xfloat_param_1]; +; SM100-NEXT: st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.volatile.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load volatile <8 x float>, ptr %a store volatile <8 x float> %a.load, ptr %b ret void @@ -514,15 +542,16 @@ define void @global_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) { define void @global_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-LABEL: global_8xfloat( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<7>; +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b64 %rd1, [global_8xfloat_param_0]; -; SM90-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [%rd1]; -; SM90-NEXT: ld.global.v2.b64 {%rd4, %rd5}, [%rd1+16]; -; SM90-NEXT: ld.param.b64 %rd6, [global_8xfloat_param_1]; -; SM90-NEXT: st.global.v2.b64 [%rd6+16], {%rd4, %rd5}; -; SM90-NEXT: st.global.v2.b64 [%rd6], {%rd2, %rd3}; +; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [global_8xfloat_param_1]; +; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; SM90-NEXT: ret; ; ; SM100-LABEL: global_8xfloat( @@ -758,15 +787,16 @@ define void @global_volatile_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) { define void @global_volatile_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-LABEL: global_volatile_8xfloat( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<7>; +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_8xfloat_param_0]; -; SM90-NEXT: ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1]; -; SM90-NEXT: ld.volatile.global.v2.b64 {%rd4, %rd5}, [%rd1+16]; -; SM90-NEXT: ld.param.b64 %rd6, [global_volatile_8xfloat_param_1]; -; SM90-NEXT: st.volatile.global.v2.b64 [%rd6+16], {%rd4, %rd5}; -; SM90-NEXT: st.volatile.global.v2.b64 [%rd6], {%rd2, %rd3}; +; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [global_volatile_8xfloat_param_1]; +; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; SM90-NEXT: ret; ; ; SM100-LABEL: global_volatile_8xfloat( @@ -931,18 +961,32 @@ define void @shared_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) { } define void @shared_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) { -; CHECK-LABEL: shared_8xfloat( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<7>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [shared_8xfloat_param_0]; -; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd6, [shared_8xfloat_param_1]; -; CHECK-NEXT: st.shared.v2.b64 [%rd6+16], {%rd4, %rd5}; -; CHECK-NEXT: st.shared.v2.b64 [%rd6], {%rd2, %rd3}; -; CHECK-NEXT: ret; +; SM90-LABEL: shared_8xfloat( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [shared_8xfloat_param_0]; +; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [shared_8xfloat_param_1]; +; SM90-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: shared_8xfloat( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [shared_8xfloat_param_0]; +; SM100-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [shared_8xfloat_param_1]; +; SM100-NEXT: st.shared.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.shared.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load <8 x float>, ptr addrspace(3) %a store <8 x float> %a.load, ptr addrspace(3) %b ret void @@ -1082,18 +1126,32 @@ define void @shared_volatile_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) { } define void @shared_volatile_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) { -; CHECK-LABEL: shared_volatile_8xfloat( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<7>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_8xfloat_param_0]; -; CHECK-NEXT: ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd6, [shared_volatile_8xfloat_param_1]; -; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5}; -; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3}; -; CHECK-NEXT: ret; +; SM90-LABEL: shared_volatile_8xfloat( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [shared_volatile_8xfloat_param_0]; +; SM90-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [shared_volatile_8xfloat_param_1]; +; SM90-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: shared_volatile_8xfloat( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [shared_volatile_8xfloat_param_0]; +; SM100-NEXT: ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [shared_volatile_8xfloat_param_1]; +; SM100-NEXT: st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load volatile <8 x float>, ptr addrspace(3) %a store volatile <8 x float> %a.load, ptr addrspace(3) %b ret void @@ -1235,18 +1293,32 @@ define void @local_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) { } define void @local_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) { -; CHECK-LABEL: local_8xfloat( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<7>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [local_8xfloat_param_0]; -; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd6, [local_8xfloat_param_1]; -; CHECK-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5}; -; CHECK-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3}; -; CHECK-NEXT: ret; +; SM90-LABEL: local_8xfloat( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [local_8xfloat_param_0]; +; SM90-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [local_8xfloat_param_1]; +; SM90-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: local_8xfloat( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [local_8xfloat_param_0]; +; SM100-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [local_8xfloat_param_1]; +; SM100-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load <8 x float>, ptr addrspace(5) %a store <8 x float> %a.load, ptr addrspace(5) %b ret void @@ -1386,18 +1458,32 @@ define void @local_volatile_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) { } define void @local_volatile_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) { -; CHECK-LABEL: local_volatile_8xfloat( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<7>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_8xfloat_param_0]; -; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd6, [local_volatile_8xfloat_param_1]; -; CHECK-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5}; -; CHECK-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3}; -; CHECK-NEXT: ret; +; SM90-LABEL: local_volatile_8xfloat( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [local_volatile_8xfloat_param_0]; +; SM90-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [local_volatile_8xfloat_param_1]; +; SM90-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: local_volatile_8xfloat( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [local_volatile_8xfloat_param_0]; +; SM100-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [local_volatile_8xfloat_param_1]; +; SM100-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load volatile <8 x float>, ptr addrspace(5) %a store volatile <8 x float> %a.load, ptr addrspace(5) %b ret void diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll index d542fa58684a1..7553c727b09c5 100644 --- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll +++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll @@ -333,28 +333,30 @@ define ptx_kernel void @foo10(ptr noalias readonly %from, ptr %to) { define ptx_kernel void @foo11(ptr noalias readonly %from, ptr %to) { ; SM20-LABEL: foo11( ; SM20: { -; SM20-NEXT: .reg .b64 %rd<6>; +; SM20-NEXT: .reg .b32 %r<3>; +; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.b64 %rd1, [foo11_param_0]; ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; ; SM20-NEXT: ld.param.b64 %rd3, [foo11_param_1]; ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; -; SM20-NEXT: ld.global.b64 %rd5, [%rd2]; -; SM20-NEXT: st.global.b64 [%rd4], %rd5; +; SM20-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd2]; +; SM20-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2}; ; SM20-NEXT: ret; ; ; SM35-LABEL: foo11( ; SM35: { -; SM35-NEXT: .reg .b64 %rd<6>; +; SM35-NEXT: .reg .b32 %r<3>; +; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.b64 %rd1, [foo11_param_0]; ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; ; SM35-NEXT: ld.param.b64 %rd3, [foo11_param_1]; ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; -; SM35-NEXT: ld.global.nc.b64 %rd5, [%rd2]; -; SM35-NEXT: st.global.b64 [%rd4], %rd5; +; SM35-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [%rd2]; +; SM35-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2}; ; SM35-NEXT: ret; %1 = load <2 x float>, ptr %from store <2 x float> %1, ptr %to @@ -494,28 +496,30 @@ define ptx_kernel void @foo15(ptr noalias readonly %from, ptr %to) { define ptx_kernel void @foo16(ptr noalias readonly %from, ptr %to) { ; SM20-LABEL: foo16( ; SM20: { -; SM20-NEXT: .reg .b64 %rd<7>; +; SM20-NEXT: .reg .b32 %r<5>; +; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.b64 %rd1, [foo16_param_0]; ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; ; SM20-NEXT: ld.param.b64 %rd3, [foo16_param_1]; ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; -; SM20-NEXT: ld.global.v2.b64 {%rd5, %rd6}, [%rd2]; -; SM20-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6}; +; SM20-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2]; +; SM20-NEXT: st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4}; ; SM20-NEXT: ret; ; ; SM35-LABEL: foo16( ; SM35: { -; SM35-NEXT: .reg .b64 %rd<7>; +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.b64 %rd1, [foo16_param_0]; ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; ; SM35-NEXT: ld.param.b64 %rd3, [foo16_param_1]; ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; -; SM35-NEXT: ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2]; -; SM35-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6}; +; SM35-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2]; +; SM35-NEXT: st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4}; ; SM35-NEXT: ret; %1 = load <4 x float>, ptr %from store <4 x float> %1, ptr %to diff --git a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll index dfdb33852305b..0039370e6dcf5 100644 --- a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll +++ b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll @@ -8,55 +8,52 @@ target triple = "nvptx64-nvidia-cuda" define <4 x float> @t1(ptr %p1) { ; CHECK-LABEL: t1( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<46>; +; CHECK-NEXT: .reg .b32 %r<41>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [t1_param_0]; -; CHECK-NEXT: ld.b8 %rd2, [%rd1+8]; -; CHECK-NEXT: ld.b8 %rd3, [%rd1+9]; -; CHECK-NEXT: shl.b64 %rd4, %rd3, 8; -; CHECK-NEXT: or.b64 %rd5, %rd4, %rd2; -; CHECK-NEXT: ld.b8 %rd6, [%rd1+10]; -; CHECK-NEXT: shl.b64 %rd7, %rd6, 16; -; CHECK-NEXT: ld.b8 %rd8, [%rd1+11]; -; CHECK-NEXT: shl.b64 %rd9, %rd8, 24; -; CHECK-NEXT: or.b64 %rd10, %rd9, %rd7; -; CHECK-NEXT: or.b64 %rd11, %rd10, %rd5; -; CHECK-NEXT: ld.b8 %rd12, [%rd1+12]; -; CHECK-NEXT: ld.b8 %rd13, [%rd1+13]; -; CHECK-NEXT: shl.b64 %rd14, %rd13, 8; -; CHECK-NEXT: or.b64 %rd15, %rd14, %rd12; -; CHECK-NEXT: ld.b8 %rd16, [%rd1+14]; -; CHECK-NEXT: shl.b64 %rd17, %rd16, 16; -; CHECK-NEXT: ld.b8 %rd18, [%rd1+15]; -; CHECK-NEXT: shl.b64 %rd19, %rd18, 24; -; CHECK-NEXT: or.b64 %rd20, %rd19, %rd17; -; CHECK-NEXT: or.b64 %rd21, %rd20, %rd15; -; CHECK-NEXT: shl.b64 %rd22, %rd21, 32; -; CHECK-NEXT: or.b64 %rd23, %rd22, %rd11; -; CHECK-NEXT: ld.b8 %rd24, [%rd1]; -; CHECK-NEXT: ld.b8 %rd25, [%rd1+1]; -; CHECK-NEXT: shl.b64 %rd26, %rd25, 8; -; CHECK-NEXT: or.b64 %rd27, %rd26, %rd24; -; CHECK-NEXT: ld.b8 %rd28, [%rd1+2]; -; CHECK-NEXT: shl.b64 %rd29, %rd28, 16; -; CHECK-NEXT: ld.b8 %rd30, [%rd1+3]; -; CHECK-NEXT: shl.b64 %rd31, %rd30, 24; -; CHECK-NEXT: or.b64 %rd32, %rd31, %rd29; -; CHECK-NEXT: or.b64 %rd33, %rd32, %rd27; -; CHECK-NEXT: ld.b8 %rd34, [%rd1+4]; -; CHECK-NEXT: ld.b8 %rd35, [%rd1+5]; -; CHECK-NEXT: shl.b64 %rd36, %rd35, 8; -; CHECK-NEXT: or.b64 %rd37, %rd36, %rd34; -; CHECK-NEXT: ld.b8 %rd38, [%rd1+6]; -; CHECK-NEXT: shl.b64 %rd39, %rd38, 16; -; CHECK-NEXT: ld.b8 %rd40, [%rd1+7]; -; CHECK-NEXT: shl.b64 %rd41, %rd40, 24; -; CHECK-NEXT: or.b64 %rd42, %rd41, %rd39; -; CHECK-NEXT: or.b64 %rd43, %rd42, %rd37; -; CHECK-NEXT: shl.b64 %rd44, %rd43, 32; -; CHECK-NEXT: or.b64 %rd45, %rd44, %rd33; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd45, %rd23}; +; CHECK-NEXT: ld.b8 %r1, [%rd1+12]; +; CHECK-NEXT: ld.b8 %r2, [%rd1+13]; +; CHECK-NEXT: shl.b32 %r3, %r2, 8; +; CHECK-NEXT: or.b32 %r4, %r3, %r1; +; CHECK-NEXT: ld.b8 %r5, [%rd1+14]; +; CHECK-NEXT: shl.b32 %r6, %r5, 16; +; CHECK-NEXT: ld.b8 %r7, [%rd1+15]; +; CHECK-NEXT: shl.b32 %r8, %r7, 24; +; CHECK-NEXT: or.b32 %r9, %r8, %r6; +; CHECK-NEXT: or.b32 %r10, %r9, %r4; +; CHECK-NEXT: ld.b8 %r11, [%rd1+8]; +; CHECK-NEXT: ld.b8 %r12, [%rd1+9]; +; CHECK-NEXT: shl.b32 %r13, %r12, 8; +; CHECK-NEXT: or.b32 %r14, %r13, %r11; +; CHECK-NEXT: ld.b8 %r15, [%rd1+10]; +; CHECK-NEXT: shl.b32 %r16, %r15, 16; +; CHECK-NEXT: ld.b8 %r17, [%rd1+11]; +; CHECK-NEXT: shl.b32 %r18, %r17, 24; +; CHECK-NEXT: or.b32 %r19, %r18, %r16; +; CHECK-NEXT: or.b32 %r20, %r19, %r14; +; CHECK-NEXT: ld.b8 %r21, [%rd1+4]; +; CHECK-NEXT: ld.b8 %r22, [%rd1+5]; +; CHECK-NEXT: shl.b32 %r23, %r22, 8; +; CHECK-NEXT: or.b32 %r24, %r23, %r21; +; CHECK-NEXT: ld.b8 %r25, [%rd1+6]; +; CHECK-NEXT: shl.b32 %r26, %r25, 16; +; CHECK-NEXT: ld.b8 %r27, [%rd1+7]; +; CHECK-NEXT: shl.b32 %r28, %r27, 24; +; CHECK-NEXT: or.b32 %r29, %r28, %r26; +; CHECK-NEXT: or.b32 %r30, %r29, %r24; +; CHECK-NEXT: ld.b8 %r31, [%rd1]; +; CHECK-NEXT: ld.b8 %r32, [%rd1+1]; +; CHECK-NEXT: shl.b32 %r33, %r32, 8; +; CHECK-NEXT: or.b32 %r34, %r33, %r31; +; CHECK-NEXT: ld.b8 %r35, [%rd1+2]; +; CHECK-NEXT: shl.b32 %r36, %r35, 16; +; CHECK-NEXT: ld.b8 %r37, [%rd1+3]; +; CHECK-NEXT: shl.b32 %r38, %r37, 24; +; CHECK-NEXT: or.b32 %r39, %r38, %r36; +; CHECK-NEXT: or.b32 %r40, %r39, %r34; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r40, %r30, %r20, %r10}; ; CHECK-NEXT: ret; %r = load <4 x float>, ptr %p1, align 1 ret <4 x float> %r @@ -65,19 +62,16 @@ define <4 x float> @t1(ptr %p1) { define <4 x float> @t2(ptr %p1) { ; CHECK-LABEL: t2( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<10>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [t2_param_0]; -; CHECK-NEXT: ld.b32 %rd2, [%rd1+8]; -; CHECK-NEXT: ld.b32 %rd3, [%rd1+12]; -; CHECK-NEXT: shl.b64 %rd4, %rd3, 32; -; CHECK-NEXT: or.b64 %rd5, %rd4, %rd2; -; CHECK-NEXT: ld.b32 %rd6, [%rd1]; -; CHECK-NEXT: ld.b32 %rd7, [%rd1+4]; -; CHECK-NEXT: shl.b64 %rd8, %rd7, 32; -; CHECK-NEXT: or.b64 %rd9, %rd8, %rd6; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd9, %rd5}; +; CHECK-NEXT: ld.b32 %r1, [%rd1+12]; +; CHECK-NEXT: ld.b32 %r2, [%rd1+8]; +; CHECK-NEXT: ld.b32 %r3, [%rd1+4]; +; CHECK-NEXT: ld.b32 %r4, [%rd1]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1}; ; CHECK-NEXT: ret; %r = load <4 x float>, ptr %p1, align 4 ret <4 x float> %r @@ -86,13 +80,14 @@ define <4 x float> @t2(ptr %p1) { define <4 x float> @t3(ptr %p1) { ; CHECK-LABEL: t3( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [t3_param_0]; -; CHECK-NEXT: ld.b64 %rd2, [%rd1+8]; -; CHECK-NEXT: ld.b64 %rd3, [%rd1]; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; +; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1+8]; +; CHECK-NEXT: ld.v2.b32 {%r3, %r4}, [%rd1]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r3, %r4, %r1, %r2}; ; CHECK-NEXT: ret; %r = load <4 x float>, ptr %p1, align 8 ret <4 x float> %r @@ -101,12 +96,13 @@ define <4 x float> @t3(ptr %p1) { define <4 x float> @t4(ptr %p1) { ; CHECK-LABEL: t4( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [t4_param_0]; -; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd3}; +; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; %r = load <4 x float>, ptr %p1, align 16 ret <4 x float> %r @@ -189,40 +185,43 @@ define void @test_v4halfp0a1(ptr noalias readonly %from, ptr %to) { define void @s1(ptr %p1, <4 x float> %v) { ; CHECK-LABEL: s1( ; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-NEXT: .reg .b64 %rd<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [s1_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s1_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s1_param_1]; +; CHECK-NEXT: cvt.u64.u32 %rd2, %r4; +; CHECK-NEXT: st.b8 [%rd1+12], %rd2; +; CHECK-NEXT: cvt.u64.u32 %rd3, %r3; ; CHECK-NEXT: st.b8 [%rd1+8], %rd3; -; CHECK-NEXT: st.b8 [%rd1], %rd2; -; CHECK-NEXT: shr.u64 %rd4, %rd3, 56; -; CHECK-NEXT: st.b8 [%rd1+15], %rd4; -; CHECK-NEXT: shr.u64 %rd5, %rd3, 48; -; CHECK-NEXT: st.b8 [%rd1+14], %rd5; -; CHECK-NEXT: shr.u64 %rd6, %rd3, 40; -; CHECK-NEXT: st.b8 [%rd1+13], %rd6; -; CHECK-NEXT: shr.u64 %rd7, %rd3, 32; -; CHECK-NEXT: st.b8 [%rd1+12], %rd7; -; CHECK-NEXT: shr.u64 %rd8, %rd3, 24; -; CHECK-NEXT: st.b8 [%rd1+11], %rd8; -; CHECK-NEXT: shr.u64 %rd9, %rd3, 16; -; CHECK-NEXT: st.b8 [%rd1+10], %rd9; -; CHECK-NEXT: shr.u64 %rd10, %rd3, 8; -; CHECK-NEXT: st.b8 [%rd1+9], %rd10; -; CHECK-NEXT: shr.u64 %rd11, %rd2, 56; -; CHECK-NEXT: st.b8 [%rd1+7], %rd11; -; CHECK-NEXT: shr.u64 %rd12, %rd2, 48; -; CHECK-NEXT: st.b8 [%rd1+6], %rd12; -; CHECK-NEXT: shr.u64 %rd13, %rd2, 40; -; CHECK-NEXT: st.b8 [%rd1+5], %rd13; -; CHECK-NEXT: shr.u64 %rd14, %rd2, 32; -; CHECK-NEXT: st.b8 [%rd1+4], %rd14; -; CHECK-NEXT: shr.u64 %rd15, %rd2, 24; +; CHECK-NEXT: cvt.u64.u32 %rd4, %r2; +; CHECK-NEXT: st.b8 [%rd1+4], %rd4; +; CHECK-NEXT: cvt.u64.u32 %rd5, %r1; +; CHECK-NEXT: st.b8 [%rd1], %rd5; +; CHECK-NEXT: shr.u64 %rd6, %rd2, 24; +; CHECK-NEXT: st.b8 [%rd1+15], %rd6; +; CHECK-NEXT: shr.u64 %rd7, %rd2, 16; +; CHECK-NEXT: st.b8 [%rd1+14], %rd7; +; CHECK-NEXT: shr.u64 %rd8, %rd2, 8; +; CHECK-NEXT: st.b8 [%rd1+13], %rd8; +; CHECK-NEXT: shr.u64 %rd9, %rd3, 24; +; CHECK-NEXT: st.b8 [%rd1+11], %rd9; +; CHECK-NEXT: shr.u64 %rd10, %rd3, 16; +; CHECK-NEXT: st.b8 [%rd1+10], %rd10; +; CHECK-NEXT: shr.u64 %rd11, %rd3, 8; +; CHECK-NEXT: st.b8 [%rd1+9], %rd11; +; CHECK-NEXT: shr.u64 %rd12, %rd4, 24; +; CHECK-NEXT: st.b8 [%rd1+7], %rd12; +; CHECK-NEXT: shr.u64 %rd13, %rd4, 16; +; CHECK-NEXT: st.b8 [%rd1+6], %rd13; +; CHECK-NEXT: shr.u64 %rd14, %rd4, 8; +; CHECK-NEXT: st.b8 [%rd1+5], %rd14; +; CHECK-NEXT: shr.u64 %rd15, %rd5, 24; ; CHECK-NEXT: st.b8 [%rd1+3], %rd15; -; CHECK-NEXT: shr.u64 %rd16, %rd2, 16; +; CHECK-NEXT: shr.u64 %rd16, %rd5, 16; ; CHECK-NEXT: st.b8 [%rd1+2], %rd16; -; CHECK-NEXT: shr.u64 %rd17, %rd2, 8; +; CHECK-NEXT: shr.u64 %rd17, %rd5, 8; ; CHECK-NEXT: st.b8 [%rd1+1], %rd17; ; CHECK-NEXT: ret; store <4 x float> %v, ptr %p1, align 1 @@ -232,17 +231,16 @@ define void @s1(ptr %p1, <4 x float> %v) { define void @s2(ptr %p1, <4 x float> %v) { ; CHECK-LABEL: s2( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [s2_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s2_param_1]; -; CHECK-NEXT: st.b32 [%rd1+8], %rd3; -; CHECK-NEXT: st.b32 [%rd1], %rd2; -; CHECK-NEXT: shr.u64 %rd4, %rd3, 32; -; CHECK-NEXT: st.b32 [%rd1+12], %rd4; -; CHECK-NEXT: shr.u64 %rd5, %rd2, 32; -; CHECK-NEXT: st.b32 [%rd1+4], %rd5; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s2_param_1]; +; CHECK-NEXT: st.b32 [%rd1+12], %r4; +; CHECK-NEXT: st.b32 [%rd1+8], %r3; +; CHECK-NEXT: st.b32 [%rd1+4], %r2; +; CHECK-NEXT: st.b32 [%rd1], %r1; ; CHECK-NEXT: ret; store <4 x float> %v, ptr %p1, align 4 ret void @@ -251,13 +249,14 @@ define void @s2(ptr %p1, <4 x float> %v) { define void @s3(ptr %p1, <4 x float> %v) { ; CHECK-LABEL: s3( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [s3_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s3_param_1]; -; CHECK-NEXT: st.b64 [%rd1+8], %rd3; -; CHECK-NEXT: st.b64 [%rd1], %rd2; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s3_param_1]; +; CHECK-NEXT: st.v2.b32 [%rd1+8], {%r3, %r4}; +; CHECK-NEXT: st.v2.b32 [%rd1], {%r1, %r2}; ; CHECK-NEXT: ret; store <4 x float> %v, ptr %p1, align 8 ret void @@ -266,12 +265,13 @@ define void @s3(ptr %p1, <4 x float> %v) { define void @s4(ptr %p1, <4 x float> %v) { ; CHECK-LABEL: s4( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [s4_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s4_param_1]; -; CHECK-NEXT: st.v2.b64 [%rd1], {%rd2, %rd3}; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s4_param_1]; +; CHECK-NEXT: st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; store <4 x float> %v, ptr %p1, align 16 ret void diff --git a/llvm/test/CodeGen/NVPTX/mulwide.ll b/llvm/test/CodeGen/NVPTX/mulwide.ll index 666c7a160e1f0..17220340d4b07 100644 --- a/llvm/test/CodeGen/NVPTX/mulwide.ll +++ b/llvm/test/CodeGen/NVPTX/mulwide.ll @@ -203,27 +203,35 @@ define i64 @mulwideu32(i32 %a, i32 %b) { define i64 @mulwideu7(i7 %a, i7 %b) { ; OPT-LABEL: mulwideu7( ; OPT: { -; OPT-NEXT: .reg .b32 %r<3>; +; OPT-NEXT: .reg .b32 %r<5>; ; OPT-NEXT: .reg .b64 %rd<2>; ; OPT-EMPTY: ; OPT-NEXT: // %bb.0: -; OPT-NEXT: ld.param.b8 %r1, [mulwideu7_param_0]; -; OPT-NEXT: ld.param.b8 %r2, [mulwideu7_param_1]; -; OPT-NEXT: mul.wide.u32 %rd1, %r1, %r2; +; OPT-NEXT: ld.param.b8 %r1, [mulwideu7_param_1]; +; OPT-NEXT: and.b32 %r2, %r1, 127; +; OPT-NEXT: ld.param.b8 %r3, [mulwideu7_param_0]; +; OPT-NEXT: and.b32 %r4, %r3, 127; +; OPT-NEXT: mul.wide.u32 %rd1, %r4, %r2; ; OPT-NEXT: st.param.b64 [func_retval0], %rd1; ; OPT-NEXT: ret; ; ; NOOPT-LABEL: mulwideu7( ; NOOPT: { -; NOOPT-NEXT: .reg .b16 %rs<3>; +; NOOPT-NEXT: .reg .b16 %rs<9>; ; NOOPT-NEXT: .reg .b64 %rd<6>; ; NOOPT-EMPTY: ; NOOPT-NEXT: // %bb.0: -; NOOPT-NEXT: ld.param.b8 %rs2, [mulwideu7_param_1]; -; NOOPT-NEXT: ld.param.b8 %rs1, [mulwideu7_param_0]; -; NOOPT-NEXT: cvt.u64.u16 %rd1, %rs1; +; NOOPT-NEXT: ld.param.b8 %rs3, [mulwideu7_param_0+1]; +; NOOPT-NEXT: shl.b16 %rs4, %rs3, 8; +; NOOPT-NEXT: ld.param.b8 %rs5, [mulwideu7_param_0]; +; NOOPT-NEXT: or.b16 %rs1, %rs4, %rs5; +; NOOPT-NEXT: ld.param.b8 %rs6, [mulwideu7_param_1+1]; +; NOOPT-NEXT: shl.b16 %rs7, %rs6, 8; +; NOOPT-NEXT: ld.param.b8 %rs8, [mulwideu7_param_1]; +; NOOPT-NEXT: or.b16 %rs2, %rs7, %rs8; +; NOOPT-NEXT: cvt.u64.u16 %rd1, %rs5; ; NOOPT-NEXT: and.b64 %rd2, %rd1, 127; -; NOOPT-NEXT: cvt.u64.u16 %rd3, %rs2; +; NOOPT-NEXT: cvt.u64.u16 %rd3, %rs8; ; NOOPT-NEXT: and.b64 %rd4, %rd3, 127; ; NOOPT-NEXT: mul.lo.s64 %rd5, %rd2, %rd4; ; NOOPT-NEXT: st.param.b64 [func_retval0], %rd5; @@ -242,26 +250,32 @@ define i64 @mulwides7(i7 %a, i7 %b) { ; OPT-EMPTY: ; OPT-NEXT: // %bb.0: ; OPT-NEXT: ld.param.b8 %r1, [mulwides7_param_0]; -; OPT-NEXT: bfe.s32 %r2, %r1, 0, 7; -; OPT-NEXT: ld.param.b8 %r3, [mulwides7_param_1]; -; OPT-NEXT: bfe.s32 %r4, %r3, 0, 7; -; OPT-NEXT: mul.wide.s32 %rd1, %r2, %r4; +; OPT-NEXT: ld.param.b8 %r2, [mulwides7_param_1]; +; OPT-NEXT: bfe.s32 %r3, %r2, 0, 7; +; OPT-NEXT: bfe.s32 %r4, %r1, 0, 7; +; OPT-NEXT: mul.wide.s32 %rd1, %r4, %r3; ; OPT-NEXT: st.param.b64 [func_retval0], %rd1; ; OPT-NEXT: ret; ; ; NOOPT-LABEL: mulwides7( ; NOOPT: { -; NOOPT-NEXT: .reg .b16 %rs<3>; +; NOOPT-NEXT: .reg .b16 %rs<9>; ; NOOPT-NEXT: .reg .b64 %rd<6>; ; NOOPT-EMPTY: ; NOOPT-NEXT: // %bb.0: -; NOOPT-NEXT: ld.param.b8 %rs2, [mulwides7_param_1]; -; NOOPT-NEXT: ld.param.b8 %rs1, [mulwides7_param_0]; -; NOOPT-NEXT: cvt.u64.u16 %rd1, %rs1; -; NOOPT-NEXT: bfe.s64 %rd2, %rd1, 0, 7; -; NOOPT-NEXT: cvt.u64.u16 %rd3, %rs2; -; NOOPT-NEXT: bfe.s64 %rd4, %rd3, 0, 7; -; NOOPT-NEXT: mul.lo.s64 %rd5, %rd2, %rd4; +; NOOPT-NEXT: ld.param.b8 %rs3, [mulwides7_param_0+1]; +; NOOPT-NEXT: shl.b16 %rs4, %rs3, 8; +; NOOPT-NEXT: ld.param.b8 %rs5, [mulwides7_param_0]; +; NOOPT-NEXT: or.b16 %rs1, %rs4, %rs5; +; NOOPT-NEXT: ld.param.b8 %rs6, [mulwides7_param_1]; +; NOOPT-NEXT: cvt.u64.u16 %rd1, %rs6; +; NOOPT-NEXT: cvt.u64.u16 %rd2, %rs5; +; NOOPT-NEXT: ld.param.b8 %rs7, [mulwides7_param_1+1]; +; NOOPT-NEXT: shl.b16 %rs8, %rs7, 8; +; NOOPT-NEXT: or.b16 %rs2, %rs8, %rs6; +; NOOPT-NEXT: bfe.s64 %rd3, %rd2, 0, 7; +; NOOPT-NEXT: bfe.s64 %rd4, %rd1, 0, 7; +; NOOPT-NEXT: mul.lo.s64 %rd5, %rd3, %rd4; ; NOOPT-NEXT: st.param.b64 [func_retval0], %rd5; ; NOOPT-NEXT: ret; %val0 = sext i7 %a to i64 diff --git a/llvm/test/CodeGen/NVPTX/no-f32x2.ll b/llvm/test/CodeGen/NVPTX/no-f32x2.ll new file mode 100644 index 0000000000000..b2b909166a0c6 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/no-f32x2.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mcpu=sm_100 | FileCheck %s --check-prefix=F32X2 +; RUN: llc < %s -mcpu=sm_90 | FileCheck %s --check-prefix=NOF32X2 +; RUN: llc < %s -mcpu=sm_100 -nvptx-no-f32x2 | FileCheck %s --check-prefix=NOF32X2 + +target triple = "nvptx64-nvidia-cuda" + +define <2 x float> @test(<2 x float> %a, <2 x float> %b) { +; F32X2-LABEL: test( +; F32X2: { +; F32X2-NEXT: .reg .b64 %rd<4>; +; F32X2-EMPTY: +; F32X2-NEXT: // %bb.0: +; F32X2-NEXT: ld.param.b64 %rd1, [test_param_0]; +; F32X2-NEXT: ld.param.b64 %rd2, [test_param_1]; +; F32X2-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; +; F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; F32X2-NEXT: ret; +; +; NOF32X2-LABEL: test( +; NOF32X2: { +; NOF32X2-NEXT: .reg .b32 %r<7>; +; NOF32X2-EMPTY: +; NOF32X2-NEXT: // %bb.0: +; NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_param_0]; +; NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_param_1]; +; NOF32X2-NEXT: add.rn.f32 %r5, %r2, %r4; +; NOF32X2-NEXT: add.rn.f32 %r6, %r1, %r3; +; NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; NOF32X2-NEXT: ret; + %c = fadd <2 x float> %a, %b + ret <2 x float> %c +} diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll index db3fbbc1d2c0f..90c8b921009b6 100644 --- a/llvm/test/CodeGen/NVPTX/param-load-store.ll +++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll @@ -523,8 +523,7 @@ define <9 x half> @test_v9f16(<9 x half> %a) { ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i19( ; CHECK-NEXT: .param .b32 test_i19_param_0 -; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i19_param_0]; -; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i19_param_0+2]; +; CHECK: ld.param.b32 {{%r[0-9]+}}, [test_i19_param_0]; ; CHECK: .param .b32 param0; ; CHECK: .param .b32 retval0; ; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; @@ -540,8 +539,7 @@ define i19 @test_i19(i19 %a) { ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i23( ; CHECK-NEXT: .param .b32 test_i23_param_0 -; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i23_param_0]; -; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i23_param_0+2]; +; CHECK: ld.param.b32 {{%r[0-9]+}}, [test_i23_param_0]; ; CHECK: .param .b32 param0; ; CHECK: .param .b32 retval0; ; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; @@ -557,8 +555,7 @@ define i23 @test_i23(i23 %a) { ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i24( ; CHECK-NEXT: .param .b32 test_i24_param_0 -; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i24_param_0+2]; -; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i24_param_0]; +; CHECK: ld.param.b32 {{%r[0-9]+}}, [test_i24_param_0]; ; CHECK: .param .b32 param0; ; CHECK: .param .b32 retval0; ; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; @@ -678,8 +675,7 @@ define float @test_f32(float %a) { ; CHECK: .func (.param .b64 func_retval0) ; CHECK-LABEL: test_i40( ; CHECK-NEXT: .param .b64 test_i40_param_0 -; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i40_param_0+4]; -; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i40_param_0]; +; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i40_param_0]; ; CHECK: .param .b64 param0; ; CHECK: .param .b64 retval0; ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; @@ -695,8 +691,7 @@ define i40 @test_i40(i40 %a) { ; CHECK: .func (.param .b64 func_retval0) ; CHECK-LABEL: test_i47( ; CHECK-NEXT: .param .b64 test_i47_param_0 -; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i47_param_0+4]; -; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i47_param_0]; +; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i47_param_0]; ; CHECK: .param .b64 param0; ; CHECK: .param .b64 retval0; ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; @@ -712,8 +707,7 @@ define i47 @test_i47(i47 %a) { ; CHECK: .func (.param .b64 func_retval0) ; CHECK-LABEL: test_i48( ; CHECK-NEXT: .param .b64 test_i48_param_0 -; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i48_param_0+4]; -; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i48_param_0]; +; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i48_param_0]; ; CHECK: .param .b64 param0; ; CHECK: .param .b64 retval0; ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; @@ -729,9 +723,7 @@ define i48 @test_i48(i48 %a) { ; CHECK: .func (.param .b64 func_retval0) ; CHECK-LABEL: test_i51( ; CHECK-NEXT: .param .b64 test_i51_param_0 -; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i51_param_0+6]; -; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i51_param_0+4]; -; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i51_param_0]; +; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i51_param_0]; ; CHECK: .param .b64 param0; ; CHECK: .param .b64 retval0; ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; @@ -747,9 +739,7 @@ define i51 @test_i51(i51 %a) { ; CHECK: .func (.param .b64 func_retval0) ; CHECK-LABEL: test_i56( ; CHECK-NEXT: .param .b64 test_i56_param_0 -; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i56_param_0+6]; -; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i56_param_0+4]; -; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i56_param_0]; +; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i56_param_0]; ; CHECK: .param .b64 param0; ; CHECK: .param .b64 retval0; ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; diff --git a/llvm/test/CodeGen/NVPTX/pr126337.ll b/llvm/test/CodeGen/NVPTX/pr126337.ll index 95258f7a3f360..f56b8eb98077c 100644 --- a/llvm/test/CodeGen/NVPTX/pr126337.ll +++ b/llvm/test/CodeGen/NVPTX/pr126337.ll @@ -17,17 +17,16 @@ define ptx_kernel void @Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel(<2 x float> %0) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; ; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %.preheader15 -; CHECK-NEXT: ld.param.b64 %rd1, [Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel_param_0]; -; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; } +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel_param_0]; ; CHECK-NEXT: setp.eq.f32 %p1, %r1, 0f00000000; ; CHECK-NEXT: selp.b16 %rs1, 1, 0, %p1; ; CHECK-NEXT: $L__BB0_1: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov.b64 %rd2, 0; -; CHECK-NEXT: st.b8 [%rd2], %rs1; +; CHECK-NEXT: mov.b64 %rd1, 0; +; CHECK-NEXT: st.b8 [%rd1], %rs1; ; CHECK-NEXT: bra.uni $L__BB0_1; .preheader15: br label %1 diff --git a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll index c78fcddb7ed0f..153d677058d9f 100644 --- a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll +++ b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll @@ -25,11 +25,11 @@ define float @test_gv_float() { define <2 x float> @test_gv_float2() { ; CHECK-LABEL: test_gv_float2( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.global.nc.b64 %rd1, [gv_float2]; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [gv_float2]; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2}; ; CHECK-NEXT: ret; %v = load <2 x float>, ptr @gv_float2 ret <2 x float> %v @@ -38,11 +38,11 @@ define <2 x float> @test_gv_float2() { define <4 x float> @test_gv_float4() { ; CHECK-LABEL: test_gv_float4( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.global.nc.v2.b64 {%rd1, %rd2}, [gv_float4]; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [gv_float4]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; %v = load <4 x float>, ptr @gv_float4 ret <4 x float> %v diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll index 94c2637ea7509..f286928da4481 100644 --- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll @@ -86,28 +86,46 @@ define half @reduce_fadd_half_reassoc_nonpow2(<7 x half> %in) { } define float @reduce_fadd_float(<8 x float> %in) { -; CHECK-LABEL: reduce_fadd_float( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<17>; -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_param_0+16]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd3; -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_param_0]; -; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2; -; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1; -; CHECK-NEXT: add.rn.f32 %r9, %r7, 0f00000000; -; CHECK-NEXT: add.rn.f32 %r10, %r9, %r8; -; CHECK-NEXT: add.rn.f32 %r11, %r10, %r5; -; CHECK-NEXT: add.rn.f32 %r12, %r11, %r6; -; CHECK-NEXT: add.rn.f32 %r13, %r12, %r3; -; CHECK-NEXT: add.rn.f32 %r14, %r13, %r4; -; CHECK-NEXT: add.rn.f32 %r15, %r14, %r1; -; CHECK-NEXT: add.rn.f32 %r16, %r15, %r2; -; CHECK-NEXT: st.param.b32 [func_retval0], %r16; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fadd_float( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<17>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_param_0]; +; CHECK-SM80-NEXT: add.rn.f32 %r9, %r1, 0f00000000; +; CHECK-SM80-NEXT: add.rn.f32 %r10, %r9, %r2; +; CHECK-SM80-NEXT: add.rn.f32 %r11, %r10, %r3; +; CHECK-SM80-NEXT: add.rn.f32 %r12, %r11, %r4; +; CHECK-SM80-NEXT: add.rn.f32 %r13, %r12, %r5; +; CHECK-SM80-NEXT: add.rn.f32 %r14, %r13, %r6; +; CHECK-SM80-NEXT: add.rn.f32 %r15, %r14, %r7; +; CHECK-SM80-NEXT: add.rn.f32 %r16, %r15, %r8; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r16; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fadd_float( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<17>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd3; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd2; +; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd1; +; CHECK-SM100-NEXT: add.rn.f32 %r9, %r7, 0f00000000; +; CHECK-SM100-NEXT: add.rn.f32 %r10, %r9, %r8; +; CHECK-SM100-NEXT: add.rn.f32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: add.rn.f32 %r12, %r11, %r6; +; CHECK-SM100-NEXT: add.rn.f32 %r13, %r12, %r3; +; CHECK-SM100-NEXT: add.rn.f32 %r14, %r13, %r4; +; CHECK-SM100-NEXT: add.rn.f32 %r15, %r14, %r1; +; CHECK-SM100-NEXT: add.rn.f32 %r16, %r15, %r2; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r16; +; CHECK-SM100-NEXT: ret; %res = call float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in) ret float %res } @@ -116,20 +134,15 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fadd_float_reassoc( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<17>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_reassoc_param_0+16]; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_reassoc_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: add.rn.f32 %r5, %r4, %r2; -; CHECK-SM80-NEXT: mov.b64 {%r6, %r7}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r8, %r9}, %rd1; -; CHECK-SM80-NEXT: add.rn.f32 %r10, %r9, %r7; -; CHECK-SM80-NEXT: add.rn.f32 %r11, %r10, %r5; -; CHECK-SM80-NEXT: add.rn.f32 %r12, %r3, %r1; -; CHECK-SM80-NEXT: add.rn.f32 %r13, %r8, %r6; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_param_0]; +; CHECK-SM80-NEXT: add.rn.f32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: add.rn.f32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: add.rn.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: add.rn.f32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: add.rn.f32 %r13, %r1, %r5; ; CHECK-SM80-NEXT: add.rn.f32 %r14, %r13, %r12; ; CHECK-SM80-NEXT: add.rn.f32 %r15, %r14, %r11; ; CHECK-SM80-NEXT: add.rn.f32 %r16, %r15, 0f00000000; @@ -272,27 +285,44 @@ define half @reduce_fmul_half_reassoc_nonpow2(<7 x half> %in) { } define float @reduce_fmul_float(<8 x float> %in) { -; CHECK-LABEL: reduce_fmul_float( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_param_0+16]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd3; -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_param_0]; -; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2; -; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1; -; CHECK-NEXT: mul.rn.f32 %r9, %r7, %r8; -; CHECK-NEXT: mul.rn.f32 %r10, %r9, %r5; -; CHECK-NEXT: mul.rn.f32 %r11, %r10, %r6; -; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r3; -; CHECK-NEXT: mul.rn.f32 %r13, %r12, %r4; -; CHECK-NEXT: mul.rn.f32 %r14, %r13, %r1; -; CHECK-NEXT: mul.rn.f32 %r15, %r14, %r2; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fmul_float( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_param_0]; +; CHECK-SM80-NEXT: mul.rn.f32 %r9, %r1, %r2; +; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r9, %r3; +; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r10, %r4; +; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r11, %r5; +; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r12, %r6; +; CHECK-SM80-NEXT: mul.rn.f32 %r14, %r13, %r7; +; CHECK-SM80-NEXT: mul.rn.f32 %r15, %r14, %r8; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmul_float( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd3; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd2; +; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd1; +; CHECK-SM100-NEXT: mul.rn.f32 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mul.rn.f32 %r10, %r9, %r5; +; CHECK-SM100-NEXT: mul.rn.f32 %r11, %r10, %r6; +; CHECK-SM100-NEXT: mul.rn.f32 %r12, %r11, %r3; +; CHECK-SM100-NEXT: mul.rn.f32 %r13, %r12, %r4; +; CHECK-SM100-NEXT: mul.rn.f32 %r14, %r13, %r1; +; CHECK-SM100-NEXT: mul.rn.f32 %r15, %r14, %r2; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in) ret float %res } @@ -301,20 +331,15 @@ define float @reduce_fmul_float_reassoc(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fmul_float_reassoc( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_reassoc_param_0+16]; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_reassoc_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: mul.rn.f32 %r5, %r4, %r2; -; CHECK-SM80-NEXT: mov.b64 {%r6, %r7}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r8, %r9}, %rd1; -; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r9, %r7; -; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r10, %r5; -; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r3, %r1; -; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r8, %r6; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_param_0]; +; CHECK-SM80-NEXT: mul.rn.f32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r1, %r5; ; CHECK-SM80-NEXT: mul.rn.f32 %r14, %r13, %r12; ; CHECK-SM80-NEXT: mul.rn.f32 %r15, %r14, %r11; ; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; @@ -495,15 +520,10 @@ define float @reduce_fmax_float(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fmax_float( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0]; ; CHECK-SM80-NEXT: max.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: max.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r9; @@ -540,15 +560,10 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fmax_float_reassoc( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_reassoc_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_reassoc_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0]; ; CHECK-SM80-NEXT: max.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: max.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r9; @@ -620,15 +635,10 @@ define float @reduce_fmax_float_nnan(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fmax_float_nnan( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_nnan_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_nnan_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_nnan_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_nnan_param_0]; ; CHECK-SM80-NEXT: max.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: max.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r9; @@ -809,15 +819,10 @@ define float @reduce_fmin_float(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fmin_float( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0]; ; CHECK-SM80-NEXT: min.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: min.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r9; @@ -854,15 +859,10 @@ define float @reduce_fmin_float_reassoc(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fmin_float_reassoc( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_reassoc_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_reassoc_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0]; ; CHECK-SM80-NEXT: min.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: min.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r9; @@ -934,15 +934,10 @@ define float @reduce_fmin_float_nnan(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fmin_float_nnan( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_nnan_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_nnan_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_nnan_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_nnan_param_0]; ; CHECK-SM80-NEXT: min.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: min.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r9; @@ -1078,15 +1073,10 @@ define float @reduce_fmaximum_float(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fmaximum_float( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0]; ; CHECK-SM80-NEXT: max.NaN.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: max.NaN.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: max.NaN.f32 %r11, %r10, %r9; @@ -1123,15 +1113,10 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fmaximum_float_reassoc( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_reassoc_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_reassoc_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0]; ; CHECK-SM80-NEXT: max.NaN.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: max.NaN.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: max.NaN.f32 %r11, %r10, %r9; @@ -1267,15 +1252,10 @@ define float @reduce_fminimum_float(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fminimum_float( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0]; ; CHECK-SM80-NEXT: min.NaN.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: min.NaN.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: min.NaN.f32 %r11, %r10, %r9; @@ -1312,15 +1292,10 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fminimum_float_reassoc( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_reassoc_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_reassoc_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0]; ; CHECK-SM80-NEXT: min.NaN.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: min.NaN.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: min.NaN.f32 %r11, %r10, %r9; diff --git a/llvm/test/CodeGen/NVPTX/vec-param-load.ll b/llvm/test/CodeGen/NVPTX/vec-param-load.ll index 29939e323b4b1..3c424c9318375 100644 --- a/llvm/test/CodeGen/NVPTX/vec-param-load.ll +++ b/llvm/test/CodeGen/NVPTX/vec-param-load.ll @@ -7,17 +7,17 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 define <16 x float> @test_v16f32(<16 x float> %a) { ; CHECK-LABEL: test_v16f32( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<9>; +; CHECK-NEXT: .reg .b32 %r<17>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v16f32_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_v16f32_param_0+16]; -; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_v16f32_param_0+32]; -; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [test_v16f32_param_0+48]; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+48], {%rd7, %rd8}; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+32], {%rd5, %rd6}; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4}; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_v16f32_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_v16f32_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [test_v16f32_param_0+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [test_v16f32_param_0+48]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0+48], {%r13, %r14, %r15, %r16}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0+32], {%r9, %r10, %r11, %r12}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; ret <16 x float> %a } @@ -25,13 +25,13 @@ define <16 x float> @test_v16f32(<16 x float> %a) { define <8 x float> @test_v8f32(<8 x float> %a) { ; CHECK-LABEL: test_v8f32( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b32 %r<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v8f32_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_v8f32_param_0+16]; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4}; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_v8f32_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_v8f32_param_0+16]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; ret <8 x float> %a } @@ -39,11 +39,11 @@ define <8 x float> @test_v8f32(<8 x float> %a) { define <4 x float> @test_v4f32(<4 x float> %a) { ; CHECK-LABEL: test_v4f32( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v4f32_param_0]; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_v4f32_param_0]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; ret <4 x float> %a } @@ -51,11 +51,11 @@ define <4 x float> @test_v4f32(<4 x float> %a) { define <2 x float> @test_v2f32(<2 x float> %a) { ; CHECK-LABEL: test_v2f32( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_v2f32_param_0]; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v2f32_param_0]; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2}; ; CHECK-NEXT: ret; ret <2 x float> %a } @@ -64,14 +64,13 @@ define <2 x float> @test_v2f32(<2 x float> %a) { define <3 x float> @test_v3f32(<3 x float> %a) { ; CHECK-LABEL: test_v3f32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_v3f32_param_0]; -; CHECK-NEXT: ld.param.b32 %r1, [test_v3f32_param_0+8]; -; CHECK-NEXT: st.param.b32 [func_retval0+8], %r1; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v3f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r3, [test_v3f32_param_0+8]; +; CHECK-NEXT: st.param.b32 [func_retval0+8], %r3; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2}; ; CHECK-NEXT: ret; ret <3 x float> %a } diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll index 6f0dff78d5569..ccac7ff8e6472 100644 --- a/llvm/test/CodeGen/NVPTX/vector-loads.ll +++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll @@ -206,18 +206,18 @@ define void @extv8f16_global_a16(ptr addrspace(1) noalias readonly align 16 %dst ; CHECK-NEXT: ld.param.b64 %rd1, [extv8f16_global_a16_param_0]; ; CHECK-NEXT: ld.param.b64 %rd2, [extv8f16_global_a16_param_1]; ; CHECK-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NEXT: cvt.f32.f16 %r5, %rs2; -; CHECK-NEXT: cvt.f32.f16 %r6, %rs1; -; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NEXT: cvt.f32.f16 %r7, %rs4; -; CHECK-NEXT: cvt.f32.f16 %r8, %rs3; -; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r4; -; CHECK-NEXT: cvt.f32.f16 %r9, %rs6; -; CHECK-NEXT: cvt.f32.f16 %r10, %rs5; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; -; CHECK-NEXT: cvt.f32.f16 %r11, %rs8; -; CHECK-NEXT: cvt.f32.f16 %r12, %rs7; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r4; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r2; +; CHECK-NEXT: cvt.f32.f16 %r5, %rs8; +; CHECK-NEXT: cvt.f32.f16 %r6, %rs7; +; CHECK-NEXT: cvt.f32.f16 %r7, %rs6; +; CHECK-NEXT: cvt.f32.f16 %r8, %rs5; +; CHECK-NEXT: cvt.f32.f16 %r9, %rs4; +; CHECK-NEXT: cvt.f32.f16 %r10, %rs3; +; CHECK-NEXT: cvt.f32.f16 %r11, %rs2; +; CHECK-NEXT: cvt.f32.f16 %r12, %rs1; ; CHECK-NEXT: st.global.v4.b32 [%rd1+16], {%r12, %r11, %r10, %r9}; ; CHECK-NEXT: st.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; ; CHECK-NEXT: ret; @@ -270,18 +270,18 @@ define void @extv8f16_generic_a16(ptr noalias readonly align 16 %dst, ptr noalia ; CHECK-NEXT: ld.param.b64 %rd1, [extv8f16_generic_a16_param_0]; ; CHECK-NEXT: ld.param.b64 %rd2, [extv8f16_generic_a16_param_1]; ; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NEXT: cvt.f32.f16 %r5, %rs2; -; CHECK-NEXT: cvt.f32.f16 %r6, %rs1; -; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NEXT: cvt.f32.f16 %r7, %rs4; -; CHECK-NEXT: cvt.f32.f16 %r8, %rs3; -; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r4; -; CHECK-NEXT: cvt.f32.f16 %r9, %rs6; -; CHECK-NEXT: cvt.f32.f16 %r10, %rs5; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; -; CHECK-NEXT: cvt.f32.f16 %r11, %rs8; -; CHECK-NEXT: cvt.f32.f16 %r12, %rs7; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r4; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r2; +; CHECK-NEXT: cvt.f32.f16 %r5, %rs8; +; CHECK-NEXT: cvt.f32.f16 %r6, %rs7; +; CHECK-NEXT: cvt.f32.f16 %r7, %rs6; +; CHECK-NEXT: cvt.f32.f16 %r8, %rs5; +; CHECK-NEXT: cvt.f32.f16 %r9, %rs4; +; CHECK-NEXT: cvt.f32.f16 %r10, %rs3; +; CHECK-NEXT: cvt.f32.f16 %r11, %rs2; +; CHECK-NEXT: cvt.f32.f16 %r12, %rs1; ; CHECK-NEXT: st.v4.b32 [%rd1+16], {%r12, %r11, %r10, %r9}; ; CHECK-NEXT: st.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/vector-stores.ll b/llvm/test/CodeGen/NVPTX/vector-stores.ll index d07c740d32a72..b9bb417aa2c37 100644 --- a/llvm/test/CodeGen/NVPTX/vector-stores.ll +++ b/llvm/test/CodeGen/NVPTX/vector-stores.ll @@ -5,12 +5,13 @@ define void @foo1(<2 x float> %val, ptr %ptr) { ; CHECK-LABEL: foo1( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [foo1_param_0]; -; CHECK-NEXT: ld.param.b64 %rd2, [foo1_param_1]; -; CHECK-NEXT: st.b64 [%rd2], %rd1; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [foo1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [foo1_param_1]; +; CHECK-NEXT: st.v2.b32 [%rd1], {%r1, %r2}; ; CHECK-NEXT: ret; store <2 x float> %val, ptr %ptr ret void @@ -19,12 +20,13 @@ define void @foo1(<2 x float> %val, ptr %ptr) { define void @foo2(<4 x float> %val, ptr %ptr) { ; CHECK-LABEL: foo2( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [foo2_param_0]; -; CHECK-NEXT: ld.param.b64 %rd3, [foo2_param_1]; -; CHECK-NEXT: st.v2.b64 [%rd3], {%rd1, %rd2}; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [foo2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [foo2_param_1]; +; CHECK-NEXT: st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; store <4 x float> %val, ptr %ptr ret void