diff --git a/clang/test/CodeGenCUDA/device-fun-linkage.cu b/clang/test/CodeGenCUDA/device-fun-linkage.cu index d8ad6d438be9c9..d1b9db261151b8 100644 --- a/clang/test/CodeGenCUDA/device-fun-linkage.cu +++ b/clang/test/CodeGenCUDA/device-fun-linkage.cu @@ -1,32 +1,19 @@ -// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -emit-llvm -o - %s \ +// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \ +// RUN: -emit-llvm -o - %s \ // RUN: | FileCheck -check-prefix=NORDC %s -// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -emit-llvm -o - %s \ -// RUN: | FileCheck -check-prefix=NORDC-NEG %s -// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -fgpu-rdc -emit-llvm -o - %s \ +// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \ +// RUN: -fgpu-rdc -emit-llvm -o - %s \ // RUN: | FileCheck -check-prefix=RDC %s -// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -fgpu-rdc -emit-llvm -o - %s \ -// RUN: | FileCheck -check-prefix=RDC-NEG %s #include "Inputs/cuda.h" +// NORDC: define internal void @_Z4funcIiEvv() +// NORDC: define{{.*}} void @_Z6kernelIiEvv() +// RDC: define weak_odr void @_Z4funcIiEvv() +// RDC: define weak_odr void @_Z6kernelIiEvv() + template __device__ void func() {} template __global__ void kernel() {} template __device__ void func(); -// NORDC: define internal void @_Z4funcIiEvv() -// RDC: define weak_odr void @_Z4funcIiEvv() - template __global__ void kernel(); -// NORDC: define void @_Z6kernelIiEvv() -// RDC: define weak_odr void @_Z6kernelIiEvv() - -// Ensure that unused static device function is eliminated -static __device__ void static_func() {} -// NORDC-NEG-NOT: define{{.*}} void @_ZL13static_funcv() -// RDC-NEG-NOT: define{{.*}} void @_ZL13static_funcv() - -// Ensure that kernel function has external or weak_odr -// linkage regardless static specifier -static __global__ void static_kernel() {} -// NORDC: define void @_ZL13static_kernelv() -// RDC: define weak_odr void @_ZL13static_kernelv() diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index e8322a0a8425ba..2516dff52efdfa 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -329,7 +329,7 @@ MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) { void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { const DataLayout &DL = getDataLayout(); const NVPTXSubtarget &STI = TM.getSubtarget(*F); - const auto *TLI = cast(STI.getTargetLowering()); + const TargetLowering *TLI = STI.getTargetLowering(); Type *Ty = F->getReturnType(); @@ -363,7 +363,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { unsigned totalsz = DL.getTypeAllocSize(Ty); unsigned retAlignment = 0; if (!getAlign(*F, 0, retAlignment)) - retAlignment = TLI->getFunctionParamOptimizedAlign(F, Ty, DL).value(); + retAlignment = DL.getABITypeAlignment(Ty); O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz << "]"; } else @@ -1348,8 +1348,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { const DataLayout &DL = getDataLayout(); const AttributeList &PAL = F->getAttributes(); const NVPTXSubtarget &STI = TM.getSubtarget(*F); - const auto *TLI = cast(STI.getTargetLowering()); - + const TargetLowering *TLI = STI.getTargetLowering(); Function::const_arg_iterator I, E; unsigned paramIndex = 0; bool first = true; @@ -1406,24 +1405,18 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { } } - auto getOptimalAlignForParam = [TLI, &DL, &PAL, F, - paramIndex](Type *Ty) -> Align { - Align TypeAlign = TLI->getFunctionParamOptimizedAlign(F, Ty, DL); - MaybeAlign ParamAlign = PAL.getParamAlignment(paramIndex); - return max(TypeAlign, ParamAlign); - }; - if (!PAL.hasParamAttr(paramIndex, Attribute::ByVal)) { if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { // Just print .param .align .b8 .param[size]; - // = optimal alignment for the element type; always multiple of - // PAL.getParamAlignment + // = PAL.getparamalignment // size = typeallocsize of element type - Align OptimalAlign = getOptimalAlignForParam(Ty); + const Align align = DL.getValueOrABITypeAlignment( + PAL.getParamAlignment(paramIndex), Ty); - O << "\t.param .align " << OptimalAlign.value() << " .b8 "; + unsigned sz = DL.getTypeAllocSize(Ty); + O << "\t.param .align " << align.value() << " .b8 "; printParamName(I, paramIndex, O); - O << "[" << DL.getTypeAllocSize(Ty) << "]"; + O << "[" << sz << "]"; continue; } @@ -1499,11 +1492,10 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { if (isABI || isKernelFunc) { // Just print .param .align .b8 .param[size]; - // = optimal alignment for the element type; always multiple of - // PAL.getParamAlignment + // = PAL.getparamalignment // size = typeallocsize of element type - Align OptimalAlign = getOptimalAlignForParam(ETy); - + Align align = + DL.getValueOrABITypeAlignment(PAL.getParamAlignment(paramIndex), ETy); // Work around a bug in ptxas. When PTX code takes address of // byval parameter with alignment < 4, ptxas generates code to // spill argument into memory. Alas on sm_50+ ptxas generates @@ -1515,10 +1507,10 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { // TODO: this will need to be undone when we get to support multi-TU // device-side compilation as it breaks ABI compatibility with nvcc. // Hopefully ptxas bug is fixed by then. - if (!isKernelFunc && OptimalAlign < Align(4)) - OptimalAlign = Align(4); + if (!isKernelFunc && align < Align(4)) + align = Align(4); unsigned sz = DL.getTypeAllocSize(ETy); - O << "\t.param .align " << OptimalAlign.value() << " .b8 "; + O << "\t.param .align " << align.value() << " .b8 "; printParamName(I, paramIndex, O); O << "[" << sz << "]"; continue; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 382e83dbb4cb91..2cda034f047c15 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1302,8 +1302,8 @@ std::string NVPTXTargetLowering::getPrototype( bool first = true; - const Function *F = CB.getFunction(); - for (unsigned i = 0, e = Args.size(), OIdx = 0; i != e; ++i, ++OIdx) { + unsigned OIdx = 0; + for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { Type *Ty = Args[i].Ty; if (!first) { O << ", "; @@ -1312,14 +1312,15 @@ std::string NVPTXTargetLowering::getPrototype( if (!Outs[OIdx].Flags.isByVal()) { if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { - unsigned ParamAlign = 0; + unsigned align = 0; const CallInst *CallI = cast(&CB); // +1 because index 0 is reserved for return type alignment - if (!getAlign(*CallI, i + 1, ParamAlign)) - ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value(); - O << ".param .align " << ParamAlign << " .b8 "; + if (!getAlign(*CallI, i + 1, align)) + align = DL.getABITypeAlignment(Ty); + unsigned sz = DL.getTypeAllocSize(Ty); + O << ".param .align " << align << " .b8 "; O << "_"; - O << "[" << DL.getTypeAllocSize(Ty) << "]"; + O << "[" << sz << "]"; // update the index for Outs SmallVector vtparts; ComputeValueVTs(*this, DL, Ty, vtparts); @@ -1351,17 +1352,11 @@ std::string NVPTXTargetLowering::getPrototype( continue; } - Align ParamByValAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); - - // Try to increase alignment. This code matches logic in LowerCall when - // alignment increase is performed to increase vectorization options. - Type *ETy = Args[i].IndirectType; - Align AlignCandidate = getFunctionParamOptimizedAlign(F, ETy, DL); - ParamByValAlign = std::max(ParamByValAlign, AlignCandidate); - - O << ".param .align " << ParamByValAlign.value() << " .b8 "; + Align align = Outs[OIdx].Flags.getNonZeroByValAlign(); + unsigned sz = Outs[OIdx].Flags.getByValSize(); + O << ".param .align " << align.value() << " .b8 "; O << "_"; - O << "[" << Outs[OIdx].Flags.getByValSize() << "]"; + O << "[" << sz << "]"; } O << ");"; return O.str(); @@ -1408,15 +1403,12 @@ Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, // Check for function alignment information if we found that the // ultimate target is a Function - if (DirectCallee) { + if (DirectCallee) if (getAlign(*DirectCallee, Idx, Alignment)) return Align(Alignment); - // If alignment information is not available, fall back to the - // default function param optimized type alignment - return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL); - } - // Call is indirect, fall back to the ABI type alignment + // Call is indirect or alignment information is not available, fall back to + // the ABI type alignment return DL.getABITypeAlign(Ty); } @@ -1577,26 +1569,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } // ByVal arguments - // TODO: remove code duplication when handling byval and non-byval cases. SmallVector VTs; SmallVector Offsets; - Type *ETy = Args[i].IndirectType; - assert(ETy && "byval arg must have indirect type"); - ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, 0); + assert(Args[i].IndirectType && "byval arg must have indirect type"); + ComputePTXValueVTs(*this, DL, Args[i].IndirectType, VTs, &Offsets, 0); // declare .param .align .b8 .param[]; unsigned sz = Outs[OIdx].Flags.getByValSize(); SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - + Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); // The ByValAlign in the Outs[OIdx].Flags is alway set at this point, // so we don't need to worry about natural alignment or not. // See TargetLowering::LowerCallTo(). - Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); - - // Try to increase alignment to enhance vectorization options. - const Function *F = CB->getCalledFunction(); - Align AlignCandidate = getFunctionParamOptimizedAlign(F, ETy, DL); - ArgAlign = std::max(ArgAlign, AlignCandidate); // Enforce minumum alignment of 4 to work around ptxas miscompile // for sm_50+. See corresponding alignment adjustment in @@ -1610,67 +1594,29 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, DeclareParamOps); InFlag = Chain.getValue(1); - - auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign); - SmallVector StoreOperands; for (unsigned j = 0, je = VTs.size(); j != je; ++j) { EVT elemtype = VTs[j]; int curOffset = Offsets[j]; - Align PartAlign = commonAlignment(ArgAlign, curOffset); - - // New store. - if (VectorInfo[j] & PVF_FIRST) { - assert(StoreOperands.empty() && "Unfinished preceding store."); - StoreOperands.push_back(Chain); - StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32)); - StoreOperands.push_back(DAG.getConstant(curOffset, dl, MVT::i32)); - } - + unsigned PartAlign = GreatestCommonDivisor64(ArgAlign.value(), curOffset); auto PtrVT = getPointerTy(DL); SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx], DAG.getConstant(curOffset, dl, PtrVT)); SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, MachinePointerInfo(), PartAlign); - if (elemtype.getSizeInBits() < 16) { - // Use 16-bit registers for small stores as it's the - // smallest general purpose register size supported by NVPTX. theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); } + SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CopyParamOps[] = { Chain, + DAG.getConstant(paramCount, dl, MVT::i32), + DAG.getConstant(curOffset, dl, MVT::i32), + theVal, InFlag }; + Chain = DAG.getMemIntrinsicNode( + NVPTXISD::StoreParam, dl, CopyParamVTs, CopyParamOps, elemtype, + MachinePointerInfo(), /* Align */ None, MachineMemOperand::MOStore); - // Record the value to store. - StoreOperands.push_back(theVal); - - if (VectorInfo[j] & PVF_LAST) { - unsigned NumElts = StoreOperands.size() - 3; - NVPTXISD::NodeType Op; - switch (NumElts) { - case 1: - Op = NVPTXISD::StoreParam; - break; - case 2: - Op = NVPTXISD::StoreParamV2; - break; - case 4: - Op = NVPTXISD::StoreParamV4; - break; - default: - llvm_unreachable("Invalid vector info."); - } - - StoreOperands.push_back(InFlag); - - Chain = DAG.getMemIntrinsicNode( - Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, - elemtype, MachinePointerInfo(), PartAlign, - MachineMemOperand::MOStore); - InFlag = Chain.getValue(1); - - // Cleanup. - StoreOperands.clear(); - } + InFlag = Chain.getValue(1); } - assert(StoreOperands.empty() && "Unfinished parameter store."); ++paramCount; } @@ -2671,8 +2617,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &dl, SelectionDAG &DAG) const { - const MachineFunction &MF = DAG.getMachineFunction(); - const Function &F = MF.getFunction(); + MachineFunction &MF = DAG.getMachineFunction(); Type *RetTy = MF.getFunction().getReturnType(); bool isABI = (STI.getSmVersion() >= 20); @@ -2687,9 +2632,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); auto VectorInfo = VectorizePTXValueVTs( - VTs, Offsets, - RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL) - : Align(1)); + VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlign(RetTy) : Align(1)); // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than // 32-bits are sign extended or zero extended, depending on whether @@ -4309,55 +4252,6 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( return false; } -/// getFunctionParamOptimizedAlign - since function arguments are passed via -/// .param space, we may want to increase their alignment in a way that -/// ensures that we can effectively vectorize their loads & stores. We can -/// increase alignment only if the function has internal or has private -/// linkage as for other linkage types callers may already rely on default -/// alignment. To allow using 128-bit vectorized loads/stores, this function -/// ensures that alignment is 16 or greater. -Align NVPTXTargetLowering::getFunctionParamOptimizedAlign( - const Function *F, Type *ArgTy, const DataLayout &DL) const { - const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value(); - - // If a function has linkage different from internal or private, we - // must use default ABI alignment as external users rely on it. - switch (F->getLinkage()) { - case GlobalValue::InternalLinkage: - case GlobalValue::PrivateLinkage: { - // Check that if a function has internal or private linkage - // it is not a kernel. -#ifndef NDEBUG - const NamedMDNode *NMDN = - F->getParent()->getNamedMetadata("nvvm.annotations"); - if (NMDN) { - for (const MDNode *MDN : NMDN->operands()) { - assert(MDN->getNumOperands() == 3); - - const Metadata *MD0 = MDN->getOperand(0).get(); - const auto *MDV0 = cast(MD0)->getValue(); - const auto *MDFn = cast(MDV0); - if (MDFn != F) - continue; - - const Metadata *MD1 = MDN->getOperand(1).get(); - const MDString *MDStr = cast(MD1); - if (MDStr->getString() != "kernel") - continue; - - const Metadata *MD2 = MDN->getOperand(2).get(); - const auto *MDV2 = cast(MD2)->getValue(); - assert(!cast(MDV2)->isZero()); - } - } -#endif - return Align(std::max(uint64_t(16), ABITypeAlign)); - } - default: - return Align(ABITypeAlign); - } -} - /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. /// Used to guide target specific optimizations, like loop strength reduction diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 18a697deacb44d..13829b924d4b43 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -451,16 +451,6 @@ class NVPTXTargetLowering : public TargetLowering { MachineFunction &MF, unsigned Intrinsic) const override; - /// getFunctionParamOptimizedAlign - since function arguments are passed via - /// .param space, we may want to increase their alignment in a way that - /// ensures that we can effectively vectorize their loads & stores. We can - /// increase alignment only if the function has internal or has private - /// linkage as for other linkage types callers may already rely on default - /// alignment. To allow using 128-bit vectorized loads/stores, this function - /// ensures that alignment is 16 or greater. - Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, - const DataLayout &DL) const; - /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type /// Used to guide target specific optimizations, like loop strength diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index 19b04f49d76cac..6183019de43df6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -88,17 +88,16 @@ // cancel the addrspacecast pair this pass emits. //===----------------------------------------------------------------------===// -#include "MCTargetDesc/NVPTXBaseInfo.h" #include "NVPTX.h" #include "NVPTXTargetMachine.h" #include "NVPTXUtilities.h" +#include "MCTargetDesc/NVPTXBaseInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/Pass.h" -#include #define DEBUG_TYPE "nvptx-lower-args" @@ -227,90 +226,6 @@ static void convertToParamAS(Value *OldUser, Value *Param) { [](Instruction *I) { I->eraseFromParent(); }); } -// Adjust alignment of arguments passed byval in .param address space. We can -// increase alignment of such arguments in a way that ensures that we can -// effectively vectorize their loads. We should also traverse all loads from -// byval pointer and adjust their alignment, if those were using known offset. -// Such alignment changes must be conformed with parameter store and load in -// NVPTXTargetLowering::LowerCall. -static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS, - const NVPTXTargetLowering *TLI) { - Function *Func = Arg->getParent(); - Type *StructType = Arg->getParamByValType(); - const DataLayout DL(Func->getParent()); - - uint64_t NewArgAlign = - TLI->getFunctionParamOptimizedAlign(Func, StructType, DL).value(); - uint64_t CurArgAlign = - Arg->getAttribute(Attribute::Alignment).getValueAsInt(); - - if (CurArgAlign >= NewArgAlign) - return; - - LLVM_DEBUG(dbgs() << "Try to use alignment " << NewArgAlign << " instead of " - << CurArgAlign << " for " << *Arg << '\n'); - - auto NewAlignAttr = - Attribute::get(Func->getContext(), Attribute::Alignment, NewArgAlign); - Arg->removeAttr(Attribute::Alignment); - Arg->addAttr(NewAlignAttr); - - struct Load { - LoadInst *Inst; - uint64_t Offset; - }; - - struct LoadContext { - Value *InitialVal; - uint64_t Offset; - }; - - SmallVector Loads; - std::queue Worklist; - Worklist.push({ArgInParamAS, 0}); - - while (!Worklist.empty()) { - LoadContext Ctx = Worklist.front(); - Worklist.pop(); - - for (User *CurUser : Ctx.InitialVal->users()) { - if (auto *I = dyn_cast(CurUser)) { - Loads.push_back({I, Ctx.Offset}); - continue; - } - - if (auto *I = dyn_cast(CurUser)) { - Worklist.push({I, Ctx.Offset}); - continue; - } - - if (auto *I = dyn_cast(CurUser)) { - APInt OffsetAccumulated = - APInt::getZero(DL.getIndexSizeInBits(ADDRESS_SPACE_PARAM)); - - if (!I->accumulateConstantOffset(DL, OffsetAccumulated)) - continue; - - uint64_t OffsetLimit = -1; - uint64_t Offset = OffsetAccumulated.getLimitedValue(OffsetLimit); - assert(Offset != OffsetLimit && "Expect Offset less than UINT64_MAX"); - - Worklist.push({I, Ctx.Offset + Offset}); - continue; - } - - llvm_unreachable("All users must be one of: load, " - "bitcast, getelementptr."); - } - } - - for (Load &CurLoad : Loads) { - Align NewLoadAlign(greatestCommonDivisor(NewArgAlign, CurLoad.Offset)); - Align CurLoadAlign(CurLoad.Inst->getAlign()); - CurLoad.Inst->setAlignment(std::max(NewLoadAlign, CurLoadAlign)); - } -} - void NVPTXLowerArgs::handleByValParam(Argument *Arg) { Function *Func = Arg->getParent(); Instruction *FirstInst = &(Func->getEntryBlock().front()); @@ -355,16 +270,6 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) { convertToParamAS(V, ArgInParamAS); }); LLVM_DEBUG(dbgs() << "No need to copy " << *Arg << "\n"); - - // Further optimizations require target lowering info. - if (!TM) - return; - - const auto *TLI = - cast(TM->getSubtargetImpl()->getTargetLowering()); - - adjustByValArgAlignment(Arg, ArgInParamAS, TLI); - return; } diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll deleted file mode 100644 index 681faf0bfa5343..00000000000000 --- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll +++ /dev/null @@ -1,801 +0,0 @@ -; RUN: llc < %s -mtriple=nvptx-unknown-unknown | FileCheck %s -; -; Check that parameters of a __device__ function with private or internal -; linkage called from a __global__ (kernel) function get increased alignment, -; and additional vectorization is performed on loads/stores with that -; parameters. -; -; Test IR is a minimized version of IR generated with the following command -; from the source code below: -; $ clang++ -O3 --cuda-gpu-arch=sm_35 -S -emit-llvm src.cu -; -; ---------------------------------------------------------------------------- -; #include -; -; struct St4x1 { uint32_t field[1]; }; -; struct St4x2 { uint32_t field[2]; }; -; struct St4x3 { uint32_t field[3]; }; -; struct St4x4 { uint32_t field[4]; }; -; struct St4x5 { uint32_t field[5]; }; -; struct St4x6 { uint32_t field[6]; }; -; struct St4x7 { uint32_t field[7]; }; -; struct St4x8 { uint32_t field[8]; }; -; struct St8x1 { uint64_t field[1]; }; -; struct St8x2 { uint64_t field[2]; }; -; struct St8x3 { uint64_t field[3]; }; -; struct St8x4 { uint64_t field[4]; }; -; -; #define DECLARE_CALLEE(StName) \ -; static __device__ __attribute__((noinline)) \ -; struct StName callee_##StName(struct StName in) { \ -; struct StName ret; \ -; const unsigned size = sizeof(ret.field) / sizeof(*ret.field); \ -; for (unsigned i = 0; i != size; ++i) \ -; ret.field[i] = in.field[i]; \ -; return ret; \ -; } \ - -; #define DECLARE_CALLER(StName) \ -; __global__ \ -; void caller_##StName(struct StName in, struct StName* ret) \ -; { \ -; *ret = callee_##StName(in); \ -; } \ -; -; #define DECLARE_CALL(StName) \ -; DECLARE_CALLEE(StName) \ -; DECLARE_CALLER(StName) \ -; -; DECLARE_CALL(St4x1) -; DECLARE_CALL(St4x2) -; DECLARE_CALL(St4x3) -; DECLARE_CALL(St4x4) -; DECLARE_CALL(St4x5) -; DECLARE_CALL(St4x6) -; DECLARE_CALL(St4x7) -; DECLARE_CALL(St4x8) -; DECLARE_CALL(St8x1) -; DECLARE_CALL(St8x2) -; DECLARE_CALL(St8x3) -; DECLARE_CALL(St8x4) -; ---------------------------------------------------------------------------- - -%struct.St4x1 = type { [1 x i32] } -%struct.St4x2 = type { [2 x i32] } -%struct.St4x3 = type { [3 x i32] } -%struct.St4x4 = type { [4 x i32] } -%struct.St4x5 = type { [5 x i32] } -%struct.St4x6 = type { [6 x i32] } -%struct.St4x7 = type { [7 x i32] } -%struct.St4x8 = type { [8 x i32] } -%struct.St8x1 = type { [1 x i64] } -%struct.St8x2 = type { [2 x i64] } -%struct.St8x3 = type { [3 x i64] } -%struct.St8x4 = type { [4 x i64] } - -; Section 1 - checking that: -; - function argument (including retval) vectorization is done with internal linkage; -; - caller and callee specify correct alignment for callee's params. - -define dso_local void @caller_St4x1(%struct.St4x1* nocapture noundef readonly byval(%struct.St4x1) align 4 %in, %struct.St4x1* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func caller_St4x1( - ; CHECK: .param .align 4 .b8 caller_St4x1_param_0[4], - ; CHECK: .param .b32 caller_St4x1_param_1 - ; CHECK: ) - ; CHECK: .param .b32 param0; - ; CHECK: st.param.b32 [param0+0], {{%r[0-9]+}}; - ; CHECK: .param .align 16 .b8 retval0[4]; - ; CHECK: call.uni (retval0), - ; CHECK-NEXT: callee_St4x1, - ; CHECK-NEXT: ( - ; CHECK-NEXT: param0 - ; CHECK-NEXT: ); - ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+0]; - %1 = getelementptr inbounds %struct.St4x1, %struct.St4x1* %in, i64 0, i32 0, i64 0 - %2 = load i32, i32* %1, align 4 - %call = tail call fastcc [1 x i32] @callee_St4x1(i32 %2) - %.fca.0.extract = extractvalue [1 x i32] %call, 0 - %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x1, %struct.St4x1* %ret, i64 0, i32 0, i64 0 - store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4 - ret void -} - -define internal fastcc [1 x i32] @callee_St4x1(i32 %in.0.val) { - ; CHECK: .func (.param .align 16 .b8 func_retval0[4]) - ; CHECK-LABEL: callee_St4x1( - ; CHECK-NEXT: .param .b32 callee_St4x1_param_0 - ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [callee_St4x1_param_0]; - ; CHECK: st.param.b32 [func_retval0+0], [[R1]]; - ; CHECK-NEXT: ret; - %oldret = insertvalue [1 x i32] poison, i32 %in.0.val, 0 - ret [1 x i32] %oldret -} - -define dso_local void @caller_St4x2(%struct.St4x2* nocapture noundef readonly byval(%struct.St4x2) align 4 %in, %struct.St4x2* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func caller_St4x2( - ; CHECK: .param .align 4 .b8 caller_St4x2_param_0[8], - ; CHECK: .param .b32 caller_St4x2_param_1 - ; CHECK: ) - ; CHECK: .param .align 16 .b8 param0[8]; - ; CHECK: st.param.v2.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}}; - ; CHECK: .param .align 16 .b8 retval0[8]; - ; CHECK: call.uni (retval0), - ; CHECK-NEXT: callee_St4x2, - ; CHECK-NEXT: ( - ; CHECK-NEXT: param0 - ; CHECK-NEXT: ); - ; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; - %agg.tmp = alloca i64, align 8 - %tmpcast = bitcast i64* %agg.tmp to %struct.St4x2* - %1 = bitcast %struct.St4x2* %in to i64* - %2 = load i64, i64* %1, align 4 - store i64 %2, i64* %agg.tmp, align 8 - %call = tail call fastcc [2 x i32] @callee_St4x2(%struct.St4x2* noundef nonnull byval(%struct.St4x2) align 4 %tmpcast) - %.fca.0.extract = extractvalue [2 x i32] %call, 0 - %.fca.1.extract = extractvalue [2 x i32] %call, 1 - %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x2, %struct.St4x2* %ret, i64 0, i32 0, i64 0 - store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4 - %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %ret, i64 0, i32 0, i64 1 - store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx3, align 4 - ret void -} - -define internal fastcc [2 x i32] @callee_St4x2(%struct.St4x2* nocapture noundef readonly byval(%struct.St4x2) align 4 %in) { - ; CHECK: .func (.param .align 16 .b8 func_retval0[8]) - ; CHECK-LABEL: callee_St4x2( - ; CHECK-NEXT: .param .align 16 .b8 callee_St4x2_param_0[8] - ; CHECK: ld.param.v2.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x2_param_0]; - ; CHECK: st.param.v2.b32 [func_retval0+0], {[[R1]], [[R2]]}; - ; CHECK-NEXT: ret; - %arrayidx = getelementptr inbounds %struct.St4x2, %struct.St4x2* %in, i64 0, i32 0, i64 0 - %1 = load i32, i32* %arrayidx, align 4 - %arrayidx.1 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %in, i64 0, i32 0, i64 1 - %2 = load i32, i32* %arrayidx.1, align 4 - %3 = insertvalue [2 x i32] poison, i32 %1, 0 - %oldret = insertvalue [2 x i32] %3, i32 %2, 1 - ret [2 x i32] %oldret -} - -define dso_local void @caller_St4x3(%struct.St4x3* nocapture noundef readonly byval(%struct.St4x3) align 4 %in, %struct.St4x3* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func caller_St4x3( - ; CHECK: .param .align 4 .b8 caller_St4x3_param_0[12], - ; CHECK: .param .b32 caller_St4x3_param_1 - ; CHECK: ) - ; CHECK: .param .align 16 .b8 param0[12]; - ; CHECK: st.param.v2.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}}; - ; CHECK: st.param.b32 [param0+8], {{%r[0-9]+}}; - ; CHECK: .param .align 16 .b8 retval0[12]; - ; CHECK: call.uni (retval0), - ; CHECK-NEXT: callee_St4x3, - ; CHECK-NEXT: ( - ; CHECK-NEXT: param0 - ; CHECK-NEXT: ); - ; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; - ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+8]; - %call = tail call fastcc [3 x i32] @callee_St4x3(%struct.St4x3* noundef nonnull byval(%struct.St4x3) align 4 %in) - %.fca.0.extract = extractvalue [3 x i32] %call, 0 - %.fca.1.extract = extractvalue [3 x i32] %call, 1 - %.fca.2.extract = extractvalue [3 x i32] %call, 2 - %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 0 - store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4 - %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 1 - store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx2, align 4 - %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 2 - store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx4, align 4 - ret void -} - - -define internal fastcc [3 x i32] @callee_St4x3(%struct.St4x3* nocapture noundef readonly byval(%struct.St4x3) align 4 %in) { - ; CHECK: .func (.param .align 16 .b8 func_retval0[12]) - ; CHECK-LABEL: callee_St4x3( - ; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12] - ; CHECK: ld.param.v2.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0]; - ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [callee_St4x3_param_0+8]; - ; CHECK: st.param.v2.b32 [func_retval0+0], {[[R1]], [[R2]]}; - ; CHECK: st.param.b32 [func_retval0+8], [[R3]]; - ; CHECK-NEXT: ret; - %arrayidx = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 0 - %1 = load i32, i32* %arrayidx, align 4 - %arrayidx.1 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 1 - %2 = load i32, i32* %arrayidx.1, align 4 - %arrayidx.2 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 2 - %3 = load i32, i32* %arrayidx.2, align 4 - %4 = insertvalue [3 x i32] poison, i32 %1, 0 - %5 = insertvalue [3 x i32] %4, i32 %2, 1 - %oldret = insertvalue [3 x i32] %5, i32 %3, 2 - ret [3 x i32] %oldret -} - - -define dso_local void @caller_St4x4(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in, %struct.St4x4* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func caller_St4x4( - ; CHECK: .param .align 4 .b8 caller_St4x4_param_0[16], - ; CHECK: .param .b32 caller_St4x4_param_1 - ; CHECK: ) - ; CHECK: .param .align 16 .b8 param0[16]; - ; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; - ; CHECK: .param .align 16 .b8 retval0[16]; - ; CHECK: call.uni (retval0), - ; CHECK-NEXT: callee_St4x4, - ; CHECK-NEXT: ( - ; CHECK-NEXT: param0 - ; CHECK-NEXT: ); - ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; - %call = tail call fastcc [4 x i32] @callee_St4x4(%struct.St4x4* noundef nonnull byval(%struct.St4x4) align 4 %in) - %.fca.0.extract = extractvalue [4 x i32] %call, 0 - %.fca.1.extract = extractvalue [4 x i32] %call, 1 - %.fca.2.extract = extractvalue [4 x i32] %call, 2 - %.fca.3.extract = extractvalue [4 x i32] %call, 3 - %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 0 - store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4 - %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 1 - store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx3, align 4 - %ref.tmp.sroa.5.0..sroa_idx5 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 2 - store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx5, align 4 - %ref.tmp.sroa.6.0..sroa_idx7 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 3 - store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx7, align 4 - ret void -} - - -define internal fastcc [4 x i32] @callee_St4x4(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in) { - ; CHECK: .func (.param .align 16 .b8 func_retval0[16]) - ; CHECK-LABEL: callee_St4x4( - ; CHECK-NEXT: .param .align 16 .b8 callee_St4x4_param_0[16] - ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_param_0]; - ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; - ; CHECK-NEXT: ret; - %arrayidx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 0 - %1 = load i32, i32* %arrayidx, align 4 - %arrayidx.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 1 - %2 = load i32, i32* %arrayidx.1, align 4 - %arrayidx.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 2 - %3 = load i32, i32* %arrayidx.2, align 4 - %arrayidx.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 3 - %4 = load i32, i32* %arrayidx.3, align 4 - %5 = insertvalue [4 x i32] poison, i32 %1, 0 - %6 = insertvalue [4 x i32] %5, i32 %2, 1 - %7 = insertvalue [4 x i32] %6, i32 %3, 2 - %oldret = insertvalue [4 x i32] %7, i32 %4, 3 - ret [4 x i32] %oldret -} - - -define dso_local void @caller_St4x5(%struct.St4x5* nocapture noundef readonly byval(%struct.St4x5) align 4 %in, %struct.St4x5* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func caller_St4x5( - ; CHECK: .param .align 4 .b8 caller_St4x5_param_0[20], - ; CHECK: .param .b32 caller_St4x5_param_1 - ; CHECK: ) - ; CHECK: .param .align 16 .b8 param0[20]; - ; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; - ; CHECK: st.param.b32 [param0+16], {{%r[0-9]+}}; - ; CHECK: .param .align 16 .b8 retval0[20]; - ; CHECK: call.uni (retval0), - ; CHECK-NEXT: callee_St4x5, - ; CHECK-NEXT: ( - ; CHECK-NEXT: param0 - ; CHECK-NEXT: ); - ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; - ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+16]; - %call = tail call fastcc [5 x i32] @callee_St4x5(%struct.St4x5* noundef nonnull byval(%struct.St4x5) align 4 %in) - %.fca.0.extract = extractvalue [5 x i32] %call, 0 - %.fca.1.extract = extractvalue [5 x i32] %call, 1 - %.fca.2.extract = extractvalue [5 x i32] %call, 2 - %.fca.3.extract = extractvalue [5 x i32] %call, 3 - %.fca.4.extract = extractvalue [5 x i32] %call, 4 - %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 0 - store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4 - %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 1 - store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx3, align 4 - %ref.tmp.sroa.5.0..sroa_idx5 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 2 - store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx5, align 4 - %ref.tmp.sroa.6.0..sroa_idx7 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 3 - store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx7, align 4 - %ref.tmp.sroa.7.0..sroa_idx9 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 4 - store i32 %.fca.4.extract, i32* %ref.tmp.sroa.7.0..sroa_idx9, align 4 - ret void -} - - -define internal fastcc [5 x i32] @callee_St4x5(%struct.St4x5* nocapture noundef readonly byval(%struct.St4x5) align 4 %in) { - ; CHECK: .func (.param .align 16 .b8 func_retval0[20]) - ; CHECK-LABEL: callee_St4x5( - ; CHECK-NEXT: .param .align 16 .b8 callee_St4x5_param_0[20] - ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x5_param_0]; - ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [callee_St4x5_param_0+16]; - ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; - ; CHECK: st.param.b32 [func_retval0+16], [[R5]]; - ; CHECK-NEXT: ret; - %arrayidx = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 0 - %1 = load i32, i32* %arrayidx, align 4 - %arrayidx.1 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 1 - %2 = load i32, i32* %arrayidx.1, align 4 - %arrayidx.2 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 2 - %3 = load i32, i32* %arrayidx.2, align 4 - %arrayidx.3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 3 - %4 = load i32, i32* %arrayidx.3, align 4 - %arrayidx.4 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 4 - %5 = load i32, i32* %arrayidx.4, align 4 - %6 = insertvalue [5 x i32] poison, i32 %1, 0 - %7 = insertvalue [5 x i32] %6, i32 %2, 1 - %8 = insertvalue [5 x i32] %7, i32 %3, 2 - %9 = insertvalue [5 x i32] %8, i32 %4, 3 - %oldret = insertvalue [5 x i32] %9, i32 %5, 4 - ret [5 x i32] %oldret -} - - -define dso_local void @caller_St4x6(%struct.St4x6* nocapture noundef readonly byval(%struct.St4x6) align 4 %in, %struct.St4x6* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func caller_St4x6( - ; CHECK: .param .align 4 .b8 caller_St4x6_param_0[24], - ; CHECK: .param .b32 caller_St4x6_param_1 - ; CHECK: ) - ; CHECK: .param .align 16 .b8 param0[24]; - ; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; - ; CHECK: st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}}; - ; CHECK: .param .align 16 .b8 retval0[24]; - ; CHECK: call.uni (retval0), - ; CHECK-NEXT: callee_St4x6, - ; CHECK-NEXT: ( - ; CHECK-NEXT: param0 - ; CHECK-NEXT: ); - ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; - ; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16]; - %call = tail call fastcc [6 x i32] @callee_St4x6(%struct.St4x6* noundef nonnull byval(%struct.St4x6) align 4 %in) - %.fca.0.extract = extractvalue [6 x i32] %call, 0 - %.fca.1.extract = extractvalue [6 x i32] %call, 1 - %.fca.2.extract = extractvalue [6 x i32] %call, 2 - %.fca.3.extract = extractvalue [6 x i32] %call, 3 - %.fca.4.extract = extractvalue [6 x i32] %call, 4 - %.fca.5.extract = extractvalue [6 x i32] %call, 5 - %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 0 - store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4 - %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 1 - store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx2, align 4 - %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 2 - store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx4, align 4 - %ref.tmp.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 3 - store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx6, align 4 - %ref.tmp.sroa.7.0..sroa_idx8 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 4 - store i32 %.fca.4.extract, i32* %ref.tmp.sroa.7.0..sroa_idx8, align 4 - %ref.tmp.sroa.8.0..sroa_idx10 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 5 - store i32 %.fca.5.extract, i32* %ref.tmp.sroa.8.0..sroa_idx10, align 4 - ret void -} - - -define internal fastcc [6 x i32] @callee_St4x6(%struct.St4x6* nocapture noundef readonly byval(%struct.St4x6) align 4 %in) { - ; CHECK: .func (.param .align 16 .b8 func_retval0[24]) - ; CHECK-LABEL: callee_St4x6( - ; CHECK-NEXT: .param .align 16 .b8 callee_St4x6_param_0[24] - ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x6_param_0]; - ; CHECK: ld.param.v2.u32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x6_param_0+16]; - ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; - ; CHECK: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]}; - ; CHECK-NEXT: ret; - %arrayidx = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 0 - %1 = load i32, i32* %arrayidx, align 4 - %arrayidx.1 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 1 - %2 = load i32, i32* %arrayidx.1, align 4 - %arrayidx.2 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 2 - %3 = load i32, i32* %arrayidx.2, align 4 - %arrayidx.3 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 3 - %4 = load i32, i32* %arrayidx.3, align 4 - %arrayidx.4 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 4 - %5 = load i32, i32* %arrayidx.4, align 4 - %arrayidx.5 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 5 - %6 = load i32, i32* %arrayidx.5, align 4 - %7 = insertvalue [6 x i32] poison, i32 %1, 0 - %8 = insertvalue [6 x i32] %7, i32 %2, 1 - %9 = insertvalue [6 x i32] %8, i32 %3, 2 - %10 = insertvalue [6 x i32] %9, i32 %4, 3 - %11 = insertvalue [6 x i32] %10, i32 %5, 4 - %oldret = insertvalue [6 x i32] %11, i32 %6, 5 - ret [6 x i32] %oldret -} - - -define dso_local void @caller_St4x7(%struct.St4x7* nocapture noundef readonly byval(%struct.St4x7) align 4 %in, %struct.St4x7* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func caller_St4x7( - ; CHECK: .param .align 4 .b8 caller_St4x7_param_0[28], - ; CHECK: .param .b32 caller_St4x7_param_1 - ; CHECK: ) - ; CHECK: .param .align 16 .b8 param0[28]; - ; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; - ; CHECK: st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}}; - ; CHECK: st.param.b32 [param0+24], {{%r[0-9]+}}; - ; CHECK: .param .align 16 .b8 retval0[28]; - ; CHECK: call.uni (retval0), - ; CHECK-NEXT: callee_St4x7, - ; CHECK-NEXT: ( - ; CHECK-NEXT: param0 - ; CHECK-NEXT: ); - ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; - ; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16]; - ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+24]; - %call = tail call fastcc [7 x i32] @callee_St4x7(%struct.St4x7* noundef nonnull byval(%struct.St4x7) align 4 %in) - %.fca.0.extract = extractvalue [7 x i32] %call, 0 - %.fca.1.extract = extractvalue [7 x i32] %call, 1 - %.fca.2.extract = extractvalue [7 x i32] %call, 2 - %.fca.3.extract = extractvalue [7 x i32] %call, 3 - %.fca.4.extract = extractvalue [7 x i32] %call, 4 - %.fca.5.extract = extractvalue [7 x i32] %call, 5 - %.fca.6.extract = extractvalue [7 x i32] %call, 6 - %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 0 - store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4 - %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 1 - store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx2, align 4 - %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 2 - store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx4, align 4 - %ref.tmp.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 3 - store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx6, align 4 - %ref.tmp.sroa.7.0..sroa_idx8 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 4 - store i32 %.fca.4.extract, i32* %ref.tmp.sroa.7.0..sroa_idx8, align 4 - %ref.tmp.sroa.8.0..sroa_idx10 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 5 - store i32 %.fca.5.extract, i32* %ref.tmp.sroa.8.0..sroa_idx10, align 4 - %ref.tmp.sroa.9.0..sroa_idx12 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 6 - store i32 %.fca.6.extract, i32* %ref.tmp.sroa.9.0..sroa_idx12, align 4 - ret void -} - - -define internal fastcc [7 x i32] @callee_St4x7(%struct.St4x7* nocapture noundef readonly byval(%struct.St4x7) align 4 %in) { - ; CHECK: .func (.param .align 16 .b8 func_retval0[28]) - ; CHECK-LABEL: callee_St4x7( - ; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28] - ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0]; - ; CHECK: ld.param.v2.u32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16]; - ; CHECK: ld.param.u32 [[R7:%r[0-9]+]], [callee_St4x7_param_0+24]; - ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; - ; CHECK: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]}; - ; CHECK: st.param.b32 [func_retval0+24], [[R7]]; - ; CHECK-NEXT: ret; - %arrayidx = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 0 - %1 = load i32, i32* %arrayidx, align 4 - %arrayidx.1 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 1 - %2 = load i32, i32* %arrayidx.1, align 4 - %arrayidx.2 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 2 - %3 = load i32, i32* %arrayidx.2, align 4 - %arrayidx.3 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 3 - %4 = load i32, i32* %arrayidx.3, align 4 - %arrayidx.4 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 4 - %5 = load i32, i32* %arrayidx.4, align 4 - %arrayidx.5 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 5 - %6 = load i32, i32* %arrayidx.5, align 4 - %arrayidx.6 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 6 - %7 = load i32, i32* %arrayidx.6, align 4 - %8 = insertvalue [7 x i32] poison, i32 %1, 0 - %9 = insertvalue [7 x i32] %8, i32 %2, 1 - %10 = insertvalue [7 x i32] %9, i32 %3, 2 - %11 = insertvalue [7 x i32] %10, i32 %4, 3 - %12 = insertvalue [7 x i32] %11, i32 %5, 4 - %13 = insertvalue [7 x i32] %12, i32 %6, 5 - %oldret = insertvalue [7 x i32] %13, i32 %7, 6 - ret [7 x i32] %oldret -} - - -define dso_local void @caller_St4x8(%struct.St4x8* nocapture noundef readonly byval(%struct.St4x8) align 4 %in, %struct.St4x8* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func caller_St4x8( - ; CHECK: .param .align 4 .b8 caller_St4x8_param_0[32], - ; CHECK: .param .b32 caller_St4x8_param_1 - ; CHECK: ) - ; CHECK: .param .align 16 .b8 param0[32]; - ; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; - ; CHECK: st.param.v4.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; - ; CHECK: .param .align 16 .b8 retval0[32]; - ; CHECK: call.uni (retval0), - ; CHECK-NEXT: callee_St4x8, - ; CHECK-NEXT: ( - ; CHECK-NEXT: param0 - ; CHECK-NEXT: ); - ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; - ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16]; - %call = tail call fastcc [8 x i32] @callee_St4x8(%struct.St4x8* noundef nonnull byval(%struct.St4x8) align 4 %in) - %.fca.0.extract = extractvalue [8 x i32] %call, 0 - %.fca.1.extract = extractvalue [8 x i32] %call, 1 - %.fca.2.extract = extractvalue [8 x i32] %call, 2 - %.fca.3.extract = extractvalue [8 x i32] %call, 3 - %.fca.4.extract = extractvalue [8 x i32] %call, 4 - %.fca.5.extract = extractvalue [8 x i32] %call, 5 - %.fca.6.extract = extractvalue [8 x i32] %call, 6 - %.fca.7.extract = extractvalue [8 x i32] %call, 7 - %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 0 - store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4 - %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 1 - store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx2, align 4 - %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 2 - store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx4, align 4 - %ref.tmp.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 3 - store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx6, align 4 - %ref.tmp.sroa.7.0..sroa_idx8 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 4 - store i32 %.fca.4.extract, i32* %ref.tmp.sroa.7.0..sroa_idx8, align 4 - %ref.tmp.sroa.8.0..sroa_idx10 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 5 - store i32 %.fca.5.extract, i32* %ref.tmp.sroa.8.0..sroa_idx10, align 4 - %ref.tmp.sroa.9.0..sroa_idx12 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 6 - store i32 %.fca.6.extract, i32* %ref.tmp.sroa.9.0..sroa_idx12, align 4 - %ref.tmp.sroa.10.0..sroa_idx14 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 7 - store i32 %.fca.7.extract, i32* %ref.tmp.sroa.10.0..sroa_idx14, align 4 - ret void -} - - -define internal fastcc [8 x i32] @callee_St4x8(%struct.St4x8* nocapture noundef readonly byval(%struct.St4x8) align 4 %in) { - ; CHECK: .func (.param .align 16 .b8 func_retval0[32]) - ; CHECK-LABEL: callee_St4x8( - ; CHECK-NEXT: .param .align 16 .b8 callee_St4x8_param_0[32] - ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x8_param_0]; - ; CHECK: ld.param.v4.u32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], [[R8:%r[0-9]+]]}, [callee_St4x8_param_0+16]; - ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; - ; CHECK: st.param.v4.b32 [func_retval0+16], {[[R5]], [[R6]], [[R7]], [[R8]]}; - ; CHECK-NEXT: ret; - %arrayidx = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 0 - %1 = load i32, i32* %arrayidx, align 4 - %arrayidx.1 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 1 - %2 = load i32, i32* %arrayidx.1, align 4 - %arrayidx.2 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 2 - %3 = load i32, i32* %arrayidx.2, align 4 - %arrayidx.3 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 3 - %4 = load i32, i32* %arrayidx.3, align 4 - %arrayidx.4 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 4 - %5 = load i32, i32* %arrayidx.4, align 4 - %arrayidx.5 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 5 - %6 = load i32, i32* %arrayidx.5, align 4 - %arrayidx.6 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 6 - %7 = load i32, i32* %arrayidx.6, align 4 - %arrayidx.7 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 7 - %8 = load i32, i32* %arrayidx.7, align 4 - %9 = insertvalue [8 x i32] poison, i32 %1, 0 - %10 = insertvalue [8 x i32] %9, i32 %2, 1 - %11 = insertvalue [8 x i32] %10, i32 %3, 2 - %12 = insertvalue [8 x i32] %11, i32 %4, 3 - %13 = insertvalue [8 x i32] %12, i32 %5, 4 - %14 = insertvalue [8 x i32] %13, i32 %6, 5 - %15 = insertvalue [8 x i32] %14, i32 %7, 6 - %oldret = insertvalue [8 x i32] %15, i32 %8, 7 - ret [8 x i32] %oldret -} - - -define dso_local void @caller_St8x1(%struct.St8x1* nocapture noundef readonly byval(%struct.St8x1) align 8 %in, %struct.St8x1* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func caller_St8x1( - ; CHECK: .param .align 8 .b8 caller_St8x1_param_0[8], - ; CHECK: .param .b32 caller_St8x1_param_1 - ; CHECK: ) - ; CHECK: .param .b64 param0; - ; CHECK: st.param.b64 [param0+0], {{%rd[0-9]+}}; - ; CHECK: .param .align 16 .b8 retval0[8]; - ; CHECK: call.uni (retval0), - ; CHECK-NEXT: callee_St8x1, - ; CHECK-NEXT: ( - ; CHECK-NEXT: param0 - ; CHECK-NEXT: ); - ; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0+0]; - %1 = getelementptr inbounds %struct.St8x1, %struct.St8x1* %in, i64 0, i32 0, i64 0 - %2 = load i64, i64* %1, align 8 - %call = tail call fastcc [1 x i64] @callee_St8x1(i64 %2) - %.fca.0.extract = extractvalue [1 x i64] %call, 0 - %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St8x1, %struct.St8x1* %ret, i64 0, i32 0, i64 0 - store i64 %.fca.0.extract, i64* %ref.tmp.sroa.0.0..sroa_idx, align 8 - ret void -} - - -define internal fastcc [1 x i64] @callee_St8x1(i64 %in.0.val) { - ; CHECK: .func (.param .align 16 .b8 func_retval0[8]) - ; CHECK-LABEL: callee_St8x1( - ; CHECK-NEXT: .param .b64 callee_St8x1_param_0 - ; CHECK: ld.param.u64 [[RD1:%rd[0-9]+]], [callee_St8x1_param_0]; - ; CHECK: st.param.b64 [func_retval0+0], [[RD1]]; - ; CHECK-NEXT: ret; - %oldret = insertvalue [1 x i64] poison, i64 %in.0.val, 0 - ret [1 x i64] %oldret -} - - -define dso_local void @caller_St8x2(%struct.St8x2* nocapture noundef readonly byval(%struct.St8x2) align 8 %in, %struct.St8x2* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func caller_St8x2( - ; CHECK: .param .align 8 .b8 caller_St8x2_param_0[16], - ; CHECK: .param .b32 caller_St8x2_param_1 - ; CHECK: ) - ; CHECK: .param .align 16 .b8 param0[16]; - ; CHECK: st.param.v2.b64 [param0+0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; - ; CHECK: .param .align 16 .b8 retval0[16]; - ; CHECK: call.uni (retval0), - ; CHECK-NEXT: callee_St8x2, - ; CHECK-NEXT: ( - ; CHECK-NEXT: param0 - ; CHECK-NEXT: ); - ; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+0]; - %call = tail call fastcc [2 x i64] @callee_St8x2(%struct.St8x2* noundef nonnull byval(%struct.St8x2) align 8 %in) - %.fca.0.extract = extractvalue [2 x i64] %call, 0 - %.fca.1.extract = extractvalue [2 x i64] %call, 1 - %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St8x2, %struct.St8x2* %ret, i64 0, i32 0, i64 0 - store i64 %.fca.0.extract, i64* %ref.tmp.sroa.0.0..sroa_idx, align 8 - %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %ret, i64 0, i32 0, i64 1 - store i64 %.fca.1.extract, i64* %ref.tmp.sroa.4.0..sroa_idx3, align 8 - ret void -} - - -define internal fastcc [2 x i64] @callee_St8x2(%struct.St8x2* nocapture noundef readonly byval(%struct.St8x2) align 8 %in) { - ; CHECK: .func (.param .align 16 .b8 func_retval0[16]) - ; CHECK-LABEL: callee_St8x2( - ; CHECK-NEXT: .param .align 16 .b8 callee_St8x2_param_0[16] - ; CHECK: ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x2_param_0]; - ; CHECK: st.param.v2.b64 [func_retval0+0], {[[RD1]], [[RD2]]}; - ; CHECK-NEXT: ret; - %arrayidx = getelementptr inbounds %struct.St8x2, %struct.St8x2* %in, i64 0, i32 0, i64 0 - %1 = load i64, i64* %arrayidx, align 8 - %arrayidx.1 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %in, i64 0, i32 0, i64 1 - %2 = load i64, i64* %arrayidx.1, align 8 - %3 = insertvalue [2 x i64] poison, i64 %1, 0 - %oldret = insertvalue [2 x i64] %3, i64 %2, 1 - ret [2 x i64] %oldret -} - - -define dso_local void @caller_St8x3(%struct.St8x3* nocapture noundef readonly byval(%struct.St8x3) align 8 %in, %struct.St8x3* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func caller_St8x3( - ; CHECK: .param .align 8 .b8 caller_St8x3_param_0[24], - ; CHECK: .param .b32 caller_St8x3_param_1 - ; CHECK: ) - ; CHECK: .param .align 16 .b8 param0[24]; - ; CHECK: st.param.v2.b64 [param0+0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; - ; CHECK: st.param.b64 [param0+16], {{%rd[0-9]+}}; - ; CHECK: .param .align 16 .b8 retval0[24]; - ; CHECK: call.uni (retval0), - ; CHECK-NEXT: callee_St8x3, - ; CHECK-NEXT: ( - ; CHECK-NEXT: param0 - ; CHECK-NEXT: ); - ; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+0]; - ; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0+16]; - %call = tail call fastcc [3 x i64] @callee_St8x3(%struct.St8x3* noundef nonnull byval(%struct.St8x3) align 8 %in) - %.fca.0.extract = extractvalue [3 x i64] %call, 0 - %.fca.1.extract = extractvalue [3 x i64] %call, 1 - %.fca.2.extract = extractvalue [3 x i64] %call, 2 - %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 0 - store i64 %.fca.0.extract, i64* %ref.tmp.sroa.0.0..sroa_idx, align 8 - %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 1 - store i64 %.fca.1.extract, i64* %ref.tmp.sroa.4.0..sroa_idx2, align 8 - %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 2 - store i64 %.fca.2.extract, i64* %ref.tmp.sroa.5.0..sroa_idx4, align 8 - ret void -} - - -define internal fastcc [3 x i64] @callee_St8x3(%struct.St8x3* nocapture noundef readonly byval(%struct.St8x3) align 8 %in) { - ; CHECK: .func (.param .align 16 .b8 func_retval0[24]) - ; CHECK-LABEL: callee_St8x3( - ; CHECK-NEXT: .param .align 16 .b8 callee_St8x3_param_0[24] - ; CHECK: ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x3_param_0]; - ; CHECK: ld.param.u64 [[RD3:%rd[0-9]+]], [callee_St8x3_param_0+16]; - ; CHECK: st.param.v2.b64 [func_retval0+0], {[[RD1]], [[RD2]]}; - ; CHECK: st.param.b64 [func_retval0+16], [[RD3]]; - ; CHECK-NEXT: ret; - %arrayidx = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 0 - %1 = load i64, i64* %arrayidx, align 8 - %arrayidx.1 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 1 - %2 = load i64, i64* %arrayidx.1, align 8 - %arrayidx.2 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 2 - %3 = load i64, i64* %arrayidx.2, align 8 - %4 = insertvalue [3 x i64] poison, i64 %1, 0 - %5 = insertvalue [3 x i64] %4, i64 %2, 1 - %oldret = insertvalue [3 x i64] %5, i64 %3, 2 - ret [3 x i64] %oldret -} - - -define dso_local void @caller_St8x4(%struct.St8x4* nocapture noundef readonly byval(%struct.St8x4) align 8 %in, %struct.St8x4* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func caller_St8x4( - ; CHECK: .param .align 8 .b8 caller_St8x4_param_0[32], - ; CHECK: .param .b32 caller_St8x4_param_1 - ; CHECK: ) - ; CHECK: .param .align 16 .b8 param0[32]; - ; CHECK: st.param.v2.b64 [param0+0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; - ; CHECK: st.param.v2.b64 [param0+16], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; - ; CHECK: .param .align 16 .b8 retval0[32]; - ; CHECK: call.uni (retval0), - ; CHECK-NEXT: callee_St8x4, - ; CHECK-NEXT: ( - ; CHECK-NEXT: param0 - ; CHECK-NEXT: ); - ; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+0]; - ; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+16]; - %call = tail call fastcc [4 x i64] @callee_St8x4(%struct.St8x4* noundef nonnull byval(%struct.St8x4) align 8 %in) - %.fca.0.extract = extractvalue [4 x i64] %call, 0 - %.fca.1.extract = extractvalue [4 x i64] %call, 1 - %.fca.2.extract = extractvalue [4 x i64] %call, 2 - %.fca.3.extract = extractvalue [4 x i64] %call, 3 - %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 0 - store i64 %.fca.0.extract, i64* %ref.tmp.sroa.0.0..sroa_idx, align 8 - %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 1 - store i64 %.fca.1.extract, i64* %ref.tmp.sroa.4.0..sroa_idx3, align 8 - %ref.tmp.sroa.5.0..sroa_idx5 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 2 - store i64 %.fca.2.extract, i64* %ref.tmp.sroa.5.0..sroa_idx5, align 8 - %ref.tmp.sroa.6.0..sroa_idx7 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 3 - store i64 %.fca.3.extract, i64* %ref.tmp.sroa.6.0..sroa_idx7, align 8 - ret void -} - - -define internal fastcc [4 x i64] @callee_St8x4(%struct.St8x4* nocapture noundef readonly byval(%struct.St8x4) align 8 %in) { - ; CHECK: .func (.param .align 16 .b8 func_retval0[32]) - ; CHECK-LABEL: callee_St8x4( - ; CHECK-NEXT: .param .align 16 .b8 callee_St8x4_param_0[32] - ; CHECK: ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x4_param_0]; - ; CHECK: ld.param.v2.u64 {[[RD3:%rd[0-9]+]], [[RD4:%rd[0-9]+]]}, [callee_St8x4_param_0+16]; - ; CHECK: st.param.v2.b64 [func_retval0+0], {[[RD1]], [[RD2]]}; - ; CHECK: st.param.v2.b64 [func_retval0+16], {[[RD3]], [[RD4]]}; - ; CHECK-NEXT: ret; - %arrayidx = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 0 - %1 = load i64, i64* %arrayidx, align 8 - %arrayidx.1 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 1 - %2 = load i64, i64* %arrayidx.1, align 8 - %arrayidx.2 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 2 - %3 = load i64, i64* %arrayidx.2, align 8 - %arrayidx.3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 3 - %4 = load i64, i64* %arrayidx.3, align 8 - %5 = insertvalue [4 x i64] poison, i64 %1, 0 - %6 = insertvalue [4 x i64] %5, i64 %2, 1 - %7 = insertvalue [4 x i64] %6, i64 %3, 2 - %oldret = insertvalue [4 x i64] %7, i64 %4, 3 - ret [4 x i64] %oldret -} - -; Section 2 - checking that function argument (including retval) vectorization is done with private linkage. - -define private fastcc [4 x i32] @callee_St4x4_private(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in) { - ; CHECK: .func (.param .align 16 .b8 func_retval0[16]) - ; CHECK-LABEL: callee_St4x4_private( - ; CHECK-NEXT: .param .align 16 .b8 callee_St4x4_private_param_0[16] - ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_private_param_0]; - ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; - ; CHECK-NEXT: ret; - %arrayidx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 0 - %1 = load i32, i32* %arrayidx, align 4 - %arrayidx.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 1 - %2 = load i32, i32* %arrayidx.1, align 4 - %arrayidx.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 2 - %3 = load i32, i32* %arrayidx.2, align 4 - %arrayidx.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 3 - %4 = load i32, i32* %arrayidx.3, align 4 - %5 = insertvalue [4 x i32] poison, i32 %1, 0 - %6 = insertvalue [4 x i32] %5, i32 %2, 1 - %7 = insertvalue [4 x i32] %6, i32 %3, 2 - %oldret = insertvalue [4 x i32] %7, i32 %4, 3 - ret [4 x i32] %oldret -} - -; Section 3 - checking that function argument (including retval) vectorization -; is NOT done with linkage types other than internal and private. - -define external fastcc [4 x i32] @callee_St4x4_external(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in) { - ; CHECK: .func (.param .align 4 .b8 func_retval0[16]) - ; CHECK-LABEL: callee_St4x4_external( - ; CHECK-NEXT: .param .align 4 .b8 callee_St4x4_external_param_0[16] - ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [callee_St4x4_external_param_0]; - ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [callee_St4x4_external_param_0+4]; - ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [callee_St4x4_external_param_0+8]; - ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [callee_St4x4_external_param_0+12]; - ; CHECK: st.param.b32 [func_retval0+0], [[R1]]; - ; CHECK: st.param.b32 [func_retval0+4], [[R2]]; - ; CHECK: st.param.b32 [func_retval0+8], [[R3]]; - ; CHECK: st.param.b32 [func_retval0+12], [[R4]]; - ; CHECK-NEXT: ret; - %arrayidx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 0 - %1 = load i32, i32* %arrayidx, align 4 - %arrayidx.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 1 - %2 = load i32, i32* %arrayidx.1, align 4 - %arrayidx.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 2 - %3 = load i32, i32* %arrayidx.2, align 4 - %arrayidx.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 3 - %4 = load i32, i32* %arrayidx.3, align 4 - %5 = insertvalue [4 x i32] poison, i32 %1, 0 - %6 = insertvalue [4 x i32] %5, i32 %2, 1 - %7 = insertvalue [4 x i32] %6, i32 %3, 2 - %oldret = insertvalue [4 x i32] %7, i32 %4, 3 - ret [4 x i32] %oldret -} diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll deleted file mode 100644 index 4a1ed8f4dcda77..00000000000000 --- a/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll +++ /dev/null @@ -1,456 +0,0 @@ -; RUN: llc < %s -mtriple=nvptx-unknown-unknown | FileCheck %s -; -; Check that parameters of a __global__ (kernel) function do not get increased -; alignment, and no additional vectorization is performed on loads/stores with -; that parameters. -; -; Test IR is a minimized version of IR generated with the following command -; from the source code below: -; $ clang++ -O3 --cuda-gpu-arch=sm_35 -S -emit-llvm src.cu -; -; ---------------------------------------------------------------------------- -; #include -; -; struct St4x1 { uint32_t field[1]; }; -; struct St4x2 { uint32_t field[2]; }; -; struct St4x3 { uint32_t field[3]; }; -; struct St4x4 { uint32_t field[4]; }; -; struct St4x5 { uint32_t field[5]; }; -; struct St4x6 { uint32_t field[6]; }; -; struct St4x7 { uint32_t field[7]; }; -; struct St4x8 { uint32_t field[8]; }; -; struct St8x1 { uint64_t field[1]; }; -; struct St8x2 { uint64_t field[2]; }; -; struct St8x3 { uint64_t field[3]; }; -; struct St8x4 { uint64_t field[4]; }; -; -; #define DECLARE_FUNCTION(StName) \ -; static __global__ __attribute__((noinline)) \ -; void foo_##StName(struct StName in, struct StName* ret) { \ -; const unsigned size = sizeof(ret->field) / sizeof(*ret->field); \ -; for (unsigned i = 0; i != size; ++i) \ -; ret->field[i] = in.field[i]; \ -; } \ -; -; DECLARE_FUNCTION(St4x1) -; DECLARE_FUNCTION(St4x2) -; DECLARE_FUNCTION(St4x3) -; DECLARE_FUNCTION(St4x4) -; DECLARE_FUNCTION(St4x5) -; DECLARE_FUNCTION(St4x6) -; DECLARE_FUNCTION(St4x7) -; DECLARE_FUNCTION(St4x8) -; DECLARE_FUNCTION(St8x1) -; DECLARE_FUNCTION(St8x2) -; DECLARE_FUNCTION(St8x3) -; DECLARE_FUNCTION(St8x4) -; ---------------------------------------------------------------------------- - -%struct.St4x1 = type { [1 x i32] } -%struct.St4x2 = type { [2 x i32] } -%struct.St4x3 = type { [3 x i32] } -%struct.St4x4 = type { [4 x i32] } -%struct.St4x5 = type { [5 x i32] } -%struct.St4x6 = type { [6 x i32] } -%struct.St4x7 = type { [7 x i32] } -%struct.St4x8 = type { [8 x i32] } -%struct.St8x1 = type { [1 x i64] } -%struct.St8x2 = type { [2 x i64] } -%struct.St8x3 = type { [3 x i64] } -%struct.St8x4 = type { [4 x i64] } - -define dso_local void @foo_St4x1(%struct.St4x1* nocapture noundef readonly byval(%struct.St4x1) align 4 %in, %struct.St4x1* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func foo_St4x1( - ; CHECK: .param .align 4 .b8 foo_St4x1_param_0[4], - ; CHECK: .param .b32 foo_St4x1_param_1 - ; CHECK: ) - ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x1_param_1]; - ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x1_param_0]; - ; CHECK: st.u32 [[[R1]]], [[R2]]; - ; CHECK: ret; - %arrayidx = getelementptr inbounds %struct.St4x1, %struct.St4x1* %in, i64 0, i32 0, i64 0 - %1 = load i32, i32* %arrayidx, align 4 - %arrayidx3 = getelementptr inbounds %struct.St4x1, %struct.St4x1* %ret, i64 0, i32 0, i64 0 - store i32 %1, i32* %arrayidx3, align 4 - ret void -} - -define dso_local void @foo_St4x2(%struct.St4x2* nocapture noundef readonly byval(%struct.St4x2) align 4 %in, %struct.St4x2* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func foo_St4x2( - ; CHECK: .param .align 4 .b8 foo_St4x2_param_0[8], - ; CHECK: .param .b32 foo_St4x2_param_1 - ; CHECK: ) - ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x2_param_1]; - ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x2_param_0]; - ; CHECK: st.u32 [[[R1]]], [[R2]]; - ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x2_param_0+4]; - ; CHECK: st.u32 [[[R1]]+4], [[R3]]; - ; CHECK: ret; - %arrayidx = getelementptr inbounds %struct.St4x2, %struct.St4x2* %in, i64 0, i32 0, i64 0 - %1 = load i32, i32* %arrayidx, align 4 - %arrayidx3 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %ret, i64 0, i32 0, i64 0 - store i32 %1, i32* %arrayidx3, align 4 - %arrayidx.1 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %in, i64 0, i32 0, i64 1 - %2 = load i32, i32* %arrayidx.1, align 4 - %arrayidx3.1 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %ret, i64 0, i32 0, i64 1 - store i32 %2, i32* %arrayidx3.1, align 4 - ret void -} - -define dso_local void @foo_St4x3(%struct.St4x3* nocapture noundef readonly byval(%struct.St4x3) align 4 %in, %struct.St4x3* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func foo_St4x3( - ; CHECK: .param .align 4 .b8 foo_St4x3_param_0[12], - ; CHECK: .param .b32 foo_St4x3_param_1 - ; CHECK: ) - ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x3_param_1]; - ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x3_param_0]; - ; CHECK: st.u32 [[[R1]]], [[R2]]; - ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x3_param_0+4]; - ; CHECK: st.u32 [[[R1]]+4], [[R3]]; - ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x3_param_0+8]; - ; CHECK: st.u32 [[[R1]]+8], [[R4]]; - ; CHECK: ret; - %arrayidx = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 0 - %1 = load i32, i32* %arrayidx, align 4 - %arrayidx3 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 0 - store i32 %1, i32* %arrayidx3, align 4 - %arrayidx.1 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 1 - %2 = load i32, i32* %arrayidx.1, align 4 - %arrayidx3.1 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 1 - store i32 %2, i32* %arrayidx3.1, align 4 - %arrayidx.2 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 2 - %3 = load i32, i32* %arrayidx.2, align 4 - %arrayidx3.2 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 2 - store i32 %3, i32* %arrayidx3.2, align 4 - ret void -} - -define dso_local void @foo_St4x4(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in, %struct.St4x4* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func foo_St4x4( - ; CHECK: .param .align 4 .b8 foo_St4x4_param_0[16], - ; CHECK: .param .b32 foo_St4x4_param_1 - ; CHECK: ) - ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x4_param_1]; - ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x4_param_0]; - ; CHECK: st.u32 [[[R1]]], [[R2]]; - ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x4_param_0+4]; - ; CHECK: st.u32 [[[R1]]+4], [[R3]]; - ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x4_param_0+8]; - ; CHECK: st.u32 [[[R1]]+8], [[R4]]; - ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x4_param_0+12]; - ; CHECK: st.u32 [[[R1]]+12], [[R5]]; - ; CHECK: ret; - %arrayidx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 0 - %1 = load i32, i32* %arrayidx, align 4 - %arrayidx3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 0 - store i32 %1, i32* %arrayidx3, align 4 - %arrayidx.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 1 - %2 = load i32, i32* %arrayidx.1, align 4 - %arrayidx3.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 1 - store i32 %2, i32* %arrayidx3.1, align 4 - %arrayidx.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 2 - %3 = load i32, i32* %arrayidx.2, align 4 - %arrayidx3.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 2 - store i32 %3, i32* %arrayidx3.2, align 4 - %arrayidx.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 3 - %4 = load i32, i32* %arrayidx.3, align 4 - %arrayidx3.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 3 - store i32 %4, i32* %arrayidx3.3, align 4 - ret void -} - -define dso_local void @foo_St4x5(%struct.St4x5* nocapture noundef readonly byval(%struct.St4x5) align 4 %in, %struct.St4x5* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func foo_St4x5( - ; CHECK: .param .align 4 .b8 foo_St4x5_param_0[20], - ; CHECK: .param .b32 foo_St4x5_param_1 - ; CHECK: ) - ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x5_param_1]; - ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x5_param_0]; - ; CHECK: st.u32 [[[R1]]], [[R2]]; - ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x5_param_0+4]; - ; CHECK: st.u32 [[[R1]]+4], [[R3]]; - ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x5_param_0+8]; - ; CHECK: st.u32 [[[R1]]+8], [[R4]]; - ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x5_param_0+12]; - ; CHECK: st.u32 [[[R1]]+12], [[R5]]; - ; CHECK: ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x5_param_0+16]; - ; CHECK: st.u32 [[[R1]]+16], [[R6]]; - ; CHECK: ret; - %arrayidx = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 0 - %1 = load i32, i32* %arrayidx, align 4 - %arrayidx3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 0 - store i32 %1, i32* %arrayidx3, align 4 - %arrayidx.1 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 1 - %2 = load i32, i32* %arrayidx.1, align 4 - %arrayidx3.1 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 1 - store i32 %2, i32* %arrayidx3.1, align 4 - %arrayidx.2 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 2 - %3 = load i32, i32* %arrayidx.2, align 4 - %arrayidx3.2 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 2 - store i32 %3, i32* %arrayidx3.2, align 4 - %arrayidx.3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 3 - %4 = load i32, i32* %arrayidx.3, align 4 - %arrayidx3.3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 3 - store i32 %4, i32* %arrayidx3.3, align 4 - %arrayidx.4 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 4 - %5 = load i32, i32* %arrayidx.4, align 4 - %arrayidx3.4 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 4 - store i32 %5, i32* %arrayidx3.4, align 4 - ret void -} - -define dso_local void @foo_St4x6(%struct.St4x6* nocapture noundef readonly byval(%struct.St4x6) align 4 %in, %struct.St4x6* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func foo_St4x6( - ; CHECK: .param .align 4 .b8 foo_St4x6_param_0[24], - ; CHECK: .param .b32 foo_St4x6_param_1 - ; CHECK: ) - ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x6_param_1]; - ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x6_param_0]; - ; CHECK: st.u32 [[[R1]]], [[R2]]; - ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x6_param_0+4]; - ; CHECK: st.u32 [[[R1]]+4], [[R3]]; - ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x6_param_0+8]; - ; CHECK: st.u32 [[[R1]]+8], [[R4]]; - ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x6_param_0+12]; - ; CHECK: st.u32 [[[R1]]+12], [[R5]]; - ; CHECK: ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x6_param_0+16]; - ; CHECK: st.u32 [[[R1]]+16], [[R6]]; - ; CHECK: ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x6_param_0+20]; - ; CHECK: st.u32 [[[R1]]+20], [[R7]]; - ; CHECK: ret; - %arrayidx = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 0 - %1 = load i32, i32* %arrayidx, align 4 - %arrayidx3 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 0 - store i32 %1, i32* %arrayidx3, align 4 - %arrayidx.1 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 1 - %2 = load i32, i32* %arrayidx.1, align 4 - %arrayidx3.1 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 1 - store i32 %2, i32* %arrayidx3.1, align 4 - %arrayidx.2 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 2 - %3 = load i32, i32* %arrayidx.2, align 4 - %arrayidx3.2 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 2 - store i32 %3, i32* %arrayidx3.2, align 4 - %arrayidx.3 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 3 - %4 = load i32, i32* %arrayidx.3, align 4 - %arrayidx3.3 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 3 - store i32 %4, i32* %arrayidx3.3, align 4 - %arrayidx.4 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 4 - %5 = load i32, i32* %arrayidx.4, align 4 - %arrayidx3.4 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 4 - store i32 %5, i32* %arrayidx3.4, align 4 - %arrayidx.5 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 5 - %6 = load i32, i32* %arrayidx.5, align 4 - %arrayidx3.5 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 5 - store i32 %6, i32* %arrayidx3.5, align 4 - ret void -} - -define dso_local void @foo_St4x7(%struct.St4x7* nocapture noundef readonly byval(%struct.St4x7) align 4 %in, %struct.St4x7* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func foo_St4x7( - ; CHECK: .param .align 4 .b8 foo_St4x7_param_0[28], - ; CHECK: .param .b32 foo_St4x7_param_1 - ; CHECK: ) - ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x7_param_1]; - ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x7_param_0]; - ; CHECK: st.u32 [[[R1]]], [[R2]]; - ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x7_param_0+4]; - ; CHECK: st.u32 [[[R1]]+4], [[R3]]; - ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x7_param_0+8]; - ; CHECK: st.u32 [[[R1]]+8], [[R4]]; - ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x7_param_0+12]; - ; CHECK: st.u32 [[[R1]]+12], [[R5]]; - ; CHECK: ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x7_param_0+16]; - ; CHECK: st.u32 [[[R1]]+16], [[R6]]; - ; CHECK: ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x7_param_0+20]; - ; CHECK: st.u32 [[[R1]]+20], [[R7]]; - ; CHECK: ld.param.u32 [[R8:%r[0-9]+]], [foo_St4x7_param_0+24]; - ; CHECK: st.u32 [[[R1]]+24], [[R8]]; - ; CHECK: ret; - %arrayidx = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 0 - %1 = load i32, i32* %arrayidx, align 4 - %arrayidx3 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 0 - store i32 %1, i32* %arrayidx3, align 4 - %arrayidx.1 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 1 - %2 = load i32, i32* %arrayidx.1, align 4 - %arrayidx3.1 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 1 - store i32 %2, i32* %arrayidx3.1, align 4 - %arrayidx.2 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 2 - %3 = load i32, i32* %arrayidx.2, align 4 - %arrayidx3.2 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 2 - store i32 %3, i32* %arrayidx3.2, align 4 - %arrayidx.3 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 3 - %4 = load i32, i32* %arrayidx.3, align 4 - %arrayidx3.3 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 3 - store i32 %4, i32* %arrayidx3.3, align 4 - %arrayidx.4 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 4 - %5 = load i32, i32* %arrayidx.4, align 4 - %arrayidx3.4 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 4 - store i32 %5, i32* %arrayidx3.4, align 4 - %arrayidx.5 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 5 - %6 = load i32, i32* %arrayidx.5, align 4 - %arrayidx3.5 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 5 - store i32 %6, i32* %arrayidx3.5, align 4 - %arrayidx.6 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 6 - %7 = load i32, i32* %arrayidx.6, align 4 - %arrayidx3.6 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 6 - store i32 %7, i32* %arrayidx3.6, align 4 - ret void -} - -define dso_local void @foo_St4x8(%struct.St4x8* nocapture noundef readonly byval(%struct.St4x8) align 4 %in, %struct.St4x8* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func foo_St4x8( - ; CHECK: .param .align 4 .b8 foo_St4x8_param_0[32], - ; CHECK: .param .b32 foo_St4x8_param_1 - ; CHECK: ) - ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x8_param_1]; - ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x8_param_0]; - ; CHECK: st.u32 [[[R1]]], [[R2]]; - ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x8_param_0+4]; - ; CHECK: st.u32 [[[R1]]+4], [[R3]]; - ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x8_param_0+8]; - ; CHECK: st.u32 [[[R1]]+8], [[R4]]; - ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x8_param_0+12]; - ; CHECK: st.u32 [[[R1]]+12], [[R5]]; - ; CHECK: ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x8_param_0+16]; - ; CHECK: st.u32 [[[R1]]+16], [[R6]]; - ; CHECK: ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x8_param_0+20]; - ; CHECK: st.u32 [[[R1]]+20], [[R7]]; - ; CHECK: ld.param.u32 [[R8:%r[0-9]+]], [foo_St4x8_param_0+24]; - ; CHECK: st.u32 [[[R1]]+24], [[R8]]; - ; CHECK: ld.param.u32 [[R9:%r[0-9]+]], [foo_St4x8_param_0+28]; - ; CHECK: st.u32 [[[R1]]+28], [[R9]]; - ; CHECK: ret; - %arrayidx = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 0 - %1 = load i32, i32* %arrayidx, align 4 - %arrayidx3 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 0 - store i32 %1, i32* %arrayidx3, align 4 - %arrayidx.1 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 1 - %2 = load i32, i32* %arrayidx.1, align 4 - %arrayidx3.1 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 1 - store i32 %2, i32* %arrayidx3.1, align 4 - %arrayidx.2 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 2 - %3 = load i32, i32* %arrayidx.2, align 4 - %arrayidx3.2 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 2 - store i32 %3, i32* %arrayidx3.2, align 4 - %arrayidx.3 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 3 - %4 = load i32, i32* %arrayidx.3, align 4 - %arrayidx3.3 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 3 - store i32 %4, i32* %arrayidx3.3, align 4 - %arrayidx.4 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 4 - %5 = load i32, i32* %arrayidx.4, align 4 - %arrayidx3.4 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 4 - store i32 %5, i32* %arrayidx3.4, align 4 - %arrayidx.5 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 5 - %6 = load i32, i32* %arrayidx.5, align 4 - %arrayidx3.5 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 5 - store i32 %6, i32* %arrayidx3.5, align 4 - %arrayidx.6 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 6 - %7 = load i32, i32* %arrayidx.6, align 4 - %arrayidx3.6 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 6 - store i32 %7, i32* %arrayidx3.6, align 4 - %arrayidx.7 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 7 - %8 = load i32, i32* %arrayidx.7, align 4 - %arrayidx3.7 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 7 - store i32 %8, i32* %arrayidx3.7, align 4 - ret void -} - -define dso_local void @foo_St8x1(%struct.St8x1* nocapture noundef readonly byval(%struct.St8x1) align 8 %in, %struct.St8x1* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func foo_St8x1( - ; CHECK: .param .align 8 .b8 foo_St8x1_param_0[8], - ; CHECK: .param .b32 foo_St8x1_param_1 - ; CHECK: ) - ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x1_param_1]; - ; CHECK: ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x1_param_0]; - ; CHECK: st.u64 [[[R1]]], [[RD1]]; - ; CHECK: ret; - %arrayidx = getelementptr inbounds %struct.St8x1, %struct.St8x1* %in, i64 0, i32 0, i64 0 - %1 = load i64, i64* %arrayidx, align 8 - %arrayidx3 = getelementptr inbounds %struct.St8x1, %struct.St8x1* %ret, i64 0, i32 0, i64 0 - store i64 %1, i64* %arrayidx3, align 8 - ret void -} - -define dso_local void @foo_St8x2(%struct.St8x2* nocapture noundef readonly byval(%struct.St8x2) align 8 %in, %struct.St8x2* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func foo_St8x2( - ; CHECK: .param .align 8 .b8 foo_St8x2_param_0[16], - ; CHECK: .param .b32 foo_St8x2_param_1 - ; CHECK: ) - ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x2_param_1]; - ; CHECK: ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x2_param_0]; - ; CHECK: st.u64 [[[R1]]], [[RD1]]; - ; CHECK: ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x2_param_0+8]; - ; CHECK: st.u64 [[[R1]]+8], [[RD2]]; - ; CHECK: ret; - %arrayidx = getelementptr inbounds %struct.St8x2, %struct.St8x2* %in, i64 0, i32 0, i64 0 - %1 = load i64, i64* %arrayidx, align 8 - %arrayidx3 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %ret, i64 0, i32 0, i64 0 - store i64 %1, i64* %arrayidx3, align 8 - %arrayidx.1 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %in, i64 0, i32 0, i64 1 - %2 = load i64, i64* %arrayidx.1, align 8 - %arrayidx3.1 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %ret, i64 0, i32 0, i64 1 - store i64 %2, i64* %arrayidx3.1, align 8 - ret void -} - -define dso_local void @foo_St8x3(%struct.St8x3* nocapture noundef readonly byval(%struct.St8x3) align 8 %in, %struct.St8x3* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func foo_St8x3( - ; CHECK: .param .align 8 .b8 foo_St8x3_param_0[24], - ; CHECK: .param .b32 foo_St8x3_param_1 - ; CHECK: ) - ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x3_param_1]; - ; CHECK: ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x3_param_0]; - ; CHECK: st.u64 [[[R1]]], [[RD1]]; - ; CHECK: ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x3_param_0+8]; - ; CHECK: st.u64 [[[R1]]+8], [[RD2]]; - ; CHECK: ld.param.u64 [[RD3:%rd[0-9]+]], [foo_St8x3_param_0+16]; - ; CHECK: st.u64 [[[R1]]+16], [[RD3]]; - ; CHECK: ret; - %arrayidx = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 0 - %1 = load i64, i64* %arrayidx, align 8 - %arrayidx3 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 0 - store i64 %1, i64* %arrayidx3, align 8 - %arrayidx.1 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 1 - %2 = load i64, i64* %arrayidx.1, align 8 - %arrayidx3.1 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 1 - store i64 %2, i64* %arrayidx3.1, align 8 - %arrayidx.2 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 2 - %3 = load i64, i64* %arrayidx.2, align 8 - %arrayidx3.2 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 2 - store i64 %3, i64* %arrayidx3.2, align 8 - ret void -} - -define dso_local void @foo_St8x4(%struct.St8x4* nocapture noundef readonly byval(%struct.St8x4) align 8 %in, %struct.St8x4* nocapture noundef writeonly %ret) { - ; CHECK-LABEL: .visible .func foo_St8x4( - ; CHECK: .param .align 8 .b8 foo_St8x4_param_0[32], - ; CHECK: .param .b32 foo_St8x4_param_1 - ; CHECK: ) - ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x4_param_1]; - ; CHECK: ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x4_param_0]; - ; CHECK: st.u64 [[[R1]]], [[RD1]]; - ; CHECK: ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x4_param_0+8]; - ; CHECK: st.u64 [[[R1]]+8], [[RD2]]; - ; CHECK: ld.param.u64 [[RD3:%rd[0-9]+]], [foo_St8x4_param_0+16]; - ; CHECK: st.u64 [[[R1]]+16], [[RD3]]; - ; CHECK: ld.param.u64 [[RD4:%rd[0-9]+]], [foo_St8x4_param_0+24]; - ; CHECK: st.u64 [[[R1]]+24], [[RD4]]; - ; CHECK: ret; - %arrayidx = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 0 - %1 = load i64, i64* %arrayidx, align 8 - %arrayidx3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 0 - store i64 %1, i64* %arrayidx3, align 8 - %arrayidx.1 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 1 - %2 = load i64, i64* %arrayidx.1, align 8 - %arrayidx3.1 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 1 - store i64 %2, i64* %arrayidx3.1, align 8 - %arrayidx.2 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 2 - %3 = load i64, i64* %arrayidx.2, align 8 - %arrayidx3.2 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 2 - store i64 %3, i64* %arrayidx3.2, align 8 - %arrayidx.3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 3 - %4 = load i64, i64* %arrayidx.3, align 8 - %arrayidx3.3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 3 - store i64 %4, i64* %arrayidx3.3, align 8 - ret void -}