diff --git a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h index d4e72a60fc1ea..8924b8b1e6e54 100644 --- a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h +++ b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h @@ -59,7 +59,8 @@ LLVM_ABI bool expandMemMoveAsLoop(MemMoveInst *MemMove, const TargetTransformInfo &TTI); /// Expand \p MemSet as a loop. \p MemSet is not deleted. -LLVM_ABI void expandMemSetAsLoop(MemSetInst *MemSet); +LLVM_ABI void expandMemSetAsLoop(MemSetInst *MemSet, + const TargetTransformInfo &TTI); /// Expand \p MemSetPattern as a loop. \p MemSet is not deleted. LLVM_ABI void expandMemSetPatternAsLoop(MemSetPatternInst *MemSet); diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp index d738dc4eea36d..88e2bb81f9e3b 100644 --- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -369,7 +369,7 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses( canEmitLibcall(TM, ParentFunc, RTLIB::MEMSET)) break; - expandMemSetAsLoop(Memset); + expandMemSetAsLoop(Memset, TTI); Changed = true; Memset->eraseFromParent(); } @@ -384,7 +384,9 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses( if (isa(Memset->getLength())) break; - expandMemSetAsLoop(Memset); + Function *ParentFunc = Memset->getFunction(); + const TargetTransformInfo &TTI = LookupTTI(*ParentFunc); + expandMemSetAsLoop(Memset, TTI); Changed = true; Memset->eraseFromParent(); break; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index fdff21b6ef8df..76f1e006bbf74 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -635,7 +635,8 @@ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetInst( MemSetInst &MSI) { if (MSI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER) return false; - llvm::expandMemSetAsLoop(&MSI); + llvm::expandMemSetAsLoop(&MSI, + TM->getTargetTransformInfo(*MSI.getFunction())); MSI.eraseFromParent(); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 03d16fdd54c42..5a68dca1b10b7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -80,7 +80,8 @@ static cl::opt InlineMaxBB( static cl::opt MemcpyLoopUnroll( "amdgpu-memcpy-loop-unroll", cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " - "operations when lowering memcpy as a loop"), + "operations when lowering statically-sized memcpy, memmove, or" + "memset as a loop"), cl::init(16), cl::Hidden); static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp index ac6f4061b9f1f..d0b50d2610bd5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -128,7 +128,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { } else if (MemMoveInst *Memmove = dyn_cast(MemCall)) { expandMemMoveAsLoop(Memmove, TTI); } else if (MemSetInst *Memset = dyn_cast(MemCall)) { - expandMemSetAsLoop(Memset); + expandMemSetAsLoop(Memset, TTI); } MemCall->eraseFromParent(); } diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp index be88f334d2171..8bee6da75cc75 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp @@ -23,6 +23,7 @@ #include "SPIRVTargetMachine.h" #include "SPIRVUtils.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/IR/IRBuilder.h" @@ -93,7 +94,8 @@ static Function *getOrCreateFunction(Module *M, Type *RetTy, return NewF; } -static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic) { +static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic, + const TargetTransformInfo &TTI) { // For @llvm.memset.* intrinsic cases with constant value and length arguments // are emulated via "storing" a constant array to the destination. For other // cases we wrap the intrinsic in @spirv.llvm_memset_* function and expand the @@ -137,7 +139,7 @@ static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic) { auto *MemSet = IRB.CreateMemSet(Dest, Val, Len, MSI->getDestAlign(), MSI->isVolatile()); IRB.CreateRetVoid(); - expandMemSetAsLoop(cast(MemSet)); + expandMemSetAsLoop(cast(MemSet), TTI); MemSet->eraseFromParent(); break; } @@ -399,6 +401,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { bool Changed = false; const SPIRVSubtarget &STI = TM.getSubtarget(*F); SmallVector EraseFromParent; + const TargetTransformInfo &TTI = TM.getTargetTransformInfo(*F); for (BasicBlock &BB : *F) { for (Instruction &I : make_early_inc_range(BB)) { auto Call = dyn_cast(&I); @@ -411,7 +414,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { switch (II->getIntrinsicID()) { case Intrinsic::memset: case Intrinsic::bswap: - Changed |= lowerIntrinsicToFunction(II); + Changed |= lowerIntrinsicToFunction(II, TTI); break; case Intrinsic::fshl: case Intrinsic::fshr: @@ -459,7 +462,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { return false; return II->getCalledFunction()->getName().starts_with(Prefix); })) - Changed |= lowerIntrinsicToFunction(II); + Changed |= lowerIntrinsicToFunction(II, TTI); break; } } diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp index 88e5d038bff82..07bfceb99d206 100644 --- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -930,9 +930,187 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore, } } +/// Create a Value of \p DstType that consists of a sequence of copies of +/// \p SetValue, using bitcasts and a vector splat. +static Value *createMemSetSplat(const DataLayout &DL, IRBuilderBase &B, + Value *SetValue, Type *DstType) { + unsigned DstSize = DL.getTypeStoreSize(DstType); + Type *SetValueType = SetValue->getType(); + unsigned SetValueSize = DL.getTypeStoreSize(SetValueType); + assert(SetValueSize == DL.getTypeAllocSize(SetValueType) && + "Store size and alloc size of SetValue's type must match"); + assert(SetValueSize != 0 && DstSize % SetValueSize == 0 && + "DstType size must be a multiple of SetValue size"); + + Value *Result = SetValue; + if (DstSize != SetValueSize) { + if (!SetValueType->isIntegerTy() && !SetValueType->isFloatingPointTy()) { + // If the type cannot be put into a vector, bitcast to iN first. + LLVMContext &Ctx = SetValue->getContext(); + Result = B.CreateBitCast(Result, Type::getIntNTy(Ctx, SetValueSize * 8), + "setvalue.toint"); + } + // Form a sufficiently large vector consisting of SetValue, repeated. + Result = + B.CreateVectorSplat(DstSize / SetValueSize, Result, "setvalue.splat"); + } + + // The value has the right size, but we might have to bitcast it to the right + // type. + if (Result->getType() != DstType) { + Result = B.CreateBitCast(Result, DstType, "setvalue.splat.cast"); + } + return Result; +} + +static void createMemSetLoopKnownSize(Instruction *InsertBefore, Value *DstAddr, + ConstantInt *Len, Value *SetValue, + Align DstAlign, bool IsVolatile, + const TargetTransformInfo &TTI) { + // No need to expand zero length memsets. + if (Len->isZero()) + return; + + BasicBlock *PreLoopBB = InsertBefore->getParent(); + Function *ParentFunc = PreLoopBB->getParent(); + const DataLayout &DL = ParentFunc->getDataLayout(); + LLVMContext &Ctx = PreLoopBB->getContext(); + + unsigned DstAS = cast(DstAddr->getType())->getAddressSpace(); + + Type *TypeOfLen = Len->getType(); + Type *Int8Type = Type::getInt8Ty(Ctx); + assert(SetValue->getType() == Int8Type && "Can only set bytes"); + + // Use the same memory access type as for a memcpy with the same Dst and Src + // alignment and address space. + Type *LoopOpType = TTI.getMemcpyLoopLoweringType( + Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt); + unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType); + + uint64_t LoopEndCount = alignDown(Len->getZExtValue(), LoopOpSize); + + if (LoopEndCount != 0) { + Value *SplatSetValue = nullptr; + { + IRBuilder<> PreLoopBuilder(InsertBefore); + SplatSetValue = + createMemSetSplat(DL, PreLoopBuilder, SetValue, LoopOpType); + } + + // Don't generate a residual loop, the remaining bytes are set with + // straight-line code. + LoopExpansionInfo LEI = + insertLoopExpansion(InsertBefore, Len, LoopOpSize, 0, "static-memset"); + + // Fill MainLoopBB + IRBuilder<> MainLoopBuilder(LEI.MainLoopIP); + Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize)); + + Value *DstGEP = + MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex); + + MainLoopBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign, + IsVolatile); + + assert(!LEI.ResidualLoopIP && !LEI.ResidualLoopIndex && + "No residual loop was requested"); + } + + uint64_t BytesSet = LoopEndCount; + uint64_t RemainingBytes = Len->getZExtValue() - BytesSet; + if (RemainingBytes == 0) + return; + + IRBuilder<> RBuilder(InsertBefore); + + SmallVector RemainingOps; + TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes, + DstAS, DstAS, DstAlign, DstAlign, + std::nullopt); + + Type *PreviousOpTy = nullptr; + Value *SplatSetValue = nullptr; + for (auto *OpTy : RemainingOps) { + unsigned OperandSize = DL.getTypeStoreSize(OpTy); + Align PartDstAlign(commonAlignment(DstAlign, BytesSet)); + + // Avoid recomputing the splat SetValue if it's the same as for the last + // iteration. + if (OpTy != PreviousOpTy) + SplatSetValue = createMemSetSplat(DL, RBuilder, SetValue, OpTy); + + Value *DstGEP = RBuilder.CreateInBoundsGEP( + Int8Type, DstAddr, ConstantInt::get(TypeOfLen, BytesSet)); + RBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign, + IsVolatile); + BytesSet += OperandSize; + PreviousOpTy = OpTy; + } + assert(BytesSet == Len->getZExtValue() && + "Bytes set should match size in the call!"); +} + +static void createMemSetLoopUnknownSize(Instruction *InsertBefore, + Value *DstAddr, Value *Len, + Value *SetValue, Align DstAlign, + bool IsVolatile, + const TargetTransformInfo &TTI) { + BasicBlock *PreLoopBB = InsertBefore->getParent(); + Function *ParentFunc = PreLoopBB->getParent(); + const DataLayout &DL = ParentFunc->getDataLayout(); + LLVMContext &Ctx = PreLoopBB->getContext(); + + unsigned DstAS = cast(DstAddr->getType())->getAddressSpace(); + + Type *Int8Type = Type::getInt8Ty(Ctx); + assert(SetValue->getType() == Int8Type && "Can only set bytes"); + + Type *LoopOpType = TTI.getMemcpyLoopLoweringType( + Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt); + unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType); + + Type *ResidualLoopOpType = Int8Type; + unsigned ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType); + + Value *SplatSetValue = SetValue; + { + IRBuilder<> PreLoopBuilder(InsertBefore); + SplatSetValue = createMemSetSplat(DL, PreLoopBuilder, SetValue, LoopOpType); + } + + LoopExpansionInfo LEI = insertLoopExpansion( + InsertBefore, Len, LoopOpSize, ResidualLoopOpSize, "dynamic-memset"); + + // Fill MainLoopBB + IRBuilder<> MainLoopBuilder(LEI.MainLoopIP); + Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize)); + + Value *DstGEP = + MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex); + MainLoopBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign, + IsVolatile); + + // Fill ResidualLoopBB + if (!LEI.ResidualLoopIP) + return; + + Align ResDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize)); + + IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP); + + Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, + LEI.ResidualLoopIndex); + ResLoopBuilder.CreateAlignedStore(SetValue, ResDstGEP, ResDstAlign, + IsVolatile); +} + static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr, Value *CopyLen, Value *SetValue, Align DstAlign, bool IsVolatile) { + // Currently no longer used for memset, only for memset.pattern. + // TODO: Update the memset.pattern lowering to also use the loop expansion + // framework and remove this function. Type *TypeOfCopyLen = CopyLen->getType(); BasicBlock *OrigBB = InsertBefore->getParent(); Function *F = OrigBB->getParent(); @@ -1067,13 +1245,25 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove, return true; } -void llvm::expandMemSetAsLoop(MemSetInst *Memset) { - createMemSetLoop(/* InsertBefore */ Memset, - /* DstAddr */ Memset->getRawDest(), - /* CopyLen */ Memset->getLength(), - /* SetValue */ Memset->getValue(), - /* Alignment */ Memset->getDestAlign().valueOrOne(), - Memset->isVolatile()); +void llvm::expandMemSetAsLoop(MemSetInst *Memset, + const TargetTransformInfo &TTI) { + if (ConstantInt *CI = dyn_cast(Memset->getLength())) { + createMemSetLoopKnownSize( + /* InsertBefore */ Memset, + /* DstAddr */ Memset->getRawDest(), + /* Len */ CI, + /* SetValue */ Memset->getValue(), + /* DstAlign */ Memset->getDestAlign().valueOrOne(), + Memset->isVolatile(), TTI); + } else { + createMemSetLoopUnknownSize( + /* InsertBefore */ Memset, + /* DstAddr */ Memset->getRawDest(), + /* Len */ Memset->getLength(), + /* SetValue */ Memset->getValue(), + /* DstAlign */ Memset->getDestAlign().valueOrOne(), + Memset->isVolatile(), TTI); + } } void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *Memset) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll index 04652af147f9b..4d35f3198bc0a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll @@ -1,27 +1,87 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-- -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s -; RUN: llc -global-isel -mtriple=amdgcn-- -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s +; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=35 %s -o - | FileCheck -check-prefix=LOOP %s +; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=37 %s -o - | FileCheck -check-prefix=UNROLL %s declare void @llvm.memset.p1.i32(ptr addrspace(1), i8, i32, i1) define amdgpu_cs void @memset_p1i8(ptr addrspace(1) %dst, i8 %val) { ; LOOP-LABEL: memset_p1i8: -; LOOP: ; %bb.0: ; %loadstoreloop.preheader +; LOOP: ; %bb.0: +; LOOP-NEXT: v_and_b32_e32 v3, 0xff, v2 ; LOOP-NEXT: s_mov_b64 s[0:1], 0 ; LOOP-NEXT: s_mov_b32 s2, 0 ; LOOP-NEXT: s_mov_b32 s3, 0xf000 +; LOOP-NEXT: v_lshlrev_b32_e32 v4, 8, v3 +; LOOP-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; LOOP-NEXT: v_lshlrev_b32_e32 v6, 24, v3 +; LOOP-NEXT: v_or_b32_e32 v3, v3, v4 +; LOOP-NEXT: v_or_b32_e32 v3, v3, v5 +; LOOP-NEXT: v_or_b32_e32 v5, v3, v6 +; LOOP-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; LOOP-NEXT: v_bfe_u32 v7, v5, 8, 8 +; LOOP-NEXT: v_lshrrev_b32_e32 v8, 24, v5 ; LOOP-NEXT: v_mov_b32_e32 v4, s1 ; LOOP-NEXT: v_mov_b32_e32 v3, s0 -; LOOP-NEXT: .LBB0_1: ; %loadstoreloop +; LOOP-NEXT: .LBB0_1: ; %static-memset-expansion-main-body ; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1 -; LOOP-NEXT: v_add_i32_e32 v5, vcc, v0, v3 -; LOOP-NEXT: v_addc_u32_e32 v6, vcc, v1, v4, vcc -; LOOP-NEXT: v_add_i32_e32 v3, vcc, 1, v3 +; LOOP-NEXT: v_add_i32_e32 v9, vcc, v0, v3 +; LOOP-NEXT: v_addc_u32_e32 v10, vcc, v1, v4, vcc +; LOOP-NEXT: v_add_i32_e32 v3, vcc, 32, v3 ; LOOP-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; LOOP-NEXT: v_cmp_gt_u32_e32 vcc, 4, v3 -; LOOP-NEXT: buffer_store_byte v2, v[5:6], s[0:3], 0 addr64 +; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 +; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:1 +; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:2 +; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:3 +; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:4 +; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:5 +; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:6 +; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:7 +; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:8 +; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:9 +; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:10 +; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:11 +; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:12 +; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:13 +; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:14 +; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:15 +; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:16 +; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:17 +; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:18 +; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:19 +; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:20 +; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:21 +; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:22 +; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:23 +; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:24 +; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:25 +; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:26 +; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:27 +; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:28 +; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:29 +; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:30 +; LOOP-NEXT: v_cmp_gt_u32_e32 vcc, 32, v3 +; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:31 ; LOOP-NEXT: s_cbranch_vccnz .LBB0_1 -; LOOP-NEXT: ; %bb.2: ; %split +; LOOP-NEXT: ; %bb.2: ; %static-memset-post-expansion +; LOOP-NEXT: v_and_b32_e32 v2, 0xff, v2 +; LOOP-NEXT: s_mov_b32 s2, 0 +; LOOP-NEXT: s_mov_b32 s3, 0xf000 +; LOOP-NEXT: s_mov_b64 s[0:1], 0 +; LOOP-NEXT: v_lshlrev_b32_e32 v3, 8, v2 +; LOOP-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; LOOP-NEXT: s_waitcnt expcnt(3) +; LOOP-NEXT: v_lshlrev_b32_e32 v5, 24, v2 +; LOOP-NEXT: v_or_b32_e32 v2, v2, v3 +; LOOP-NEXT: v_or_b32_e32 v2, v2, v4 +; LOOP-NEXT: v_or_b32_e32 v2, v2, v5 +; LOOP-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; LOOP-NEXT: v_bfe_u32 v4, v2, 8, 8 +; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:32 +; LOOP-NEXT: s_waitcnt expcnt(0) +; LOOP-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:33 +; LOOP-NEXT: buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:34 +; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:35 ; LOOP-NEXT: s_endpgm ; ; UNROLL-LABEL: memset_p1i8: @@ -33,7 +93,39 @@ define amdgpu_cs void @memset_p1i8(ptr addrspace(1) %dst, i8 %val) { ; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:1 ; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:2 ; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:3 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:4 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:5 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:6 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:7 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:8 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:9 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:10 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:11 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:12 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:13 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:14 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:15 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:16 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:17 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:18 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:19 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:20 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:21 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:22 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:23 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:24 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:25 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:26 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:27 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:28 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:29 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:30 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:31 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:32 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:33 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:34 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:35 ; UNROLL-NEXT: s_endpgm - call void @llvm.memset.p1.i32(ptr addrspace(1) %dst, i8 %val, i32 4, i1 false) + call void @llvm.memset.p1.i32(ptr addrspace(1) %dst, i8 %val, i32 36, i1 false) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index 5f0ca7bc42ae0..097d36b190299 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -28,31 +28,179 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; MUBUF-NEXT: s_mov_b32 s4, 0 ; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: .LBB0_1: ; %loadstoreloop +; MUBUF-NEXT: .LBB0_1: ; %static-memset-expansion-main-body ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 ; MUBUF-NEXT: v_mov_b32_e32 v3, 0x3000 ; MUBUF-NEXT: v_add_u32_e32 v2, s4, v3 -; MUBUF-NEXT: s_add_i32 s4, s4, 1 -; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120 -; MUBUF-NEXT: buffer_store_byte v1, v2, s[0:3], 0 offen +; MUBUF-NEXT: s_addk_i32 s4, 0x100 +; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2100 +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:252 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:248 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:244 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:240 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:236 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:232 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:228 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:224 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:220 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:216 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:212 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:208 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:204 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:200 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:196 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:192 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:188 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:184 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:180 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:176 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:172 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:168 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:164 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:160 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:156 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:152 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:148 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:144 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:140 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:136 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:132 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:128 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:124 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:120 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:116 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:112 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:108 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:104 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:100 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:96 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:92 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:88 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:84 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:80 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:76 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:72 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:68 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:64 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:60 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:56 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:52 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:48 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:44 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:40 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:36 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:32 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:28 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:24 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:20 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:12 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:8 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_cbranch_scc1 .LBB0_1 -; MUBUF-NEXT: ; %bb.2: ; %split -; MUBUF-NEXT: v_mov_b32_e32 v1, 0x50d0 -; MUBUF-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen glc +; MUBUF-NEXT: ; %bb.2: ; %static-memset-post-expansion +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x5100 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0 +; MUBUF-NEXT: s_movk_i32 s4, 0x2110 +; MUBUF-NEXT: v_mov_b32_e32 v3, 0x3000 +; MUBUF-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen offset:12 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen offset:8 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_add_u32_e32 v1, s4, v3 +; MUBUF-NEXT: s_movk_i32 s4, 0x20d0 +; MUBUF-NEXT: v_mov_b32_e32 v3, 0x3000 +; MUBUF-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen offset:12 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen offset:8 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_add_u32_e32 v1, s4, v3 +; MUBUF-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 glc +; MUBUF-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen glc +; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 glc +; MUBUF-NEXT: buffer_load_dword v6, v0, s[0:3], 0 offen offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; MUBUF-NEXT: v_mov_b32_e32 v6, 0 -; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4 -; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v5, vcc +; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v3, v5 +; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v6, vcc ; MUBUF-NEXT: s_waitcnt lgkmcnt(0) -; MUBUF-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] +; MUBUF-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_endpgm ; @@ -65,20 +213,69 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_mov_b32 s0, 0 -; FLATSCR-NEXT: .LBB0_1: ; %loadstoreloop +; FLATSCR-NEXT: s_mov_b32 s1, s0 +; FLATSCR-NEXT: s_mov_b32 s2, s0 +; FLATSCR-NEXT: s_mov_b32 s3, s0 +; FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; FLATSCR-NEXT: v_mov_b32_e32 v1, s1 +; FLATSCR-NEXT: v_mov_b32_e32 v2, s2 +; FLATSCR-NEXT: v_mov_b32_e32 v3, s3 +; FLATSCR-NEXT: .LBB0_1: ; %static-memset-expansion-main-body ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLATSCR-NEXT: s_add_i32 s1, s0, 0x3000 -; FLATSCR-NEXT: s_add_i32 s0, s0, 1 -; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120 -; FLATSCR-NEXT: scratch_store_byte off, v0, s1 +; FLATSCR-NEXT: s_addk_i32 s0, 0x100 +; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2100 +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:240 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:224 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:208 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:192 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:176 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:160 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:144 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:128 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:112 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:96 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:80 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:64 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:48 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:32 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_1 -; FLATSCR-NEXT: ; %bb.2: ; %split +; FLATSCR-NEXT: ; %bb.2: ; %static-memset-post-expansion ; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 -; FLATSCR-NEXT: s_addk_i32 s0, 0x3000 -; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:208 glc +; FLATSCR-NEXT: s_add_i32 s6, s0, 0x3000 +; FLATSCR-NEXT: s_mov_b32 s0, 0 +; FLATSCR-NEXT: s_mov_b32 s1, s0 +; FLATSCR-NEXT: s_mov_b32 s2, s0 +; FLATSCR-NEXT: s_mov_b32 s3, s0 +; FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; FLATSCR-NEXT: v_mov_b32_e32 v1, s1 +; FLATSCR-NEXT: v_mov_b32_e32 v2, s2 +; FLATSCR-NEXT: v_mov_b32_e32 v3, s3 +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s6 offset:256 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s6 offset:272 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_movk_i32 s0, 0x3000 +; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s6 offset:208 glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:64 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -121,20 +318,173 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; MUBUF-NEXT: s_add_i32 s32, s32, 0x200000 ; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], s33 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: .LBB1_1: ; %loadstoreloop +; MUBUF-NEXT: .LBB1_1: ; %static-memset-expansion-main-body ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 ; MUBUF-NEXT: v_lshrrev_b32_e64 v5, 6, s33 ; MUBUF-NEXT: v_add_u32_e32 v4, s4, v5 ; MUBUF-NEXT: v_mov_b32_e32 v5, 0x3000 -; MUBUF-NEXT: s_add_i32 s4, s4, 1 +; MUBUF-NEXT: s_addk_i32 s4, 0x100 ; MUBUF-NEXT: v_add_u32_e32 v4, v5, v4 -; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120 -; MUBUF-NEXT: buffer_store_byte v3, v4, s[0:3], 0 offen +; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2100 +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:252 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:248 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:244 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:240 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:236 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:232 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:228 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:224 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:220 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:216 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:212 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:208 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:204 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:200 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:196 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:192 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:188 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:184 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:180 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:176 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:172 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:168 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:164 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:160 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:156 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:152 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:148 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:144 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:140 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:136 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:132 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:128 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:124 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:120 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:116 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:112 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:108 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:104 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:100 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:96 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:92 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:88 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:84 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:80 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:76 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:72 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:68 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:64 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:60 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:56 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:52 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:48 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:44 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:40 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:36 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:32 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:28 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:24 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:20 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:16 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:8 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_cbranch_scc1 .LBB1_1 -; MUBUF-NEXT: ; %bb.2: ; %split +; MUBUF-NEXT: ; %bb.2: ; %static-memset-post-expansion ; MUBUF-NEXT: v_lshrrev_b32_e64 v4, 6, s33 -; MUBUF-NEXT: v_add_u32_e32 v3, 0x50d0, v4 +; MUBUF-NEXT: v_add_u32_e32 v3, 0x5100, v4 +; MUBUF-NEXT: v_mov_b32_e32 v4, 0 +; MUBUF-NEXT: s_movk_i32 s4, 0x2110 +; MUBUF-NEXT: v_lshrrev_b32_e64 v5, 6, s33 +; MUBUF-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen offset:12 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen offset:8 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_add_u32_e32 v3, s4, v5 +; MUBUF-NEXT: v_mov_b32_e32 v5, 0x3000 +; MUBUF-NEXT: v_add_u32_e32 v3, v5, v3 +; MUBUF-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen offset:12 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen offset:8 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_movk_i32 s4, 0x20d0 +; MUBUF-NEXT: v_lshrrev_b32_e64 v4, 6, s33 +; MUBUF-NEXT: v_add_u32_e32 v3, s4, v4 +; MUBUF-NEXT: v_mov_b32_e32 v4, 0x3000 +; MUBUF-NEXT: v_add_u32_e32 v3, v4, v3 ; MUBUF-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: buffer_load_dword v5, v3, s[0:3], 0 offen offset:4 glc @@ -155,10 +505,10 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; FLATSCR-LABEL: func_local_stack_offset_uses_sp: ; FLATSCR: ; %bb.0: ; %entry ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_mov_b32 s2, s33 +; FLATSCR-NEXT: s_mov_b32 s5, s33 ; FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff ; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000 -; FLATSCR-NEXT: s_mov_b32 s3, s34 +; FLATSCR-NEXT: s_mov_b32 s6, s34 ; FLATSCR-NEXT: s_mov_b32 s34, s32 ; FLATSCR-NEXT: s_add_i32 s32, s32, 0x8000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 @@ -166,27 +516,76 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; FLATSCR-NEXT: scratch_store_dword off, v2, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_mov_b32 s0, 0 -; FLATSCR-NEXT: .LBB1_1: ; %loadstoreloop +; FLATSCR-NEXT: s_mov_b32 s3, s0 +; FLATSCR-NEXT: s_mov_b32 s1, s0 +; FLATSCR-NEXT: s_mov_b32 s2, s0 +; FLATSCR-NEXT: v_mov_b32_e32 v5, s3 +; FLATSCR-NEXT: v_mov_b32_e32 v4, s2 +; FLATSCR-NEXT: v_mov_b32_e32 v3, s1 +; FLATSCR-NEXT: v_mov_b32_e32 v2, s0 +; FLATSCR-NEXT: .LBB1_1: ; %static-memset-expansion-main-body ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLATSCR-NEXT: s_add_i32 s1, s33, s0 ; FLATSCR-NEXT: s_addk_i32 s1, 0x3000 -; FLATSCR-NEXT: s_add_i32 s0, s0, 1 -; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120 -; FLATSCR-NEXT: scratch_store_byte off, v2, s1 +; FLATSCR-NEXT: s_addk_i32 s0, 0x100 +; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2100 +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:240 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:224 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:208 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:192 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:176 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:160 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:144 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:128 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:112 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:96 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:80 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:64 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:48 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:32 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:16 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_1 -; FLATSCR-NEXT: ; %bb.2: ; %split +; FLATSCR-NEXT: ; %bb.2: ; %static-memset-post-expansion ; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 -; FLATSCR-NEXT: s_add_i32 s1, s33, s0 -; FLATSCR-NEXT: s_add_i32 s0, s1, 0x3000 -; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:208 glc +; FLATSCR-NEXT: s_add_i32 s4, s33, s0 +; FLATSCR-NEXT: s_mov_b32 s0, 0 +; FLATSCR-NEXT: s_mov_b32 s3, s0 +; FLATSCR-NEXT: s_mov_b32 s1, s0 +; FLATSCR-NEXT: s_mov_b32 s2, s0 +; FLATSCR-NEXT: v_mov_b32_e32 v5, s3 +; FLATSCR-NEXT: s_addk_i32 s4, 0x3000 +; FLATSCR-NEXT: v_mov_b32_e32 v4, s2 +; FLATSCR-NEXT: v_mov_b32_e32 v3, s1 +; FLATSCR-NEXT: v_mov_b32_e32 v2, s0 +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s4 offset:256 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s4 offset:272 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_add_i32 s0, s33, 0x3000 +; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s4 offset:208 glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:64 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_mov_b32 s32, s34 -; FLATSCR-NEXT: s_mov_b32 s34, s3 -; FLATSCR-NEXT: s_mov_b32 s33, s2 +; FLATSCR-NEXT: s_mov_b32 s34, s6 +; FLATSCR-NEXT: s_mov_b32 s33, s5 ; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc ; FLATSCR-NEXT: global_store_dwordx2 v[0:1], v[2:3], off @@ -216,76 +615,222 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; MUBUF-NEXT: s_mov_b32 s4, 0 ; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: .LBB2_1: ; %loadstoreloop +; MUBUF-NEXT: .LBB2_1: ; %static-memset-expansion-main-body ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 ; MUBUF-NEXT: v_add_u32_e32 v1, s4, v2 -; MUBUF-NEXT: s_add_i32 s4, s4, 1 -; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120 -; MUBUF-NEXT: buffer_store_byte v0, v1, s[0:3], 0 offen +; MUBUF-NEXT: s_addk_i32 s4, 0x100 +; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2100 +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:252 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:248 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:244 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:240 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:236 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:232 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:228 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:224 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:220 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:216 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:212 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:208 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:204 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:200 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:196 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:192 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:188 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:184 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:180 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:176 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:172 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:168 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:164 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:160 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:156 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:152 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:148 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:144 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:140 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:136 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:132 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:128 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:124 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:120 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:116 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:112 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:108 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:104 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:100 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:96 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:92 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:88 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:84 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:80 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:76 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:72 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:68 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:64 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:60 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:56 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:52 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:48 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:44 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:40 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:36 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:32 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:28 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:24 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:20 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:12 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:8 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_cbranch_scc1 .LBB2_1 -; MUBUF-NEXT: ; %bb.2: ; %split -; MUBUF-NEXT: s_movk_i32 s5, 0x12d4 -; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000 -; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1 -; MUBUF-NEXT: s_movk_i32 s5, 0x12d0 +; MUBUF-NEXT: ; %bb.2: ; %static-memset-post-expansion +; MUBUF-NEXT: v_mov_b32_e32 v0, 0x6100 +; MUBUF-NEXT: v_mov_b32_e32 v6, 0 +; MUBUF-NEXT: s_movk_i32 s4, 0x2110 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000 -; MUBUF-NEXT: s_movk_i32 s4, 0x4000 -; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen glc +; MUBUF-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1 -; MUBUF-NEXT: s_movk_i32 s5, 0x12c4 +; MUBUF-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_add_u32_e32 v0, s4, v1 +; MUBUF-NEXT: s_movk_i32 s4, 0x12c0 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000 -; MUBUF-NEXT: s_or_b32 s4, s4, 0x12c0 -; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen glc +; MUBUF-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:4 ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1 -; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen glc +; MUBUF-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_or_b32_e32 v0, s4, v1 +; MUBUF-NEXT: s_movk_i32 s4, 0x12d4 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 +; MUBUF-NEXT: v_or_b32_e32 v1, s4, v2 +; MUBUF-NEXT: s_movk_i32 s4, 0x12d0 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 +; MUBUF-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen glc +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_or_b32_e32 v1, s4, v2 +; MUBUF-NEXT: s_movk_i32 s4, 0x12c4 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 +; MUBUF-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen glc +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_or_b32_e32 v1, s4, v2 +; MUBUF-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen glc +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: buffer_load_dword v8, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_mov_b32_e32 v0, s4 ; MUBUF-NEXT: s_movk_i32 s4, 0x12cc -; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000 -; MUBUF-NEXT: v_or_b32_e32 v2, s4, v3 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000 +; MUBUF-NEXT: v_or_b32_e32 v0, s4, v1 ; MUBUF-NEXT: s_movk_i32 s4, 0x12c8 -; MUBUF-NEXT: v_mov_b32_e32 v6, 0x4000 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 +; MUBUF-NEXT: v_or_b32_e32 v1, s4, v2 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 ; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_mov_b32_e32 v7, 0x4000 -; MUBUF-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen glc -; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_or_b32_e32 v2, s4, v6 -; MUBUF-NEXT: v_mov_b32_e32 v8, 0x4000 -; MUBUF-NEXT: v_mov_b32_e32 v9, 0x4000 -; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen glc -; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_mov_b32_e32 v10, 0x4000 -; MUBUF-NEXT: buffer_load_dword v6, v7, s[0:3], 0 offen glc +; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000 +; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_mov_b32_e32 v11, 0x4000 -; MUBUF-NEXT: buffer_load_dword v7, v8, s[0:3], 0 offen offset:4 glc +; MUBUF-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen glc +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 +; MUBUF-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_mov_b32_e32 v12, 0x4000 -; MUBUF-NEXT: buffer_load_dword v8, v9, s[0:3], 0 offen offset:8 glc +; MUBUF-NEXT: buffer_load_dword v2, v3, s[0:3], 0 offen offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; MUBUF-NEXT: buffer_load_dword v9, v10, s[0:3], 0 offen offset:12 glc +; MUBUF-NEXT: v_mov_b32_e32 v13, 0x4000 +; MUBUF-NEXT: buffer_load_dword v3, v11, s[0:3], 0 offen offset:12 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v2, v8 -; MUBUF-NEXT: buffer_load_dword v10, v11, s[0:3], 0 offen offset:16 glc +; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; MUBUF-NEXT: buffer_load_dword v11, v12, s[0:3], 0 offen offset:16 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v9, vcc -; MUBUF-NEXT: buffer_load_dword v11, v12, s[0:3], 0 offen offset:20 glc +; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v1, v2 +; MUBUF-NEXT: buffer_load_dword v12, v13, s[0:3], 0 offen offset:20 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v0, v6 -; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v7, vcc -; MUBUF-NEXT: v_mov_b32_e32 v12, 0 -; MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, v4, v10 -; MUBUF-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v11, vcc +; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc +; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v8, v9 +; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v10, vcc +; MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, v4, v11 +; MUBUF-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v12, vcc ; MUBUF-NEXT: s_waitcnt lgkmcnt(0) -; MUBUF-NEXT: global_store_dwordx2 v12, v[4:5], s[4:5] offset:16 +; MUBUF-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5] offset:16 ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: global_store_dwordx4 v12, v[0:3], s[4:5] +; MUBUF-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_endpgm ; @@ -297,16 +842,67 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:1024 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: .LBB2_1: ; %loadstoreloop +; FLATSCR-NEXT: s_mov_b32 s1, s0 +; FLATSCR-NEXT: s_mov_b32 s2, s0 +; FLATSCR-NEXT: s_mov_b32 s3, s0 +; FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; FLATSCR-NEXT: v_mov_b32_e32 v1, s1 +; FLATSCR-NEXT: v_mov_b32_e32 v2, s2 +; FLATSCR-NEXT: v_mov_b32_e32 v3, s3 +; FLATSCR-NEXT: .LBB2_1: ; %static-memset-expansion-main-body ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLATSCR-NEXT: s_add_i32 s1, s0, 0x2000 -; FLATSCR-NEXT: s_add_i32 s0, s0, 1 -; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120 -; FLATSCR-NEXT: scratch_store_byte off, v0, s1 +; FLATSCR-NEXT: s_addk_i32 s0, 0x100 +; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2100 +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:240 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:224 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:208 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:192 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:176 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:160 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:144 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:128 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:112 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:96 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:80 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:64 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:48 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:32 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 .LBB2_1 -; FLATSCR-NEXT: ; %bb.2: ; %split +; FLATSCR-NEXT: ; %bb.2: ; %static-memset-post-expansion +; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 +; FLATSCR-NEXT: s_add_i32 s6, s0, 0x2000 +; FLATSCR-NEXT: s_mov_b32 s0, 0 +; FLATSCR-NEXT: s_mov_b32 s1, s0 +; FLATSCR-NEXT: s_mov_b32 s2, s0 +; FLATSCR-NEXT: s_mov_b32 s3, s0 +; FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; FLATSCR-NEXT: v_mov_b32_e32 v1, s1 +; FLATSCR-NEXT: v_mov_b32_e32 v2, s2 +; FLATSCR-NEXT: v_mov_b32_e32 v3, s3 ; FLATSCR-NEXT: s_movk_i32 s0, 0x1000 +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s6 offset:256 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s6 offset:272 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_addk_i32 s0, 0x2000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:720 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll index 83d6f4f5882b4..a57a4a38b1cbd 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll @@ -1328,15 +1328,45 @@ define void @memset_known(ptr addrspace(7) inreg %ptr) { ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 -; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] -; CHECK: [[LOADSTORELOOP]]: -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: br label %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY:.*]] +; CHECK: [[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) -; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 0), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 1), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 2), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 3)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP2]], 16 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 4), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 5), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 6), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 7)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP2]], 32 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 8), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 9), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 10), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 11)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP2]], 48 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 12), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 13), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 14), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 15)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP2]], 64 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 16), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 17), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 18), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 19)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP2]], 80 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 20), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 21), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 22), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 23)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP2]], 96 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 24), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 25), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 26), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 27)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP2]], 112 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 28), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 29), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 30), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 31)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP2]], 128 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 32), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 33), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 34), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 35)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP2]], 144 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 36), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 37), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 38), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 39)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP2]], 160 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 40), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 41), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 42), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 43)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP2]], 176 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 44), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 45), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 46), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 47)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP2]], 192 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 48), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 49), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 50), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 51)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP2]], 208 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 52), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 53), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 54), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 55)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP2]], 224 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 56), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 57), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 58), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 59)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP2]], 240 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 60), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 61), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 62), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 63)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) +; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 256 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 8192 -; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] -; CHECK: [[SPLIT]]: +; CHECK-NEXT: br i1 [[TMP4]], label %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMSET_POST_LOOP_EXPANSION:.*]] +; CHECK: [[STATIC_MEMSET_POST_LOOP_EXPANSION]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.p7.i32(ptr addrspace(7) noundef nonnull align 16 %ptr, i8 1, i32 8192, i1 false) @@ -1348,15 +1378,9 @@ define void @memset_known_small(ptr addrspace(7) inreg %ptr) { ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 -; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] -; CHECK: [[LOADSTORELOOP]]: -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) -; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32 -; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] -; CHECK: [[SPLIT]]: +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[PTR_OFF]], i32 0, i32 0) +; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[PTR_OFF]], 16 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP1]], i32 0, i32 0) ; CHECK-NEXT: ret void ; call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 32, i1 false) @@ -1368,15 +1392,7 @@ define void @memset_known_byte(ptr addrspace(7) inreg %ptr) { ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 -; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] -; CHECK: [[LOADSTORELOOP]]: -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) -; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1 -; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] -; CHECK: [[SPLIT]]: +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[PTR_OFF]], i32 0, i32 0) ; CHECK-NEXT: ret void ; call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 1, i1 false) @@ -1388,15 +1404,13 @@ define void @memset_known_tail(ptr addrspace(7) inreg %ptr) { ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 -; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] -; CHECK: [[LOADSTORELOOP]]: -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 bitcast (<8 x i8> splat (i8 1) to i64), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[PTR_OFF]], i32 0, i32 0) +; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[PTR_OFF]], 8 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 bitcast (<4 x i8> splat (i8 1) to i32), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP1]], i32 0, i32 0) +; CHECK-NEXT: [[TMP3:%.*]] = add nuw i32 [[PTR_OFF]], 12 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 bitcast (<2 x i8> splat (i8 1) to i16), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[PTR_OFF]], 14 ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) -; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 15 -; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] -; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 15, i1 false) @@ -1408,16 +1422,46 @@ define void @memset_known_i64(ptr addrspace(7) inreg %ptr) { ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 -; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] -; CHECK: [[LOADSTORELOOP]]: -; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: br label %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY:.*]] +; CHECK: [[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]] ] ; CHECK-NEXT: [[DOTC:%.*]] = trunc i64 [[TMP1]] to i32 ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTC]] -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) -; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 0), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 1), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 2), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 3)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP2]], 16 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 4), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 5), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 6), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 7)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP2]], 32 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 8), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 9), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 10), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 11)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP2]], 48 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 12), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 13), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 14), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 15)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP2]], 64 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 16), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 17), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 18), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 19)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP2]], 80 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 20), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 21), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 22), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 23)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP2]], 96 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 24), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 25), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 26), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 27)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP2]], 112 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 28), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 29), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 30), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 31)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP2]], 128 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 32), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 33), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 34), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 35)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP2]], 144 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 36), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 37), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 38), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 39)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP2]], 160 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 40), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 41), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 42), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 43)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP2]], 176 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 44), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 45), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 46), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 47)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP2]], 192 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 48), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 49), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 50), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 51)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP2]], 208 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 52), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 53), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 54), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 55)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP2]], 224 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 56), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 57), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 58), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 59)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP2]], 240 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 60), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 61), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 62), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 63)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) +; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 256 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8192 -; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] -; CHECK: [[SPLIT]]: +; CHECK-NEXT: br i1 [[TMP4]], label %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMSET_POST_LOOP_EXPANSION:.*]] +; CHECK: [[STATIC_MEMSET_POST_LOOP_EXPANSION]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.p7.i64(ptr addrspace(7) %ptr, i8 1, i64 8192, i1 false) @@ -1429,15 +1473,9 @@ define void @memset_known_i32_volatile(ptr addrspace(7) inreg %ptr) { ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 -; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] -; CHECK: [[LOADSTORELOOP]]: -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648) -; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32 -; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] -; CHECK: [[SPLIT]]: +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[PTR_OFF]], i32 0, i32 -2147483648) +; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[PTR_OFF]], 16 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP1]], i32 0, i32 -2147483648) ; CHECK-NEXT: ret void ; call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 32, i1 true) @@ -1449,16 +1487,29 @@ define void @memset_unknown(ptr addrspace(7) inreg %ptr, i32 inreg %length) { ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]], i32 inreg [[LENGTH:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 0, [[LENGTH]] -; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] -; CHECK: [[LOADSTORELOOP]]: -; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[LENGTH]], 15 +; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[LENGTH]], [[TMP1]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP12]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY:.*]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_COND:.*]] +; CHECK: [[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]]: +; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[PTR_OFF]], [[TMP2]] -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0) -; CHECK-NEXT: [[TMP4]] = add i32 [[TMP2]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[LENGTH]] -; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]] -; CHECK: [[SPLIT]]: +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0) +; CHECK-NEXT: [[TMP5]] = add i32 [[TMP2]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_COND]] +; CHECK: [[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_COND]]: +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[TMP7]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_BODY:.*]], label %[[DYNAMIC_MEMSET_POST_LOOP_EXPANSION:.*]] +; CHECK: [[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_BODY]]: +; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_COND]] ], [ [[TMP10:%.*]], %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP4]], [[RESIDUAL_LOOP_INDEX]] +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[PTR_OFF]], [[TMP8]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP9]], i32 0, i32 0) +; CHECK-NEXT: [[TMP10]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP1]] +; CHECK-NEXT: br i1 [[TMP11]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_BODY]], label %[[DYNAMIC_MEMSET_POST_LOOP_EXPANSION]] +; CHECK: [[DYNAMIC_MEMSET_POST_LOOP_EXPANSION]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 %length, i1 false) @@ -1475,15 +1526,45 @@ define void @memset.inline_known(ptr addrspace(7) inreg %ptr) { ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 -; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] -; CHECK: [[LOADSTORELOOP]]: -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: br label %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY:.*]] +; CHECK: [[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) -; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 0), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 1), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 2), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 3)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP2]], 16 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 4), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 5), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 6), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 7)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP2]], 32 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 8), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 9), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 10), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 11)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP2]], 48 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 12), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 13), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 14), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 15)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP2]], 64 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 16), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 17), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 18), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 19)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP2]], 80 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 20), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 21), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 22), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 23)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP2]], 96 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 24), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 25), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 26), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 27)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP2]], 112 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 28), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 29), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 30), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 31)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP2]], 128 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 32), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 33), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 34), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 35)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP2]], 144 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 36), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 37), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 38), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 39)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP2]], 160 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 40), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 41), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 42), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 43)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP2]], 176 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 44), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 45), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 46), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 47)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP2]], 192 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 48), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 49), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 50), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 51)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP2]], 208 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 52), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 53), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 54), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 55)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP2]], 224 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 56), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 57), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 58), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 59)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP2]], 240 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 60), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 61), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 62), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 63)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) +; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 256 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 8192 -; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] -; CHECK: [[SPLIT]]: +; CHECK-NEXT: br i1 [[TMP4]], label %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMSET_POST_LOOP_EXPANSION:.*]] +; CHECK: [[STATIC_MEMSET_POST_LOOP_EXPANSION]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.inline.p7.i32(ptr addrspace(7) noundef nonnull align 16 %ptr, i8 1, i32 8192, i1 false) @@ -1495,15 +1576,9 @@ define void @memset.inline_known_small(ptr addrspace(7) inreg %ptr) { ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 -; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] -; CHECK: [[LOADSTORELOOP]]: -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) -; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32 -; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] -; CHECK: [[SPLIT]]: +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[PTR_OFF]], i32 0, i32 0) +; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[PTR_OFF]], 16 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP1]], i32 0, i32 0) ; CHECK-NEXT: ret void ; call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 32, i1 false) @@ -1515,15 +1590,7 @@ define void @memset.inline_known_byte(ptr addrspace(7) inreg %ptr) { ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 -; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] -; CHECK: [[LOADSTORELOOP]]: -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) -; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1 -; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] -; CHECK: [[SPLIT]]: +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[PTR_OFF]], i32 0, i32 0) ; CHECK-NEXT: ret void ; call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 1, i1 false) @@ -1535,15 +1602,13 @@ define void @memset.inline_known_tail(ptr addrspace(7) inreg %ptr) { ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 -; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] -; CHECK: [[LOADSTORELOOP]]: -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 bitcast (<8 x i8> splat (i8 1) to i64), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[PTR_OFF]], i32 0, i32 0) +; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[PTR_OFF]], 8 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 bitcast (<4 x i8> splat (i8 1) to i32), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP1]], i32 0, i32 0) +; CHECK-NEXT: [[TMP3:%.*]] = add nuw i32 [[PTR_OFF]], 12 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 bitcast (<2 x i8> splat (i8 1) to i16), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[PTR_OFF]], 14 ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) -; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 15 -; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] -; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 15, i1 false) @@ -1555,16 +1620,46 @@ define void @memset.inline_known_i64(ptr addrspace(7) inreg %ptr) { ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 -; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] -; CHECK: [[LOADSTORELOOP]]: -; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: br label %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY:.*]] +; CHECK: [[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]] ] ; CHECK-NEXT: [[DOTC:%.*]] = trunc i64 [[TMP1]] to i32 ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTC]] -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) -; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 0), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 1), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 2), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 3)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP2]], 16 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 4), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 5), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 6), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 7)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP2]], 32 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 8), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 9), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 10), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 11)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP2]], 48 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 12), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 13), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 14), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 15)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP2]], 64 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 16), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 17), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 18), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 19)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP2]], 80 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 20), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 21), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 22), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 23)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP2]], 96 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 24), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 25), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 26), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 27)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP2]], 112 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 28), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 29), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 30), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 31)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP2]], 128 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 32), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 33), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 34), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 35)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP2]], 144 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 36), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 37), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 38), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 39)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP2]], 160 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 40), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 41), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 42), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 43)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP2]], 176 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 44), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 45), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 46), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 47)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP2]], 192 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 48), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 49), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 50), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 51)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP2]], 208 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 52), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 53), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 54), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 55)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP2]], 224 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 56), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 57), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 58), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 59)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP2]], 240 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 60), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 61), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 62), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 63)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) +; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 256 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8192 -; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] -; CHECK: [[SPLIT]]: +; CHECK-NEXT: br i1 [[TMP4]], label %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMSET_POST_LOOP_EXPANSION:.*]] +; CHECK: [[STATIC_MEMSET_POST_LOOP_EXPANSION]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.inline.p7.i64(ptr addrspace(7) %ptr, i8 1, i64 8192, i1 false) @@ -1576,15 +1671,9 @@ define void @memset.inline_known_i32_volatile(ptr addrspace(7) inreg %ptr) { ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 -; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] -; CHECK: [[LOADSTORELOOP]]: -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648) -; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32 -; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] -; CHECK: [[SPLIT]]: +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[PTR_OFF]], i32 0, i32 -2147483648) +; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[PTR_OFF]], 16 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP1]], i32 0, i32 -2147483648) ; CHECK-NEXT: ret void ; call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 32, i1 true) @@ -1596,16 +1685,29 @@ define void @memset.inline_unknown(ptr addrspace(7) inreg %ptr, i32 inreg %lengt ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]], i32 inreg [[LENGTH:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 0, [[LENGTH]] -; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] -; CHECK: [[LOADSTORELOOP]]: -; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[LENGTH]], 15 +; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[LENGTH]], [[TMP1]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP12]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY:.*]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_COND:.*]] +; CHECK: [[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]]: +; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[PTR_OFF]], [[TMP2]] -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0) -; CHECK-NEXT: [[TMP4]] = add i32 [[TMP2]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[LENGTH]] -; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]] -; CHECK: [[SPLIT]]: +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0) +; CHECK-NEXT: [[TMP5]] = add i32 [[TMP2]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_COND]] +; CHECK: [[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_COND]]: +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[TMP7]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_BODY:.*]], label %[[DYNAMIC_MEMSET_POST_LOOP_EXPANSION:.*]] +; CHECK: [[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_BODY]]: +; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_COND]] ], [ [[TMP10:%.*]], %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP4]], [[RESIDUAL_LOOP_INDEX]] +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[PTR_OFF]], [[TMP8]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP9]], i32 0, i32 0) +; CHECK-NEXT: [[TMP10]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP1]] +; CHECK-NEXT: br i1 [[TMP11]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_BODY]], label %[[DYNAMIC_MEMSET_POST_LOOP_EXPANSION]] +; CHECK: [[DYNAMIC_MEMSET_POST_LOOP_EXPANSION]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 %length, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll index cf3443ff33b72..3d73b55831cd0 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll @@ -21,15 +21,6 @@ define amdgpu_kernel void @memset_size_0(ptr addrspace(1) %dst, i8 %val) { ; OPT4-NEXT: ret void ; ; OPT0-LABEL: @memset_size_0( -; OPT0-NEXT: br i1 true, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] -; OPT0: loadstoreloop: -; OPT0-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] -; OPT0-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]] -; OPT0-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1 -; OPT0-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 -; OPT0-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 0 -; OPT0-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] -; OPT0: split: ; OPT0-NEXT: ret void ; ; OPT_NEG-LABEL: @memset_size_0( @@ -50,15 +41,11 @@ define amdgpu_kernel void @memset_size_4(ptr addrspace(1) %dst, i8 %val) { ; OPT4-NEXT: ret void ; ; OPT0-LABEL: @memset_size_4( -; OPT0-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] -; OPT0: loadstoreloop: -; OPT0-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] -; OPT0-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]] -; OPT0-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1 -; OPT0-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 -; OPT0-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 4 -; OPT0-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] -; OPT0: split: +; OPT0-NEXT: [[SETVALUE_SPLAT_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[VAL:%.*]], i64 0 +; OPT0-NEXT: [[SETVALUE_SPLAT_SPLAT:%.*]] = shufflevector <4 x i8> [[SETVALUE_SPLAT_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer +; OPT0-NEXT: [[SETVALUE_SPLAT_CAST:%.*]] = bitcast <4 x i8> [[SETVALUE_SPLAT_SPLAT]] to i32 +; OPT0-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0 +; OPT0-NEXT: store i32 [[SETVALUE_SPLAT_CAST]], ptr addrspace(1) [[TMP1]], align 1 ; OPT0-NEXT: ret void ; ; OPT_NEG-LABEL: @memset_size_4( @@ -75,27 +62,19 @@ define amdgpu_kernel void @memset_size_8(ptr addrspace(1) %dst, i8 %val) { ; OPT8-NEXT: ret void ; ; OPT4-LABEL: @memset_size_8( -; OPT4-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] -; OPT4: loadstoreloop: -; OPT4-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] -; OPT4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]] -; OPT4-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1 -; OPT4-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 -; OPT4-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8 -; OPT4-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] -; OPT4: split: +; OPT4-NEXT: [[SETVALUE_SPLAT_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[VAL:%.*]], i64 0 +; OPT4-NEXT: [[SETVALUE_SPLAT_SPLAT:%.*]] = shufflevector <8 x i8> [[SETVALUE_SPLAT_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer +; OPT4-NEXT: [[SETVALUE_SPLAT_CAST:%.*]] = bitcast <8 x i8> [[SETVALUE_SPLAT_SPLAT]] to i64 +; OPT4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0 +; OPT4-NEXT: store i64 [[SETVALUE_SPLAT_CAST]], ptr addrspace(1) [[TMP1]], align 1 ; OPT4-NEXT: ret void ; ; OPT0-LABEL: @memset_size_8( -; OPT0-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] -; OPT0: loadstoreloop: -; OPT0-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] -; OPT0-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]] -; OPT0-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1 -; OPT0-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 -; OPT0-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8 -; OPT0-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] -; OPT0: split: +; OPT0-NEXT: [[SETVALUE_SPLAT_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[VAL:%.*]], i64 0 +; OPT0-NEXT: [[SETVALUE_SPLAT_SPLAT:%.*]] = shufflevector <8 x i8> [[SETVALUE_SPLAT_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer +; OPT0-NEXT: [[SETVALUE_SPLAT_CAST:%.*]] = bitcast <8 x i8> [[SETVALUE_SPLAT_SPLAT]] to i64 +; OPT0-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0 +; OPT0-NEXT: store i64 [[SETVALUE_SPLAT_CAST]], ptr addrspace(1) [[TMP1]], align 1 ; OPT0-NEXT: ret void ; ; OPT_NEG-LABEL: @memset_size_8( diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll index 20a34dc997bbc..929ef2fc3c06c 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll @@ -163,15 +163,18 @@ define amdgpu_kernel void @max_size_small_static_memset_caller0(ptr addrspace(1) ; MAX1024-NEXT: ret void ; ; ALL-LABEL: @max_size_small_static_memset_caller0( -; ALL-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] -; ALL: loadstoreloop: +; ALL-NEXT: [[SETVALUE_SPLAT_SPLATINSERT:%.*]] = insertelement <256 x i8> poison, i8 [[VAL:%.*]], i64 0 +; ALL-NEXT: [[SETVALUE_SPLAT_SPLAT:%.*]] = shufflevector <256 x i8> [[SETVALUE_SPLAT_SPLATINSERT]], <256 x i8> poison, <256 x i32> zeroinitializer +; ALL-NEXT: [[SETVALUE_SPLAT_CAST:%.*]] = bitcast <256 x i8> [[SETVALUE_SPLAT_SPLAT]] to <64 x i32> +; ALL-NEXT: br label [[LOADSTORELOOP:%.*]] +; ALL: static-memset-expansion-main-body: ; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] ; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]] -; ALL-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1 -; ALL-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 +; ALL-NEXT: store <64 x i32> [[SETVALUE_SPLAT_CAST]], ptr addrspace(1) [[TMP2]], align 1 +; ALL-NEXT: [[TMP3]] = add i64 [[TMP1]], 256 ; ALL-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1024 -; ALL-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] -; ALL: split: +; ALL-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT:%.*]] +; ALL: static-memset-post-expansion: ; ALL-NEXT: ret void ; call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 1024, i1 false) @@ -180,21 +183,60 @@ define amdgpu_kernel void @max_size_small_static_memset_caller0(ptr addrspace(1) define amdgpu_kernel void @min_size_large_static_memset_caller0(ptr addrspace(1) %dst, i8 %val) #0 { ; OPT-LABEL: @min_size_large_static_memset_caller0( -; OPT-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] -; OPT: loadstoreloop: +; OPT-NEXT: [[SETVALUE_SPLAT_SPLATINSERT:%.*]] = insertelement <256 x i8> poison, i8 [[VAL:%.*]], i64 0 +; OPT-NEXT: [[SETVALUE_SPLAT_SPLAT:%.*]] = shufflevector <256 x i8> [[SETVALUE_SPLAT_SPLATINSERT]], <256 x i8> poison, <256 x i32> zeroinitializer +; OPT-NEXT: [[SETVALUE_SPLAT_CAST:%.*]] = bitcast <256 x i8> [[SETVALUE_SPLAT_SPLAT]] to <64 x i32> +; OPT-NEXT: br label [[LOADSTORELOOP:%.*]] +; OPT: static-memset-expansion-main-body: ; OPT-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]] -; OPT-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1 -; OPT-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 -; OPT-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1025 -; OPT-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] -; OPT: split: +; OPT-NEXT: store <64 x i32> [[SETVALUE_SPLAT_CAST]], ptr addrspace(1) [[TMP2]], align 1 +; OPT-NEXT: [[TMP3]] = add i64 [[TMP1]], 256 +; OPT-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1024 +; OPT-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT:%.*]] +; OPT: static-memset-post-expansion: +; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 +; OPT-NEXT: store i8 [[VAL]], ptr addrspace(1) [[TMP5]], align 1 ; OPT-NEXT: ret void ; call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 1025, i1 false) ret void } +define amdgpu_kernel void @variable_size_memset_caller0(ptr addrspace(1) %dst, i8 %val, i64 %n) #0 { +; OPT-LABEL: @variable_size_memset_caller0( +; OPT-NEXT: [[SETVALUE_SPLAT_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[VAL:%.*]], i64 0 +; OPT-NEXT: [[SETVALUE_SPLAT_SPLAT:%.*]] = shufflevector <16 x i8> [[SETVALUE_SPLAT_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer +; OPT-NEXT: [[SETVALUE_SPLAT_CAST:%.*]] = bitcast <16 x i8> [[SETVALUE_SPLAT_SPLAT]] to <4 x i32> +; OPT-NEXT: [[TMP1:%.*]] = and i64 [[N:%.*]], 15 +; OPT-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[TMP1]] +; OPT-NEXT: [[TMP3:%.*]] = icmp ne i64 [[TMP2]], 0 +; OPT-NEXT: br i1 [[TMP3]], label [[DYNAMIC_MEMSET_EXPANSION_MAIN_BODY:%.*]], label [[DYNAMIC_MEMSET_EXPANSION_RESIDUAL_COND:%.*]] +; OPT: dynamic-memset-expansion-main-body: +; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[DYNAMIC_MEMSET_EXPANSION_MAIN_BODY]] ] +; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] +; OPT-NEXT: store <4 x i32> [[SETVALUE_SPLAT_CAST]], ptr addrspace(1) [[TMP4]], align 1 +; OPT-NEXT: [[TMP5]] = add i64 [[LOOP_INDEX]], 16 +; OPT-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[TMP2]] +; OPT-NEXT: br i1 [[TMP6]], label [[DYNAMIC_MEMSET_EXPANSION_MAIN_BODY]], label [[DYNAMIC_MEMSET_EXPANSION_RESIDUAL_COND]] +; OPT: dynamic-memset-expansion-residual-cond: +; OPT-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP1]], 0 +; OPT-NEXT: br i1 [[TMP7]], label [[DYNAMIC_MEMSET_EXPANSION_RESIDUAL_BODY:%.*]], label [[DYNAMIC_MEMSET_POST_EXPANSION:%.*]] +; OPT: dynamic-memset-expansion-residual-body: +; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[DYNAMIC_MEMSET_EXPANSION_RESIDUAL_COND]] ], [ [[TMP10:%.*]], [[DYNAMIC_MEMSET_EXPANSION_RESIDUAL_BODY]] ] +; OPT-NEXT: [[TMP8:%.*]] = add i64 [[TMP2]], [[RESIDUAL_LOOP_INDEX]] +; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP8]] +; OPT-NEXT: store i8 [[VAL]], ptr addrspace(1) [[TMP9]], align 1 +; OPT-NEXT: [[TMP10]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 +; OPT-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP1]] +; OPT-NEXT: br i1 [[TMP11]], label [[DYNAMIC_MEMSET_EXPANSION_RESIDUAL_BODY]], label [[DYNAMIC_MEMSET_POST_EXPANSION]] +; OPT: dynamic-memset-post-expansion: +; OPT-NEXT: ret void +; + call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 %n, i1 false) + ret void +} + define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 { ; OPT-LABEL: @variable_memcpy_caller0( ; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15 diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index 4f2816538b1ff..c60642e2cc4d8 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -15974,6 +15974,1616 @@ entry: ret void } +define void @memset_p0_sz2048(ptr addrspace(0) %dst) { +; CHECK-LABEL: memset_p0_sz2048: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0x41414141 +; CHECK-NEXT: s_mov_b32 s5, s4 +; CHECK-NEXT: s_mov_b32 s6, s4 +; CHECK-NEXT: s_mov_b32 s7, s4 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-NEXT: v_mov_b32_e32 v3, s5 +; CHECK-NEXT: v_mov_b32_e32 v4, s6 +; CHECK-NEXT: v_mov_b32_e32 v5, s7 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_inst_prefetch 0x1 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB10_1: ; %static-memset-expansion-main-body +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v0, s4 +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, s5, v1, vcc_lo +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v6, 48 +; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v7, vcc_lo +; CHECK-NEXT: flat_store_dwordx4 v[6:7], v[2:5] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[6:7], v[2:5] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[6:7], v[2:5] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[6:7], v[2:5] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[6:7], v[2:5] +; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; CHECK-NEXT: s_cbranch_vccnz .LBB10_1 +; CHECK-NEXT: ; %bb.2: ; %static-memset-post-expansion +; CHECK-NEXT: s_inst_prefetch 0x2 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memset_p0_sz2048: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: v_mov_b32_e32 v4, 0x41414141 +; ALIGNED-NEXT: v_mov_b32_e32 v5, 0x41 +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: .LBB10_1: ; %static-memset-expansion-main-body +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v1, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:128 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:64 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:32 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:16 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:8 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:2 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:1 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 +; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:247 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:248 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:246 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:252 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:251 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:250 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:249 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:245 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:244 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:243 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:242 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:241 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:240 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:239 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:238 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:237 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:231 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:232 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:230 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:236 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:235 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:234 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:233 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:229 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:228 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:227 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:226 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:225 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:224 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:223 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:222 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:221 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:215 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:216 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:214 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:220 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:219 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:218 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:217 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:213 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:212 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:211 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:210 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:209 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:208 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:207 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:206 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:205 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:199 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:200 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:198 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:204 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:203 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:202 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:201 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:197 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:196 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:195 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:194 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:193 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:192 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:191 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:190 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:189 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:183 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:184 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:182 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:188 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:187 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:186 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:185 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:181 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:180 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:179 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:178 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:177 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:176 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:175 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:174 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:173 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:167 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:168 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:166 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:172 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:171 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:170 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:169 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:165 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:164 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:163 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:162 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:161 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:160 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:159 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:158 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:157 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:151 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:152 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:150 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:156 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:155 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:154 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:153 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:149 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:148 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:147 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:146 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:145 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:144 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:143 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:142 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:141 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:135 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:136 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:134 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:140 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:139 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:138 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:137 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:133 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:132 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:131 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:130 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:129 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:128 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:127 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:126 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:119 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:120 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:118 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:124 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:123 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:122 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:121 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:117 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:116 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:115 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:114 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:113 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:112 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:111 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:110 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:109 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:103 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:104 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:102 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:108 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:107 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:106 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:105 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:101 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:100 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:99 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:98 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:97 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:96 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:95 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:94 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:93 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:78 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:77 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:80 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:79 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:84 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:83 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:82 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:81 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:86 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:85 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:88 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:87 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:92 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:91 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:90 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:89 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:71 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:72 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:70 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:76 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:75 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:74 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:73 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:69 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:68 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:67 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:66 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:65 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:64 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:63 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:62 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:55 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:56 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:54 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:60 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:59 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:58 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:57 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:53 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:52 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:51 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:50 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:49 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:48 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:47 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:46 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:45 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:39 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:40 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:38 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:44 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:43 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:42 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:41 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:37 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:36 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:35 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:34 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:33 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:32 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:31 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:30 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:23 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:24 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:22 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:28 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:27 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:26 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:25 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:21 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:20 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:19 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:18 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:17 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:16 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:15 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:14 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:7 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:8 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:6 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:12 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:11 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:10 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:9 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:3 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:2 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 +; ALIGNED-NEXT: s_cbranch_vccnz .LBB10_1 +; ALIGNED-NEXT: ; %bb.2: ; %static-memset-post-expansion +; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memset_p0_sz2048: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: s_mov_b32 s4, 0x41414141 +; UNROLL3-NEXT: s_mov_b32 s5, s4 +; UNROLL3-NEXT: s_mov_b32 s6, s4 +; UNROLL3-NEXT: s_mov_b32 s7, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v2, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v3, s5 +; UNROLL3-NEXT: v_mov_b32_e32 v4, s6 +; UNROLL3-NEXT: v_mov_b32_e32 v5, s7 +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: .p2align 6 +; UNROLL3-NEXT: .LBB10_1: ; %static-memset-expansion-main-body +; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 +; UNROLL3-NEXT: v_add_co_u32 v6, vcc_lo, v0, s4 +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v7, null, s5, v1, vcc_lo +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[2:5] offset:16 +; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[2:5] +; UNROLL3-NEXT: v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5] +; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[2:5] offset:32 +; UNROLL3-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; UNROLL3-NEXT: s_cbranch_vccnz .LBB10_1 +; UNROLL3-NEXT: ; %bb.2: ; %static-memset-post-expansion +; UNROLL3-NEXT: s_mov_b32 s4, 0x41414141 +; UNROLL3-NEXT: s_mov_b32 s5, s4 +; UNROLL3-NEXT: s_mov_b32 s6, s4 +; UNROLL3-NEXT: s_mov_b32 s7, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v2, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v3, s5 +; UNROLL3-NEXT: v_mov_b32_e32 v4, s6 +; UNROLL3-NEXT: v_mov_b32_e32 v5, s7 +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:2016 +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:2032 +; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p0.i64(ptr addrspace(0) noundef nonnull %dst, i8 65, i64 2048, i1 false) + ret void +} + +define void @memset_p1_sz2048(ptr addrspace(1) %dst) { +; CHECK-LABEL: memset_p1_sz2048: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0x41414141 +; CHECK-NEXT: s_mov_b32 s5, s4 +; CHECK-NEXT: s_mov_b32 s6, s4 +; CHECK-NEXT: s_mov_b32 s7, s4 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-NEXT: v_mov_b32_e32 v3, s5 +; CHECK-NEXT: v_mov_b32_e32 v4, s6 +; CHECK-NEXT: v_mov_b32_e32 v5, s7 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_inst_prefetch 0x1 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB11_1: ; %static-memset-expansion-main-body +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v0, s4 +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, s5, v1, vcc_lo +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:240 +; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:224 +; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:208 +; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:192 +; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:176 +; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:160 +; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:144 +; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:128 +; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:112 +; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:96 +; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:80 +; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:64 +; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:48 +; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:32 +; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:16 +; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; CHECK-NEXT: s_cbranch_vccnz .LBB11_1 +; CHECK-NEXT: ; %bb.2: ; %static-memset-post-expansion +; CHECK-NEXT: s_inst_prefetch 0x2 +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memset_p1_sz2048: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: v_mov_b32_e32 v4, 0x41414141 +; ALIGNED-NEXT: v_mov_b32_e32 v5, 0x41 +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: .LBB11_1: ; %static-memset-expansion-main-body +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, v0, s4 +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v1, vcc_lo +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:250 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:251 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:249 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:255 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:254 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:253 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:252 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:248 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:247 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:246 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:245 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:244 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:243 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:242 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:241 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:240 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:234 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:235 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:233 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:239 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:238 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:237 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:236 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:232 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:231 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:230 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:229 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:228 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:227 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:226 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:225 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:224 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:218 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:219 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:217 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:223 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:222 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:221 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:220 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:216 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:215 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:214 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:213 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:212 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:211 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:210 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:209 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:208 +; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:202 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:203 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:201 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:207 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:206 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:205 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:204 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:200 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:199 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:198 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:197 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:196 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:195 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:194 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:193 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:192 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:186 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:187 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:185 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:191 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:190 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:189 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:188 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:184 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:183 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:182 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:181 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:180 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:179 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:178 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:177 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:176 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:170 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:171 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:169 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:175 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:174 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:173 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:172 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:168 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:167 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:166 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:165 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:164 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:163 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:162 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:161 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:160 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:154 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:155 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:153 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:159 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:158 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:157 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:156 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:152 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:151 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:150 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:149 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:148 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:147 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:146 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:145 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:144 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:138 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:139 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:137 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:143 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:142 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:141 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:140 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:136 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:135 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:134 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:133 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:132 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:131 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:130 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:129 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:128 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:122 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:123 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:121 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:127 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:126 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:125 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:124 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:120 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:119 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:118 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:117 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:116 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:115 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:114 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:113 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:112 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:106 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:107 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:105 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:111 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:110 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:109 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:108 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:104 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:103 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:102 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:101 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:100 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:99 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:98 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:97 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:96 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:81 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:80 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:83 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:82 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:87 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:86 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:85 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:84 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:89 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:88 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:91 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:90 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:95 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:94 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:93 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:92 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:74 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:75 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:73 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:79 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:78 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:77 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:76 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:72 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:71 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:70 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:69 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:68 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:67 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:66 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:65 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:64 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:58 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:59 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:57 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:63 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:62 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:61 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:60 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:56 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:55 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:54 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:53 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:52 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:51 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:50 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:49 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:48 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:42 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:43 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:41 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:47 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:46 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:45 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:44 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:40 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:39 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:38 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:37 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:36 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:35 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:34 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:33 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:32 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:26 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:27 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:25 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:31 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:30 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:29 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:28 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:24 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:23 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:22 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:21 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:20 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:19 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:18 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:17 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:16 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:10 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:11 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:9 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:15 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:14 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:13 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:12 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:8 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:7 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:6 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:5 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:4 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:3 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:2 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:1 +; ALIGNED-NEXT: global_store_byte v[2:3], v5, off +; ALIGNED-NEXT: s_cbranch_vccnz .LBB11_1 +; ALIGNED-NEXT: ; %bb.2: ; %static-memset-post-expansion +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memset_p1_sz2048: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: s_mov_b32 s4, 0x41414141 +; UNROLL3-NEXT: s_mov_b32 s5, s4 +; UNROLL3-NEXT: s_mov_b32 s6, s4 +; UNROLL3-NEXT: s_mov_b32 s7, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v2, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v3, s5 +; UNROLL3-NEXT: v_mov_b32_e32 v4, s6 +; UNROLL3-NEXT: v_mov_b32_e32 v5, s7 +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: .p2align 6 +; UNROLL3-NEXT: .LBB11_1: ; %static-memset-expansion-main-body +; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 +; UNROLL3-NEXT: v_add_co_u32 v6, vcc_lo, v0, s4 +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v7, null, s5, v1, vcc_lo +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:16 +; UNROLL3-NEXT: global_store_dwordx4 v[6:7], v[2:5], off +; UNROLL3-NEXT: v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5] +; UNROLL3-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:32 +; UNROLL3-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; UNROLL3-NEXT: s_cbranch_vccnz .LBB11_1 +; UNROLL3-NEXT: ; %bb.2: ; %static-memset-post-expansion +; UNROLL3-NEXT: s_mov_b32 s4, 0x41414141 +; UNROLL3-NEXT: s_mov_b32 s5, s4 +; UNROLL3-NEXT: s_mov_b32 s6, s4 +; UNROLL3-NEXT: s_mov_b32 s7, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v2, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v3, s5 +; UNROLL3-NEXT: v_mov_b32_e32 v4, s6 +; UNROLL3-NEXT: v_mov_b32_e32 v5, s7 +; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:2016 +; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:2032 +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef nonnull %dst, i8 65, i64 2048, i1 false) + ret void +} + +define void @memset_p3_sz2048(ptr addrspace(3) %dst) { +; CHECK-LABEL: memset_p3_sz2048: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0x41414141 +; CHECK-NEXT: s_mov_b32 s5, s4 +; CHECK-NEXT: s_mov_b32 s6, s4 +; CHECK-NEXT: s_mov_b32 s7, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: v_mov_b32_e32 v2, s5 +; CHECK-NEXT: v_mov_b32_e32 v3, s6 +; CHECK-NEXT: v_mov_b32_e32 v4, s7 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_inst_prefetch 0x1 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB12_1: ; %static-memset-expansion-main-body +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:240 +; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:224 +; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:208 +; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:192 +; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:176 +; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:160 +; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:144 +; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:128 +; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:112 +; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:96 +; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:80 +; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:64 +; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:48 +; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:32 +; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:16 +; CHECK-NEXT: ds_write_b128 v0, v[1:4] +; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; CHECK-NEXT: s_cbranch_vccnz .LBB12_1 +; CHECK-NEXT: ; %bb.2: ; %static-memset-post-expansion +; CHECK-NEXT: s_inst_prefetch 0x2 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memset_p3_sz2048: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: v_mov_b32_e32 v1, 0x41 +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: .LBB12_1: ; %static-memset-expansion-main-body +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:255 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:254 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:253 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:252 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:251 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:250 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:249 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:248 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:247 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:246 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:245 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:244 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:243 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:242 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:241 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:240 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:239 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:238 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:237 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:236 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:235 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:234 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:233 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:232 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:231 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:230 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:229 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:228 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:227 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:226 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:225 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:224 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:223 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:222 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:221 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:220 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:219 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:218 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:217 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:216 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:215 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:214 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:213 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:212 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:211 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:210 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:209 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:208 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:207 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:206 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:205 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:204 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:203 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:202 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:201 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:200 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:199 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:198 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:197 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:196 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:195 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:194 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:193 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:192 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:191 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:190 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:189 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:188 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:187 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:186 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:185 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:184 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:183 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:182 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:181 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:180 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:179 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:178 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:177 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:176 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:175 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:174 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:173 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:172 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:171 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:170 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:169 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:168 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:167 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:166 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:165 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:164 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:163 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:162 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:161 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:160 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:159 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:158 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:157 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:156 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:155 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:154 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:153 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:152 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:151 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:150 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:149 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:148 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:147 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:146 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:145 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:144 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:143 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:142 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:141 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:140 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:139 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:138 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:137 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:136 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:135 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:134 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:133 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:132 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:131 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:130 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:129 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:128 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:127 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:126 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:125 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:124 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:123 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:122 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:121 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:120 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:119 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:118 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:117 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:116 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:115 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:114 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:113 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:112 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:111 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:110 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:109 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:108 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:107 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:106 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:105 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:104 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:103 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:102 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:101 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:100 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:99 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:98 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:97 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:96 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:87 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:86 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:85 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:84 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:81 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:80 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:83 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:82 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:95 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:94 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:93 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:92 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:89 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:88 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:91 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:90 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:79 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:78 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:77 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:76 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:75 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:74 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:73 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:72 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:71 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:70 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:69 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:68 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:67 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:66 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:65 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:64 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:63 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:62 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:61 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:60 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:59 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:58 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:57 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:56 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:55 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:54 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:53 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:52 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:51 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:50 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:49 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:48 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:47 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:46 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:45 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:44 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:43 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:42 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:41 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:40 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:39 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:38 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:37 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:36 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:35 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:34 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:33 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:32 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:31 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:30 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:29 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:28 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:27 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:26 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:25 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:24 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:23 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:22 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:21 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:20 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:19 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:18 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:17 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:16 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:15 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:14 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:13 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:12 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:11 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:10 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:9 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:8 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:7 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:6 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:5 +; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:4 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:3 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:2 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:1 +; ALIGNED-NEXT: ds_write_b8 v0, v1 +; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 +; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; ALIGNED-NEXT: s_cbranch_vccnz .LBB12_1 +; ALIGNED-NEXT: ; %bb.2: ; %static-memset-post-expansion +; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memset_p3_sz2048: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: s_mov_b32 s4, 0x41414141 +; UNROLL3-NEXT: v_mov_b32_e32 v5, v0 +; UNROLL3-NEXT: s_mov_b32 s5, s4 +; UNROLL3-NEXT: s_mov_b32 s6, s4 +; UNROLL3-NEXT: s_mov_b32 s7, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v1, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v2, s5 +; UNROLL3-NEXT: v_mov_b32_e32 v3, s6 +; UNROLL3-NEXT: v_mov_b32_e32 v4, s7 +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: .LBB12_1: ; %static-memset-expansion-main-body +; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: ds_write_b128 v5, v[1:4] offset:16 +; UNROLL3-NEXT: ds_write_b128 v5, v[1:4] +; UNROLL3-NEXT: ds_write_b128 v5, v[1:4] offset:32 +; UNROLL3-NEXT: v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5] +; UNROLL3-NEXT: v_add_nc_u32_e32 v5, 48, v5 +; UNROLL3-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; UNROLL3-NEXT: s_cbranch_vccnz .LBB12_1 +; UNROLL3-NEXT: ; %bb.2: ; %static-memset-post-expansion +; UNROLL3-NEXT: s_mov_b32 s4, 0x41414141 +; UNROLL3-NEXT: s_mov_b32 s5, s4 +; UNROLL3-NEXT: s_mov_b32 s6, s4 +; UNROLL3-NEXT: s_mov_b32 s7, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v1, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v2, s5 +; UNROLL3-NEXT: v_mov_b32_e32 v3, s6 +; UNROLL3-NEXT: v_mov_b32_e32 v4, s7 +; UNROLL3-NEXT: ds_write_b128 v0, v[1:4] offset:2016 +; UNROLL3-NEXT: ds_write_b128 v0, v[1:4] offset:2032 +; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p3.i64(ptr addrspace(3) noundef nonnull %dst, i8 65, i64 2048, i1 false) + ret void +} + +define void @memset_p5_sz2048(ptr addrspace(5) %dst) { +; CHECK-LABEL: memset_p5_sz2048: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, 0x41414141 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .LBB13_1: ; %static-memset-expansion-main-body +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:252 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:248 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:244 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:240 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:236 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:232 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:228 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:224 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:220 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:216 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:212 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:208 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:204 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:200 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:196 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:192 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:188 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:184 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:180 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:176 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:172 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:168 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:164 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:160 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:156 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:152 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:148 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:144 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:140 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:136 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:132 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:128 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; CHECK-NEXT: s_cbranch_vccnz .LBB13_1 +; CHECK-NEXT: ; %bb.2: ; %static-memset-post-expansion +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memset_p5_sz2048: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: v_mov_b32_e32 v1, 0x41 +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: .LBB13_1: ; %static-memset-expansion-main-body +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:129 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:128 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:125 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:124 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:123 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:122 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:121 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:119 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:118 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:117 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:116 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:115 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:114 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:113 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:111 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:110 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:109 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:107 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:106 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:105 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:103 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:102 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:101 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:100 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:99 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:98 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:97 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:95 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:94 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:93 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:92 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:91 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:90 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:89 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:81 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:78 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:76 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:65 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 +; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; ALIGNED-NEXT: s_cbranch_vccnz .LBB13_1 +; ALIGNED-NEXT: ; %bb.2: ; %static-memset-post-expansion +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memset_p5_sz2048: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: v_mov_b32_e32 v1, 0x41414141 +; UNROLL3-NEXT: v_mov_b32_e32 v2, v0 +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: .p2align 6 +; UNROLL3-NEXT: .LBB13_1: ; %static-memset-expansion-main-body +; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:44 +; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:40 +; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:36 +; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:32 +; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:28 +; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:24 +; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:20 +; UNROLL3-NEXT: v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5] +; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:12 +; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:8 +; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 48, v2 +; UNROLL3-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; UNROLL3-NEXT: s_cbranch_vccnz .LBB13_1 +; UNROLL3-NEXT: ; %bb.2: ; %static-memset-post-expansion +; UNROLL3-NEXT: v_mov_b32_e32 v1, 0x41414141 +; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2028 +; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2024 +; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2020 +; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2016 +; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2044 +; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2040 +; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2036 +; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2032 +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p5.i64(ptr addrspace(5) noundef nonnull %dst, i8 65, i64 2048, i1 false) + ret void +} + declare void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(0) noalias nocapture readonly, i64, i1 immarg) #2 declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2 @@ -15989,4 +17599,10 @@ declare void @llvm.memmove.p5.p5.i64(ptr addrspace(5) nocapture writeonly, ptr a declare void @llvm.memmove.p0.p5.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #2 +declare void @llvm.memset.p0.i64(ptr addrspace(0) nocapture writeonly, i8, i64, i1 immarg) #3 +declare void @llvm.memset.p1.i64(ptr addrspace(1) nocapture writeonly, i8, i64, i1 immarg) #3 +declare void @llvm.memset.p3.i64(ptr addrspace(3) nocapture writeonly, i8, i64, i1 immarg) #3 +declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg) #3 + attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: write) } diff --git a/llvm/test/CodeGen/AMDGPU/memset-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memset-param-combinations.ll new file mode 100644 index 0000000000000..512c0477803fb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memset-param-combinations.ll @@ -0,0 +1,1900 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 %s -o - | FileCheck -check-prefixes=GFX942,GFX942-SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 %s -o - | FileCheck -check-prefixes=GFX942,GFX942-GISEL %s + + +define void @memset_p0_varsize_align_4_varsetval(ptr addrspace(0) align 4 %dst, i8 %setval, i64 %size) { +; GFX942-SDAG-LABEL: memset_p0_varsize_align_4_varsetval: +; GFX942-SDAG: ; %bb.0: ; %entry +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v3 +; GFX942-SDAG-NEXT: v_and_b32_e32 v12, -16, v10 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v4 +; GFX942-SDAG-NEXT: v_and_b32_e32 v8, 15, v10 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] +; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-SDAG-NEXT: s_cbranch_execz .LBB0_3 +; GFX942-SDAG-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader +; GFX942-SDAG-NEXT: s_mov_b32 s4, 0x4040404 +; GFX942-SDAG-NEXT: v_perm_b32 v4, v2, v2, s4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-SDAG-NEXT: .LBB0_2: ; %dynamic-memset-expansion-main-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[14:15], v[0:1], 0, s[4:5] +; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 16 +; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0 +; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[12:13] +; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB0_2 +; GFX942-SDAG-NEXT: .LBB0_3: ; %Flow4 +; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-SDAG-NEXT: s_cbranch_execz .LBB0_6 +; GFX942-SDAG-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader +; GFX942-SDAG-NEXT: v_and_b32_e32 v10, -16, v10 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[10:11] +; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-SDAG-NEXT: .LBB0_5: ; %dynamic-memset-expansion-residual-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[4:5] +; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 1 +; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0 +; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9] +; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-SDAG-NEXT: flat_store_byte v[4:5], v2 +; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB0_5 +; GFX942-SDAG-NEXT: .LBB0_6: ; %Flow2 +; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: memset_p0_varsize_align_4_varsetval: +; GFX942-GISEL: ; %bb.0: ; %entry +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b32_e32 v10, v3 +; GFX942-GISEL-NEXT: v_and_b32_e32 v8, 15, v10 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v11, v4 +; GFX942-GISEL-NEXT: v_sub_co_u32_e32 v12, vcc, v10, v8 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_subbrev_co_u32_e32 v13, vcc, 0, v11, vcc +; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] +; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-GISEL-NEXT: s_cbranch_execz .LBB0_3 +; GFX942-GISEL-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader +; GFX942-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v2 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v4, v3, 8, v3 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX942-GISEL-NEXT: v_or3_b32 v4, v4, v5, v3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1] +; GFX942-GISEL-NEXT: .LBB0_2: ; %dynamic-memset-expansion-main-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, v0, v14 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, v1, v15, vcc +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 16, v14 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[14:15], v[12:13] +; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB0_2 +; GFX942-GISEL-NEXT: .LBB0_3: ; %Flow4 +; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-GISEL-NEXT: s_cbranch_execz .LBB0_6 +; GFX942-GISEL-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader +; GFX942-GISEL-NEXT: v_lshrrev_b64 v[4:5], 4, v[10:11] +; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 4, v[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-GISEL-NEXT: .LBB0_5: ; %dynamic-memset-expansion-residual-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, v0, v4 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v5, vcc +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v4, vcc, 1, v4 +; GFX942-GISEL-NEXT: flat_store_byte v[6:7], v2 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[8:9] +; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB0_5 +; GFX942-GISEL-NEXT: .LBB0_6: ; %Flow2 +; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p0.i64(ptr addrspace(0) noundef nonnull align 4 %dst, i8 %setval, i64 %size, i1 false) + ret void +} + +define void @memset_p1_varsize_align_4_varsetval(ptr addrspace(1) align 4 %dst, i8 %setval, i64 %size) { +; GFX942-SDAG-LABEL: memset_p1_varsize_align_4_varsetval: +; GFX942-SDAG: ; %bb.0: ; %entry +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v3 +; GFX942-SDAG-NEXT: v_and_b32_e32 v12, -16, v10 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v4 +; GFX942-SDAG-NEXT: v_and_b32_e32 v8, 15, v10 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] +; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-SDAG-NEXT: s_cbranch_execz .LBB1_3 +; GFX942-SDAG-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader +; GFX942-SDAG-NEXT: s_mov_b32 s4, 0x4040404 +; GFX942-SDAG-NEXT: v_perm_b32 v4, v2, v2, s4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-SDAG-NEXT: .LBB1_2: ; %dynamic-memset-expansion-main-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[14:15], v[0:1], 0, s[4:5] +; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 16 +; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0 +; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[12:13] +; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-SDAG-NEXT: global_store_dwordx4 v[14:15], v[4:7], off +; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB1_2 +; GFX942-SDAG-NEXT: .LBB1_3: ; %Flow4 +; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-SDAG-NEXT: s_cbranch_execz .LBB1_6 +; GFX942-SDAG-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader +; GFX942-SDAG-NEXT: v_and_b32_e32 v10, -16, v10 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[10:11] +; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-SDAG-NEXT: .LBB1_5: ; %dynamic-memset-expansion-residual-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[4:5] +; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 1 +; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0 +; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9] +; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-SDAG-NEXT: global_store_byte v[4:5], v2, off +; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB1_5 +; GFX942-SDAG-NEXT: .LBB1_6: ; %Flow2 +; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: memset_p1_varsize_align_4_varsetval: +; GFX942-GISEL: ; %bb.0: ; %entry +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b32_e32 v10, v3 +; GFX942-GISEL-NEXT: v_and_b32_e32 v8, 15, v10 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v11, v4 +; GFX942-GISEL-NEXT: v_sub_co_u32_e32 v12, vcc, v10, v8 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_subbrev_co_u32_e32 v13, vcc, 0, v11, vcc +; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] +; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-GISEL-NEXT: s_cbranch_execz .LBB1_3 +; GFX942-GISEL-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader +; GFX942-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v2 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v4, v3, 8, v3 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX942-GISEL-NEXT: v_or3_b32 v4, v4, v5, v3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1] +; GFX942-GISEL-NEXT: .LBB1_2: ; %dynamic-memset-expansion-main-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, v0, v14 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, v1, v15, vcc +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 16, v14 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[16:17], v[4:7], off +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[14:15], v[12:13] +; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB1_2 +; GFX942-GISEL-NEXT: .LBB1_3: ; %Flow4 +; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-GISEL-NEXT: s_cbranch_execz .LBB1_6 +; GFX942-GISEL-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader +; GFX942-GISEL-NEXT: v_lshrrev_b64 v[4:5], 4, v[10:11] +; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 4, v[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-GISEL-NEXT: .LBB1_5: ; %dynamic-memset-expansion-residual-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, v0, v4 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v5, vcc +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v4, vcc, 1, v4 +; GFX942-GISEL-NEXT: global_store_byte v[6:7], v2, off +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[8:9] +; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB1_5 +; GFX942-GISEL-NEXT: .LBB1_6: ; %Flow2 +; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef nonnull align 4 %dst, i8 %setval, i64 %size, i1 false) + ret void +} + +define void @memset_p3_varsize_align_4_varsetval(ptr addrspace(3) align 4 %dst, i8 %setval, i64 %size) { +; GFX942-SDAG-LABEL: memset_p3_varsize_align_4_varsetval: +; GFX942-SDAG: ; %bb.0: ; %entry +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-SDAG-NEXT: v_and_b32_e32 v4, -16, v2 +; GFX942-SDAG-NEXT: v_and_b32_e32 v10, 15, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, 0 +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-SDAG-NEXT: s_cbranch_execz .LBB2_3 +; GFX942-SDAG-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader +; GFX942-SDAG-NEXT: s_mov_b32 s4, 0x4040404 +; GFX942-SDAG-NEXT: v_perm_b32 v6, v1, v1, s4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v6 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-SDAG-NEXT: .LBB2_2: ; %dynamic-memset-expansion-main-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 16 +; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0 +; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[4:5] +; GFX942-SDAG-NEXT: ds_write2_b32 v9, v8, v7 offset0:2 offset1:3 +; GFX942-SDAG-NEXT: ds_write2_b32 v9, v6, v3 offset1:1 +; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-SDAG-NEXT: v_add_u32_e32 v9, 16, v9 +; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB2_2 +; GFX942-SDAG-NEXT: .LBB2_3: ; %Flow7 +; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-SDAG-NEXT: s_cbranch_execz .LBB2_6 +; GFX942-SDAG-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader +; GFX942-SDAG-NEXT: v_and_b32_e32 v2, -16, v2 +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-SDAG-NEXT: .LBB2_5: ; %dynamic-memset-expansion-residual-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 1 +; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0 +; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[10:11] +; GFX942-SDAG-NEXT: ds_write_b8 v0, v1 +; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 1, v0 +; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB2_5 +; GFX942-SDAG-NEXT: .LBB2_6: ; %Flow5 +; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: memset_p3_varsize_align_4_varsetval: +; GFX942-GISEL: ; %bb.0: ; %entry +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_and_b32_e32 v4, 15, v2 +; GFX942-GISEL-NEXT: v_sub_co_u32_e32 v6, vcc, v2, v4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_subbrev_co_u32_e32 v7, vcc, 0, v3, vcc +; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-GISEL-NEXT: s_cbranch_execz .LBB2_3 +; GFX942-GISEL-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader +; GFX942-GISEL-NEXT: v_and_b32_e32 v8, 0xff, v1 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v9, v8, 8, v8 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v8 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX942-GISEL-NEXT: v_or3_b32 v8, v9, v10, v8 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v9, v8 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v11, v8 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v14, v0 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GFX942-GISEL-NEXT: .LBB2_2: ; %dynamic-memset-expansion-main-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 16, v12 +; GFX942-GISEL-NEXT: ds_write2_b64 v14, v[8:9], v[10:11] offset1:1 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[12:13], v[6:7] +; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-GISEL-NEXT: v_add_u32_e32 v14, 16, v14 +; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB2_2 +; GFX942-GISEL-NEXT: .LBB2_3: ; %Flow7 +; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-GISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX942-GISEL-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader +; GFX942-GISEL-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-GISEL-NEXT: .LBB2_5: ; %dynamic-memset-expansion-residual-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 1, v2 +; GFX942-GISEL-NEXT: ds_write_b8 v0, v1 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] +; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, 1, v0 +; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB2_5 +; GFX942-GISEL-NEXT: .LBB2_6: ; %Flow5 +; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p3.i64(ptr addrspace(3) noundef nonnull align 4 %dst, i8 %setval, i64 %size, i1 false) + ret void +} + +define void @memset_p5_varsize_align_4_varsetval(ptr addrspace(5) align 4 %dst, i8 %setval, i64 %size) { +; GFX942-SDAG-LABEL: memset_p5_varsize_align_4_varsetval: +; GFX942-SDAG: ; %bb.0: ; %entry +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-SDAG-NEXT: v_and_b32_e32 v4, -16, v2 +; GFX942-SDAG-NEXT: v_and_b32_e32 v10, 15, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, 0 +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-SDAG-NEXT: s_cbranch_execz .LBB3_3 +; GFX942-SDAG-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader +; GFX942-SDAG-NEXT: s_mov_b32 s4, 0x4040404 +; GFX942-SDAG-NEXT: v_perm_b32 v6, v1, v1, s4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v6 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v6 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-SDAG-NEXT: .LBB3_2: ; %dynamic-memset-expansion-main-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 16 +; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0 +; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[4:5] +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v3, v[6:9], off +; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 16, v3 +; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB3_2 +; GFX942-SDAG-NEXT: .LBB3_3: ; %Flow7 +; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-SDAG-NEXT: s_cbranch_execz .LBB3_6 +; GFX942-SDAG-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader +; GFX942-SDAG-NEXT: v_and_b32_e32 v2, -16, v2 +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-SDAG-NEXT: .LBB3_5: ; %dynamic-memset-expansion-residual-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 1 +; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0 +; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[10:11] +; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off +; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 1, v0 +; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB3_5 +; GFX942-SDAG-NEXT: .LBB3_6: ; %Flow5 +; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: memset_p5_varsize_align_4_varsetval: +; GFX942-GISEL: ; %bb.0: ; %entry +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_and_b32_e32 v8, 15, v2 +; GFX942-GISEL-NEXT: v_sub_co_u32_e32 v10, vcc, v2, v8 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_subbrev_co_u32_e32 v11, vcc, 0, v3, vcc +; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-GISEL-NEXT: s_cbranch_execz .LBB3_3 +; GFX942-GISEL-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader +; GFX942-GISEL-NEXT: v_and_b32_e32 v4, 0xff, v1 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v5, v4, 8, v4 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX942-GISEL-NEXT: v_or3_b32 v4, v5, v6, v4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v14, v0 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GFX942-GISEL-NEXT: .LBB3_2: ; %dynamic-memset-expansion-main-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 16, v12 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v14, v[4:7], off +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[12:13], v[10:11] +; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-GISEL-NEXT: v_add_u32_e32 v14, 16, v14 +; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB3_2 +; GFX942-GISEL-NEXT: .LBB3_3: ; %Flow7 +; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-GISEL-NEXT: s_cbranch_execz .LBB3_6 +; GFX942-GISEL-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader +; GFX942-GISEL-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-GISEL-NEXT: .LBB3_5: ; %dynamic-memset-expansion-residual-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 1, v2 +; GFX942-GISEL-NEXT: scratch_store_byte v0, v1, off +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[8:9] +; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, 1, v0 +; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB3_5 +; GFX942-GISEL-NEXT: .LBB3_6: ; %Flow5 +; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p5.i64(ptr addrspace(5) noundef nonnull align 4 %dst, i8 %setval, i64 %size, i1 false) + ret void +} + +define void @memset_p0_sz1055_align_4_varsetval(ptr addrspace(0) align 4 %dst, i8 %setval) { +; GFX942-SDAG-LABEL: memset_p0_sz1055_align_4_varsetval: +; GFX942-SDAG: ; %bb.0: ; %entry +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404 +; GFX942-SDAG-NEXT: v_perm_b32 v4, v2, v2, s0 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v14, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v15, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v24, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v25, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v26, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v27, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v28, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v29, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v30, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v31, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v32, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v33, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v34, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v35, v4 +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x70 +; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0x60 +; GFX942-SDAG-NEXT: s_mov_b64 s[6:7], 0x50 +; GFX942-SDAG-NEXT: s_mov_b64 s[8:9], 0xf0 +; GFX942-SDAG-NEXT: s_mov_b64 s[10:11], 0xe0 +; GFX942-SDAG-NEXT: s_mov_b64 s[12:13], 0xd0 +; GFX942-SDAG-NEXT: s_mov_b64 s[14:15], 0xc0 +; GFX942-SDAG-NEXT: s_mov_b64 s[16:17], 0xb0 +; GFX942-SDAG-NEXT: s_mov_b64 s[18:19], 0xa0 +; GFX942-SDAG-NEXT: s_mov_b64 s[20:21], 0x90 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[36:37], 0x400 +; GFX942-SDAG-NEXT: .LBB4_1: ; %static-memset-expansion-main-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[38:39], v[0:1], 0, s[0:1] +; GFX942-SDAG-NEXT: s_add_u32 s0, s0, 0x100 +; GFX942-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX942-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[36:37] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[48:49], v[38:39], 0, s[2:3] +; GFX942-SDAG-NEXT: s_and_b64 vcc, exec, vcc +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[50:51], v[38:39], 0, s[4:5] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[52:53], v[38:39], 0, s[6:7] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[38:39], v[20:23] offset:64 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[54:55], v[38:39], 0, 48 +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[38:39], v[12:15] offset:32 +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[38:39], v[8:11] offset:16 +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[38:39], v[4:7] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[40:41], v[38:39], 0, s[8:9] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[42:43], v[38:39], 0, s[10:11] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[44:45], v[38:39], 0, s[12:13] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[46:47], v[38:39], 0, s[14:15] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[56:57], v[38:39], 0, s[16:17] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[58:59], v[38:39], 0, s[18:19] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[60:61], v[38:39], 0, s[20:21] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[38:39], v[4:7] offset:128 +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[48:49], v[32:35] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[50:51], v[28:31] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[52:53], v[24:27] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[54:55], v[16:19] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[40:41], v[32:35] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[42:43], v[28:31] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[44:45], v[24:27] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[46:47], v[20:23] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[56:57], v[16:19] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[58:59], v[12:15] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[60:61], v[8:11] +; GFX942-SDAG-NEXT: s_cbranch_vccnz .LBB4_1 +; GFX942-SDAG-NEXT: ; %bb.2: ; %static-memset-post-expansion +; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404 +; GFX942-SDAG-NEXT: v_perm_b32 v4, v2, v2, s0 +; GFX942-SDAG-NEXT: v_lshlrev_b16_e32 v3, 8, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-SDAG-NEXT: v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:1024 +; GFX942-SDAG-NEXT: flat_store_dwordx3 v[0:1], v[4:6] offset:1040 +; GFX942-SDAG-NEXT: flat_store_short v[0:1], v3 offset:1052 +; GFX942-SDAG-NEXT: flat_store_byte v[0:1], v2 offset:1054 +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v61, a13 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v60, a12 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v57, a9 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v56, a8 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v47, a7 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v46, a6 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v45, a5 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: memset_p0_sz1055_align_4_varsetval: +; GFX942-GISEL: ; %bb.0: ; %entry +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v2 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v4, v3, 8, v3 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX942-GISEL-NEXT: v_or3_b32 v4, v4, v5, v3 +; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], 0x400 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] +; GFX942-GISEL-NEXT: .LBB4_1: ; %static-memset-expansion-main-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, v0, v10 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, v1, v11, vcc +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 0x100, v10 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:16 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:32 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:48 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:64 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:80 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:96 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:112 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:128 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:144 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:160 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:176 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:192 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:208 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:224 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:240 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX942-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[8:9] +; GFX942-GISEL-NEXT: s_cbranch_vccnz .LBB4_1 +; GFX942-GISEL-NEXT: ; %bb.2: ; %static-memset-post-expansion +; GFX942-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v2 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v4, v3, 8, v3 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX942-GISEL-NEXT: v_or3_b32 v4, v4, v5, v3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 8 +; GFX942-GISEL-NEXT: v_lshlrev_b16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-GISEL-NEXT: v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:1024 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v6, v5, 16, v5 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, v6 +; GFX942-GISEL-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:1040 +; GFX942-GISEL-NEXT: flat_store_short v[0:1], v3 offset:1052 +; GFX942-GISEL-NEXT: flat_store_byte v[0:1], v2 offset:1054 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p0.i64(ptr addrspace(0) noundef nonnull align 4 %dst, i8 %setval, i64 1055, i1 false) + ret void +} + +define void @memset_p0_sz2048_align_4_varsetval(ptr addrspace(0) align 4 %dst, i8 %setval) { +; GFX942-SDAG-LABEL: memset_p0_sz2048_align_4_varsetval: +; GFX942-SDAG: ; %bb.0: ; %entry +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404 +; GFX942-SDAG-NEXT: v_perm_b32 v2, v2, v2, s0 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v12, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v14, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v15, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v24, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v25, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v26, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v27, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v28, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v29, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v30, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v31, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v32, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v33, v2 +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x70 +; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0x60 +; GFX942-SDAG-NEXT: s_mov_b64 s[6:7], 0x50 +; GFX942-SDAG-NEXT: s_mov_b64 s[8:9], 0xf0 +; GFX942-SDAG-NEXT: s_mov_b64 s[10:11], 0xe0 +; GFX942-SDAG-NEXT: s_mov_b64 s[12:13], 0xd0 +; GFX942-SDAG-NEXT: s_mov_b64 s[14:15], 0xc0 +; GFX942-SDAG-NEXT: s_mov_b64 s[16:17], 0xb0 +; GFX942-SDAG-NEXT: s_mov_b64 s[18:19], 0xa0 +; GFX942-SDAG-NEXT: s_mov_b64 s[20:21], 0x90 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[34:35], 0x800 +; GFX942-SDAG-NEXT: .LBB5_1: ; %static-memset-expansion-main-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[36:37], v[0:1], 0, s[0:1] +; GFX942-SDAG-NEXT: s_add_u32 s0, s0, 0x100 +; GFX942-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX942-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[34:35] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[38:39], v[36:37], 0, s[2:3] +; GFX942-SDAG-NEXT: s_and_b64 vcc, exec, vcc +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[48:49], v[36:37], 0, s[4:5] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[50:51], v[36:37], 0, s[6:7] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[36:37], v[18:21] offset:64 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[52:53], v[36:37], 0, 48 +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[36:37], v[10:13] offset:32 +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[36:37], v[6:9] offset:16 +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[36:37], v[2:5] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[54:55], v[36:37], 0, s[8:9] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[40:41], v[36:37], 0, s[10:11] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[42:43], v[36:37], 0, s[12:13] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[44:45], v[36:37], 0, s[14:15] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[46:47], v[36:37], 0, s[16:17] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[56:57], v[36:37], 0, s[18:19] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[58:59], v[36:37], 0, s[20:21] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[36:37], v[2:5] offset:128 +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[38:39], v[30:33] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[48:49], v[26:29] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[50:51], v[22:25] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[52:53], v[14:17] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[54:55], v[30:33] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[40:41], v[26:29] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[42:43], v[22:25] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[44:45], v[18:21] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[46:47], v[14:17] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[56:57], v[10:13] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[58:59], v[6:9] +; GFX942-SDAG-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX942-SDAG-NEXT: ; %bb.2: ; %static-memset-post-expansion +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v57, a9 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v56, a8 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v47, a7 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v46, a6 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v45, a5 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse +; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: memset_p0_sz2048_align_4_varsetval: +; GFX942-GISEL: ; %bb.0: ; %entry +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v3, v2, 8, v2 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX942-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2 +; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], 0x800 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-GISEL-NEXT: .LBB5_1: ; %static-memset-expansion-main-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v8 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v9, vcc +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 0x100, v8 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:16 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:32 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:48 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:64 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:80 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:96 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:112 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:128 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:144 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:160 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:176 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:192 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:208 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:224 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:240 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX942-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[6:7] +; GFX942-GISEL-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX942-GISEL-NEXT: ; %bb.2: ; %static-memset-post-expansion +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p0.i64(ptr addrspace(0) noundef nonnull align 4 %dst, i8 %setval, i64 2048, i1 false) + ret void +} + +define void @memset_p1_sz1055_align_4_varsetval(ptr addrspace(1) align 4 %dst, i8 %setval) { +; GFX942-SDAG-LABEL: memset_p1_sz1055_align_4_varsetval: +; GFX942-SDAG: ; %bb.0: ; %entry +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404 +; GFX942-SDAG-NEXT: v_perm_b32 v4, v2, v2, s0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v14, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v15, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v24, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v25, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v26, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v27, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v28, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v29, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v30, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v31, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v32, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v33, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v34, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v35, v4 +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[36:37], 0x400 +; GFX942-SDAG-NEXT: .LBB6_1: ; %static-memset-expansion-main-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[38:39], v[0:1], 0, s[0:1] +; GFX942-SDAG-NEXT: s_add_u32 s0, s0, 0x100 +; GFX942-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX942-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[36:37] +; GFX942-SDAG-NEXT: s_and_b64 vcc, exec, vcc +; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[32:35], off offset:112 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[28:31], off offset:96 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[24:27], off offset:80 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[20:23], off offset:64 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[16:19], off offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[12:15], off offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[8:11], off offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[4:7], off +; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[32:35], off offset:240 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[28:31], off offset:224 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[24:27], off offset:208 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[20:23], off offset:192 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[16:19], off offset:176 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[12:15], off offset:160 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[8:11], off offset:144 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[4:7], off offset:128 +; GFX942-SDAG-NEXT: s_cbranch_vccnz .LBB6_1 +; GFX942-SDAG-NEXT: ; %bb.2: ; %static-memset-post-expansion +; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404 +; GFX942-SDAG-NEXT: v_perm_b32 v4, v2, v2, s0 +; GFX942-SDAG-NEXT: v_lshlrev_b16_e32 v3, 8, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-SDAG-NEXT: v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-SDAG-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:1024 +; GFX942-SDAG-NEXT: global_store_dwordx3 v[0:1], v[4:6], off offset:1040 +; GFX942-SDAG-NEXT: global_store_short v[0:1], v3, off offset:1052 +; GFX942-SDAG-NEXT: global_store_byte v[0:1], v2, off offset:1054 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: memset_p1_sz1055_align_4_varsetval: +; GFX942-GISEL: ; %bb.0: ; %entry +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v2 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v4, v3, 8, v3 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX942-GISEL-NEXT: v_or3_b32 v4, v4, v5, v3 +; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], 0x400 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1] +; GFX942-GISEL-NEXT: .LBB6_1: ; %static-memset-expansion-main-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, v0, v10 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, v1, v11, vcc +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 0x100, v10 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off +; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:32 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:48 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:64 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:80 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:96 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:112 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:128 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:144 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:160 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:176 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:192 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:208 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:224 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:240 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX942-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[8:9] +; GFX942-GISEL-NEXT: s_cbranch_vccnz .LBB6_1 +; GFX942-GISEL-NEXT: ; %bb.2: ; %static-memset-post-expansion +; GFX942-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v2 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v4, v3, 8, v3 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX942-GISEL-NEXT: v_or3_b32 v4, v4, v5, v3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 8 +; GFX942-GISEL-NEXT: v_lshlrev_b16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-GISEL-NEXT: v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-GISEL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:1024 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v6, v5, 16, v5 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, v6 +; GFX942-GISEL-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:1040 +; GFX942-GISEL-NEXT: global_store_short v[0:1], v3, off offset:1052 +; GFX942-GISEL-NEXT: global_store_byte v[0:1], v2, off offset:1054 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef nonnull align 4 %dst, i8 %setval, i64 1055, i1 false) + ret void +} + +define void @memset_p1_sz2048_align_4_varsetval(ptr addrspace(1) align 4 %dst, i8 %setval) { +; GFX942-SDAG-LABEL: memset_p1_sz2048_align_4_varsetval: +; GFX942-SDAG: ; %bb.0: ; %entry +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404 +; GFX942-SDAG-NEXT: v_perm_b32 v2, v2, v2, s0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v12, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v14, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v15, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v24, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v25, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v26, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v27, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v28, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v29, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v30, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v31, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v32, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v33, v2 +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[34:35], 0x800 +; GFX942-SDAG-NEXT: .LBB7_1: ; %static-memset-expansion-main-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[36:37], v[0:1], 0, s[0:1] +; GFX942-SDAG-NEXT: s_add_u32 s0, s0, 0x100 +; GFX942-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX942-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[34:35] +; GFX942-SDAG-NEXT: s_and_b64 vcc, exec, vcc +; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[30:33], off offset:112 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[26:29], off offset:96 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[22:25], off offset:80 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[18:21], off offset:64 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[14:17], off offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[10:13], off offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[6:9], off offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[2:5], off +; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[30:33], off offset:240 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[26:29], off offset:224 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[22:25], off offset:208 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[18:21], off offset:192 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[14:17], off offset:176 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[10:13], off offset:160 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[6:9], off offset:144 +; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[2:5], off offset:128 +; GFX942-SDAG-NEXT: s_cbranch_vccnz .LBB7_1 +; GFX942-SDAG-NEXT: ; %bb.2: ; %static-memset-post-expansion +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: memset_p1_sz2048_align_4_varsetval: +; GFX942-GISEL: ; %bb.0: ; %entry +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v3, v2, 8, v2 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX942-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2 +; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], 0x800 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-GISEL-NEXT: .LBB7_1: ; %static-memset-expansion-main-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v8 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v9, vcc +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 0x100, v8 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off +; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:16 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:32 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:48 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:64 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:80 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:96 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:112 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:128 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:144 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:160 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:176 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:192 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:208 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:224 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:240 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX942-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[6:7] +; GFX942-GISEL-NEXT: s_cbranch_vccnz .LBB7_1 +; GFX942-GISEL-NEXT: ; %bb.2: ; %static-memset-post-expansion +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef nonnull align 4 %dst, i8 %setval, i64 2048, i1 false) + ret void +} + +define void @memset_p3_sz1055_align_4_varsetval(ptr addrspace(3) align 4 %dst, i8 %setval) { +; GFX942-SDAG-LABEL: memset_p3_sz1055_align_4_varsetval: +; GFX942-SDAG: ; %bb.0: ; %entry +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404 +; GFX942-SDAG-NEXT: v_perm_b32 v2, v1, v1, s0 +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[34:35], 0x400 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v36, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v12, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v14, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v15, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v24, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v25, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v26, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v27, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v28, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v29, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v30, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v31, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v32, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v33, v2 +; GFX942-SDAG-NEXT: .LBB8_1: ; %static-memset-expansion-main-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: s_add_u32 s0, s0, 0x100 +; GFX942-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX942-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[34:35] +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v32, v33 offset0:30 offset1:31 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v30, v31 offset0:28 offset1:29 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v28, v29 offset0:26 offset1:27 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v26, v27 offset0:24 offset1:25 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v24, v25 offset0:22 offset1:23 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v22, v23 offset0:20 offset1:21 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v20, v21 offset0:18 offset1:19 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v18, v19 offset0:16 offset1:17 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v16, v17 offset0:14 offset1:15 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v14, v15 offset0:12 offset1:13 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v12, v13 offset0:10 offset1:11 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v10, v11 offset0:8 offset1:9 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v8, v9 offset0:6 offset1:7 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v6, v7 offset0:4 offset1:5 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v4, v5 offset0:2 offset1:3 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v2, v3 offset1:1 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v32, v33 offset0:62 offset1:63 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v30, v31 offset0:60 offset1:61 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v28, v29 offset0:58 offset1:59 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v26, v27 offset0:56 offset1:57 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v24, v25 offset0:54 offset1:55 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v22, v23 offset0:52 offset1:53 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v20, v21 offset0:50 offset1:51 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v18, v19 offset0:48 offset1:49 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v16, v17 offset0:46 offset1:47 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v14, v15 offset0:44 offset1:45 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v12, v13 offset0:42 offset1:43 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v10, v11 offset0:40 offset1:41 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v8, v9 offset0:38 offset1:39 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v6, v7 offset0:36 offset1:37 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v4, v5 offset0:34 offset1:35 +; GFX942-SDAG-NEXT: ds_write2_b32 v36, v2, v3 offset0:32 offset1:33 +; GFX942-SDAG-NEXT: v_add_u32_e32 v36, 0x100, v36 +; GFX942-SDAG-NEXT: s_cbranch_vccnz .LBB8_1 +; GFX942-SDAG-NEXT: ; %bb.2: ; %static-memset-post-expansion +; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404 +; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 0x400, v0 +; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 0x408, v0 +; GFX942-SDAG-NEXT: v_perm_b32 v4, v1, v1, s0 +; GFX942-SDAG-NEXT: ds_write2_b32 v3, v4, v4 offset1:1 +; GFX942-SDAG-NEXT: ds_write2_b32 v2, v4, v4 offset1:1 +; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 0x410, v0 +; GFX942-SDAG-NEXT: ds_write2_b32 v2, v4, v4 offset1:1 +; GFX942-SDAG-NEXT: ds_write_b32 v0, v4 offset:1048 +; GFX942-SDAG-NEXT: v_lshlrev_b16_e32 v2, 8, v1 +; GFX942-SDAG-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-SDAG-NEXT: ds_write_b16 v0, v2 offset:1052 +; GFX942-SDAG-NEXT: ds_write_b8 v0, v1 offset:1054 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: memset_p3_sz1055_align_4_varsetval: +; GFX942-GISEL: ; %bb.0: ; %entry +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v3, v2, 8, v2 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX942-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2 +; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], 0x400 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-GISEL-NEXT: .LBB8_1: ; %static-memset-expansion-main-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 0x100, v8 +; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset1:1 +; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:2 offset1:3 +; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:4 offset1:5 +; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:6 offset1:7 +; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:8 offset1:9 +; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:10 offset1:11 +; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:12 offset1:13 +; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:14 offset1:15 +; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:16 offset1:17 +; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:18 offset1:19 +; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:20 offset1:21 +; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:22 offset1:23 +; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:24 offset1:25 +; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:26 offset1:27 +; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:28 offset1:29 +; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:30 offset1:31 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX942-GISEL-NEXT: v_add_u32_e32 v10, 0x100, v10 +; GFX942-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[6:7] +; GFX942-GISEL-NEXT: s_cbranch_vccnz .LBB8_1 +; GFX942-GISEL-NEXT: ; %bb.2: ; %static-memset-post-expansion +; GFX942-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v3, v2, 8, v2 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX942-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:128 offset1:129 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 8 +; GFX942-GISEL-NEXT: v_lshlrev_b16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX942-GISEL-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v4 +; GFX942-GISEL-NEXT: v_add_u32_e32 v5, 0x410, v0 +; GFX942-GISEL-NEXT: ds_write2_b32 v5, v4, v4 offset1:1 +; GFX942-GISEL-NEXT: ds_write_b32 v0, v2 offset:1048 +; GFX942-GISEL-NEXT: ds_write_b16 v0, v3 offset:1052 +; GFX942-GISEL-NEXT: ds_write_b8 v0, v1 offset:1054 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p3.i64(ptr addrspace(3) noundef nonnull align 4 %dst, i8 %setval, i64 1055, i1 false) + ret void +} + +define void @memset_p3_sz2048_align_4_varsetval(ptr addrspace(3) align 4 %dst, i8 %setval) { +; GFX942-SDAG-LABEL: memset_p3_sz2048_align_4_varsetval: +; GFX942-SDAG: ; %bb.0: ; %entry +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404 +; GFX942-SDAG-NEXT: v_perm_b32 v2, v1, v1, s0 +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[34:35], 0x800 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v12, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v14, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v15, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v24, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v26, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v25, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v28, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v27, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v30, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v29, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v32, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v31, v2 +; GFX942-SDAG-NEXT: .LBB9_1: ; %static-memset-expansion-main-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: s_add_u32 s0, s0, 0x100 +; GFX942-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX942-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[34:35] +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v32, v31 offset0:30 offset1:31 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v30, v29 offset0:28 offset1:29 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v28, v27 offset0:26 offset1:27 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v26, v25 offset0:24 offset1:25 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v24, v23 offset0:22 offset1:23 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v22, v21 offset0:20 offset1:21 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v20, v19 offset0:18 offset1:19 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v18, v17 offset0:16 offset1:17 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v16, v15 offset0:14 offset1:15 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v14, v13 offset0:12 offset1:13 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v12, v11 offset0:10 offset1:11 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v10, v9 offset0:8 offset1:9 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v8, v7 offset0:6 offset1:7 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v6, v5 offset0:4 offset1:5 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v4, v3 offset0:2 offset1:3 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v2, v1 offset1:1 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v32, v31 offset0:62 offset1:63 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v30, v29 offset0:60 offset1:61 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v28, v27 offset0:58 offset1:59 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v26, v25 offset0:56 offset1:57 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v24, v23 offset0:54 offset1:55 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v22, v21 offset0:52 offset1:53 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v20, v19 offset0:50 offset1:51 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v18, v17 offset0:48 offset1:49 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v16, v15 offset0:46 offset1:47 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v14, v13 offset0:44 offset1:45 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v12, v11 offset0:42 offset1:43 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v10, v9 offset0:40 offset1:41 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v8, v7 offset0:38 offset1:39 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v6, v5 offset0:36 offset1:37 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v4, v3 offset0:34 offset1:35 +; GFX942-SDAG-NEXT: ds_write2_b32 v0, v2, v1 offset0:32 offset1:33 +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 0x100, v0 +; GFX942-SDAG-NEXT: s_cbranch_vccnz .LBB9_1 +; GFX942-SDAG-NEXT: ; %bb.2: ; %static-memset-post-expansion +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: memset_p3_sz2048_align_4_varsetval: +; GFX942-GISEL: ; %bb.0: ; %entry +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v2, v1, 8, v1 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX942-GISEL-NEXT: v_or3_b32 v2, v2, v3, v1 +; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], 0x800 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-GISEL-NEXT: .LBB9_1: ; %static-memset-expansion-main-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 0x100, v8 +; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 +; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:2 offset1:3 +; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:4 offset1:5 +; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:6 offset1:7 +; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:8 offset1:9 +; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:10 offset1:11 +; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:12 offset1:13 +; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:14 offset1:15 +; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:16 offset1:17 +; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:18 offset1:19 +; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:20 offset1:21 +; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:22 offset1:23 +; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:24 offset1:25 +; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:26 offset1:27 +; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:28 offset1:29 +; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:30 offset1:31 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, 0x100, v0 +; GFX942-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[6:7] +; GFX942-GISEL-NEXT: s_cbranch_vccnz .LBB9_1 +; GFX942-GISEL-NEXT: ; %bb.2: ; %static-memset-post-expansion +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p3.i64(ptr addrspace(3) noundef nonnull align 4 %dst, i8 %setval, i64 2048, i1 false) + ret void +} + +define void @memset_p5_sz1055_align_4_varsetval(ptr addrspace(5) align 4 %dst, i8 %setval) { +; GFX942-SDAG-LABEL: memset_p5_sz1055_align_4_varsetval: +; GFX942-SDAG: ; %bb.0: ; %entry +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404 +; GFX942-SDAG-NEXT: v_perm_b32 v2, v1, v1, s0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v12, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v14, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v15, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v24, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v25, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v26, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v27, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v28, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v29, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v30, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v31, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v32, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v33, v2 +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[34:35], 0x400 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v36, v0 +; GFX942-SDAG-NEXT: .LBB10_1: ; %static-memset-expansion-main-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: s_add_u32 s0, s0, 0x100 +; GFX942-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX942-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[34:35] +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[30:33], off offset:112 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[26:29], off offset:96 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[22:25], off offset:80 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[18:21], off offset:64 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[14:17], off offset:48 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[10:13], off offset:32 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[6:9], off offset:16 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[2:5], off +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[30:33], off offset:240 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[26:29], off offset:224 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[22:25], off offset:208 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[18:21], off offset:192 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[14:17], off offset:176 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[10:13], off offset:160 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[6:9], off offset:144 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[2:5], off offset:128 +; GFX942-SDAG-NEXT: v_add_u32_e32 v36, 0x100, v36 +; GFX942-SDAG-NEXT: s_cbranch_vccnz .LBB10_1 +; GFX942-SDAG-NEXT: ; %bb.2: ; %static-memset-post-expansion +; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404 +; GFX942-SDAG-NEXT: v_perm_b32 v2, v1, v1, s0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:1024 +; GFX942-SDAG-NEXT: scratch_store_dwordx2 v0, v[2:3], off offset:1040 +; GFX942-SDAG-NEXT: scratch_store_dword v0, v2, off offset:1048 +; GFX942-SDAG-NEXT: v_lshlrev_b16_e32 v2, 8, v1 +; GFX942-SDAG-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-SDAG-NEXT: scratch_store_short v0, v2, off offset:1052 +; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1054 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: memset_p5_sz1055_align_4_varsetval: +; GFX942-GISEL: ; %bb.0: ; %entry +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v3, v2, 8, v2 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX942-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2 +; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], 0x400 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-GISEL-NEXT: .LBB10_1: ; %static-memset-expansion-main-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 0x100, v8 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:16 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:32 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:48 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:64 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:80 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:96 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:112 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:128 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:144 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:160 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:176 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:192 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:208 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:224 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:240 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX942-GISEL-NEXT: v_add_u32_e32 v10, 0x100, v10 +; GFX942-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[6:7] +; GFX942-GISEL-NEXT: s_cbranch_vccnz .LBB10_1 +; GFX942-GISEL-NEXT: ; %bb.2: ; %static-memset-post-expansion +; GFX942-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v3, v2, 8, v2 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX942-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:1024 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 8 +; GFX942-GISEL-NEXT: v_lshlrev_b16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX942-GISEL-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v4 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-GISEL-NEXT: scratch_store_dwordx2 v0, v[4:5], off offset:1040 +; GFX942-GISEL-NEXT: scratch_store_dword v0, v2, off offset:1048 +; GFX942-GISEL-NEXT: scratch_store_short v0, v3, off offset:1052 +; GFX942-GISEL-NEXT: scratch_store_byte v0, v1, off offset:1054 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p5.i64(ptr addrspace(5) noundef nonnull align 4 %dst, i8 %setval, i64 1055, i1 false) + ret void +} + +define void @memset_p5_sz2048_align_4_varsetval(ptr addrspace(5) align 4 %dst, i8 %setval) { +; GFX942-SDAG-LABEL: memset_p5_sz2048_align_4_varsetval: +; GFX942-SDAG: ; %bb.0: ; %entry +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404 +; GFX942-SDAG-NEXT: v_perm_b32 v2, v1, v1, s0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v12, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v14, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v15, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v24, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v25, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v26, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v27, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v28, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v29, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v30, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v31, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v32, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v33, v2 +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[34:35], 0x800 +; GFX942-SDAG-NEXT: .LBB11_1: ; %static-memset-expansion-main-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: s_add_u32 s0, s0, 0x100 +; GFX942-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX942-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[34:35] +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[30:33], off offset:112 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[26:29], off offset:96 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[22:25], off offset:80 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[18:21], off offset:64 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[14:17], off offset:48 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[10:13], off offset:32 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[6:9], off offset:16 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[2:5], off +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[30:33], off offset:240 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[26:29], off offset:224 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[22:25], off offset:208 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[18:21], off offset:192 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[14:17], off offset:176 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[10:13], off offset:160 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[6:9], off offset:144 +; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:128 +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 0x100, v0 +; GFX942-SDAG-NEXT: s_cbranch_vccnz .LBB11_1 +; GFX942-SDAG-NEXT: ; %bb.2: ; %static-memset-post-expansion +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: memset_p5_sz2048_align_4_varsetval: +; GFX942-GISEL: ; %bb.0: ; %entry +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX942-GISEL-NEXT: v_lshl_or_b32 v2, v1, 8, v1 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX942-GISEL-NEXT: v_or3_b32 v2, v2, v3, v1 +; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], 0x800 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-GISEL-NEXT: .LBB11_1: ; %static-memset-expansion-main-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 0x100, v8 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:16 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:32 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:48 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:64 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:80 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:96 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:112 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:128 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:144 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:160 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:176 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:192 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:208 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:224 +; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:240 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, 0x100, v0 +; GFX942-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[6:7] +; GFX942-GISEL-NEXT: s_cbranch_vccnz .LBB11_1 +; GFX942-GISEL-NEXT: ; %bb.2: ; %static-memset-post-expansion +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p5.i64(ptr addrspace(5) noundef nonnull align 4 %dst, i8 %setval, i64 2048, i1 false) + ret void +} + +define void @memset_p1_varsz_align_4_set40(ptr addrspace(1) align 4 %dst, i64 %size) { +; GFX942-SDAG-LABEL: memset_p1_varsz_align_4_set40: +; GFX942-SDAG: ; %bb.0: ; %entry +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: v_and_b32_e32 v10, -16, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-SDAG-NEXT: v_and_b32_e32 v8, 15, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-SDAG-NEXT: s_cbranch_execz .LBB12_3 +; GFX942-SDAG-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0x28282828 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-SDAG-NEXT: .LBB12_2: ; %dynamic-memset-expansion-main-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[12:13], v[0:1], 0, s[4:5] +; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 16 +; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0 +; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[10:11] +; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-SDAG-NEXT: global_store_dwordx4 v[12:13], v[4:7], off +; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB12_2 +; GFX942-SDAG-NEXT: .LBB12_3: ; %Flow4 +; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-SDAG-NEXT: s_cbranch_execz .LBB12_6 +; GFX942-SDAG-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader +; GFX942-SDAG-NEXT: v_and_b32_e32 v2, -16, v2 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 40 +; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-SDAG-NEXT: .LBB12_5: ; %dynamic-memset-expansion-residual-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[4:5] +; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 1 +; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0 +; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9] +; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-SDAG-NEXT: global_store_byte v[4:5], v2, off +; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB12_5 +; GFX942-SDAG-NEXT: .LBB12_6: ; %Flow2 +; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: memset_p1_varsz_align_4_set40: +; GFX942-GISEL: ; %bb.0: ; %entry +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_and_b32_e32 v8, 15, v2 +; GFX942-GISEL-NEXT: v_sub_co_u32_e32 v10, vcc, v2, v8 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_subbrev_co_u32_e32 v11, vcc, 0, v3, vcc +; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-GISEL-NEXT: s_cbranch_execz .LBB12_3 +; GFX942-GISEL-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader +; GFX942-GISEL-NEXT: s_mov_b32 s4, 0x28282828 +; GFX942-GISEL-NEXT: s_mov_b32 s5, s4 +; GFX942-GISEL-NEXT: s_mov_b32 s6, s4 +; GFX942-GISEL-NEXT: s_mov_b32 s7, s4 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GFX942-GISEL-NEXT: .LBB12_2: ; %dynamic-memset-expansion-main-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, v0, v12 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, v1, v13, vcc +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 16, v12 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[14:15], v[4:7], off +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[12:13], v[10:11] +; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB12_2 +; GFX942-GISEL-NEXT: .LBB12_3: ; %Flow4 +; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-GISEL-NEXT: s_cbranch_execz .LBB12_6 +; GFX942-GISEL-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader +; GFX942-GISEL-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] +; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 4, v[0:1] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 40 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-GISEL-NEXT: .LBB12_5: ; %dynamic-memset-expansion-residual-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v3, vcc +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 1, v2 +; GFX942-GISEL-NEXT: global_store_byte v[6:7], v4, off +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[8:9] +; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB12_5 +; GFX942-GISEL-NEXT: .LBB12_6: ; %Flow2 +; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef nonnull align 4 %dst, i8 40, i64 %size, i1 false) + ret void +} + +define void @memset_p1_varsz_align_4_set0(ptr addrspace(1) align 4 %dst, i64 %size) { +; GFX942-SDAG-LABEL: memset_p1_varsz_align_4_set0: +; GFX942-SDAG: ; %bb.0: ; %entry +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: v_and_b32_e32 v6, -16, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-SDAG-NEXT: v_and_b32_e32 v4, 15, v2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-SDAG-NEXT: s_cbranch_execz .LBB13_3 +; GFX942-SDAG-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader +; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v5 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v5 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-SDAG-NEXT: .LBB13_2: ; %dynamic-memset-expansion-main-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[12:13], v[0:1], 0, s[4:5] +; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 16 +; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0 +; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[6:7] +; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-SDAG-NEXT: global_store_dwordx4 v[12:13], v[8:11], off +; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB13_2 +; GFX942-SDAG-NEXT: .LBB13_3: ; %Flow4 +; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-SDAG-NEXT: s_cbranch_execz .LBB13_6 +; GFX942-SDAG-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader +; GFX942-SDAG-NEXT: v_and_b32_e32 v2, -16, v2 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-SDAG-NEXT: .LBB13_5: ; %dynamic-memset-expansion-residual-body +; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[4:5] +; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 1 +; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0 +; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[4:5] +; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-SDAG-NEXT: global_store_byte v[6:7], v2, off +; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB13_5 +; GFX942-SDAG-NEXT: .LBB13_6: ; %Flow2 +; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: memset_p1_varsz_align_4_set0: +; GFX942-GISEL: ; %bb.0: ; %entry +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_and_b32_e32 v8, 15, v2 +; GFX942-GISEL-NEXT: v_sub_co_u32_e32 v10, vcc, v2, v8 +; GFX942-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_subbrev_co_u32_e32 v11, vcc, 0, v3, vcc +; GFX942-GISEL-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX942-GISEL-NEXT: s_cbranch_execz .LBB13_3 +; GFX942-GISEL-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader +; GFX942-GISEL-NEXT: s_mov_b32 s2, s0 +; GFX942-GISEL-NEXT: s_mov_b32 s3, s0 +; GFX942-GISEL-NEXT: s_mov_b32 s1, s0 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[4:5] +; GFX942-GISEL-NEXT: .LBB13_2: ; %dynamic-memset-expansion-main-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, v0, v12 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, v1, v13, vcc +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 16, v12 +; GFX942-GISEL-NEXT: global_store_dwordx4 v[14:15], v[4:7], off +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[12:13], v[10:11] +; GFX942-GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB13_2 +; GFX942-GISEL-NEXT: .LBB13_3: ; %Flow4 +; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-GISEL-NEXT: s_cbranch_execz .LBB13_6 +; GFX942-GISEL-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader +; GFX942-GISEL-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] +; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 4, v[0:1] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-GISEL-NEXT: .LBB13_5: ; %dynamic-memset-expansion-residual-body +; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v3, vcc +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 1, v2 +; GFX942-GISEL-NEXT: global_store_byte v[6:7], v4, off +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[8:9] +; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB13_5 +; GFX942-GISEL-NEXT: .LBB13_6: ; %Flow2 +; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef nonnull align 4 %dst, i8 0, i64 %size, i1 false) + ret void +} + +declare void @llvm.memset.p0.i64(ptr addrspace(0) noalias nocapture writeonly, i8, i64, i1 immarg) +declare void @llvm.memset.p1.i64(ptr addrspace(1) noalias nocapture writeonly, i8, i64, i1 immarg) +declare void @llvm.memset.p3.i64(ptr addrspace(3) noalias nocapture writeonly, i8, i64, i1 immarg) +declare void @llvm.memset.p5.i64(ptr addrspace(5) noalias nocapture writeonly, i8, i64, i1 immarg) + + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX942: {{.*}} diff --git a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll index ad78e0fe7438b..6c96eab1439fb 100644 --- a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll +++ b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll @@ -117,9 +117,9 @@ entry: ; IR-LABEL: @memset_caller ; IR: [[VAL:%[0-9]+]] = trunc i32 %c to i8 -; IR: [[CMPREG:%[0-9]+]] = icmp eq i64 0, %n -; IR: br i1 [[CMPREG]], label %split, label %loadstoreloop -; IR: loadstoreloop: +; IR: [[CMPREG:%[0-9]+]] = icmp ne i64 %n, 0 +; IR: br i1 [[CMPREG]], label %dynamic-memset-expansion-main-body, label %dynamic-memset-post-expansion +; IR: dynamic-memset-expansion-main-body: ; IR: [[STOREPTR:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64 ; IR-NEXT: store i8 [[VAL]], ptr [[STOREPTR]] @@ -141,7 +141,7 @@ entry: ; IR-LABEL: @volatile_memset_caller ; IR: [[VAL:%[0-9]+]] = trunc i32 %c to i8 -; IR: loadstoreloop: +; IR: dynamic-memset-expansion-main-body: ; IR: [[STOREPTR:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64 ; IR-NEXT: store volatile i8 [[VAL]], ptr [[STOREPTR]] } diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll index d5e70ae9e7aa8..eb184bae7fa5c 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll @@ -39,8 +39,8 @@ ; CHECK: %[[#Volatile:]] = OpFunctionParameter %[[#]] ; CHECK: %[[#Entry:]] = OpLabel -; CHECK: %[[#IsZeroLen:]] = OpIEqual %[[#]] %[[#Zero:]] %[[#Len]] -; CHECK: OpBranchConditional %[[#IsZeroLen]] %[[#End:]] %[[#WhileBody:]] +; CHECK: %[[#IsNonZeroLen:]] = OpINotEqual %[[#]] %[[#Len]] %[[#Zero:]] +; CHECK: OpBranchConditional %[[#IsNonZeroLen]] %[[#WhileBody:]] %[[#End:]] ; CHECK: %[[#WhileBody]] = OpLabel ; CHECK: %[[#Offset:]] = OpPhi %[[#]] %[[#Zero]] %[[#Entry]] %[[#OffsetInc:]] %[[#WhileBody]] diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-inline-non-constant-len.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-inline-non-constant-len.ll index 0843b1532f843..8a371083548eb 100644 --- a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-inline-non-constant-len.ll +++ b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-inline-non-constant-len.ll @@ -17,26 +17,26 @@ define void @memset_32(ptr %a, i8 %value) nounwind { define void @memset_x(ptr %a, i8 %value, i64 %x) nounwind { ; CHECK-LABEL: define void @memset_x( ; CHECK-SAME: ptr [[A:%.*]], i8 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 0, [[X]] -; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] -; CHECK: [[LOADSTORELOOP]]: -; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[X]], 0 +; CHECK-NEXT: br i1 [[TMP1]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY:.*]], label %[[DYNAMIC_MEMSET_POST_LOOP_EXPANSION:.*]] +; CHECK: [[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]]: +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] ; CHECK-NEXT: store i8 [[VALUE]], ptr [[TMP3]], align 1 ; CHECK-NEXT: [[TMP4]] = add i64 [[TMP2]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], [[X]] -; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]] -; CHECK: [[SPLIT]]: -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 0, [[X]] -; CHECK-NEXT: br i1 [[TMP6]], label %[[SPLIT1:.*]], label %[[LOADSTORELOOP2:.*]] -; CHECK: [[LOADSTORELOOP2]]: -; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ 0, %[[SPLIT]] ], [ [[TMP9:%.*]], %[[LOADSTORELOOP2]] ] +; CHECK-NEXT: br i1 [[TMP5]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]], label %[[DYNAMIC_MEMSET_POST_LOOP_EXPANSION]] +; CHECK: [[DYNAMIC_MEMSET_POST_LOOP_EXPANSION]]: +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[X]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY2:.*]], label %[[DYNAMIC_MEMSET_POST_LOOP_EXPANSION1:.*]] +; CHECK: [[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY2]]: +; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ 0, %[[DYNAMIC_MEMSET_POST_LOOP_EXPANSION]] ], [ [[TMP9:%.*]], %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY2]] ] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]] ; CHECK-NEXT: store volatile i8 [[VALUE]], ptr [[TMP8]], align 1 ; CHECK-NEXT: [[TMP9]] = add i64 [[TMP7]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = icmp ult i64 [[TMP9]], [[X]] -; CHECK-NEXT: br i1 [[TMP10]], label %[[LOADSTORELOOP2]], label %[[SPLIT1]] -; CHECK: [[SPLIT1]]: +; CHECK-NEXT: br i1 [[TMP10]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY2]], label %[[DYNAMIC_MEMSET_POST_LOOP_EXPANSION1]] +; CHECK: [[DYNAMIC_MEMSET_POST_LOOP_EXPANSION1]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 %x, i1 0)