2 changes: 1 addition & 1 deletion lldb/source/Core/ValueObject.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -814,7 +814,7 @@ static bool CopyStringDataToBufferSP(const StreamString &source,
std::pair<size_t, bool>
ValueObject::ReadPointedString(lldb::WritableDataBufferSP &buffer_sp,
Status &error, uint32_t max_length,
bool honor_array, Format item_format) {
bool honor_array) {
bool was_capped = false;
StreamString s;
ExecutionContext exe_ctx(GetExecutionContextRef());
Expand Down
2 changes: 1 addition & 1 deletion llvm/cmake/modules/HandleLLVMOptions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,7 @@ if ( LLVM_COMPILER_IS_GCC_COMPATIBLE )
# crash if LLVM is built with GCC and LTO enabled (#57740). Until
# these bugs are fixed, we need to disable dead store eliminations
# based on object lifetime.
# add_flag_if_supported("-fno-lifetime-dse" CMAKE_CXX_FLAGS)
add_flag_if_supported("-fno-lifetime-dse" CMAKE_CXX_FLAGS)
endif ( LLVM_COMPILER_IS_GCC_COMPATIBLE )

# Modules enablement for GCC-compatible compilers:
Expand Down
10 changes: 10 additions & 0 deletions llvm/include/llvm/IR/IRBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -1974,6 +1974,16 @@ class IRBuilderBase {
return CreateConstInBoundsGEP2_32(Ty, Ptr, 0, Idx, Name);
}

Value *CreatePtrAdd(Value *Ptr, Value *Offset, const Twine &Name = "",
bool IsInBounds = false) {
return CreateGEP(getInt8Ty(), Ptr, Offset, Name, IsInBounds);
}

Value *CreateInBoundsPtrAdd(Value *Ptr, Value *Offset,
const Twine &Name = "") {
return CreateGEP(getInt8Ty(), Ptr, Offset, Name, /*IsInBounds*/ true);
}

/// Same as CreateGlobalString, but return a pointer with "i8*" type
/// instead of a pointer to array of i8.
///
Expand Down
24 changes: 12 additions & 12 deletions llvm/include/llvm/IR/IntrinsicsAArch64.td
Original file line number Diff line number Diff line change
Expand Up @@ -3095,23 +3095,23 @@ let TargetPrefix = "aarch64" in {
[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem]>;

class SME2_CVT_FtoI_VG2_Intrinsic
class SME2_CVT_ItoF_X2_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>],
[IntrNoMem]>;

class SME2_CVT_ItoF_VG2_Intrinsic
class SME2_CVT_FtoI_X2_Intrinsic
: DefaultAttrsIntrinsic<[LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>],
[llvm_anyvector_ty, LLVMMatchType<0>],
[IntrNoMem]>;

class SME2_CVT_FtoI_VG4_Intrinsic
class SME2_CVT_ItoF_X4_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>,
LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>],
[IntrNoMem]>;

class SME2_CVT_ItoF_VG4_Intrinsic
class SME2_CVT_FtoI_X4_Intrinsic
: DefaultAttrsIntrinsic<[LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>,
LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>],
[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
Expand Down Expand Up @@ -3403,14 +3403,14 @@ let TargetPrefix = "aarch64" in {
//
def int_aarch64_sve_fcvt_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
def int_aarch64_sve_bfcvt_x2 : SME2_CVT_VG2_SINGLE_BF16_Intrinsic;
def int_aarch64_sve_fcvts_x2 : SME2_CVT_FtoI_VG2_Intrinsic;
def int_aarch64_sve_fcvtu_x2 : SME2_CVT_FtoI_VG2_Intrinsic;
def int_aarch64_sve_scvtf_x2 : SME2_CVT_ItoF_VG2_Intrinsic;
def int_aarch64_sve_ucvtf_x2 : SME2_CVT_ItoF_VG2_Intrinsic;
def int_aarch64_sve_fcvts_x4 : SME2_CVT_FtoI_VG4_Intrinsic;
def int_aarch64_sve_fcvtu_x4 : SME2_CVT_FtoI_VG4_Intrinsic;
def int_aarch64_sve_scvtf_x4 : SME2_CVT_ItoF_VG4_Intrinsic;
def int_aarch64_sve_ucvtf_x4 : SME2_CVT_ItoF_VG4_Intrinsic;
def int_aarch64_sve_fcvts_x2 : SME2_CVT_FtoI_X2_Intrinsic;
def int_aarch64_sve_fcvtu_x2 : SME2_CVT_FtoI_X2_Intrinsic;
def int_aarch64_sve_scvtf_x2 : SME2_CVT_ItoF_X2_Intrinsic;
def int_aarch64_sve_ucvtf_x2 : SME2_CVT_ItoF_X2_Intrinsic;
def int_aarch64_sve_fcvts_x4 : SME2_CVT_FtoI_X4_Intrinsic;
def int_aarch64_sve_fcvtu_x4 : SME2_CVT_FtoI_X4_Intrinsic;
def int_aarch64_sve_scvtf_x4 : SME2_CVT_ItoF_X4_Intrinsic;
def int_aarch64_sve_ucvtf_x4 : SME2_CVT_ItoF_X4_Intrinsic;

//
// Multi-vector saturating extract narrow
Expand Down
79 changes: 52 additions & 27 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions llvm/lib/Analysis/ScalarEvolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10721,6 +10721,10 @@ bool ScalarEvolution::isKnownNonPositive(const SCEV *S) {
}

bool ScalarEvolution::isKnownNonZero(const SCEV *S) {
// Query push down for cases where the unsigned range is
// less than sufficient.
if (const auto *SExt = dyn_cast<SCEVSignExtendExpr>(S))
return isKnownNonZero(SExt->getOperand(0));
return getUnsignedRangeMin(S) != 0;
}

Expand Down
12 changes: 10 additions & 2 deletions llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3455,7 +3455,6 @@ uint64_t DwarfDebug::makeTypeSignature(StringRef Identifier) {
void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
StringRef Identifier, DIE &RefDie,
const DICompositeType *CTy) {
setCurrentDWARF5AccelTable(DWARF5AccelTableKind::TU);
// Fast path if we're building some type units and one has already used the
// address pool we know we're going to throw away all this work anyway, so
// don't bother building dependent types.
Expand All @@ -3468,6 +3467,7 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
return;
}

setCurrentDWARF5AccelTable(DWARF5AccelTableKind::TU);
bool TopLevelType = TypeUnitsUnderConstruction.empty();
AddrPool.resetUsedFlag();

Expand Down Expand Up @@ -3556,9 +3556,9 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
AccelTypeUnitsDebugNames.convertDieToOffset();
AccelDebugNames.addTypeEntries(AccelTypeUnitsDebugNames);
AccelTypeUnitsDebugNames.clear();
setCurrentDWARF5AccelTable(DWARF5AccelTableKind::CU);
}
CU.addDIETypeSignature(RefDie, Signature);
setCurrentDWARF5AccelTable(DWARF5AccelTableKind::CU);
}

// Add the Name along with its companion DIE to the appropriate accelerator
Expand Down Expand Up @@ -3587,6 +3587,14 @@ void DwarfDebug::addAccelNameImpl(
break;
case AccelTableKind::Dwarf: {
DWARF5AccelTable &Current = getCurrentDWARF5AccelTable();
assert((&Current == &AccelTypeUnitsDebugNames) ||
((&Current == &AccelDebugNames) &&
(Unit.getUnitDie().getTag() != dwarf::DW_TAG_type_unit)) &&
"Kind is CU but TU is being processed.");
assert((&Current == &AccelDebugNames) ||
((&Current == &AccelTypeUnitsDebugNames) &&
(Unit.getUnitDie().getTag() == dwarf::DW_TAG_type_unit)) &&
"Kind is TU but CU is being processed.");
// The type unit can be discarded, so need to add references to final
// acceleration table once we know it's complete and we emit it.
Current.addName(Ref, Die, Unit.getUniqueID());
Expand Down
16 changes: 6 additions & 10 deletions llvm/lib/CodeGen/CodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5553,7 +5553,6 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
} else {
Type *I8PtrTy =
Builder.getPtrTy(Addr->getType()->getPointerAddressSpace());
Type *I8Ty = Builder.getInt8Ty();

// Start with the base register. Do this first so that subsequent address
// matching finds it last, which will prevent it from trying to match it
Expand Down Expand Up @@ -5597,8 +5596,8 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
// SDAG consecutive load/store merging.
if (ResultPtr->getType() != I8PtrTy)
ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
ResultPtr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex,
"sunkaddr", AddrMode.InBounds);
ResultPtr = Builder.CreatePtrAdd(ResultPtr, ResultIndex, "sunkaddr",
AddrMode.InBounds);
}

ResultIndex = V;
Expand All @@ -5609,8 +5608,8 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
} else {
if (ResultPtr->getType() != I8PtrTy)
ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
SunkAddr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr",
AddrMode.InBounds);
SunkAddr = Builder.CreatePtrAdd(ResultPtr, ResultIndex, "sunkaddr",
AddrMode.InBounds);
}

if (SunkAddr->getType() != Addr->getType()) {
Expand Down Expand Up @@ -6169,7 +6168,6 @@ bool CodeGenPrepare::splitLargeGEPOffsets() {
Type *PtrIdxTy = DL->getIndexType(GEP->getType());
Type *I8PtrTy =
PointerType::get(Ctx, GEP->getType()->getPointerAddressSpace());
Type *I8Ty = Type::getInt8Ty(Ctx);

BasicBlock::iterator NewBaseInsertPt;
BasicBlock *NewBaseInsertBB;
Expand Down Expand Up @@ -6198,7 +6196,7 @@ bool CodeGenPrepare::splitLargeGEPOffsets() {
if (NewBaseGEP->getType() != I8PtrTy)
NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy);
NewBaseGEP =
NewBaseBuilder.CreateGEP(I8Ty, NewBaseGEP, BaseIndex, "splitgep");
NewBaseBuilder.CreatePtrAdd(NewBaseGEP, BaseIndex, "splitgep");
NewGEPBases.insert(NewBaseGEP);
return;
};
Expand Down Expand Up @@ -6235,9 +6233,7 @@ bool CodeGenPrepare::splitLargeGEPOffsets() {
}

// Generate a new GEP to replace the current one.
LLVMContext &Ctx = GEP->getContext();
Type *PtrIdxTy = DL->getIndexType(GEP->getType());
Type *I8Ty = Type::getInt8Ty(Ctx);

if (!NewBaseGEP) {
// Create a new base if we don't have one yet. Find the insertion
Expand All @@ -6250,7 +6246,7 @@ bool CodeGenPrepare::splitLargeGEPOffsets() {
if (Offset != BaseOffset) {
// Calculate the new offset for the new GEP.
Value *Index = ConstantInt::get(PtrIdxTy, Offset - BaseOffset);
NewGEP = Builder.CreateGEP(I8Ty, NewBaseGEP, Index);
NewGEP = Builder.CreatePtrAdd(NewBaseGEP, Index);
}
replaceAllUsesWith(GEP, NewGEP, FreshBBs, IsHugeFunc);
LargeOffsetGEPID.erase(GEP);
Expand Down
5 changes: 2 additions & 3 deletions llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ static bool lowerLoadRelative(Function &F) {

bool Changed = false;
Type *Int32Ty = Type::getInt32Ty(F.getContext());
Type *Int8Ty = Type::getInt8Ty(F.getContext());

for (Use &U : llvm::make_early_inc_range(F.uses())) {
auto CI = dyn_cast<CallInst>(U.getUser());
Expand All @@ -81,10 +80,10 @@ static bool lowerLoadRelative(Function &F) {

IRBuilder<> B(CI);
Value *OffsetPtr =
B.CreateGEP(Int8Ty, CI->getArgOperand(0), CI->getArgOperand(1));
B.CreatePtrAdd(CI->getArgOperand(0), CI->getArgOperand(1));
Value *OffsetI32 = B.CreateAlignedLoad(Int32Ty, OffsetPtr, Align(4));

Value *ResultPtr = B.CreateGEP(Int8Ty, CI->getArgOperand(0), OffsetI32);
Value *ResultPtr = B.CreatePtrAdd(CI->getArgOperand(0), OffsetI32);

CI->replaceAllUsesWith(ResultPtr);
CI->eraseFromParent();
Expand Down
22 changes: 10 additions & 12 deletions llvm/lib/CodeGen/SafeStack.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ class SafeStack {
Type *StackPtrTy;
Type *IntPtrTy;
Type *Int32Ty;
Type *Int8Ty;

Value *UnsafeStackPtr = nullptr;

Expand Down Expand Up @@ -195,8 +194,7 @@ class SafeStack {
: F(F), TL(TL), DL(DL), DTU(DTU), SE(SE),
StackPtrTy(PointerType::getUnqual(F.getContext())),
IntPtrTy(DL.getIntPtrType(F.getContext())),
Int32Ty(Type::getInt32Ty(F.getContext())),
Int8Ty(Type::getInt8Ty(F.getContext())) {}
Int32Ty(Type::getInt32Ty(F.getContext())) {}

// Run the transformation on the associated function.
// Returns whether the function was changed.
Expand Down Expand Up @@ -562,8 +560,8 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(

if (StackGuardSlot) {
unsigned Offset = SSL.getObjectOffset(StackGuardSlot);
Value *Off = IRB.CreateGEP(Int8Ty, BasePointer, // BasePointer is i8*
ConstantInt::get(Int32Ty, -Offset));
Value *Off =
IRB.CreatePtrAdd(BasePointer, ConstantInt::get(Int32Ty, -Offset));
Value *NewAI =
IRB.CreateBitCast(Off, StackGuardSlot->getType(), "StackGuardSlot");

Expand All @@ -581,10 +579,10 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
if (Size == 0)
Size = 1; // Don't create zero-sized stack objects.

Value *Off = IRB.CreateGEP(Int8Ty, BasePointer, // BasePointer is i8*
ConstantInt::get(Int32Ty, -Offset));
Value *Off =
IRB.CreatePtrAdd(BasePointer, ConstantInt::get(Int32Ty, -Offset));
Value *NewArg = IRB.CreateBitCast(Off, Arg->getType(),
Arg->getName() + ".unsafe-byval");
Arg->getName() + ".unsafe-byval");

// Replace alloc with the new location.
replaceDbgDeclare(Arg, BasePointer, DIB, DIExpression::ApplyOffset,
Expand Down Expand Up @@ -616,8 +614,8 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
InsertBefore = User;

IRBuilder<> IRBUser(InsertBefore);
Value *Off = IRBUser.CreateGEP(Int8Ty, BasePointer, // BasePointer is i8*
ConstantInt::get(Int32Ty, -Offset));
Value *Off =
IRBUser.CreatePtrAdd(BasePointer, ConstantInt::get(Int32Ty, -Offset));
Value *Replacement = IRBUser.CreateBitCast(Off, AI->getType(), Name);

if (auto *PHI = dyn_cast<PHINode>(User))
Expand Down Expand Up @@ -647,8 +645,8 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
IRB.SetInsertPoint(BasePointer->getNextNode());

Value *StaticTop =
IRB.CreateGEP(Int8Ty, BasePointer, ConstantInt::get(Int32Ty, -FrameSize),
"unsafe_stack_static_top");
IRB.CreatePtrAdd(BasePointer, ConstantInt::get(Int32Ty, -FrameSize),
"unsafe_stack_static_top");
IRB.CreateStore(StaticTop, UnsafeStackPtr);
return StaticTop;
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/DWARFLinker/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
add_llvm_component_library(LLVMDWARFLinkerBase
add_llvm_component_library(LLVMDWARFLinker
Utils.cpp

ADDITIONAL_HEADER_DIRS
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/DWARFLinker/Classic/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
add_llvm_component_library(LLVMDWARFLinker
add_llvm_component_library(LLVMDWARFLinkerClassic
DWARFLinkerCompileUnit.cpp
DWARFLinkerDeclContext.cpp
DWARFLinker.cpp
Expand All @@ -16,7 +16,7 @@ add_llvm_component_library(LLVMDWARFLinker
CodeGen
CodeGenTypes
DebugInfoDWARF
DWARFLinkerBase
DWARFLinker
MC
Object
Support
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/DWARFLinker/Parallel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ add_llvm_component_library(LLVMDWARFLinkerParallel
BinaryFormat
CodeGen
DebugInfoDWARF
DWARFLinkerBase
DWARFLinker
MC
Object
Support
Expand Down
42 changes: 41 additions & 1 deletion llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2925,37 +2925,45 @@ def UDF : UDFType<0, "udf">;
// Pair (indexed, offset)
defm LDPW : LoadPairOffset<0b00, 0, GPR32z, simm7s4, "ldp">;
defm LDPX : LoadPairOffset<0b10, 0, GPR64z, simm7s8, "ldp">;
let Predicates = [HasFPARMv8] in {
defm LDPS : LoadPairOffset<0b00, 1, FPR32Op, simm7s4, "ldp">;
defm LDPD : LoadPairOffset<0b01, 1, FPR64Op, simm7s8, "ldp">;
defm LDPQ : LoadPairOffset<0b10, 1, FPR128Op, simm7s16, "ldp">;
}

defm LDPSW : LoadPairOffset<0b01, 0, GPR64z, simm7s4, "ldpsw">;

// Pair (pre-indexed)
def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32z, simm7s4, "ldp">;
def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64z, simm7s8, "ldp">;
let Predicates = [HasFPARMv8] in {
def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32Op, simm7s4, "ldp">;
def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64Op, simm7s8, "ldp">;
def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128Op, simm7s16, "ldp">;
}

def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64z, simm7s4, "ldpsw">;

// Pair (post-indexed)
def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32z, simm7s4, "ldp">;
def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64z, simm7s8, "ldp">;
let Predicates = [HasFPARMv8] in {
def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32Op, simm7s4, "ldp">;
def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64Op, simm7s8, "ldp">;
def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128Op, simm7s16, "ldp">;
}

def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64z, simm7s4, "ldpsw">;


// Pair (no allocate)
defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32z, simm7s4, "ldnp">;
defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64z, simm7s8, "ldnp">;
let Predicates = [HasFPARMv8] in {
defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32Op, simm7s4, "ldnp">;
defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64Op, simm7s8, "ldnp">;
defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128Op, simm7s16, "ldnp">;
}

def : Pat<(AArch64ldp (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
(LDPXi GPR64sp:$Rn, simm7s8:$offset)>;
Expand All @@ -2973,11 +2981,13 @@ defm LDRW : Load32RO<0b10, 0, 0b01, GPR32, "ldr", i32, load>;
defm LDRX : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>;

// Floating-point
let Predicates = [HasFPARMv8] in {
defm LDRB : Load8RO<0b00, 1, 0b01, FPR8Op, "ldr", i8, load>;
defm LDRH : Load16RO<0b01, 1, 0b01, FPR16Op, "ldr", f16, load>;
defm LDRS : Load32RO<0b10, 1, 0b01, FPR32Op, "ldr", f32, load>;
defm LDRD : Load64RO<0b11, 1, 0b01, FPR64Op, "ldr", f64, load>;
defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128Op, "ldr", f128, load>;
}

// Load sign-extended half-word
defm LDRSHW : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh", i32, sextloadi16>;
Expand Down Expand Up @@ -3147,6 +3157,7 @@ defm LDRX : LoadUI<0b11, 0, 0b01, GPR64z, uimm12s8, "ldr",
defm LDRW : LoadUI<0b10, 0, 0b01, GPR32z, uimm12s4, "ldr",
[(set GPR32z:$Rt,
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
let Predicates = [HasFPARMv8] in {
defm LDRB : LoadUI<0b00, 1, 0b01, FPR8Op, uimm12s1, "ldr",
[(set FPR8Op:$Rt,
(load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
Expand All @@ -3162,6 +3173,7 @@ defm LDRD : LoadUI<0b11, 1, 0b01, FPR64Op, uimm12s8, "ldr",
defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128Op, uimm12s16, "ldr",
[(set (f128 FPR128Op:$Rt),
(load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>;
}

// bf16 load pattern
def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
Expand Down Expand Up @@ -3339,12 +3351,14 @@ def LDRWl : LoadLiteral<0b00, 0, GPR32z, "ldr",
[(set GPR32z:$Rt, (load (AArch64adr alignedglobal:$label)))]>;
def LDRXl : LoadLiteral<0b01, 0, GPR64z, "ldr",
[(set GPR64z:$Rt, (load (AArch64adr alignedglobal:$label)))]>;
let Predicates = [HasFPARMv8] in {
def LDRSl : LoadLiteral<0b00, 1, FPR32Op, "ldr",
[(set (f32 FPR32Op:$Rt), (load (AArch64adr alignedglobal:$label)))]>;
def LDRDl : LoadLiteral<0b01, 1, FPR64Op, "ldr",
[(set (f64 FPR64Op:$Rt), (load (AArch64adr alignedglobal:$label)))]>;
def LDRQl : LoadLiteral<0b10, 1, FPR128Op, "ldr",
[(set (f128 FPR128Op:$Rt), (load (AArch64adr alignedglobal:$label)))]>;
}

// load sign-extended word
def LDRSWl : LoadLiteral<0b10, 0, GPR64z, "ldrsw",
Expand All @@ -3367,6 +3381,7 @@ defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64z, "ldur",
defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32z, "ldur",
[(set GPR32z:$Rt,
(load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
let Predicates = [HasFPARMv8] in {
defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8Op, "ldur",
[(set FPR8Op:$Rt,
(load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
Expand All @@ -3382,6 +3397,7 @@ defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64Op, "ldur",
defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128Op, "ldur",
[(set (f128 FPR128Op:$Rt),
(load (am_unscaled128 GPR64sp:$Rn, simm9:$offset)))]>;
}

defm LDURHH
: LoadUnscaled<0b01, 0, 0b01, GPR32, "ldurh",
Expand Down Expand Up @@ -3641,11 +3657,13 @@ defm LDTRSW : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;
// (immediate pre-indexed)
def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32z, "ldr">;
def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64z, "ldr">;
let Predicates = [HasFPARMv8] in {
def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8Op, "ldr">;
def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16Op, "ldr">;
def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32Op, "ldr">;
def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64Op, "ldr">;
def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128Op, "ldr">;
}

// load sign-extended half-word
def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32z, "ldrsh">;
Expand All @@ -3666,11 +3684,13 @@ def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64z, "ldrsw">;
// (immediate post-indexed)
def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32z, "ldr">;
def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64z, "ldr">;
let Predicates = [HasFPARMv8] in {
def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8Op, "ldr">;
def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16Op, "ldr">;
def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32Op, "ldr">;
def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64Op, "ldr">;
def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128Op, "ldr">;
}

// load sign-extended half-word
def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32z, "ldrsh">;
Expand All @@ -3695,30 +3715,38 @@ def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64z, "ldrsw">;
// FIXME: Use dedicated range-checked addressing mode operand here.
defm STPW : StorePairOffset<0b00, 0, GPR32z, simm7s4, "stp">;
defm STPX : StorePairOffset<0b10, 0, GPR64z, simm7s8, "stp">;
let Predicates = [HasFPARMv8] in {
defm STPS : StorePairOffset<0b00, 1, FPR32Op, simm7s4, "stp">;
defm STPD : StorePairOffset<0b01, 1, FPR64Op, simm7s8, "stp">;
defm STPQ : StorePairOffset<0b10, 1, FPR128Op, simm7s16, "stp">;
}

// Pair (pre-indexed)
def STPWpre : StorePairPreIdx<0b00, 0, GPR32z, simm7s4, "stp">;
def STPXpre : StorePairPreIdx<0b10, 0, GPR64z, simm7s8, "stp">;
let Predicates = [HasFPARMv8] in {
def STPSpre : StorePairPreIdx<0b00, 1, FPR32Op, simm7s4, "stp">;
def STPDpre : StorePairPreIdx<0b01, 1, FPR64Op, simm7s8, "stp">;
def STPQpre : StorePairPreIdx<0b10, 1, FPR128Op, simm7s16, "stp">;
}

// Pair (post-indexed)
def STPWpost : StorePairPostIdx<0b00, 0, GPR32z, simm7s4, "stp">;
def STPXpost : StorePairPostIdx<0b10, 0, GPR64z, simm7s8, "stp">;
let Predicates = [HasFPARMv8] in {
def STPSpost : StorePairPostIdx<0b00, 1, FPR32Op, simm7s4, "stp">;
def STPDpost : StorePairPostIdx<0b01, 1, FPR64Op, simm7s8, "stp">;
def STPQpost : StorePairPostIdx<0b10, 1, FPR128Op, simm7s16, "stp">;
}

// Pair (no allocate)
defm STNPW : StorePairNoAlloc<0b00, 0, GPR32z, simm7s4, "stnp">;
defm STNPX : StorePairNoAlloc<0b10, 0, GPR64z, simm7s8, "stnp">;
let Predicates = [HasFPARMv8] in {
defm STNPS : StorePairNoAlloc<0b00, 1, FPR32Op, simm7s4, "stnp">;
defm STNPD : StorePairNoAlloc<0b01, 1, FPR64Op, simm7s8, "stnp">;
defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">;
}

def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
(STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>;
Expand All @@ -3738,11 +3766,13 @@ defm STRX : Store64RO<0b11, 0, 0b00, GPR64, "str", i64, store>;


// Floating-point
let Predicates = [HasFPARMv8] in {
defm STRB : Store8RO< 0b00, 1, 0b00, FPR8Op, "str", i8, store>;
defm STRH : Store16RO<0b01, 1, 0b00, FPR16Op, "str", f16, store>;
defm STRS : Store32RO<0b10, 1, 0b00, FPR32Op, "str", f32, store>;
defm STRD : Store64RO<0b11, 1, 0b00, FPR64Op, "str", f64, store>;
defm STRQ : Store128RO<0b00, 1, 0b10, FPR128Op, "str">;
}

let Predicates = [UseSTRQro], AddedComplexity = 10 in {
def : Pat<(store (f128 FPR128:$Rt),
Expand Down Expand Up @@ -3851,6 +3881,7 @@ defm STRX : StoreUIz<0b11, 0, 0b00, GPR64z, uimm12s8, "str",
defm STRW : StoreUIz<0b10, 0, 0b00, GPR32z, uimm12s4, "str",
[(store GPR32z:$Rt,
(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
let Predicates = [HasFPARMv8] in {
defm STRB : StoreUI<0b00, 1, 0b00, FPR8Op, uimm12s1, "str",
[(store FPR8Op:$Rt,
(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
Expand All @@ -3864,6 +3895,7 @@ defm STRD : StoreUI<0b11, 1, 0b00, FPR64Op, uimm12s8, "str",
[(store (f64 FPR64Op:$Rt),
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
defm STRQ : StoreUI<0b00, 1, 0b10, FPR128Op, uimm12s16, "str", []>;
}

defm STRHH : StoreUIz<0b01, 0, 0b00, GPR32z, uimm12s2, "strh",
[(truncstorei16 GPR32z:$Rt,
Expand Down Expand Up @@ -3985,6 +4017,7 @@ defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64z, "stur",
defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32z, "stur",
[(store GPR32z:$Rt,
(am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
let Predicates = [HasFPARMv8] in {
defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8Op, "stur",
[(store FPR8Op:$Rt,
(am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
Expand All @@ -4000,6 +4033,7 @@ defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64Op, "stur",
defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128Op, "stur",
[(store (f128 FPR128Op:$Rt),
(am_unscaled128 GPR64sp:$Rn, simm9:$offset))]>;
}
defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32z, "sturh",
[(truncstorei16 GPR32z:$Rt,
(am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
Expand Down Expand Up @@ -4156,11 +4190,13 @@ defm STTRB : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;
// (immediate pre-indexed)
def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32z, "str", pre_store, i32>;
def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64z, "str", pre_store, i64>;
let Predicates = [HasFPARMv8] in {
def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8Op, "str", pre_store, i8>;
def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16Op, "str", pre_store, f16>;
def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32Op, "str", pre_store, f32>;
def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64Op, "str", pre_store, f64>;
def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128Op, "str", pre_store, f128>;
}

def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32z, "strb", pre_truncsti8, i32>;
def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32z, "strh", pre_truncsti16, i32>;
Expand Down Expand Up @@ -4210,11 +4246,13 @@ def : Pat<(pre_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
// (immediate post-indexed)
def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32z, "str", post_store, i32>;
def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64z, "str", post_store, i64>;
let Predicates = [HasFPARMv8] in {
def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8Op, "str", post_store, i8>;
def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16Op, "str", post_store, f16>;
def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32Op, "str", post_store, f32>;
def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64Op, "str", post_store, f64>;
def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128Op, "str", post_store, f128>;
}

def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32z, "strb", post_truncsti8, i32>;
def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32z, "strh", post_truncsti16, i32>;
Expand Down Expand Up @@ -4531,7 +4569,8 @@ def : Pat<(f64 (fdiv (f64 (any_uint_to_fp (i32 GPR32:$Rn))), fixedpoint_f64_i32:
defm FMOV : UnscaledConversion<"fmov">;

// Add pseudo ops for FMOV 0 so we can mark them as isReMaterializable
let isReMaterializable = 1, isCodeGenOnly = 1, isAsCheapAsAMove = 1 in {
let isReMaterializable = 1, isCodeGenOnly = 1, isAsCheapAsAMove = 1,
Predicates = [HasFPARMv8] in {
def FMOVH0 : Pseudo<(outs FPR16:$Rd), (ins), [(set f16:$Rd, (fpimm0))]>,
Sched<[WriteF]>;
def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>,
Expand Down Expand Up @@ -4758,6 +4797,7 @@ def : Pat<(bf16 (AArch64csel (bf16 FPR16:$Rn), (bf16 FPR16:$Rm), (i32 imm:$cond)
// CSEL instructions providing f128 types need to be handled by a
// pseudo-instruction since the eventual code will need to introduce basic
// blocks and control flow.
let Predicates = [HasFPARMv8] in
def F128CSEL : Pseudo<(outs FPR128:$Rd),
(ins FPR128:$Rn, FPR128:$Rm, ccode:$cond),
[(set (f128 FPR128:$Rd),
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AArch64/AArch64SystemOperands.td
Original file line number Diff line number Diff line change
Expand Up @@ -986,8 +986,10 @@ def : RWSysReg<"SPSR_irq", 0b11, 0b100, 0b0100, 0b0011, 0b000>;
def : RWSysReg<"SPSR_abt", 0b11, 0b100, 0b0100, 0b0011, 0b001>;
def : RWSysReg<"SPSR_und", 0b11, 0b100, 0b0100, 0b0011, 0b010>;
def : RWSysReg<"SPSR_fiq", 0b11, 0b100, 0b0100, 0b0011, 0b011>;
let Requires = [{ {AArch64::FeatureFPARMv8} }] in {
def : RWSysReg<"FPCR", 0b11, 0b011, 0b0100, 0b0100, 0b000>;
def : RWSysReg<"FPSR", 0b11, 0b011, 0b0100, 0b0100, 0b001>;
}
def : RWSysReg<"DSPSR_EL0", 0b11, 0b011, 0b0100, 0b0101, 0b000>;
def : RWSysReg<"DLR_EL0", 0b11, 0b011, 0b0100, 0b0101, 0b001>;
def : RWSysReg<"IFSR32_EL2", 0b11, 0b100, 0b0101, 0b0000, 0b001>;
Expand Down
10 changes: 5 additions & 5 deletions llvm/lib/Target/AMDGPU/AMDGPUGISel.td
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ def gi_vop3pmodsdot :
GIComplexOperandMatcher<s32, "selectVOP3PModsDOT">,
GIComplexPatternEquiv<VOP3PModsDOT>;

def gi_dotiuvop3pmods :
GIComplexOperandMatcher<s32, "selectDotIUVOP3PMods">,
GIComplexPatternEquiv<DotIUVOP3PMods>;
def gi_vop3pmodsneg :
GIComplexOperandMatcher<s32, "selectVOP3PModsNeg">,
GIComplexPatternEquiv<VOP3PModsNeg>;

def gi_wmmaopselvop3pmods :
GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">,
Expand Down Expand Up @@ -379,8 +379,8 @@ def gi_extract_cpol : GICustomOperandRenderer<"renderExtractCPol">,
def gi_extract_swz : GICustomOperandRenderer<"renderExtractSWZ">,
GISDNodeXFormEquiv<extract_swz>;

def gi_set_glc : GICustomOperandRenderer<"renderSetGLC">,
GISDNodeXFormEquiv<set_glc>;
def gi_extract_cpol_set_glc : GICustomOperandRenderer<"renderExtractCpolSetGLC">,
GISDNodeXFormEquiv<extract_cpol_set_glc>;

def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameIndex">,
GISDNodeXFormEquiv<frameindex_to_targetframeindex>;
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3009,7 +3009,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
return SelectVOP3PMods(In, Src, SrcMods, true);
}

bool AMDGPUDAGToDAGISel::SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const {
bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
const ConstantSDNode *C = cast<ConstantSDNode>(In);
// Literal i1 value set in intrinsic, represents SrcMods for the next operand.
// 1 promotes packed values to signed, 0 treats them as unsigned.
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool IsDOT = false) const;
bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const;

bool SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const;
bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const;
bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const;

bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
Expand Down
15 changes: 9 additions & 6 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1917,7 +1917,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
if (BaseOpcode->Atomic)
CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12))
if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
AMDGPU::CPol::VOLATILE))
return false;

int NumVAddrRegs = 0;
Expand Down Expand Up @@ -3927,7 +3928,7 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
}

InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectDotIUVOP3PMods(MachineOperand &Root) const {
AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
// Literal i1 value set in intrinsic, represents SrcMods for the next operand.
// Value is in Imm operand as i1 sign extended to int64_t.
// 1(-1) promotes packed values to signed, 0 treats them as unsigned.
Expand Down Expand Up @@ -5496,11 +5497,13 @@ void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
MIB.addImm(Swizzle);
}

void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC);
const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
(AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
: AMDGPU::CPol::ALL_pregfx12);
MIB.addImm(Cpol | AMDGPU::CPol::GLC);
}

void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
selectVOP3PModsDOT(MachineOperand &Root) const;

InstructionSelector::ComplexRendererFns
selectDotIUVOP3PMods(MachineOperand &Root) const;
selectVOP3PModsNeg(MachineOperand &Root) const;

InstructionSelector::ComplexRendererFns
selectWMMAOpSelVOP3PMods(MachineOperand &Root) const;
Expand Down Expand Up @@ -331,8 +331,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
int OpIdx) const;
void renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
void renderSetGLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
void renderExtractCpolSetGLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;

void renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
Expand Down
39 changes: 20 additions & 19 deletions llvm/lib/Target/AMDGPU/BUFInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1628,36 +1628,36 @@ multiclass SIBufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst,

defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
defvar CachePolicy = !if(!eq(RtnMode, "ret"),
(set_glc $cachepolicy), (timm:$cachepolicy));
(extract_cpol_set_glc $auxiliary), (extract_cpol $auxiliary));

let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in {
def : GCNPat<
(vt (Op vt:$vdata_in, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset),
timm:$offset, timm:$cachepolicy, 0)),
timm:$offset, timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
timm:$offset, CachePolicy)
>;

def : GCNPat<
(vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, (BUFSOffset i32:$soffset),
timm:$offset, timm:$cachepolicy, timm)),
timm:$offset, timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(Inst # "_IDXEN" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc,
SCSrc_b32:$soffset, timm:$offset, CachePolicy)
>;

def : GCNPat<
(vt (Op vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset,
(BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, 0)),
(BUFSOffset i32:$soffset), timm:$offset, timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(Inst # "_OFFEN" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc,
SCSrc_b32:$soffset, timm:$offset, CachePolicy)
>;

def : GCNPat<
(vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset,
(BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, timm)),
(BUFSOffset i32:$soffset), timm:$offset, timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(Inst # "_BOTHEN" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
Expand Down Expand Up @@ -1726,35 +1726,35 @@ multiclass BufferAtomicPatterns_NO_RTN_Common<SDPatternOperator name, ValueType
def : GCNPat<
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0,
0, (BUFSOffset i32:$soffset), timm:$offset,
timm:$cachepolicy, 0),
timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
timm:$offset, timm:$cachepolicy)
timm:$offset, (extract_cpol $auxiliary))
>;

def : GCNPat<
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
0, (BUFSOffset i32:$soffset), timm:$offset,
timm:$cachepolicy, timm),
timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
timm:$offset, timm:$cachepolicy)
timm:$offset, (extract_cpol $auxiliary))
>;

def : GCNPat<
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0,
i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
timm:$cachepolicy, 0),
timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
timm:$offset, timm:$cachepolicy)
timm:$offset, (extract_cpol $auxiliary))
>;

def : GCNPat<
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
timm:$cachepolicy, timm),
timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
getVregSrcForVT<vt>.ret:$vdata_in,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, timm:$cachepolicy)
SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary))
>;
}

Expand Down Expand Up @@ -1791,8 +1791,9 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap
# !if(!eq(RtnMode, "ret"), "", "_noret"));
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy),
(timm:$cachepolicy));
defvar CachePolicy = !if(!eq(RtnMode, "ret"),
(extract_cpol_set_glc $auxiliary),
(extract_cpol $auxiliary));
defvar SrcRC = getVregSrcForVT<vt>.ret;
defvar DataRC = getVregSrcForVT<data_vt>.ret;
defvar SubLo = !if(!eq(vt, i32), sub0, sub0_sub1);
Expand All @@ -1804,7 +1805,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
def : GCNPat<
(vt (Op
vt:$data, vt:$cmp, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset),
timm:$offset, timm:$cachepolicy, 0)),
timm:$offset, timm:$auxiliary, 0)),
!if(!eq(RtnMode, "ret"),
(EXTRACT_SUBREG OffsetResDag, SubLo),
OffsetResDag)
Expand All @@ -1818,7 +1819,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
(vt (Op
vt:$data, vt:$cmp, v4i32:$rsrc, i32:$vindex,
0, (BUFSOffset i32:$soffset), timm:$offset,
timm:$cachepolicy, timm)),
timm:$auxiliary, timm)),
!if(!eq(RtnMode, "ret"),
(EXTRACT_SUBREG IdxenResDag, SubLo),
IdxenResDag)
Expand All @@ -1832,7 +1833,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
(vt (Op
vt:$data, vt:$cmp, v4i32:$rsrc, 0,
i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
timm:$cachepolicy, 0)),
timm:$auxiliary, 0)),
!if(!eq(RtnMode, "ret"),
(EXTRACT_SUBREG OffenResDag, SubLo),
OffenResDag)
Expand All @@ -1846,7 +1847,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
(vt (Op
vt:$data, vt:$cmp, v4i32:$rsrc, i32:$vindex,
i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
timm:$cachepolicy, timm)),
timm:$auxiliary, timm)),
!if(!eq(RtnMode, "ret"),
(EXTRACT_SUBREG BothenResDag, SubLo),
BothenResDag)
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/SIDefines.h
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,10 @@ enum CPol {
TH_TYPE_STORE = 1 << 8, // TH_STORE policy
TH_TYPE_ATOMIC = 1 << 9, // TH_ATOMIC policy
TH_REAL_BYPASS = 1 << 10, // is TH=3 bypass policy or not

// Volatile (used to preserve/signal operation volatility for buffer
// operations not a real instruction bit)
VOLATILE = 1 << 31,
};

} // namespace CPol
Expand Down
8 changes: 7 additions & 1 deletion llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1183,6 +1183,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = RsrcArg;
}

auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
Info.flags |= MachineMemOperand::MOVolatile;
Info.flags |= MachineMemOperand::MODereferenceable;
if (ME.onlyReadsMemory()) {
unsigned MaxNumLanes = 4;
Expand Down Expand Up @@ -7639,7 +7642,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue();
if (BaseOpcode->Atomic)
CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12))
if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
AMDGPU::CPol::VOLATILE))
return Op;

SmallVector<SDValue, 26> Ops;
Expand Down Expand Up @@ -8005,6 +8009,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDLoc(Op), MVT::i32);
case Intrinsic::amdgcn_s_buffer_load: {
unsigned CPol = Op.getConstantOperandVal(3);
// s_buffer_load, because of how it's optimized, can't be volatile
// so reject ones with the volatile bit set.
if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
? AMDGPU::CPol::ALL
: AMDGPU::CPol::ALL_pregfx12))
Expand Down
22 changes: 11 additions & 11 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ class WaitcntBrackets {
}

void setStateOnFunctionEntryOrReturn() {
setScoreUB(VS_CNT, getWaitCountMax(VS_CNT));
setScoreUB(VS_CNT, getScoreLB(VS_CNT) + getWaitCountMax(VS_CNT));
PendingEvents |= WaitEventMaskForInst[VS_CNT];
}

Expand Down Expand Up @@ -874,11 +874,11 @@ static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
}

bool SIInsertWaitcnts::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
unsigned Opcode = Waitcnt->getOpcode();
if (!SIInstrInfo::isSoftWaitcnt(Opcode))
unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
if (Opcode == Waitcnt->getOpcode())
return false;

Waitcnt->setDesc(TII->get(SIInstrInfo::getNonSoftWaitcntOpcode(Opcode)));
Waitcnt->setDesc(TII->get(Opcode));
return true;
}

Expand All @@ -898,10 +898,10 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
if (II.isMetaInstruction())
continue;

unsigned Opcode = II.getOpcode();
bool IsSoft = SIInstrInfo::isSoftWaitcnt(Opcode);
unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
bool IsSoft = Opcode != II.getOpcode();

if (SIInstrInfo::isWaitcnt(Opcode)) {
if (Opcode == AMDGPU::S_WAITCNT) {
// Update required wait count. If this is a soft waitcnt (= it was added
// by an earlier pass), it may be entirely removed.
unsigned IEnc = II.getOperand(0).getImm();
Expand All @@ -918,7 +918,7 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
WaitcntInstr = &II;

} else {
assert(SIInstrInfo::isWaitcntVsCnt(Opcode));
assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);

unsigned OldVSCnt =
Expand Down Expand Up @@ -1590,9 +1590,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
}

static bool isWaitInstr(MachineInstr &Inst) {
auto Opcode = Inst.getOpcode();
return SIInstrInfo::isWaitcnt(Opcode) ||
(SIInstrInfo::isWaitcntVsCnt(Opcode) && Inst.getOperand(0).isReg() &&
unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
return Opcode == AMDGPU::S_WAITCNT ||
(Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
}

Expand Down
3 changes: 1 addition & 2 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9076,8 +9076,7 @@ bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
}

int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
if (SIInstrInfo::isSoftWaitcnt(Opcode))
Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);

unsigned Gen = subtargetEncodingFamily(ST);

Expand Down
27 changes: 6 additions & 21 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -905,29 +905,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
}

static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) {
if (isWaitcnt(Opcode))
switch (Opcode) {
case AMDGPU::S_WAITCNT_soft:
return AMDGPU::S_WAITCNT;

if (isWaitcntVsCnt(Opcode))
case AMDGPU::S_WAITCNT_VSCNT_soft:
return AMDGPU::S_WAITCNT_VSCNT;

llvm_unreachable("Expected opcode S_WAITCNT/S_WAITCNT_VSCNT");
}

static bool isWaitcnt(unsigned Opcode) {
return Opcode == AMDGPU::S_WAITCNT || Opcode == AMDGPU::S_WAITCNT_soft;
}

static bool isWaitcntVsCnt(unsigned Opcode) {
return Opcode == AMDGPU::S_WAITCNT_VSCNT ||
Opcode == AMDGPU::S_WAITCNT_VSCNT_soft;
}

// "Soft" waitcnt instructions can be relaxed/optimized out by
// SIInsertWaitcnts.
static bool isSoftWaitcnt(unsigned Opcode) {
return Opcode == AMDGPU::S_WAITCNT_soft ||
Opcode == AMDGPU::S_WAITCNT_VSCNT_soft;
default:
return Opcode;
}
}

bool isVGPRCopy(const MachineInstr &MI) const {
Expand Down
9 changes: 6 additions & 3 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -892,8 +892,11 @@ def extract_swz : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant(Swizzle, SDLoc(N), MVT::i8);
}]>;

def set_glc : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant(N->getZExtValue() | AMDGPU::CPol::GLC, SDLoc(N), MVT::i8);
def extract_cpol_set_glc : SDNodeXForm<timm, [{
const uint32_t cpol = N->getZExtValue() & (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12
? AMDGPU::CPol::ALL
: AMDGPU::CPol::ALL_pregfx12);
return CurDAG->getTargetConstant(cpol | AMDGPU::CPol::GLC, SDLoc(N), MVT::i8);
}]>;

//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -1360,7 +1363,7 @@ def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;
def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">;

def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
def DotIUVOP3PMods : ComplexPattern<untyped, 1, "SelectDotIUVOP3PMods">;
def VOP3PModsNeg : ComplexPattern<untyped, 1, "SelectVOP3PModsNeg">;
def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">;

def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
Expand Down
8 changes: 4 additions & 4 deletions llvm/lib/Target/AMDGPU/VOP3PInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -415,8 +415,8 @@ multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> {
null_frag, 1>;
// Dot-iu instructions consider input as signed if imod neg bits are set. Thus
// Dot-iu Intrinsics have extra operands and require separate codegen pattern.
def : GCNPat < (intrinsic_node (DotIUVOP3PMods i32:$src0_mods), i32:$src0,
(DotIUVOP3PMods i32:$src1_mods), i32:$src1,
def : GCNPat < (intrinsic_node (VOP3PModsNeg i32:$src0_mods), i32:$src0,
(VOP3PModsNeg i32:$src1_mods), i32:$src1,
i32:$src2, (i1 timm:$clamp)),
(!cast<Instruction>(NAME) $src0_mods, i32:$src0,
$src1_mods, i32:$src1,
Expand Down Expand Up @@ -828,8 +828,8 @@ class WMMAOpSelPat<Instruction Inst, SDPatternOperator node, VOPProfile P> :

class WMMAUIClampPat<Instruction Inst, SDPatternOperator node, VOPProfile P> :
GCNPat < (P.DstVT (node
(DotIUVOP3PMods i32:$src0_modifiers), (P.Src0VT P.Src0VT:$src0),
(DotIUVOP3PMods i32:$src1_modifiers), (P.Src1VT P.Src1VT:$src1),
(VOP3PModsNeg i32:$src0_modifiers), (P.Src0VT P.Src0VT:$src0),
(VOP3PModsNeg i32:$src1_modifiers), (P.Src1VT P.Src1VT:$src1),
(P.Src2VT P.Src2VT:$src2), (i1 timm:$clamp)
)),
(P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, (i32 8), P.Src2VT:$src2, i1:$clamp))
Expand Down
3 changes: 1 addition & 2 deletions llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -687,8 +687,7 @@ auto AlignVectors::createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr,
if (auto *I = dyn_cast<Instruction>(Ptr))
if (Instruction *New = CloneMap.lookup(I))
Ptr = New;
return Builder.CreateGEP(Type::getInt8Ty(HVC.F.getContext()), Ptr,
HVC.getConstInt(Adjust), "gep");
return Builder.CreatePtrAdd(Ptr, HVC.getConstInt(Adjust), "gep");
}

auto AlignVectors::createAlignedPointer(IRBuilderBase &Builder, Value *Ptr,
Expand Down
11 changes: 4 additions & 7 deletions llvm/lib/Target/RISCV/RISCVFeatures.td
Original file line number Diff line number Diff line change
Expand Up @@ -711,16 +711,13 @@ def NoHasStdExtZicfiss : Predicate<"!Subtarget->hasStdExtZicfiss()">;

def FeatureStdExtSmaia
: SubtargetFeature<"smaia", "HasStdExtSmaia", "true",
"'Smaia' (Smaia encompasses all added CSRs and all "
"modifications to interrupt response behavior that the "
"AIA specifies for a hart, over all privilege levels.)",
[]>;
"'Smaia' (Advanced Interrupt Architecture Machine "
"Level)", []>;

def FeatureStdExtSsaia
: SubtargetFeature<"ssaia", "HasStdExtSsaia", "true",
"'Ssaia' (Ssaia is essentially the same as Smaia except "
"excluding the machine-level CSRs and behavior not "
"directly visible to supervisor level.)", []>;
"'Ssaia' (Advanced Interrupt Architecture Supervisor "
"Level)", []>;

def HasHalfFPLoadStoreMove
: Predicate<"Subtarget->hasHalfFPLoadStoreMove()">,
Expand Down
10 changes: 7 additions & 3 deletions llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -597,8 +597,9 @@ void RequirementHandler::initAvailableCapabilitiesForVulkan(
const SPIRVSubtarget &ST) {
addAvailableCaps({Capability::Shader, Capability::Linkage});

// Provided by Vulkan version 1.0.
addAvailableCaps({Capability::Int16, Capability::Int64, Capability::Float64});
// Provided by all supported Vulkan versions.
addAvailableCaps({Capability::Int16, Capability::Int64, Capability::Float16,
Capability::Float64});
}

} // namespace SPIRV
Expand Down Expand Up @@ -733,7 +734,10 @@ void addInstrRequirements(const MachineInstr &MI,
auto SC = MI.getOperand(1).getImm();
Reqs.getAndAddRequirements(SPIRV::OperandCategory::StorageClassOperand, SC,
ST);
// If it's a type of pointer to float16, add Float16Buffer capability.
// If it's a type of pointer to float16 targeting OpenCL, add Float16Buffer
// capability.
if (!ST.isOpenCLEnv())
break;
assert(MI.getOperand(2).isReg());
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
SPIRVType *TypeDef = MRI.getVRegDef(MI.getOperand(2).getReg());
Expand Down
5 changes: 2 additions & 3 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56967,13 +56967,12 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
}
case 'W': {
assert(Constraint[1] == 's');
if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
GA->getValueType(0)));
} else if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
else if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op))
Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
BA->getValueType(0)));
}
return;
}
case 'Z': {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -808,8 +808,8 @@ static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL,
APInt Offset1(DL.getIndexTypeSizeInBits(Load1Ptr->getType()), 0);
Load1Ptr = Load1Ptr->stripAndAccumulateConstantOffsets(
DL, Offset1, /* AllowNonInbounds */ true);
Load1Ptr = Builder.CreateGEP(Builder.getInt8Ty(), Load1Ptr,
Builder.getInt32(Offset1.getZExtValue()));
Load1Ptr = Builder.CreatePtrAdd(Load1Ptr,
Builder.getInt32(Offset1.getZExtValue()));
}
// Generate wider load.
NewLoad = Builder.CreateAlignedLoad(WiderType, Load1Ptr, LI1->getAlign(),
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Transforms/Coroutines/CoroFrame.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2022,8 +2022,8 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
auto *FramePtr = GetFramePointer(Alloca);
auto &Value = *Alias.second;
auto ITy = IntegerType::get(C, Value.getBitWidth());
auto *AliasPtr = Builder.CreateGEP(Type::getInt8Ty(C), FramePtr,
ConstantInt::get(ITy, Value));
auto *AliasPtr =
Builder.CreatePtrAdd(FramePtr, ConstantInt::get(ITy, Value));
Alias.first->replaceUsesWithIf(
AliasPtr, [&](Use &U) { return DT.dominates(CB, U); });
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ static Value *createByteGEP(IRBuilderBase &IRB, const DataLayout &DL,
Value *Ptr, Type *ResElemTy, int64_t Offset) {
if (Offset != 0) {
APInt APOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), Offset);
Ptr = IRB.CreateGEP(IRB.getInt8Ty(), Ptr, IRB.getInt(APOffset));
Ptr = IRB.CreatePtrAdd(Ptr, IRB.getInt(APOffset));
}
return Ptr;
}
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Transforms/IPO/AttributorAttributes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -298,8 +298,8 @@ static Value *constructPointer(Value *Ptr, int64_t Offset,
<< "-bytes\n");

if (Offset)
Ptr = IRB.CreateGEP(IRB.getInt8Ty(), Ptr, IRB.getInt64(Offset),
Ptr->getName() + ".b" + Twine(Offset));
Ptr = IRB.CreatePtrAdd(Ptr, IRB.getInt64(Offset),
Ptr->getName() + ".b" + Twine(Offset));
return Ptr;
}

Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1769,7 +1769,7 @@ void DevirtModule::applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName,
continue;
auto *RetType = cast<IntegerType>(Call.CB.getType());
IRBuilder<> B(&Call.CB);
Value *Addr = B.CreateGEP(Int8Ty, Call.VTable, Byte);
Value *Addr = B.CreatePtrAdd(Call.VTable, Byte);
if (RetType->getBitWidth() == 1) {
Value *Bits = B.CreateLoad(Int8Ty, Addr);
Value *BitsAndBit = B.CreateAnd(Bits, Bit);
Expand Down Expand Up @@ -2066,14 +2066,14 @@ void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) {
Value *LoadedValue = nullptr;
if (TypeCheckedLoadFunc->getIntrinsicID() ==
Intrinsic::type_checked_load_relative) {
Value *GEP = LoadB.CreateGEP(Int8Ty, Ptr, Offset);
Value *GEP = LoadB.CreatePtrAdd(Ptr, Offset);
LoadedValue = LoadB.CreateLoad(Int32Ty, GEP);
LoadedValue = LoadB.CreateSExt(LoadedValue, IntPtrTy);
GEP = LoadB.CreatePtrToInt(GEP, IntPtrTy);
LoadedValue = LoadB.CreateAdd(GEP, LoadedValue);
LoadedValue = LoadB.CreateIntToPtr(LoadedValue, Int8PtrTy);
} else {
Value *GEP = LoadB.CreateGEP(Int8Ty, Ptr, Offset);
Value *GEP = LoadB.CreatePtrAdd(Ptr, Offset);
LoadedValue = LoadB.CreateLoad(Int8PtrTy, GEP);
}

Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5703,6 +5703,15 @@ Instruction *InstCombinerImpl::foldICmpWithCastOp(ICmpInst &ICmp) {
return new ICmpInst(ICmp.getPredicate(), Op0Src, NewOp1);
}

// Turn icmp pred (inttoptr x), (inttoptr y) into icmp pred x, y
if (CastOp0->getOpcode() == Instruction::IntToPtr &&
CompatibleSizes(DestTy, SrcTy)) {
Value *Op1Src;
if (match(ICmp.getOperand(1), m_IntToPtr(m_Value(Op1Src))) &&
Op1Src->getType() == SrcTy)
return new ICmpInst(ICmp.getPredicate(), Op0Src, Op1Src);
}

if (Instruction *R = foldICmpWithTrunc(ICmp))
return R;

Expand Down
3 changes: 1 addition & 2 deletions llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1590,8 +1590,7 @@ void AddressSanitizer::instrumentMaskedLoadOrStore(
InstrumentedAddress = IRB.CreateExtractElement(Addr, Index);
} else if (Stride) {
Index = IRB.CreateMul(Index, Stride);
Addr = IRB.CreateBitCast(Addr, PointerType::getUnqual(*C));
InstrumentedAddress = IRB.CreateGEP(Type::getInt8Ty(*C), Addr, {Index});
InstrumentedAddress = IRB.CreatePtrAdd(Addr, Index);
} else {
InstrumentedAddress = IRB.CreateGEP(VTy, Addr, {Zero, Index});
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -862,7 +862,7 @@ Value *HWAddressSanitizer::memToShadow(Value *Mem, IRBuilder<> &IRB) {
if (Mapping.Offset == 0)
return IRB.CreateIntToPtr(Shadow, PtrTy);
// (Mem >> Scale) + Offset
return IRB.CreateGEP(Int8Ty, ShadowBase, Shadow);
return IRB.CreatePtrAdd(ShadowBase, Shadow);
}

int64_t HWAddressSanitizer::getAccessInfo(bool IsWrite,
Expand Down
15 changes: 7 additions & 8 deletions llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5253,8 +5253,8 @@ struct VarArgAArch64Helper : public VarArgHelperBase {
Align(8), /*isStore*/ true)
.first;

Value *GrSrcPtr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
GrRegSaveAreaShadowPtrOff);
Value *GrSrcPtr =
IRB.CreateInBoundsPtrAdd(VAArgTLSCopy, GrRegSaveAreaShadowPtrOff);
Value *GrCopySize = IRB.CreateSub(GrArgSize, GrRegSaveAreaShadowPtrOff);

IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, Align(8), GrSrcPtr, Align(8),
Expand All @@ -5269,10 +5269,9 @@ struct VarArgAArch64Helper : public VarArgHelperBase {
Align(8), /*isStore*/ true)
.first;

Value *VrSrcPtr = IRB.CreateInBoundsGEP(
IRB.getInt8Ty(),
IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
IRB.getInt32(AArch64VrBegOffset)),
Value *VrSrcPtr = IRB.CreateInBoundsPtrAdd(
IRB.CreateInBoundsPtrAdd(VAArgTLSCopy,
IRB.getInt32(AArch64VrBegOffset)),
VrRegSaveAreaShadowPtrOff);
Value *VrCopySize = IRB.CreateSub(VrArgSize, VrRegSaveAreaShadowPtrOff);

Expand All @@ -5285,8 +5284,8 @@ struct VarArgAArch64Helper : public VarArgHelperBase {
Align(16), /*isStore*/ true)
.first;

Value *StackSrcPtr = IRB.CreateInBoundsGEP(
IRB.getInt8Ty(), VAArgTLSCopy, IRB.getInt32(AArch64VAEndOffset));
Value *StackSrcPtr = IRB.CreateInBoundsPtrAdd(
VAArgTLSCopy, IRB.getInt32(AArch64VAEndOffset));

IRB.CreateMemCpy(StackSaveAreaShadowPtr, Align(16), StackSrcPtr,
Align(16), VAArgOverflowSize);
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,8 +329,8 @@ ModuleSanitizerCoverage::CreateSecStartEnd(Module &M, const char *Section,

// Account for the fact that on windows-msvc __start_* symbols actually
// point to a uint64_t before the start of the array.
auto GEP = IRB.CreateGEP(Int8Ty, SecStart,
ConstantInt::get(IntptrTy, sizeof(uint64_t)));
auto GEP =
IRB.CreatePtrAdd(SecStart, ConstantInt::get(IntptrTy, sizeof(uint64_t)));
return std::make_pair(GEP, SecEnd);
}

Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1297,9 +1297,9 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
Value *SizeDiff = Builder.CreateSub(DestSize, SrcSize);
Value *MemsetLen = Builder.CreateSelect(
Ule, ConstantInt::getNullValue(DestSize->getType()), SizeDiff);
Instruction *NewMemSet = Builder.CreateMemSet(
Builder.CreateGEP(Builder.getInt8Ty(), Dest, SrcSize),
MemSet->getOperand(1), MemsetLen, Alignment);
Instruction *NewMemSet =
Builder.CreateMemSet(Builder.CreatePtrAdd(Dest, SrcSize),
MemSet->getOperand(1), MemsetLen, Alignment);

assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)) &&
"MemCpy must be a MemoryDef");
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Transforms/Scalar/SROA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1903,8 +1903,8 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
APInt Offset, Type *PointerTy,
const Twine &NamePrefix) {
if (Offset != 0)
Ptr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Ptr, IRB.getInt(Offset),
NamePrefix + "sroa_idx");
Ptr = IRB.CreateInBoundsPtrAdd(Ptr, IRB.getInt(Offset),
NamePrefix + "sroa_idx");
return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy,
NamePrefix + "sroa_cast");
}
Expand Down
11 changes: 4 additions & 7 deletions llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -896,8 +896,7 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
}
}
// Create an ugly GEP with a single index for each index.
ResultPtr =
Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep");
ResultPtr = Builder.CreatePtrAdd(ResultPtr, Idx, "uglygep");
if (FirstResult == nullptr)
FirstResult = ResultPtr;
}
Expand All @@ -906,8 +905,7 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
// Create a GEP with the constant offset index.
if (AccumulativeByteOffset != 0) {
Value *Offset = ConstantInt::get(PtrIndexTy, AccumulativeByteOffset);
ResultPtr =
Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep");
ResultPtr = Builder.CreatePtrAdd(ResultPtr, Offset, "uglygep");
} else
isSwapCandidate = false;

Expand Down Expand Up @@ -1107,9 +1105,8 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {

Type *PtrIdxTy = DL->getIndexType(GEP->getType());
IRBuilder<> Builder(GEP);
NewGEP = cast<Instruction>(Builder.CreateGEP(
Builder.getInt8Ty(), NewGEP,
{ConstantInt::get(PtrIdxTy, AccumulativeByteOffset, true)},
NewGEP = cast<Instruction>(Builder.CreatePtrAdd(
NewGEP, ConstantInt::get(PtrIdxTy, AccumulativeByteOffset, true),
GEP->getName(), GEPWasInBounds));
NewGEP->copyMetadata(*GEP);

Expand Down
3 changes: 1 addition & 2 deletions llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -656,8 +656,7 @@ void StraightLineStrengthReduce::rewriteCandidateWithBasis(
case Candidate::GEP: {
bool InBounds = cast<GetElementPtrInst>(C.Ins)->isInBounds();
// C = (char *)Basis + Bump
Reduced =
Builder.CreateGEP(Builder.getInt8Ty(), Basis.Ins, Bump, "", InBounds);
Reduced = Builder.CreatePtrAdd(Basis.Ins, Bump, "", InBounds);
break;
}
default:
Expand Down
16 changes: 6 additions & 10 deletions llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,12 +169,8 @@ Value *SCEVExpander::InsertNoopCastOfTo(Value *V, Type *Ty) {
// during expansion.
if (Op == Instruction::IntToPtr) {
auto *PtrTy = cast<PointerType>(Ty);
if (DL.isNonIntegralPointerType(PtrTy)) {
assert(DL.getTypeAllocSize(Builder.getInt8Ty()) == 1 &&
"alloc size of i8 must by 1 byte for the GEP to be correct");
return Builder.CreateGEP(
Builder.getInt8Ty(), Constant::getNullValue(PtrTy), V, "scevgep");
}
if (DL.isNonIntegralPointerType(PtrTy))
return Builder.CreatePtrAdd(Constant::getNullValue(PtrTy), V, "scevgep");
}
// Short-circuit unnecessary bitcasts.
if (Op == Instruction::BitCast) {
Expand Down Expand Up @@ -321,7 +317,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *Offset, Value *V) {
// Fold a GEP with constant operands.
if (Constant *CLHS = dyn_cast<Constant>(V))
if (Constant *CRHS = dyn_cast<Constant>(Idx))
return Builder.CreateGEP(Builder.getInt8Ty(), CLHS, CRHS);
return Builder.CreatePtrAdd(CLHS, CRHS);

// Do a quick scan to see if we have this GEP nearby. If so, reuse it.
unsigned ScanLimit = 6;
Expand Down Expand Up @@ -358,7 +354,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *Offset, Value *V) {
}

// Emit a GEP.
return Builder.CreateGEP(Builder.getInt8Ty(), V, Idx, "scevgep");
return Builder.CreatePtrAdd(V, Idx, "scevgep");
}

/// PickMostRelevantLoop - Given two loops pick the one that's most relevant for
Expand Down Expand Up @@ -2123,9 +2119,9 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
if (isa<PointerType>(ARTy)) {
Value *NegMulV = Builder.CreateNeg(MulV);
if (NeedPosCheck)
Add = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, MulV);
Add = Builder.CreatePtrAdd(StartValue, MulV);
if (NeedNegCheck)
Sub = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, NegMulV);
Sub = Builder.CreatePtrAdd(StartValue, NegMulV);
} else {
if (NeedPosCheck)
Add = Builder.CreateAdd(StartValue, MulV);
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Transforms/Utils/SimplifyCFG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5623,6 +5623,11 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
// optimization, such as lookup tables.
if (SI->getNumCases() == AllNumCases - 1) {
assert(NumUnknownBits > 1 && "Should be canonicalized to a branch");
IntegerType *CondTy = cast<IntegerType>(Cond->getType());
if (CondTy->getIntegerBitWidth() > 64 ||
!DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
return false;

uint64_t MissingCaseVal = 0;
for (const auto &Case : SI->cases())
MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue();
Expand Down
5 changes: 2 additions & 3 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2346,9 +2346,8 @@ emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
auto *Offset = CreateMul(Index, Step);
return CreateAdd(StartValue, Offset);
}
case InductionDescriptor::IK_PtrInduction: {
return B.CreateGEP(B.getInt8Ty(), StartValue, CreateMul(Index, Step));
}
case InductionDescriptor::IK_PtrInduction:
return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
case InductionDescriptor::IK_FpInduction: {
assert(!isa<VectorType>(Index->getType()) &&
"Vector indices not supported for FP inductions yet");
Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7379,6 +7379,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
continue;
if (Idx >= static_cast<int>(CommonVF))
Idx = E1Mask[Idx - CommonVF] + VF;
else
Idx = E1Mask[Idx];
}
CommonVF = VF;
}
Expand Down Expand Up @@ -15214,6 +15216,19 @@ class HorizontalReduction {
assert(IsSupportedHorRdxIdentityOp &&
"The optimization of matched scalar identity horizontal reductions "
"must be supported.");
auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
if (VTy->getElementType() != VL.front()->getType()) {
VectorizedValue = Builder.CreateIntCast(
VectorizedValue,
FixedVectorType::get(VL.front()->getType(), VTy->getNumElements()),
any_of(VL, [&](Value *R) {
KnownBits Known = computeKnownBits(
R, cast<Instruction>(ReductionOps.front().front())
->getModule()
->getDataLayout());
return !Known.isNonNegative();
}));
}
switch (RdxKind) {
case RecurKind::Add: {
// root = mul prev_root, <1, 1, n, 1>
Expand Down
33 changes: 21 additions & 12 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -895,7 +895,10 @@ void VPlanTransforms::truncateToMinimalBitwidths(
vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
VPWidenSelectRecipe>(&R))
VPWidenSelectRecipe, VPWidenMemoryInstructionRecipe>(&R))
continue;
if (isa<VPWidenMemoryInstructionRecipe>(&R) &&
cast<VPWidenMemoryInstructionRecipe>(&R)->isStore())
continue;

VPValue *ResultVPV = R.getVPSingleValue();
Expand Down Expand Up @@ -948,6 +951,23 @@ void VPlanTransforms::truncateToMinimalBitwidths(

auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits);

// Any wrapping introduced by shrinking this operation shouldn't be
// considered undefined behavior. So, we can't unconditionally copy
// arithmetic wrapping flags to VPW.
if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
VPW->dropPoisonGeneratingFlags();

// Extend result to original width.
auto *Ext = new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
Ext->insertAfter(&R);
ResultVPV->replaceAllUsesWith(Ext);
Ext->setOperand(0, ResultVPV);

if (isa<VPWidenMemoryInstructionRecipe>(&R)) {
assert(!cast<VPWidenMemoryInstructionRecipe>(&R)->isStore() && "stores cannot be narrowed");
continue;
}

// Shrink operands by introducing truncates as needed.
unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
Expand Down Expand Up @@ -979,17 +999,6 @@ void VPlanTransforms::truncateToMinimalBitwidths(
}
}

// Any wrapping introduced by shrinking this operation shouldn't be
// considered undefined behavior. So, we can't unconditionally copy
// arithmetic wrapping flags to VPW.
if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
VPW->dropPoisonGeneratingFlags();

// Extend result to original width.
auto *Ext = new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
Ext->insertAfter(&R);
ResultVPV->replaceAllUsesWith(Ext);
Ext->setOperand(0, ResultVPV);
}
}

Expand Down
253 changes: 127 additions & 126 deletions llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll

Large diffs are not rendered by default.

34 changes: 17 additions & 17 deletions llvm/test/CodeGen/AArch64/replace-with-veclib-sleef-scalable.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
target triple = "aarch64-unknown-linux-gnu"

;.
; CHECK: @llvm.compiler.used = appending global [18 x ptr] [ptr @_ZGVsMxv_cos, ptr @_ZGVsMxv_cosf, ptr @_ZGVsMxv_exp, ptr @_ZGVsMxv_expf, ptr @_ZGVsMxv_exp2, ptr @_ZGVsMxv_exp2f, ptr @_ZGVsMxv_exp10, ptr @_ZGVsMxv_exp10f, ptr @_ZGVsMxv_log, ptr @_ZGVsMxv_logf, ptr @_ZGVsMxv_log10, ptr @_ZGVsMxv_log10f, ptr @_ZGVsMxv_log2, ptr @_ZGVsMxv_log2f, ptr @_ZGVsMxv_sin, ptr @_ZGVsMxv_sinf, ptr @_ZGVsMxvv_fmod, ptr @_ZGVsMxvv_fmodf], section "llvm.metadata"
; CHECK: @llvm.compiler.used = appending global [18 x ptr] [ptr @_ZGVsMxv_cos, ptr @_ZGVsMxv_cosf, ptr @_ZGVsMxv_exp, ptr @_ZGVsMxv_expf, ptr @_ZGVsMxv_exp10, ptr @_ZGVsMxv_exp10f, ptr @_ZGVsMxv_exp2, ptr @_ZGVsMxv_exp2f, ptr @_ZGVsMxv_log, ptr @_ZGVsMxv_logf, ptr @_ZGVsMxv_log10, ptr @_ZGVsMxv_log10f, ptr @_ZGVsMxv_log2, ptr @_ZGVsMxv_log2f, ptr @_ZGVsMxv_sin, ptr @_ZGVsMxv_sinf, ptr @_ZGVsMxvv_fmod, ptr @_ZGVsMxvv_fmodf], section "llvm.metadata"
;.
define <vscale x 2 x double> @llvm_ceil_vscale_f64(<vscale x 2 x double> %in) {
; CHECK-LABEL: @llvm_ceil_vscale_f64(
Expand Down Expand Up @@ -78,39 +78,39 @@ define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) {
ret <vscale x 4 x float> %1
}

define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) {
; CHECK-LABEL: @llvm_exp2_vscale_f64(
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) {
; CHECK-LABEL: @llvm_exp10_vscale_f64(
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
;
%1 = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> %in)
%1 = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> %in)
ret <vscale x 2 x double> %1
}

define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) {
; CHECK-LABEL: @llvm_exp2_vscale_f32(
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
define <vscale x 4 x float> @llvm_exp10_vscale_f32(<vscale x 4 x float> %in) {
; CHECK-LABEL: @llvm_exp10_vscale_f32(
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
;
%1 = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %in)
%1 = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> %in)
ret <vscale x 4 x float> %1
}

define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) {
; CHECK-LABEL: @llvm_exp10_vscale_f64(
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) {
; CHECK-LABEL: @llvm_exp2_vscale_f64(
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
;
%1 = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> %in)
%1 = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> %in)
ret <vscale x 2 x double> %1
}

define <vscale x 4 x float> @llvm_exp10_vscale_f32(<vscale x 4 x float> %in) {
; CHECK-LABEL: @llvm_exp10_vscale_f32(
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) {
; CHECK-LABEL: @llvm_exp2_vscale_f32(
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
;
%1 = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> %in)
%1 = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %in)
ret <vscale x 4 x float> %1
}

Expand Down
34 changes: 17 additions & 17 deletions llvm/test/CodeGen/AArch64/replace-with-veclib-sleef.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
target triple = "aarch64-unknown-linux-gnu"

;.
; CHECK: @llvm.compiler.used = appending global [18 x ptr] [ptr @_ZGVnN2v_cos, ptr @_ZGVnN4v_cosf, ptr @_ZGVnN2v_exp, ptr @_ZGVnN4v_expf, ptr @_ZGVnN2v_exp2, ptr @_ZGVnN4v_exp2f, ptr @_ZGVnN2v_exp10, ptr @_ZGVnN4v_exp10f, ptr @_ZGVnN2v_log, ptr @_ZGVnN4v_logf, ptr @_ZGVnN2v_log10, ptr @_ZGVnN4v_log10f, ptr @_ZGVnN2v_log2, ptr @_ZGVnN4v_log2f, ptr @_ZGVnN2v_sin, ptr @_ZGVnN4v_sinf, ptr @_ZGVnN2vv_fmod, ptr @_ZGVnN4vv_fmodf], section "llvm.metadata"
; CHECK: @llvm.compiler.used = appending global [18 x ptr] [ptr @_ZGVnN2v_cos, ptr @_ZGVnN4v_cosf, ptr @_ZGVnN2v_exp, ptr @_ZGVnN4v_expf, ptr @_ZGVnN2v_exp10, ptr @_ZGVnN4v_exp10f, ptr @_ZGVnN2v_exp2, ptr @_ZGVnN4v_exp2f, ptr @_ZGVnN2v_log, ptr @_ZGVnN4v_logf, ptr @_ZGVnN2v_log10, ptr @_ZGVnN4v_log10f, ptr @_ZGVnN2v_log2, ptr @_ZGVnN4v_log2f, ptr @_ZGVnN2v_sin, ptr @_ZGVnN4v_sinf, ptr @_ZGVnN2vv_fmod, ptr @_ZGVnN4vv_fmodf], section "llvm.metadata"
;.
define <2 x double> @llvm_ceil_f64(<2 x double> %in) {
; CHECK-LABEL: @llvm_ceil_f64(
Expand Down Expand Up @@ -78,39 +78,39 @@ define <4 x float> @llvm_exp_f32(<4 x float> %in) {
ret <4 x float> %1
}

define <2 x double> @llvm_exp2_f64(<2 x double> %in) {
; CHECK-LABEL: @llvm_exp2_f64(
; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_exp2(<2 x double> [[IN:%.*]])
define <2 x double> @llvm_exp10_f64(<2 x double> %in) {
; CHECK-LABEL: @llvm_exp10_f64(
; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_exp10(<2 x double> [[IN:%.*]])
; CHECK-NEXT: ret <2 x double> [[TMP1]]
;
%1 = call fast <2 x double> @llvm.exp2.v2f64(<2 x double> %in)
%1 = call fast <2 x double> @llvm.exp10.v2f64(<2 x double> %in)
ret <2 x double> %1
}

define <4 x float> @llvm_exp2_f32(<4 x float> %in) {
; CHECK-LABEL: @llvm_exp2_f32(
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[IN:%.*]])
define <4 x float> @llvm_exp10_f32(<4 x float> %in) {
; CHECK-LABEL: @llvm_exp10_f32(
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[IN:%.*]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
%1 = call fast <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
%1 = call fast <4 x float> @llvm.exp10.v4f32(<4 x float> %in)
ret <4 x float> %1
}

define <2 x double> @llvm_exp10_f64(<2 x double> %in) {
; CHECK-LABEL: @llvm_exp10_f64(
; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_exp10(<2 x double> [[IN:%.*]])
define <2 x double> @llvm_exp2_f64(<2 x double> %in) {
; CHECK-LABEL: @llvm_exp2_f64(
; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_exp2(<2 x double> [[IN:%.*]])
; CHECK-NEXT: ret <2 x double> [[TMP1]]
;
%1 = call fast <2 x double> @llvm.exp10.v2f64(<2 x double> %in)
%1 = call fast <2 x double> @llvm.exp2.v2f64(<2 x double> %in)
ret <2 x double> %1
}

define <4 x float> @llvm_exp10_f32(<4 x float> %in) {
; CHECK-LABEL: @llvm_exp10_f32(
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[IN:%.*]])
define <4 x float> @llvm_exp2_f32(<4 x float> %in) {
; CHECK-LABEL: @llvm_exp2_f32(
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[IN:%.*]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
%1 = call fast <4 x float> @llvm.exp10.v4f32(<4 x float> %in)
%1 = call fast <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
ret <4 x float> %1
}

Expand Down
80 changes: 40 additions & 40 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -34,118 +34,118 @@ define <vscale x 8 x bfloat> @multi_vector_cvt_x2_bf16(<vscale x 4 x float> %unu
;
; FCVTZS
;
define {<vscale x 4 x float>, <vscale x 4 x float>} @multi_vector_cvt_x2_f32_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) {
; CHECK-LABEL: multi_vector_cvt_x2_f32_s32:
define {<vscale x 4 x i32>, <vscale x 4 x i32>} @multi_vector_cvt_x2_s32_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) {
; CHECK-LABEL: multi_vector_cvt_x2_s32_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z3.d, z2.d
; CHECK-NEXT: mov z2.d, z1.d
; CHECK-NEXT: fcvtzs { z0.s, z1.s }, { z2.s, z3.s }
; CHECK-NEXT: ret
%res = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.fcvts.x2.nxv4f32(<vscale x 4 x i32>%zn0, <vscale x 4 x i32>%zn1)
ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
%res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvts.x2.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1)
ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res
}

define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @multi_vector_cvt_x4_f32_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3) {
; CHECK-LABEL: multi_vector_cvt_x4_f32_s32:
define {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @multi_vector_cvt_x4_s32_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) {
; CHECK-LABEL: multi_vector_cvt_x4_s32_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: fcvtzs { z0.s - z3.s }, { z4.s - z7.s }
; CHECK-NEXT: ret
%res = call {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.fcvts.x4.nxv4f32(<vscale x 4 x i32>%zn0, <vscale x 4 x i32>%zn1, <vscale x 4 x i32>%zn2, <vscale x 4 x i32>%zn3)
ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res
%res = call {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvts.x4.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3)
ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %res
}

;
; FCVTZU
;
define {<vscale x 4 x float>, <vscale x 4 x float>} @multi_vector_cvt_x2_f32_u32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) {
; CHECK-LABEL: multi_vector_cvt_x2_f32_u32:
define {<vscale x 4 x i32>, <vscale x 4 x i32>} @multi_vector_cvt_x2_u32_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) {
; CHECK-LABEL: multi_vector_cvt_x2_u32_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z3.d, z2.d
; CHECK-NEXT: mov z2.d, z1.d
; CHECK-NEXT: fcvtzu { z0.s, z1.s }, { z2.s, z3.s }
; CHECK-NEXT: ret
%res = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.fcvtu.x2.nxv4f32(<vscale x 4 x i32>%zn0, <vscale x 4 x i32>%zn1)
ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
%res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtu.x2.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1)
ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res
}

define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @multi_vector_cvt_x4_f32_u32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3) {
; CHECK-LABEL: multi_vector_cvt_x4_f32_u32:
define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @multi_vector_cvt_x4_u32_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) {
; CHECK-LABEL: multi_vector_cvt_x4_u32_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: fcvtzu { z0.s - z3.s }, { z4.s - z7.s }
; CHECK-NEXT: ret
%res = call {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.fcvtu.x4.nxv4f32(<vscale x 4 x i32>%zn0, <vscale x 4 x i32>%zn1, <vscale x 4 x i32>%zn2, <vscale x 4 x i32>%zn3)
ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res
%res = call {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtu.x4.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3)
ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %res
}

;
; SCVTF
;
define {<vscale x 4 x i32>, <vscale x 4 x i32>} @multi_vector_cvt_x2_s32_f32(<vscale x 4 x float>%unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) {
; CHECK-LABEL: multi_vector_cvt_x2_s32_f32:
define {<vscale x 4 x float>, <vscale x 4 x float>} @multi_vector_cvt_x2_f32_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) {
; CHECK-LABEL: multi_vector_cvt_x2_f32_s32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z3.d, z2.d
; CHECK-NEXT: mov z2.d, z1.d
; CHECK-NEXT: scvtf { z0.s, z1.s }, { z2.s, z3.s }
; CHECK-NEXT: ret
%res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.scvtf.x2.nxv4f32(<vscale x 4 x float>%zn0, <vscale x 4 x float>%zn1)
ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res
%res = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.scvtf.x2.nxv4f32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1)
ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
}

define {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @multi_vector_cvt_x4_s32_f32(<vscale x 4 x float>%unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) {
; CHECK-LABEL: multi_vector_cvt_x4_s32_f32:
define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @multi_vector_cvt_x4_f32_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3) {
; CHECK-LABEL: multi_vector_cvt_x4_f32_s32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: scvtf { z0.s - z3.s }, { z4.s - z7.s }
; CHECK-NEXT: ret
%res = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.scvtf.x4.nxv4f32(<vscale x 4 x float>%zn0, <vscale x 4 x float>%zn1, <vscale x 4 x float>%zn2, <vscale x 4 x float>%zn3)
ret {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} %res
%res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.scvtf.x4.nxv4f32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3)
ret {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} %res
}

;
; UCVTF
;
define {<vscale x 4 x i32>, <vscale x 4 x i32>} @multi_vector_cvt_x2_u32_f32(<vscale x 4 x float>%unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) {
; CHECK-LABEL: multi_vector_cvt_x2_u32_f32:
define {<vscale x 4 x float>, <vscale x 4 x float>} @multi_vector_cvt_x2_f32_u32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) {
; CHECK-LABEL: multi_vector_cvt_x2_f32_u32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z3.d, z2.d
; CHECK-NEXT: mov z2.d, z1.d
; CHECK-NEXT: ucvtf { z0.s, z1.s }, { z2.s, z3.s }
; CHECK-NEXT: ret
%res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.ucvtf.x2.nxv4f32(<vscale x 4 x float>%zn0, <vscale x 4 x float>%zn1)
ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res
%res = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.ucvtf.x2.nxv4f32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1)
ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
}

define {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @multi_vector_cvt_x4_u32_f32(<vscale x 4 x float>%unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,<vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) {
; CHECK-LABEL: multi_vector_cvt_x4_u32_f32:
define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @multi_vector_cvt_x4_f32_u32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,<vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3) {
; CHECK-LABEL: multi_vector_cvt_x4_f32_u32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: ucvtf { z0.s - z3.s }, { z4.s - z7.s }
; CHECK-NEXT: ret
%res = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.ucvtf.x4.nxv4f32(<vscale x 4 x float>%zn0, <vscale x 4 x float>%zn1, <vscale x 4 x float>%zn2, <vscale x 4 x float>%zn3)
ret {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} %res
%res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.ucvtf.x4.nxv4f32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3)
ret {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} %res
}

declare <vscale x 8 x half> @llvm.aarch64.sve.fcvt.x2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
declare <vscale x 8 x bfloat> @llvm.aarch64.sve.bfcvt.x2(<vscale x 4 x float>, <vscale x 4 x float>)
declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.fcvts.x2.nxv4f32(<vscale x 4 x i32>,<vscale x 4 x i32>)
declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.fcvtu.x2.nxv4f32(<vscale x 4 x i32>,<vscale x 4 x i32>)
declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.scvtf.x2.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>)
declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.ucvtf.x2.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>)
declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.fcvts.x4.nxv4f32(<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>)
declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.fcvtu.x4.nxv4f32(<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>)
declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.scvtf.x4.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>,<vscale x 4 x float>,<vscale x 4 x float>)
declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.ucvtf.x4.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>,<vscale x 4 x float>,<vscale x 4 x float>)
declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvts.x2.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>)
declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtu.x2.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>)
declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.scvtf.x2.nxv4f32(<vscale x 4 x i32>,<vscale x 4 x i32>)
declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.ucvtf.x2.nxv4f32(<vscale x 4 x i32>,<vscale x 4 x i32>)
declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvts.x4.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>,<vscale x 4 x float>,<vscale x 4 x float>)
declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtu.x4.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>,<vscale x 4 x float>,<vscale x 4 x float>)
declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.scvtf.x4.nxv4f32(<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>)
declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.ucvtf.x4.nxv4f32(<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>)
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,25 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_s
ret float %val
}

define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_volatile(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
; CHECK-LABEL: name: raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_volatile
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (volatile dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8)
; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 -2147483648)
ret float %val
}

; Natural mapping
define amdgpu_ps <2 x float> @raw_ptr_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
; CHECK-LABEL: name: raw_ptr_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,25 @@ define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__
ret void
}

define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_volatile(ptr addrspace(8) inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) {
; CHECK-LABEL: name: raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_volatile
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (volatile dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8)
; CHECK-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 -2147483648)
ret void
}

define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32(ptr addrspace(8) inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) {
; CHECK-LABEL: name: raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32
; CHECK: bb.1 (%ir-block.0):
Expand Down
60 changes: 60 additions & 0 deletions llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -stop-after=si-insert-waitcnts -verify-machineinstrs < %s | FileCheck %s

declare fastcc void @bar()

define fastcc i32 @foo() {
; CHECK-LABEL: name: foo
; CHECK: bb.0 (%ir-block.0):
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $vgpr40, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_WAITCNT 0
; CHECK-NEXT: $sgpr16 = S_MOV_B32 $sgpr33
; CHECK-NEXT: $sgpr33 = S_MOV_B32 $sgpr32
; CHECK-NEXT: $sgpr17 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr17
; CHECK-NEXT: $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 512, implicit-def dead $scc
; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr16, 2, undef $vgpr40
; CHECK-NEXT: BUNDLE implicit-def $sgpr16_sgpr17, implicit-def $sgpr16, implicit-def $sgpr16_lo16, implicit-def $sgpr16_hi16, implicit-def $sgpr17, implicit-def $sgpr17_lo16, implicit-def $sgpr17_hi16, implicit-def $scc {
; CHECK-NEXT: $sgpr16_sgpr17 = S_GETPC_B64
; CHECK-NEXT: $sgpr16 = S_ADD_U32 internal $sgpr16, target-flags(amdgpu-gotprel32-lo) @bar + 4, implicit-def $scc
; CHECK-NEXT: $sgpr17 = S_ADDC_U32 internal $sgpr17, target-flags(amdgpu-gotprel32-hi) @bar + 12, implicit-def $scc, implicit internal $scc
; CHECK-NEXT: }
; CHECK-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
; CHECK-NEXT: BUFFER_GL0_INV implicit $exec
; CHECK-NEXT: BUFFER_GL1_INV implicit $exec
; CHECK-NEXT: renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40
; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40
; CHECK-NEXT: S_WAITCNT 49279
; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @bar, csr_amdgpu, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit killed $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $vcc_lo = S_MOV_B32 $exec_lo
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1 (%ir-block.1):
; CHECK-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
; CHECK-NEXT: liveins: $vcc_lo, $vgpr40
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2.DummyReturnBlock:
; CHECK-NEXT: liveins: $vgpr40
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $sgpr31 = V_READLANE_B32 $vgpr40, 1
; CHECK-NEXT: $sgpr30 = V_READLANE_B32 $vgpr40, 0
; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr40, 2
; CHECK-NEXT: $sgpr5 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr5
; CHECK-NEXT: $sgpr32 = frame-destroy S_ADDK_I32 $sgpr32, -512, implicit-def dead $scc
; CHECK-NEXT: $sgpr33 = S_MOV_B32 killed $sgpr4
; CHECK-NEXT: S_WAITCNT 16240
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit undef $vgpr0
fence acquire
call fastcc void @bar()
br label %1

1:
br label %1
}
20 changes: 20 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,26 @@ main_body:
ret float %out
}

;CHECK-LABEL: {{^}}test_volatile:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 offen glc{{$}}
;CHECK-DAG: s_waitcnt vmcnt(0)
define amdgpu_ps float @test_volatile(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %voffset) {
main_body:
%t1 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32 %data, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 -2147483648)
%out = bitcast i32 %t1 to float
ret float %out
}

;CHECK-LABEL: {{^}}test_volatile_noret:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 offen{{$}}
define amdgpu_ps void @test_volatile_noret(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %voffset) {
main_body:
%t1 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32 %data, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 -2147483648)
ret void
}

declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i32) #0
declare float @llvm.amdgcn.raw.ptr.buffer.atomic.swap.f32(float, ptr addrspace(8), i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i32) #0
Expand Down
36 changes: 36 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,42 @@ main_body:
ret {<4 x float>, <4 x float>, <4 x float>} %r2
}

define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_volatile(ptr addrspace(8) inreg) {
; PREGFX10-LABEL: buffer_load_volatile:
; PREGFX10: ; %bb.0: ; %main_body
; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; PREGFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc
; PREGFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc
; PREGFX10-NEXT: s_waitcnt vmcnt(0)
; PREGFX10-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: buffer_load_volatile:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc dlc
; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc dlc
; GFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: buffer_load_volatile:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 glc dlc
; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 glc dlc
; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 glc slc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 -2147483648)
%data_glc = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 -2147483647)
%data_slc = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 -2147483646)
%r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
%r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
%r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
ret {<4 x float>, <4 x float>, <4 x float>} %r2
}

define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) {
; PREGFX10-LABEL: buffer_load_immoffs:
; PREGFX10: ; %bb.0: ; %main_body
Expand Down
Loading