146 changes: 98 additions & 48 deletions llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,27 @@ using namespace llvm;
AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI)
: CallLowering(&TLI) {}

static void applyStackPassedSmallTypeDAGHack(EVT OrigVT, MVT &ValVT,
MVT &LocVT) {
// If ValVT is i1/i8/i16, we should set LocVT to i8/i8/i16. This is a legacy
// hack because the DAG calls the assignment function with pre-legalized
// register typed values, not the raw type.
//
// This hack is not applied to return values which are not passed on the
// stack.
if (OrigVT == MVT::i1 || OrigVT == MVT::i8)
ValVT = LocVT = MVT::i8;
else if (OrigVT == MVT::i16)
ValVT = LocVT = MVT::i16;
}

// Account for i1/i8/i16 stack passed value hack
static uint64_t getStackValueStoreSizeHack(const CCValAssign &VA) {
const MVT ValVT = VA.getValVT();
return (ValVT == MVT::i8 || ValVT == MVT::i16) ? ValVT.getStoreSize()
: VA.getLocVT().getStoreSize();
}

namespace {
struct IncomingArgHandler : public CallLowering::IncomingValueHandler {
IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
Expand All @@ -73,6 +94,10 @@ struct IncomingArgHandler : public CallLowering::IncomingValueHandler {
return AddrReg.getReg(0);
}

uint64_t getStackValueStoreSize(const CCValAssign &VA) const override {
return getStackValueStoreSizeHack(VA);
}

void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
markPhysRegUsed(PhysReg);
Expand All @@ -84,40 +109,36 @@ struct IncomingArgHandler : public CallLowering::IncomingValueHandler {
MachineFunction &MF = MIRBuilder.getMF();

// The reported memory location may be wider than the value.
const LLT RegTy = MRI.getType(ValVReg);
MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize);
const LLT RealRegTy = MRI.getType(ValVReg);
LLT ValTy(VA.getValVT());
LLT LocTy(VA.getLocVT());

// Fixup the types for the DAG compatibility hack.
if (VA.getValVT() == MVT::i8 || VA.getValVT() == MVT::i16)
std::swap(ValTy, LocTy);

MemSize = LocTy.getSizeInBytes();

auto MMO = MF.getMachineMemOperand(
MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
MemSize, inferAlignFromPtrInfo(MF, MPO));
const LLT LocVT = LLT{VA.getLocVT()};

if (RegTy.getScalarSizeInBits() < LocVT.getScalarSizeInBits()) {
auto LocInfo = VA.getLocInfo();
if (LocInfo == CCValAssign::LocInfo::ZExt) {
// We know the parameter is zero-extended. Perform a load into LocVT,
// and use G_ASSERT_ZEXT to communicate that this was zero-extended from
// the parameter type. Move down to the parameter type using G_TRUNC.
MIRBuilder.buildTrunc(
ValVReg, MIRBuilder.buildAssertZExt(
LocVT, MIRBuilder.buildLoad(LocVT, Addr, *MMO),
RegTy.getScalarSizeInBits()));
return;
}

if (LocInfo == CCValAssign::LocInfo::SExt) {
// Same as the ZExt case, but use G_ASSERT_SEXT instead.
MIRBuilder.buildTrunc(
ValVReg, MIRBuilder.buildAssertSExt(
LocVT, MIRBuilder.buildLoad(LocVT, Addr, *MMO),
RegTy.getScalarSizeInBits()));
return;
}
if (RealRegTy.getSizeInBits() == ValTy.getSizeInBits()) {
// No extension information, or no extension necessary. Load into the
// incoming parameter type directly.
MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
} else {
auto Tmp = MIRBuilder.buildLoad(LocTy, Addr, *MMO);
MIRBuilder.buildTrunc(ValVReg, Tmp);
}
}

// No extension information, or no extension necessary. Load into the
// incoming parameter type directly.
MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
CCState &State) override {
applyStackPassedSmallTypeDAGHack(OrigVT, ValVT, LocVT);
return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
}

/// How the physical register gets marked varies between formal
Expand Down Expand Up @@ -164,11 +185,11 @@ struct ReturnedArgCallReturnHandler : public CallReturnHandler {
struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler {
OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
MachineInstrBuilder MIB, CCAssignFn *AssignFn,
CCAssignFn *AssignFnVarArg, bool IsTailCall = false,
int FPDiff = 0)
CCAssignFn *AssignFnVarArg, bool IsReturn,
bool IsTailCall = false, int FPDiff = 0)
: OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
AssignFnVarArg(AssignFnVarArg), IsTailCall(IsTailCall), FPDiff(FPDiff),
StackSize(0), SPReg(0),
AssignFnVarArg(AssignFnVarArg), IsReturn(IsReturn),
IsTailCall(IsTailCall), FPDiff(FPDiff), StackSize(0), SPReg(0),
Subtarget(MIRBuilder.getMF().getSubtarget<AArch64Subtarget>()) {}

Register getStackAddress(uint64_t Size, int64_t Offset,
Expand Down Expand Up @@ -199,6 +220,14 @@ struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler {
return AddrReg.getReg(0);
}

/// We need to fixup the reported store size for certain value types because
/// we invert the interpretation of ValVT and LocVT in certain cases. This is
/// for compatability with the DAG call lowering implementation, which we're
/// currently building on top of.
uint64_t getStackValueStoreSize(const CCValAssign &VA) const override {
return getStackValueStoreSizeHack(VA);
}

void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
MIB.addUse(PhysReg, RegState::Implicit);
Expand All @@ -215,34 +244,48 @@ struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler {
}

void assignValueToAddress(const CallLowering::ArgInfo &Arg, unsigned RegIndex,
Register Addr, uint64_t Size,
Register Addr, uint64_t MemSize,
MachinePointerInfo &MPO, CCValAssign &VA) override {
unsigned MaxSize = Size * 8;
unsigned MaxSize = MemSize * 8;
// For varargs, we always want to extend them to 8 bytes, in which case
// we disable setting a max.
if (!Arg.IsFixed)
MaxSize = 0;

Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
? extendRegister(Arg.Regs[RegIndex], VA, MaxSize)
: Arg.Regs[0];
Register ValVReg = Arg.Regs[RegIndex];
if (VA.getLocInfo() != CCValAssign::LocInfo::FPExt) {
MVT LocVT = VA.getLocVT();
MVT ValVT = VA.getValVT();

if (VA.getValVT() == MVT::i8 || VA.getValVT() == MVT::i16) {
std::swap(ValVT, LocVT);
MemSize = VA.getValVT().getStoreSize();
}

ValVReg = extendRegister(ValVReg, VA, MaxSize);
const LLT RegTy = MRI.getType(ValVReg);

// If we extended we might need to adjust the MMO's Size.
const LLT RegTy = MRI.getType(ValVReg);
if (RegTy.getSizeInBytes() > Size)
Size = RegTy.getSizeInBytes();
if (RegTy.getSizeInBits() < LocVT.getSizeInBits())
ValVReg = MIRBuilder.buildTrunc(RegTy, ValVReg).getReg(0);
} else {
// The store does not cover the full allocated stack slot.
MemSize = VA.getValVT().getStoreSize();
}

assignValueToAddress(ValVReg, Addr, Size, MPO, VA);
assignValueToAddress(ValVReg, Addr, MemSize, MPO, VA);
}

bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
const CallLowering::ArgInfo &Info,
ISD::ArgFlagsTy Flags,
const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
CCState &State) override {
bool Res;
bool IsCalleeWin = Subtarget.isCallingConvWin64(State.getCallingConv());
bool UseVarArgsCCForFixed = IsCalleeWin && State.isVarArg();

if (!State.isVarArg() && !UseVarArgsCCForFixed && !IsReturn)
applyStackPassedSmallTypeDAGHack(OrigVT, ValVT, LocVT);

bool Res;
if (Info.IsFixed && !UseVarArgsCCForFixed)
Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
else
Expand All @@ -254,6 +297,11 @@ struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler {

MachineInstrBuilder MIB;
CCAssignFn *AssignFnVarArg;

/// Track if this is used for a return instead of function argument
/// passing. We apply a hack to i1/i8/i16 stack passed values, but do not use
/// stack passed returns for them and cannot apply the type adjustment.
bool IsReturn;
bool IsTailCall;

/// For tail calls, the byte offset of the call's argument area from the
Expand Down Expand Up @@ -381,7 +429,8 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
splitToValueTypes(CurArgInfo, SplitArgs, DL, CC);
}

OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFn, AssignFn);
OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFn, AssignFn,
/*IsReturn*/ true);
Success =
handleAssignments(MIRBuilder, SplitArgs, Handler, CC, F.isVarArg());
}
Expand Down Expand Up @@ -885,7 +934,8 @@ bool AArch64CallLowering::lowerTailCall(

// Do the actual argument marshalling.
OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
AssignFnVarArg, true, FPDiff);
AssignFnVarArg, /*IsReturn*/ false,
/*IsTailCall*/ true, FPDiff);
if (!handleAssignments(MIRBuilder, OutArgs, Handler, CalleeCC, Info.IsVarArg))
return false;

Expand Down Expand Up @@ -997,7 +1047,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,

// Do the actual argument marshalling.
OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
AssignFnVarArg, false);
AssignFnVarArg, /*IsReturn*/ false);
if (!handleAssignments(MIRBuilder, OutArgs, Handler, Info.CallConv,
Info.IsVarArg))
return false;
Expand Down
8 changes: 0 additions & 8 deletions llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,6 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
MIRBuilder.buildCopy(PhysReg, ExtReg);
MIB.addUse(PhysReg, RegState::Implicit);
}

bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
const CallLowering::ArgInfo &Info,
ISD::ArgFlagsTy Flags,
CCState &State) override {
return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
}
};

struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/ARM/ARMCallLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ struct ARMOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
return 1;
}

bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
CCState &State) override {
Expand Down
90 changes: 11 additions & 79 deletions llvm/lib/Target/X86/X86CallLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,50 +50,6 @@ using namespace llvm;
X86CallLowering::X86CallLowering(const X86TargetLowering &TLI)
: CallLowering(&TLI) {}

// FIXME: This should be removed and the generic version used
bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
SmallVectorImpl<ArgInfo> &SplitArgs,
const DataLayout &DL,
MachineRegisterInfo &MRI,
SplitArgTy PerformArgSplit) const {
const X86TargetLowering &TLI = *getTLI<X86TargetLowering>();
LLVMContext &Context = OrigArg.Ty->getContext();

SmallVector<EVT, 4> SplitVTs;
SmallVector<uint64_t, 4> Offsets;
ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
assert(OrigArg.Regs.size() == 1 && "Can't handle multple regs yet");

if (OrigArg.Ty->isVoidTy())
return true;

EVT VT = SplitVTs[0];
unsigned NumParts = TLI.getNumRegisters(Context, VT);

if (NumParts == 1) {
// replace the original type ( pointer -> GPR ).
SplitArgs.emplace_back(OrigArg.Regs[0], VT.getTypeForEVT(Context),
OrigArg.Flags, OrigArg.IsFixed);
return true;
}

SmallVector<Register, 8> SplitRegs;

EVT PartVT = TLI.getRegisterType(Context, VT);
Type *PartTy = PartVT.getTypeForEVT(Context);

for (unsigned i = 0; i < NumParts; ++i) {
ArgInfo Info =
ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*PartTy, DL)),
PartTy, OrigArg.Flags};
SplitArgs.push_back(Info);
SplitRegs.push_back(Info.Regs[0]);
}

PerformArgSplit(SplitRegs);
return true;
}

namespace {

struct X86OutgoingValueHandler : public CallLowering::OutgoingValueHandler {
Expand Down Expand Up @@ -138,7 +94,7 @@ struct X86OutgoingValueHandler : public CallLowering::OutgoingValueHandler {
MIRBuilder.buildStore(ExtReg, Addr, *MMO);
}

bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
CCState &State) override {
Expand Down Expand Up @@ -179,27 +135,15 @@ bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
const Function &F = MF.getFunction();
MachineRegisterInfo &MRI = MF.getRegInfo();
const DataLayout &DL = MF.getDataLayout();
LLVMContext &Ctx = Val->getType()->getContext();
const X86TargetLowering &TLI = *getTLI<X86TargetLowering>();

SmallVector<EVT, 4> SplitEVTs;
ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs);
assert(VRegs.size() == SplitEVTs.size() &&
"For each split Type there should be exactly one VReg.");

SmallVector<ArgInfo, 8> SplitArgs;
for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
ArgInfo CurArgInfo = ArgInfo{VRegs[i], SplitEVTs[i].getTypeForEVT(Ctx)};
setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F);
if (!splitToValueTypes(CurArgInfo, SplitArgs, DL, MRI,
[&](ArrayRef<Register> Regs) {
MIRBuilder.buildUnmerge(Regs, VRegs[i]);
}))
return false;
}

ArgInfo OrigRetInfo(VRegs, Val->getType());
setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F);

SmallVector<ArgInfo, 4> SplitRetInfos;
splitToValueTypes(OrigRetInfo, SplitRetInfos, DL, F.getCallingConv());

X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86);
if (!handleAssignments(MIRBuilder, SplitArgs, Handler, F.getCallingConv(),
if (!handleAssignments(MIRBuilder, SplitRetInfos, Handler, F.getCallingConv(),
F.isVarArg()))
return false;
}
Expand Down Expand Up @@ -312,11 +256,7 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,

ArgInfo OrigArg(VRegs[Idx], Arg.getType());
setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F);
if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
[&](ArrayRef<Register> Regs) {
MIRBuilder.buildMerge(VRegs[Idx][0], Regs);
}))
return false;
splitToValueTypes(OrigArg, SplitArgs, DL, F.getCallingConv());
Idx++;
}

Expand Down Expand Up @@ -374,11 +314,7 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
if (OrigArg.Regs.size() > 1)
return false;

if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
[&](ArrayRef<Register> Regs) {
MIRBuilder.buildUnmerge(Regs, OrigArg.Regs[0]);
}))
return false;
splitToValueTypes(OrigArg, SplitArgs, DL, Info.CallConv);
}
// Do the actual argument marshalling.
X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, CC_X86);
Expand Down Expand Up @@ -425,11 +361,7 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
SplitArgs.clear();
SmallVector<Register, 8> NewRegs;

if (!splitToValueTypes(Info.OrigRet, SplitArgs, DL, MRI,
[&](ArrayRef<Register> Regs) {
NewRegs.assign(Regs.begin(), Regs.end());
}))
return false;
splitToValueTypes(Info.OrigRet, SplitArgs, DL, Info.CallConv);

CallReturnHandler Handler(MIRBuilder, MRI, RetCC_X86, MIB);
if (!handleAssignments(MIRBuilder, SplitArgs, Handler, Info.CallConv,
Expand Down
9 changes: 0 additions & 9 deletions llvm/lib/Target/X86/X86CallLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,6 @@ class X86CallLowering : public CallLowering {

bool lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const override;

private:
/// A function of this type is used to perform value split action.
using SplitArgTy = std::function<void(ArrayRef<Register>)>;

bool splitToValueTypes(const ArgInfo &OrigArgInfo,
SmallVectorImpl<ArgInfo> &SplitArgs,
const DataLayout &DL, MachineRegisterInfo &MRI,
SplitArgTy SplitArg) const;
};

} // end namespace llvm
Expand Down
14 changes: 8 additions & 6 deletions llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv-ios.ll
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,21 @@ define void @test_varargs() {
; CHECK: $w0 = COPY [[C]](s32)
; CHECK: $d0 = COPY [[C1]](s64)
; CHECK: $x1 = COPY [[C2]](s64)
; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C3]](s8)
; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $sp
; CHECK: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C8]](s64)
; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s8)
; CHECK: G_STORE [[ANYEXT]](s64), [[PTR_ADD]](p0) :: (store 8 into stack, align 1)
; CHECK: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[ANYEXT]](s32)
; CHECK: G_STORE [[ANYEXT1]](s64), [[PTR_ADD]](p0) :: (store 8 into stack, align 1)
; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[C4]](s16)
; CHECK: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C9]](s64)
; CHECK: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C4]](s16)
; CHECK: G_STORE [[ANYEXT1]](s64), [[PTR_ADD1]](p0) :: (store 8 into stack + 8, align 1)
; CHECK: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[ANYEXT2]](s32)
; CHECK: G_STORE [[ANYEXT3]](s64), [[PTR_ADD1]](p0) :: (store 8 into stack + 8, align 1)
; CHECK: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C10]](s64)
; CHECK: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C5]](s32)
; CHECK: G_STORE [[ANYEXT2]](s64), [[PTR_ADD2]](p0) :: (store 8 into stack + 16, align 1)
; CHECK: [[ANYEXT4:%[0-9]+]]:_(s64) = G_ANYEXT [[C5]](s32)
; CHECK: G_STORE [[ANYEXT4]](s64), [[PTR_ADD2]](p0) :: (store 8 into stack + 16, align 1)
; CHECK: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C11]](s64)
; CHECK: G_STORE [[C6]](s32), [[PTR_ADD3]](p0) :: (store 4 into stack + 24, align 1)
Expand Down
60 changes: 60 additions & 0 deletions llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll
Original file line number Diff line number Diff line change
Expand Up @@ -224,3 +224,63 @@ entry:
ret i32 %conv
}

define void @arg_v2i64(<2 x i64> %arg) {
; CHECK-LABEL: name: arg_v2i64
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $q0
; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
; CHECK: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
; CHECK: G_STORE [[COPY]](<2 x s64>), [[DEF]](p0) :: (store 16 into `<2 x i64>* undef`)
; CHECK: RET_ReallyLR
store <2 x i64> %arg, <2 x i64>* undef
ret void
}

define void @arg_v8i64(<8 x i64> %arg) {
; CHECK-LABEL: name: arg_v8i64
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $q0, $q1, $q2, $q3
; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
; CHECK: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3
; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s64>) = G_CONCAT_VECTORS [[COPY]](<2 x s64>), [[COPY1]](<2 x s64>), [[COPY2]](<2 x s64>), [[COPY3]](<2 x s64>)
; CHECK: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
; CHECK: G_STORE [[CONCAT_VECTORS]](<8 x s64>), [[DEF]](p0) :: (store 64 into `<8 x i64>* undef`)
; CHECK: RET_ReallyLR
store <8 x i64> %arg, <8 x i64>* undef
ret void
}

define void @arg_v4f32(<4 x float> %arg) {
; CHECK-LABEL: name: arg_v4f32
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $q0
; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>)
; CHECK: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
; CHECK: G_STORE [[BITCAST]](<4 x s32>), [[DEF]](p0) :: (store 16 into `<4 x float>* undef`)
; CHECK: RET_ReallyLR
store <4 x float> %arg, <4 x float>* undef
ret void
}

define void @ret_arg_v16f32(<16 x float> %arg) {
; CHECK-LABEL: name: ret_arg_v16f32
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $q0, $q1, $q2, $q3
; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
; CHECK: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3
; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>)
; CHECK: [[BITCAST1:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>)
; CHECK: [[BITCAST2:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY2]](<2 x s64>)
; CHECK: [[BITCAST3:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY3]](<2 x s64>)
; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[BITCAST]](<4 x s32>), [[BITCAST1]](<4 x s32>), [[BITCAST2]](<4 x s32>), [[BITCAST3]](<4 x s32>)
; CHECK: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
; CHECK: G_STORE [[CONCAT_VECTORS]](<16 x s32>), [[DEF]](p0) :: (store 64 into `<16 x float>* undef`)
; CHECK: RET_ReallyLR
store <16 x float> %arg, <16 x float>* undef
ret void
}
5 changes: 3 additions & 2 deletions llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2353,8 +2353,9 @@ declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i32(i
define void @test_i1_arg_zext(void (i1)* %f) {
; CHECK-LABEL: name: test_i1_arg_zext
; CHECK: [[I1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[I1]](s1)
; CHECK: $w0 = COPY [[ZEXT]](s32)
; CHECK: [[ZEXT0:%[0-9]+]]:_(s8) = G_ZEXT [[I1]](s1)
; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ZEXT0]](s8)
; CHECK: $w0 = COPY [[ZEXT1]](s32)
call void %f(i1 true)
ret void
}
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/AArch64/GlobalISel/call-translator.ll
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,8 @@ define void @test_call_stack() {
; CHECK-NEXT: - { id: [[SLOT:[0-9]+]], type: default, offset: 0, size: 1, alignment: 16, stack-id: default,
; CHECK-NEXT: isImmutable: true,
; CHECK: [[ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[SLOT]]
; CHECK: {{%[0-9]+}}:_(s1) = G_LOAD [[ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[SLOT]], align 16)
; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[SLOT]], align 16)
; CHECK-NEXT: {{%[0-9]+}}:_(s1) = G_TRUNC [[LOAD]]
define void @test_mem_i1([8 x i64], i1 %in) {
ret void
}
Expand Down
25 changes: 15 additions & 10 deletions llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-reductions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@ define float @fadd_seq(float %start, <4 x float> %vec) {
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $q1, $s0
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
; CHECK: [[VECREDUCE_SEQ_FADD:%[0-9]+]]:_(s32) = G_VECREDUCE_SEQ_FADD [[COPY]](s32), [[COPY1]](<4 x s32>)
; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>)
; CHECK: [[VECREDUCE_SEQ_FADD:%[0-9]+]]:_(s32) = G_VECREDUCE_SEQ_FADD [[COPY]](s32), [[BITCAST]](<4 x s32>)
; CHECK: $s0 = COPY [[VECREDUCE_SEQ_FADD]](s32)
; CHECK: RET_ReallyLR implicit $s0
%res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %vec)
Expand All @@ -22,8 +23,9 @@ define float @fadd_fast(float %start, <4 x float> %vec) {
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $q1, $s0
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
; CHECK: [[VECREDUCE_FADD:%[0-9]+]]:_(s32) = reassoc G_VECREDUCE_FADD [[COPY1]](<4 x s32>)
; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>)
; CHECK: [[VECREDUCE_FADD:%[0-9]+]]:_(s32) = reassoc G_VECREDUCE_FADD [[BITCAST]](<4 x s32>)
; CHECK: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[COPY]], [[VECREDUCE_FADD]]
; CHECK: $s0 = COPY [[FADD]](s32)
; CHECK: RET_ReallyLR implicit $s0
Expand Down Expand Up @@ -69,8 +71,9 @@ define float @fmax(<4 x float> %vec) {
; CHECK-LABEL: name: fmax
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $q0
; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
; CHECK: [[VECREDUCE_FMAX:%[0-9]+]]:_(s32) = G_VECREDUCE_FMAX [[COPY]](<4 x s32>)
; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>)
; CHECK: [[VECREDUCE_FMAX:%[0-9]+]]:_(s32) = G_VECREDUCE_FMAX [[BITCAST]](<4 x s32>)
; CHECK: $s0 = COPY [[VECREDUCE_FMAX]](s32)
; CHECK: RET_ReallyLR implicit $s0
%res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %vec)
Expand All @@ -81,8 +84,9 @@ define float @fmin(<4 x float> %vec) {
; CHECK-LABEL: name: fmin
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $q0
; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
; CHECK: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = G_VECREDUCE_FMIN [[COPY]](<4 x s32>)
; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>)
; CHECK: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = G_VECREDUCE_FMIN [[BITCAST]](<4 x s32>)
; CHECK: $s0 = COPY [[VECREDUCE_FMIN]](s32)
; CHECK: RET_ReallyLR implicit $s0
%res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %vec)
Expand All @@ -93,8 +97,9 @@ define float @fmin_nnan(<4 x float> %vec) {
; CHECK-LABEL: name: fmin_nnan
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $q0
; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
; CHECK: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = nnan G_VECREDUCE_FMIN [[COPY]](<4 x s32>)
; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>)
; CHECK: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = nnan G_VECREDUCE_FMIN [[BITCAST]](<4 x s32>)
; CHECK: $s0 = COPY [[VECREDUCE_FMIN]](s32)
; CHECK: RET_ReallyLR implicit $s0
%res = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %vec)
Expand Down
76 changes: 61 additions & 15 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -429,8 +429,15 @@ define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) {
define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_andn2_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_xor_b32 s0, s3, -1
; GFX6-NEXT: s_and_b32 s0, s2, s0
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_and_b32 s1, s4, s1
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_and_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_v2i16:
Expand All @@ -451,8 +458,15 @@ define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1
define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_andn2_v2i16_commute:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_xor_b32 s0, s3, -1
; GFX6-NEXT: s_and_b32 s0, s0, s2
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_and_b32 s1, s4, s1
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_and_b32 s0, s1, s0
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_v2i16_commute:
Expand All @@ -473,8 +487,15 @@ define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inr
define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_andn2_v2i16_multi_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_xor_b32 s1, s3, -1
; GFX6-NEXT: s_and_b32 s0, s2, s1
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_and_b32 s1, s4, s1
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_and_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_v2i16_multi_use:
Expand All @@ -501,9 +522,19 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2
define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) {
; GFX6-LABEL: s_andn2_v2i16_multi_foldable_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_xor_b32 s1, s4, -1
; GFX6-NEXT: s_and_b32 s0, s2, s1
; GFX6-NEXT: s_and_b32 s1, s3, s1
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_and_b32 s3, s4, s1
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s7, 16
; GFX6-NEXT: s_and_b32 s1, s6, s1
; GFX6-NEXT: s_or_b32 s1, s3, s1
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_and_b32 s0, s0, s1
; GFX6-NEXT: s_and_b32 s1, s2, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_v2i16_multi_foldable_use:
Expand All @@ -529,12 +560,27 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg
}

define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
; GCN-LABEL: v_andn2_v2i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_xor_b32_e32 v1, -1, v1
; GCN-NEXT: v_and_b32_e32 v0, v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
; GFX6-LABEL: v_andn2_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, v0, v4
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX6-NEXT: v_and_b32_e32 v2, v2, v4
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX6-NEXT: v_and_b32_e32 v0, v0, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_andn2_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX9-NEXT: v_and_b32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_andn2_v2i16:
; GFX10: ; %bb.0:
Expand Down
46 changes: 16 additions & 30 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -723,17 +723,12 @@ define <2 x i16> @v_ashr_v2i16(<2 x i16> %value, <2 x i16> %amount) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_and_b32_e32 v2, s4, v2
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_ashrrev_i32_e32 v0, v1, v0
; GFX6-NEXT: v_bfe_i32 v1, v2, 0, 16
; GFX6-NEXT: v_ashrrev_i32_e32 v1, v3, v1
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_ashrrev_i32_e32 v0, v2, v0
; GFX6-NEXT: v_and_b32_e32 v2, s4, v3
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_ashrrev_i32_e32 v1, v2, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ashr_v2i16:
Expand Down Expand Up @@ -764,16 +759,10 @@ define <2 x i16> @v_ashr_v2i16_15(<2 x i16> %value) {
; GFX6-LABEL: v_ashr_v2i16_15:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 15, v0
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ashr_v2i16_15:
Expand Down Expand Up @@ -805,13 +794,12 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
; GFX6-LABEL: s_ashr_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: s_lshr_b32 s2, s0, 16
; GFX6-NEXT: s_lshr_b32 s3, s1, 16
; GFX6-NEXT: s_and_b32 s1, s1, s4
; GFX6-NEXT: s_and_b32 s2, s2, s4
; GFX6-NEXT: s_sext_i32_i16 s0, s0
; GFX6-NEXT: s_ashr_i32 s0, s0, s1
; GFX6-NEXT: s_sext_i32_i16 s1, s2
; GFX6-NEXT: s_ashr_i32 s1, s1, s3
; GFX6-NEXT: s_ashr_i32 s0, s0, s2
; GFX6-NEXT: s_and_b32 s2, s3, s4
; GFX6-NEXT: s_sext_i32_i16 s1, s1
; GFX6-NEXT: s_ashr_i32 s1, s1, s2
; GFX6-NEXT: s_and_b32 s1, s1, s4
; GFX6-NEXT: s_and_b32 s0, s0, s4
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
Expand Down Expand Up @@ -863,11 +851,10 @@ define amdgpu_ps float @ashr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount)
; GFX6-LABEL: ashr_v2i16_sv:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s2, 0xffff
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_and_b32_e32 v0, s2, v0
; GFX6-NEXT: s_sext_i32_i16 s0, s0
; GFX6-NEXT: v_ashr_i32_e32 v0, s0, v0
; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
; GFX6-NEXT: s_sext_i32_i16 s0, s1
; GFX6-NEXT: v_ashr_i32_e32 v1, s0, v1
; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
Expand Down Expand Up @@ -902,14 +889,13 @@ define amdgpu_ps float @ashr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount)
define amdgpu_ps float @ashr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) {
; GFX6-LABEL: ashr_v2i16_vs:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: s_mov_b32 s2, 0xffff
; GFX6-NEXT: v_ashrrev_i32_e32 v1, s1, v1
; GFX6-NEXT: s_and_b32 s0, s0, s2
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_ashrrev_i32_e32 v0, s0, v0
; GFX6-NEXT: s_and_b32 s0, s1, s2
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_ashrrev_i32_e32 v1, s0, v1
; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
; GFX6-NEXT: v_and_b32_e32 v0, s2, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
Expand Down
40 changes: 19 additions & 21 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
Original file line number Diff line number Diff line change
Expand Up @@ -459,18 +459,19 @@ define i16 @v_bswap_i16(i16 %src) {
define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) {
; GFX7-LABEL: s_bswap_v2i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_lshr_b32 s1, s0, 16
; GFX7-NEXT: s_and_b32 s3, s0, 0xffff
; GFX7-NEXT: s_mov_b32 s3, 0xffff
; GFX7-NEXT: s_lshl_b32 s2, s0, 8
; GFX7-NEXT: s_lshl_b32 s1, s1, 8
; GFX7-NEXT: s_lshr_b32 s0, s0, 24
; GFX7-NEXT: s_or_b32 s0, s0, s1
; GFX7-NEXT: s_lshr_b32 s3, s3, 8
; GFX7-NEXT: s_and_b32 s0, s0, s3
; GFX7-NEXT: s_lshr_b32 s0, s0, 8
; GFX7-NEXT: s_or_b32 s0, s0, s2
; GFX7-NEXT: s_lshl_b32 s2, s1, 8
; GFX7-NEXT: s_and_b32 s1, s1, s3
; GFX7-NEXT: s_lshr_b32 s1, s1, 8
; GFX7-NEXT: s_or_b32 s1, s1, s2
; GFX7-NEXT: s_bfe_u32 s1, s1, 0x100000
; GFX7-NEXT: s_bfe_u32 s0, s0, 0x100000
; GFX7-NEXT: s_or_b32 s2, s3, s2
; GFX7-NEXT: s_bfe_u32 s1, s2, 0x100000
; GFX7-NEXT: s_lshl_b32 s0, s0, 16
; GFX7-NEXT: s_or_b32 s0, s1, s0
; GFX7-NEXT: s_lshl_b32 s1, s1, 16
; GFX7-NEXT: s_or_b32 s0, s0, s1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_bswap_v2i16:
Expand Down Expand Up @@ -578,18 +579,15 @@ define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
; GFX7-LABEL: v_bswap_v2i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0
; GFX7-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v3
; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_bfe_u32 v1, v2, 0, 16
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v1
; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_bswap_v2i16:
Expand Down
41 changes: 16 additions & 25 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
Original file line number Diff line number Diff line change
Expand Up @@ -67,31 +67,22 @@ define i16 @halfinsts_add_i16(i16 %arg0) #1 {
define <2 x i16> @halfinsts_add_v2i16(<2 x i16> %arg0) #1 {
; CHECK-LABEL: name: halfinsts_add_v2i16
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY3]]
; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY4]], [[COPY5]]
; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]]
; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32)
; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]]
; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; CHECK: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; CHECK: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
; CHECK: [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
; CHECK: S_SETPC_B64_return [[COPY8]], implicit $vgpr0
; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY3]], [[COPY4]]
; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY5]], [[COPY6]]
; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32)
; CHECK: $vgpr0 = COPY [[COPY7]](s32)
; CHECK: $vgpr1 = COPY [[COPY8]](s32)
; CHECK: [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
; CHECK: S_SETPC_B64_return [[COPY9]], implicit $vgpr0, implicit $vgpr1
%add = add <2 x i16> %arg0, %arg0
ret <2 x i16> %add
}
Expand Down
451 changes: 181 additions & 270 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll

Large diffs are not rendered by default.

77 changes: 33 additions & 44 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
Original file line number Diff line number Diff line change
Expand Up @@ -104,23 +104,16 @@ define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) {
; GFX6-LABEL: v_fma_v2f16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5
; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v2f16:
Expand Down Expand Up @@ -156,24 +149,21 @@ define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half>
; GFX6-LABEL: v_fma_v2f16_fneg_lhs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5
; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v2f16_fneg_lhs:
Expand Down Expand Up @@ -211,24 +201,21 @@ define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half>
; GFX6-LABEL: v_fma_v2f16_fneg_rhs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5
; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v2f16_fneg_rhs:
Expand Down Expand Up @@ -266,26 +253,28 @@ define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x h
; GFX6-LABEL: v_fma_v2f16_fneg_lhs_rhs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, v0, v6
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX6-NEXT: v_and_b32_e32 v2, v2, v6
; GFX6-NEXT: s_mov_b32 s4, 0x80008000
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0
; GFX6-NEXT: v_xor_b32_e32 v1, s4, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX6-NEXT: v_fma_f32 v0, v0, v1, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5
; GFX6-NEXT: v_fma_f32 v1, v2, v3, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v2f16_fneg_lhs_rhs:
Expand Down
73 changes: 33 additions & 40 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
Original file line number Diff line number Diff line change
Expand Up @@ -143,24 +143,18 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
; GFX6-LABEL: v_pow_v2f16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_log_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_log_f32_e32 v2, v2
; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
; GFX6-NEXT: v_log_f32_e32 v0, v0
; GFX6-NEXT: v_log_f32_e32 v1, v1
; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2
; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v3
; GFX6-NEXT: v_exp_f32_e32 v0, v0
; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v2, v3
; GFX6-NEXT: v_exp_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f16:
Expand Down Expand Up @@ -228,25 +222,23 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
; GFX6-LABEL: v_pow_v2f16_fneg_lhs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_log_f32_e32 v0, v0
; GFX6-NEXT: v_log_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_log_f32_e32 v2, v2
; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
; GFX6-NEXT: v_exp_f32_e32 v0, v0
; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v2, v3
; GFX6-NEXT: v_log_f32_e32 v0, v0
; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2
; GFX6-NEXT: v_exp_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v3
; GFX6-NEXT: v_exp_f32_e32 v2, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f16_fneg_lhs:
Expand Down Expand Up @@ -318,25 +310,23 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
; GFX6-LABEL: v_pow_v2f16_fneg_rhs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_log_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_log_f32_e32 v2, v2
; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
; GFX6-NEXT: v_log_f32_e32 v1, v1
; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2
; GFX6-NEXT: v_exp_f32_e32 v0, v0
; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v2, v3
; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v3
; GFX6-NEXT: v_exp_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f16_fneg_rhs:
Expand Down Expand Up @@ -408,8 +398,15 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
; GFX6-LABEL: v_pow_v2f16_fneg_lhs_rhs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, v0, v4
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: s_mov_b32 s4, 0x80008000
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX6-NEXT: v_and_b32_e32 v2, v2, v4
; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
Expand All @@ -425,10 +422,6 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
; GFX6-NEXT: v_exp_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f16_fneg_lhs_rhs:
Expand Down
179 changes: 86 additions & 93 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3235,25 +3235,25 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) {
; GFX6-LABEL: s_fshl_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_and_b32 s5, s2, 15
; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000
; GFX6-NEXT: s_lshr_b32 s3, s0, 16
; GFX6-NEXT: s_lshr_b32 s4, s2, 16
; GFX6-NEXT: s_andn2_b32 s2, 15, s2
; GFX6-NEXT: s_lshl_b32 s0, s0, s5
; GFX6-NEXT: s_and_b32 s5, s1, 0xffff
; GFX6-NEXT: s_lshr_b32 s5, s5, 1
; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000
; GFX6-NEXT: s_lshr_b32 s2, s5, s2
; GFX6-NEXT: s_and_b32 s6, s4, 15
; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000
; GFX6-NEXT: s_lshl_b32 s0, s0, s6
; GFX6-NEXT: s_mov_b32 s6, 0xffff
; GFX6-NEXT: s_andn2_b32 s4, 15, s4
; GFX6-NEXT: s_and_b32 s2, s2, s6
; GFX6-NEXT: s_lshr_b32 s2, s2, 1
; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000
; GFX6-NEXT: s_lshr_b32 s2, s2, s4
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_and_b32 s2, s4, 15
; GFX6-NEXT: s_and_b32 s2, s5, 15
; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000
; GFX6-NEXT: s_andn2_b32 s4, 15, s4
; GFX6-NEXT: s_lshl_b32 s2, s3, s2
; GFX6-NEXT: s_lshr_b32 s1, s1, 17
; GFX6-NEXT: s_lshl_b32 s1, s1, s2
; GFX6-NEXT: s_and_b32 s2, s3, s6
; GFX6-NEXT: s_andn2_b32 s4, 15, s5
; GFX6-NEXT: s_lshr_b32 s2, s2, 1
; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000
; GFX6-NEXT: s_lshr_b32 s1, s1, s3
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_lshr_b32 s2, s2, s3
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000
; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
Expand Down Expand Up @@ -3349,31 +3349,27 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
; GFX6-LABEL: v_fshl_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v5, 15, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2
; GFX6-NEXT: v_bfe_u32 v5, v5, 0, 16
; GFX6-NEXT: v_and_b32_e32 v2, 15, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v5, v0
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 1, v5
; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16
; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v5
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: v_and_b32_e32 v2, 15, v4
; GFX6-NEXT: v_and_b32_e32 v6, 15, v4
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: v_and_b32_e32 v4, 15, v4
; GFX6-NEXT: v_and_b32_e32 v2, s4, v2
; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2
; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16
; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: v_and_b32_e32 v2, 15, v5
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5
; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1
; GFX6-NEXT: v_and_b32_e32 v2, s4, v3
; GFX6-NEXT: v_and_b32_e32 v4, 15, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 17, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2
; GFX6-NEXT: v_bfe_u32 v3, v4, 0, 16
; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v1
; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshl_v2i16:
Expand Down Expand Up @@ -3434,24 +3430,21 @@ define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-LABEL: v_fshl_v2i16_4_8:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v1
; GFX6-NEXT: s_bfe_u32 s4, 4, 0x100000
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3
; GFX6-NEXT: s_bfe_u32 s4, 11, 0x100000
; GFX6-NEXT: v_lshrrev_b32_e32 v3, s4, v3
; GFX6-NEXT: s_bfe_u32 s4, 8, 0x100000
; GFX6-NEXT: v_lshlrev_b32_e32 v2, s4, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 17, v1
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: v_and_b32_e32 v2, s4, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2
; GFX6-NEXT: s_bfe_u32 s5, 11, 0x100000
; GFX6-NEXT: v_lshrrev_b32_e32 v2, s5, v2
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: v_and_b32_e32 v2, s4, v3
; GFX6-NEXT: s_bfe_u32 s5, 8, 0x100000
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2
; GFX6-NEXT: s_bfe_u32 s4, 7, 0x100000
; GFX6-NEXT: v_lshrrev_b32_e32 v1, s4, v1
; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, s5, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshl_v2i16_4_8:
Expand Down Expand Up @@ -3570,23 +3563,23 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
; GFX6: ; %bb.0:
; GFX6-NEXT: v_and_b32_e32 v2, 15, v0
; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX6-NEXT: v_and_b32_e32 v0, 15, v0
; GFX6-NEXT: s_lshr_b32 s2, s0, 16
; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2
; GFX6-NEXT: s_and_b32 s0, s1, 0xffff
; GFX6-NEXT: s_lshr_b32 s0, s0, 1
; GFX6-NEXT: s_mov_b32 s0, 0xffff
; GFX6-NEXT: s_and_b32 s2, s2, s0
; GFX6-NEXT: v_and_b32_e32 v0, 15, v0
; GFX6-NEXT: s_lshr_b32 s2, s2, 1
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0
; GFX6-NEXT: v_lshr_b32_e32 v0, s2, v0
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
; GFX6-NEXT: v_and_b32_e32 v2, 15, v1
; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX6-NEXT: v_and_b32_e32 v1, 15, v1
; GFX6-NEXT: s_and_b32 s0, s3, s0
; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16
; GFX6-NEXT: s_lshr_b32 s0, s1, 17
; GFX6-NEXT: s_lshr_b32 s0, s0, 1
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_lshl_b32_e32 v2, s2, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2
; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1
; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
Expand Down Expand Up @@ -3660,29 +3653,29 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <2 x i16> inreg %amt) {
; GFX6-LABEL: v_fshl_v2i16_svs:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX6-NEXT: s_and_b32 s4, s1, 15
; GFX6-NEXT: s_lshr_b32 s3, s1, 16
; GFX6-NEXT: s_andn2_b32 s1, 15, s1
; GFX6-NEXT: s_and_b32 s4, s2, 15
; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000
; GFX6-NEXT: s_lshr_b32 s2, s0, 16
; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1
; GFX6-NEXT: s_lshl_b32 s0, s0, s4
; GFX6-NEXT: v_or_b32_e32 v1, s0, v1
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: s_andn2_b32 s2, 15, s2
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0
; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000
; GFX6-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: s_and_b32 s0, s3, 15
; GFX6-NEXT: s_andn2_b32 s1, 15, s3
; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 17, v0
; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000
; GFX6-NEXT: s_lshl_b32 s0, s2, s0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, s1, v0
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: s_andn2_b32 s2, 15, s3
; GFX6-NEXT: s_lshl_b32 s0, s1, s0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000
; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1
; GFX6-NEXT: v_or_b32_e32 v1, s0, v1
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_fshl_v2i16_svs:
Expand Down Expand Up @@ -3746,22 +3739,22 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) {
; GFX6-LABEL: v_fshl_v2i16_vss:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_and_b32 s3, s1, 15
; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_lshr_b32 s2, s1, 16
; GFX6-NEXT: s_andn2_b32 s1, 15, s1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, s3, v0
; GFX6-NEXT: s_and_b32 s3, s0, 0xffff
; GFX6-NEXT: s_lshr_b32 s3, s3, 1
; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000
; GFX6-NEXT: s_lshr_b32 s1, s3, s1
; GFX6-NEXT: v_or_b32_e32 v0, s1, v0
; GFX6-NEXT: s_and_b32 s1, s2, 15
; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000
; GFX6-NEXT: s_and_b32 s4, s2, 15
; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000
; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: s_andn2_b32 s2, 15, s2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, s1, v1
; GFX6-NEXT: s_lshr_b32 s0, s0, 17
; GFX6-NEXT: s_and_b32 s0, s0, s4
; GFX6-NEXT: s_lshr_b32 s0, s0, 1
; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000
; GFX6-NEXT: s_lshr_b32 s0, s0, s2
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: s_and_b32 s0, s3, 15
; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000
; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1
; GFX6-NEXT: s_and_b32 s0, s1, s4
; GFX6-NEXT: s_andn2_b32 s2, 15, s3
; GFX6-NEXT: s_lshr_b32 s0, s0, 1
; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000
; GFX6-NEXT: s_lshr_b32 s0, s0, s1
; GFX6-NEXT: v_or_b32_e32 v1, s0, v1
Expand Down
291 changes: 147 additions & 144 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll

Large diffs are not rendered by default.

10 changes: 6 additions & 4 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -570,8 +570,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; CHECK: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s8)
; CHECK: $vgpr0 = COPY [[SEXT]](s32)
; CHECK: [[SEXT:%[0-9]+]]:_(s16) = G_SEXT [[LOAD]](s8)
; CHECK: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[SEXT]](s16)
; CHECK: $vgpr0 = COPY [[SEXT1]](s32)
; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
Expand Down Expand Up @@ -627,8 +628,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s8)
; CHECK: $vgpr0 = COPY [[ZEXT]](s32)
; CHECK: [[ZEXT:%[0-9]+]]:_(s16) = G_ZEXT [[LOAD]](s8)
; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ZEXT]](s16)
; CHECK: $vgpr0 = COPY [[ZEXT1]](s32)
; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,8 @@ define void @void_func_i24_zeroext(i24 zeroext %arg0) #0 {
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[COPY]](s32)
; CHECK: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 24
; CHECK: [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[ASSERT_ZEXT]](s32)
; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK: G_STORE [[TRUNC]](s24), [[DEF]](p1) :: (store 3 into `i24 addrspace(1)* undef`, align 4, addrspace 1)
Expand All @@ -273,7 +274,8 @@ define void @void_func_i24_signext(i24 signext %arg0) #0 {
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[COPY]](s32)
; CHECK: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY]], 24
; CHECK: [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[ASSERT_SEXT]](s32)
; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK: G_STORE [[TRUNC]](s24), [[DEF]](p1) :: (store 3 into `i24 addrspace(1)* undef`, align 4, addrspace 1)
Expand Down Expand Up @@ -2105,10 +2107,11 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
; CHECK: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 1 from %fixed-stack.3, align 16, addrspace 5)
; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 1 from %fixed-stack.3, align 16, addrspace 5)
; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD]](s32)
; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
; CHECK: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 2 from %fixed-stack.2, align 4, addrspace 5)
; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s16)
; CHECK: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s16)
; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
; CHECK: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load 2 from %fixed-stack.1, align 8, addrspace 5)
; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
Expand All @@ -2120,8 +2123,8 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
; CHECK: [[COPY35:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
; CHECK: [[COPY36:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
; CHECK: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store 128 into `<32 x i32> addrspace(1)* undef`, addrspace 1)
; CHECK: G_STORE [[LOAD]](s1), [[COPY33]](p1) :: (volatile store 1 into `i1 addrspace(1)* undef`, addrspace 1)
; CHECK: G_STORE [[TRUNC]](s8), [[COPY34]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1)
; CHECK: G_STORE [[TRUNC]](s1), [[COPY33]](p1) :: (volatile store 1 into `i1 addrspace(1)* undef`, addrspace 1)
; CHECK: G_STORE [[TRUNC1]](s8), [[COPY34]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1)
; CHECK: G_STORE [[LOAD2]](s16), [[COPY35]](p1) :: (volatile store 2 into `i16 addrspace(1)* undef`, addrspace 1)
; CHECK: G_STORE [[LOAD3]](s16), [[COPY36]](p1) :: (volatile store 2 into `half addrspace(1)* undef`, addrspace 1)
; CHECK: [[COPY37:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]]
Expand Down
42 changes: 20 additions & 22 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -738,14 +738,12 @@ define <2 x i16> @v_lshr_v2i16(<2 x i16> %value, <2 x i16> %amount) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_and_b32_e32 v2, s4, v2
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v0
; GFX6-NEXT: v_and_b32_e32 v2, s4, v3
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_lshr_v2i16:
Expand Down Expand Up @@ -776,11 +774,11 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) {
; GFX6-LABEL: v_lshr_v2i16_15:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v0
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 15, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 15, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_lshr_v2i16_15:
Expand Down Expand Up @@ -812,12 +810,12 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
; GFX6-LABEL: s_lshr_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: s_lshr_b32 s2, s0, 16
; GFX6-NEXT: s_lshr_b32 s3, s1, 16
; GFX6-NEXT: s_and_b32 s1, s1, s4
; GFX6-NEXT: s_and_b32 s2, s2, s4
; GFX6-NEXT: s_and_b32 s0, s0, s4
; GFX6-NEXT: s_lshr_b32 s0, s0, s1
; GFX6-NEXT: s_lshr_b32 s1, s2, s3
; GFX6-NEXT: s_lshr_b32 s0, s0, s2
; GFX6-NEXT: s_and_b32 s2, s3, s4
; GFX6-NEXT: s_and_b32 s1, s1, s4
; GFX6-NEXT: s_lshr_b32 s1, s1, s2
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
Expand Down Expand Up @@ -867,13 +865,13 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
define amdgpu_ps float @lshr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) {
; GFX6-LABEL: lshr_v2i16_sv:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_mov_b32 s2, 0xffff
; GFX6-NEXT: v_lshr_b32_e32 v1, s1, v1
; GFX6-NEXT: v_and_b32_e32 v0, s2, v0
; GFX6-NEXT: s_and_b32 s0, s0, s2
; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0
; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
; GFX6-NEXT: s_and_b32 s0, s1, s2
; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: ; return to shader part epilog
Expand Down Expand Up @@ -904,13 +902,13 @@ define amdgpu_ps float @lshr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount)
define amdgpu_ps float @lshr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) {
; GFX6-LABEL: lshr_v2i16_vs:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_mov_b32 s2, 0xffff
; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1
; GFX6-NEXT: s_and_b32 s0, s0, s2
; GFX6-NEXT: v_and_b32_e32 v0, s2, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX6-NEXT: s_and_b32 s0, s1, s2
; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: ; return to shader part epilog
Expand Down
72 changes: 59 additions & 13 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -429,8 +429,15 @@ define amdgpu_ps float @v_orn2_i16_vs(i16 %src0, i16 inreg %src1) {
define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_orn2_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_xor_b32 s0, s3, -1
; GFX6-NEXT: s_or_b32 s0, s2, s0
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_and_b32 s1, s4, s1
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_orn2_v2i16:
Expand All @@ -451,8 +458,15 @@ define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1)
define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_orn2_v2i16_commute:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_xor_b32 s0, s3, -1
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_and_b32 s1, s4, s1
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_or_b32 s0, s1, s0
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_orn2_v2i16_commute:
Expand All @@ -473,8 +487,15 @@ define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inre
define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_orn2_v2i16_multi_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_xor_b32 s1, s3, -1
; GFX6-NEXT: s_or_b32 s0, s2, s1
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_and_b32 s1, s4, s1
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_orn2_v2i16_multi_use:
Expand All @@ -501,9 +522,19 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2
define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) {
; GFX6-LABEL: s_orn2_v2i16_multi_foldable_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_xor_b32 s1, s4, -1
; GFX6-NEXT: s_or_b32 s0, s2, s1
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_and_b32 s3, s4, s1
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s7, 16
; GFX6-NEXT: s_and_b32 s1, s6, s1
; GFX6-NEXT: s_or_b32 s1, s3, s1
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_orn2_v2i16_multi_foldable_use:
Expand All @@ -529,12 +560,27 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_foldable_use(<2 x i16> inreg %
}

define <2 x i16> @v_orn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
; GCN-LABEL: v_orn2_v2i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_xor_b32_e32 v1, -1, v1
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
; GFX6-LABEL: v_orn2_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, v0, v4
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX6-NEXT: v_and_b32_e32 v2, v2, v4
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_orn2_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_orn2_v2i16:
; GFX10: ; %bb.0:
Expand Down
52 changes: 20 additions & 32 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
Original file line number Diff line number Diff line change
Expand Up @@ -218,33 +218,23 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
; GFX6-LABEL: v_roundeven_v2f16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rndne_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_rndne_f32_e32 v0, v0
; GFX6-NEXT: v_rndne_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_roundeven_v2f16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_rndne_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_rndne_f32_e32 v0, v0
; GFX7-NEXT: v_rndne_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_roundeven_v2f16:
Expand Down Expand Up @@ -282,35 +272,33 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
; GFX6-LABEL: v_roundeven_v2f16_fneg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rndne_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_rndne_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX6-NEXT: v_rndne_f32_e32 v0, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_rndne_f32_e32 v1, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_roundeven_v2f16_fneg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_rndne_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_rndne_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX7-NEXT: v_rndne_f32_e32 v0, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_rndne_f32_e32 v1, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_roundeven_v2f16_fneg:
Expand Down
35 changes: 11 additions & 24 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2702,35 +2702,28 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-LABEL: v_saddsat_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_brev_b32 s5, 1
; GFX6-NEXT: v_min_i32_e32 v5, 0, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5
; GFX6-NEXT: s_brev_b32 s4, -2
; GFX6-NEXT: v_max_i32_e32 v4, 0, v0
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4
; GFX6-NEXT: v_max_i32_e32 v1, v5, v1
; GFX6-NEXT: v_min_i32_e32 v1, v1, v4
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX6-NEXT: v_max_i32_e32 v2, v5, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_min_i32_e32 v2, v2, v4
; GFX6-NEXT: v_min_i32_e32 v4, 0, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX6-NEXT: v_max_i32_e32 v3, 0, v1
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3
; GFX6-NEXT: v_max_i32_e32 v2, v4, v2
; GFX6-NEXT: v_min_i32_e32 v2, v2, v3
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_saddsat_v2i16:
Expand Down Expand Up @@ -2775,20 +2768,18 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
; GFX6-LABEL: s_saddsat_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshr_b32 s2, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_brev_b32 s5, 1
; GFX6-NEXT: s_min_i32 s7, s0, 0
; GFX6-NEXT: s_lshr_b32 s3, s1, 16
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_sub_i32 s7, s5, s7
; GFX6-NEXT: s_brev_b32 s4, -2
; GFX6-NEXT: s_max_i32 s6, s0, 0
; GFX6-NEXT: s_sub_i32 s6, s4, s6
; GFX6-NEXT: s_max_i32 s1, s7, s1
; GFX6-NEXT: s_min_i32 s1, s1, s6
; GFX6-NEXT: s_add_i32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s2, 16
; GFX6-NEXT: s_max_i32 s2, s7, s2
; GFX6-NEXT: s_min_i32 s2, s2, s6
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_add_i32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s3, 16
; GFX6-NEXT: s_max_i32 s3, s1, 0
; GFX6-NEXT: s_sub_i32 s3, s4, s3
Expand Down Expand Up @@ -2863,11 +2854,9 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
; GFX6-LABEL: saddsat_v2i16_sv:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_brev_b32 s3, 1
; GFX6-NEXT: s_min_i32 s5, s0, 0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_sub_i32 s5, s3, s5
; GFX6-NEXT: s_brev_b32 s2, -2
Expand Down Expand Up @@ -2938,11 +2927,9 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
; GFX6-LABEL: saddsat_v2i16_vs:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_brev_b32 s3, 1
; GFX6-NEXT: v_min_i32_e32 v3, 0, v0
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3
; GFX6-NEXT: s_brev_b32 s2, -2
Expand Down
12 changes: 10 additions & 2 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
Original file line number Diff line number Diff line change
Expand Up @@ -611,9 +611,13 @@ define i32 @v_shl_i32_zext_i16(i16 %x) {
define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) {
; GFX7-LABEL: s_shl_v2i32_zext_v2i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, 0xffff
; GFX7-NEXT: s_lshl_b32 s1, s1, 16
; GFX7-NEXT: s_and_b32 s0, s0, s2
; GFX7-NEXT: s_or_b32 s0, s1, s0
; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff3fff
; GFX7-NEXT: s_lshr_b32 s1, s0, 16
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: s_and_b32 s0, s0, s2
; GFX7-NEXT: s_lshl_b32 s0, s0, 2
; GFX7-NEXT: s_lshl_b32 s1, s1, 2
; GFX7-NEXT: ; return to shader part epilog
Expand Down Expand Up @@ -661,9 +665,13 @@ define <2 x i32> @v_shl_v2i32_zext_v2i16(<2 x i16> %x) {
; GFX7-LABEL: v_shl_v2i32_zext_v2i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_and_b32_e32 v0, v0, v2
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_and_b32_e32 v0, v0, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
Expand Down
35 changes: 12 additions & 23 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -731,15 +731,10 @@ define <2 x i16> @v_shl_v2i16(<2 x i16> %value, <2 x i16> %amount) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v2
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_and_b32_e32 v2, s4, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0
; GFX6-NEXT: v_and_b32_e32 v2, s4, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_shl_v2i16:
Expand Down Expand Up @@ -770,11 +765,8 @@ define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) {
; GFX6-LABEL: v_shl_v2i16_15:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 15, v0
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 31, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 15, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_shl_v2i16_15:
Expand Down Expand Up @@ -806,11 +798,10 @@ define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amoun
; GFX6-LABEL: s_shl_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: s_lshr_b32 s3, s1, 16
; GFX6-NEXT: s_lshr_b32 s2, s0, 16
; GFX6-NEXT: s_and_b32 s1, s1, s4
; GFX6-NEXT: s_lshl_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s2, s3
; GFX6-NEXT: s_and_b32 s2, s2, s4
; GFX6-NEXT: s_lshl_b32 s0, s0, s2
; GFX6-NEXT: s_and_b32 s2, s3, s4
; GFX6-NEXT: s_lshl_b32 s1, s1, s2
; GFX6-NEXT: s_and_b32 s1, s1, s4
; GFX6-NEXT: s_and_b32 s0, s0, s4
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
Expand Down Expand Up @@ -856,9 +847,8 @@ define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amoun
define amdgpu_ps float @shl_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) {
; GFX6-LABEL: shl_v2i16_sv:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_mov_b32 s2, 0xffff
; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
; GFX6-NEXT: v_and_b32_e32 v0, s2, v0
; GFX6-NEXT: v_lshl_b32_e32 v1, s1, v1
; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0
Expand Down Expand Up @@ -894,12 +884,11 @@ define amdgpu_ps float @shl_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount)
define amdgpu_ps float @shl_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) {
; GFX6-LABEL: shl_v2i16_vs:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_mov_b32 s2, 0xffff
; GFX6-NEXT: v_lshlrev_b32_e32 v1, s1, v1
; GFX6-NEXT: s_and_b32 s0, s0, s2
; GFX6-NEXT: v_lshlrev_b32_e32 v0, s0, v0
; GFX6-NEXT: s_and_b32 s0, s1, s2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1
; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
; GFX6-NEXT: v_and_b32_e32 v0, s2, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
Expand Down
37 changes: 12 additions & 25 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2688,35 +2688,28 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-LABEL: v_ssubsat_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_brev_b32 s4, -2
; GFX6-NEXT: v_max_i32_e32 v4, -1, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4
; GFX6-NEXT: s_brev_b32 s5, 1
; GFX6-NEXT: v_min_i32_e32 v5, -1, v0
; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5
; GFX6-NEXT: v_max_i32_e32 v1, v4, v1
; GFX6-NEXT: v_min_i32_e32 v1, v1, v5
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX6-NEXT: v_max_i32_e32 v2, v4, v2
; GFX6-NEXT: v_min_i32_e32 v2, v2, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX6-NEXT: v_max_i32_e32 v3, -1, v1
; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3
; GFX6-NEXT: v_min_i32_e32 v4, -1, v1
; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3
; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4
; GFX6-NEXT: v_max_i32_e32 v2, v3, v2
; GFX6-NEXT: v_min_i32_e32 v2, v2, v4
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_v2i16:
Expand Down Expand Up @@ -2761,20 +2754,18 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
; GFX6-LABEL: s_ssubsat_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshr_b32 s2, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_brev_b32 s4, -2
; GFX6-NEXT: s_max_i32 s6, s0, -1
; GFX6-NEXT: s_lshr_b32 s3, s1, 16
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_sub_i32 s6, s6, s4
; GFX6-NEXT: s_brev_b32 s5, 1
; GFX6-NEXT: s_min_i32 s7, s0, -1
; GFX6-NEXT: s_sub_i32 s7, s7, s5
; GFX6-NEXT: s_max_i32 s1, s6, s1
; GFX6-NEXT: s_min_i32 s1, s1, s7
; GFX6-NEXT: s_sub_i32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s2, 16
; GFX6-NEXT: s_max_i32 s2, s6, s2
; GFX6-NEXT: s_min_i32 s2, s2, s7
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_sub_i32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s3, 16
; GFX6-NEXT: s_max_i32 s3, s1, -1
; GFX6-NEXT: s_sub_i32 s3, s3, s4
Expand Down Expand Up @@ -2849,11 +2840,9 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
; GFX6-LABEL: ssubsat_v2i16_sv:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_brev_b32 s2, -2
; GFX6-NEXT: s_max_i32 s4, s0, -1
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_sub_i32 s4, s4, s2
; GFX6-NEXT: s_brev_b32 s3, 1
Expand Down Expand Up @@ -2924,11 +2913,9 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
; GFX6-LABEL: ssubsat_v2i16_vs:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_brev_b32 s2, -2
; GFX6-NEXT: v_max_i32_e32 v2, -1, v0
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v2
; GFX6-NEXT: s_brev_b32 s3, 1
Expand Down
28 changes: 9 additions & 19 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1748,22 +1748,18 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-LABEL: v_uaddsat_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v0
; GFX6-NEXT: v_min_u32_e32 v1, v4, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX6-NEXT: v_min_u32_e32 v2, v4, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX6-NEXT: v_xor_b32_e32 v3, -1, v1
; GFX6-NEXT: v_min_u32_e32 v2, v3, v2
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uaddsat_v2i16:
Expand Down Expand Up @@ -1795,14 +1791,12 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
; GFX6-LABEL: s_uaddsat_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshr_b32 s2, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_lshr_b32 s3, s1, 16
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_not_b32 s4, s0
; GFX6-NEXT: s_min_u32 s1, s4, s1
; GFX6-NEXT: s_add_i32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s2, 16
; GFX6-NEXT: s_min_u32 s2, s4, s2
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_add_i32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s3, 16
; GFX6-NEXT: s_not_b32 s3, s1
; GFX6-NEXT: s_min_u32 s2, s3, s2
Expand Down Expand Up @@ -1847,9 +1841,7 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
; GFX6-LABEL: uaddsat_v2i16_sv:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_not_b32 s2, s0
; GFX6-NEXT: v_min_u32_e32 v0, s2, v0
Expand Down Expand Up @@ -1893,9 +1885,7 @@ define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
; GFX6-LABEL: uaddsat_v2i16_vs:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0
; GFX6-NEXT: v_min_u32_e32 v2, s0, v2
Expand Down
24 changes: 7 additions & 17 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1662,20 +1662,16 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-LABEL: v_usubsat_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_min_u32_e32 v2, v0, v2
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_min_u32_e32 v1, v0, v1
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX6-NEXT: v_min_u32_e32 v2, v1, v2
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_usubsat_v2i16:
Expand Down Expand Up @@ -1707,13 +1703,11 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
; GFX6-LABEL: s_usubsat_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshr_b32 s2, s0, 16
; GFX6-NEXT: s_lshr_b32 s3, s1, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_min_u32 s2, s0, s2
; GFX6-NEXT: s_sub_i32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_min_u32 s1, s0, s1
; GFX6-NEXT: s_sub_i32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s2, 16
; GFX6-NEXT: s_lshl_b32 s2, s3, 16
; GFX6-NEXT: s_min_u32 s2, s1, s2
; GFX6-NEXT: s_sub_i32 s1, s1, s2
Expand Down Expand Up @@ -1757,8 +1751,6 @@ define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
define amdgpu_ps float @usubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
; GFX6-LABEL: usubsat_v2i16_sv:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_min_u32_e32 v0, s0, v0
Expand Down Expand Up @@ -1801,8 +1793,6 @@ define amdgpu_ps float @usubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
define amdgpu_ps float @usubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
; GFX6-LABEL: usubsat_v2i16_vs:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: v_min_u32_e32 v2, s0, v0
Expand Down
7 changes: 7 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ entry:
define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> inreg %b) {
; GFX7-LABEL: scalar_xnor_v2i16_one_use:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NEXT: s_lshl_b32 s1, s1, 16
; GFX7-NEXT: s_and_b32 s0, s0, s4
; GFX7-NEXT: s_or_b32 s0, s1, s0
; GFX7-NEXT: s_lshl_b32 s1, s3, 16
; GFX7-NEXT: s_and_b32 s2, s2, s4
; GFX7-NEXT: s_or_b32 s1, s1, s2
; GFX7-NEXT: s_xor_b32 s0, s0, s1
; GFX7-NEXT: s_xor_b32 s0, s0, -1
; GFX7-NEXT: ; return to shader part epilog
Expand Down
Loading