47 changes: 47 additions & 0 deletions llvm/lib/Target/AMDGPU/FLATInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1106,14 +1106,21 @@ let OtherPredicates = [HasFlatAddressSpace] in {

def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_8_flat, i16>;
//def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_8_flat, i8>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_8_flat, v2i8>;
def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_16_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_16_flat, v2i8>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, v2i8>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, v2i8>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, v2i8>;
//def : FlatLoadPat <FLAT_LOAD_UBYTE, load_flat, i8>;
def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
Expand All @@ -1125,6 +1132,9 @@ def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>;

def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, v2i8>;
//def : FlatStorePat <FLAT_STORE_BYTE, store_flat, i8>;
def : FlatStorePat <FLAT_STORE_SHORT, store_flat, v2i8>;

foreach vt = Reg32Types.types in {
def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, vt>;
Expand All @@ -1150,6 +1160,10 @@ def : FlatStoreAtomicPat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
def : FlatStoreAtomicPat <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
def : FlatStoreAtomicPat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;

//def : FlatStoreAtomicPat <FLAT_STORE_BYTE, atomic_store_8_flat, i8>;
def : FlatStoreAtomicPat <FLAT_STORE_BYTE, atomic_store_8_flat, v2i8>;
def : FlatStoreAtomicPat <FLAT_STORE_SHORT, atomic_store_16_flat, v2i8>;

foreach as = [ "flat", "global" ] in {
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>;
defm : FlatAtomicPat <"FLAT_ATOMIC_SUB", "atomic_load_sub_"#as, i32>;
Expand Down Expand Up @@ -1350,18 +1364,29 @@ let OtherPredicates = [HasFlatGlobalInsts] in {

defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_8_global, i16>;
//defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_8_global, i8>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_8_global, v2i8>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_16_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_16_global, v2i8>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, sextloadi8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, v2i8>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, v2i8>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, sextloadi8_global, v2i8>;
//defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, load_global, i8>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, extloadi16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, load_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, load_global, v2i8>;




foreach vt = Reg32Types.types in {
defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, load_global, vt>;
Expand Down Expand Up @@ -1392,6 +1417,11 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, store_global, i16>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX3, store_global, v3i32>;

//defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, store_global, i8>;
defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, v2i8>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, store_global, v2i8>;


let OtherPredicates = [HasD16LoadStore] in {
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
Expand All @@ -1417,6 +1447,12 @@ defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>
defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i16>;
defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>;
defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i16>;

//defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i8>;
defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, v2i8>;
defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, v2i8>;


defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>;
defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>;

Expand Down Expand Up @@ -1521,6 +1557,13 @@ defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, zextloadi16_private, i32>;
defm : ScratchFLATLoadPats <SCRATCH_LOAD_SSHORT, sextloadi16_private, i32>;
defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, load_private, i16>;

defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, extloadi8_private, v2i8>;
defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, zextloadi8_private, v2i8>;
defm : ScratchFLATLoadPats <SCRATCH_LOAD_SBYTE, sextloadi8_private, v2i8>;
defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, load_private, v2i8>;
//defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, load_private, i8>;


foreach vt = Reg32Types.types in {
defm : ScratchFLATLoadPats <SCRATCH_LOAD_DWORD, load_private, vt>;
defm : ScratchFLATStorePats <SCRATCH_STORE_DWORD, store_private, vt>;
Expand All @@ -1544,6 +1587,10 @@ defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, truncstorei16_private, i32>;
defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, store_private, i16>;
defm : ScratchFLATStorePats <SCRATCH_STORE_DWORDX3, store_private, v3i32>;

//defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, store_private, i8>;
defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, v2i8>;
defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, store_private, v2i8>;

let OtherPredicates = [HasD16LoadStore, HasFlatScratchInsts, EnableFlatScratch] in {
defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT_D16_HI, truncstorei16_hi16_private, i32>;
defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE_D16_HI, truncstorei8_hi16_private, i32>;
Expand Down
130 changes: 107 additions & 23 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);

addRegisterClass(MVT::v4i8, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::v2i8, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::i8, &AMDGPU::SReg_32RegClass);
//addRegisterClass(MVT::i8, &AMDGPU::VReg_32RegClass);

addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);

const SIRegisterInfo *TRI = STI.getRegisterInfo();
Expand Down Expand Up @@ -173,20 +178,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
//setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
//setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
//setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
//setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);

setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
//setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
Expand Down Expand Up @@ -242,7 +247,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v3i64, MVT::v3f64,
MVT::v6i32, MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64,
MVT::v8f64, MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32}) {
MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32, MVT::v4i8,
MVT::v2i8}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
Expand Down Expand Up @@ -538,6 +544,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
setOperationAction(ISD::BSWAP, MVT::v4i16, Custom);

/*
setOperationAction(ISD::STORE, MVT::v2i8, Promote);
AddPromotedToType(ISD::STORE, MVT::v2i8, MVT::i16);
setOperationAction(ISD::LOAD, MVT::v2i8, Promote);
AddPromotedToType(ISD::LOAD, MVT::v2i8, MVT::i16);
*/

// XXX - Do these do anything? Vector constants turn into build_vector.
setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);

Expand Down Expand Up @@ -607,7 +620,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

if (!Subtarget->hasVOP3PInsts())
setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16}, Custom);


setOperationAction(ISD::BUILD_VECTOR, MVT::v2i8, Custom);

setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
// This isn't really legal, but this avoids the legalizer unrolling it (and
// allows matching fneg (fabs x) patterns)
Expand Down Expand Up @@ -645,7 +660,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

setOperationAction(ISD::VECTOR_SHUFFLE,
{MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
MVT::v16f16, MVT::v16i16},
MVT::v16f16, MVT::v16i16, MVT::v4i8, MVT::v2i8},
Custom);

for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16})
Expand Down Expand Up @@ -826,8 +841,10 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
return VT.isInteger() ? MVT::i32 : MVT::f32;
}
if (Size == 8)
return Subtarget->has16BitInsts() ? MVT::v4i8 : MVT::i32;

if (Size < 16)
if (Size < 16 && Size != 8)
return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
}
Expand All @@ -843,13 +860,16 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
EVT VT) const {
if (CC == CallingConv::AMDGPU_KERNEL)
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);

if (VT.isVector()) {
unsigned NumElts = VT.getVectorNumElements();
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();

// FIXME: Should probably promote 8-bit vectors to i16.
if (Size == 8 && Subtarget->has16BitInsts())
return (NumElts + 2) / 4;

if (Size == 16 && Subtarget->has16BitInsts())
return (NumElts + 1) / 2;

Expand All @@ -872,6 +892,12 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
unsigned NumElts = VT.getVectorNumElements();
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
if (Size == 8 && Subtarget->has16BitInsts()) {
RegisterVT = MVT::v4i8;
NumIntermediates = (NumElts + 1) / 4;
IntermediateVT = RegisterVT;
return NumIntermediates;
}
// FIXME: We should fix the ABI to be the same on targets without 16-bit
// support, but unless we can properly handle 3-vectors, it will be still be
// inconsistent.
Expand Down Expand Up @@ -4653,6 +4679,10 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,


SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
errs() << "SITL::LowerOp on op\n";
Op.dump();
errs() << "\n";

switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
Expand Down Expand Up @@ -4686,6 +4716,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::INSERT_VECTOR_ELT:
return lowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT:
errs() << "calling lowerEVE\n";
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::VECTOR_SHUFFLE:
return lowerVECTOR_SHUFFLE(Op, DAG);
Expand Down Expand Up @@ -5689,9 +5720,14 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
unsigned EltSize = EltVT.getSizeInBits();
SDLoc SL(Op);


// Specially handle the case of v4i16 with static indexing.
unsigned NumElts = VecVT.getVectorNumElements();
auto KIdx = dyn_cast<ConstantSDNode>(Idx);

errs() << "legalizing insert_ve with num elts, eltsize " << NumElts << " " << EltSize << "\n";


if (NumElts == 4 && EltSize == 16 && KIdx) {
SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);

Expand Down Expand Up @@ -5761,13 +5797,19 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
SDLoc SL(Op);


errs() << "in lowerEVE\n";


EVT ResultVT = Op.getValueType();
SDValue Vec = Op.getOperand(0);
SDValue Idx = Op.getOperand(1);
EVT VecVT = Vec.getValueType();
unsigned VecSize = VecVT.getSizeInBits();
EVT EltVT = VecVT.getVectorElementType();

errs() << "found EVE with res: " << ResultVT.getEVTString() << " and src: " << VecVT.getEVTString() << "\n";

DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);

// Make sure we do any optimizations that will make it easier to fold
Expand Down Expand Up @@ -5843,6 +5885,11 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
}

if (ResultVT == MVT::i8) {
SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i8, Elt);
return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
}

return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
}

Expand All @@ -5853,12 +5900,22 @@ static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {

SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
SelectionDAG &DAG) const {
errs() << "in SIISelLowering lowerVECTOR_SHUFFLE\n";
SDLoc SL(Op);
EVT ResultVT = Op.getValueType();
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);

EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
EVT EltVT = PackVT.getVectorElementType();
EVT PackVT;
EVT EltVT;
auto ScalarSize = ResultVT.getVectorElementType().getSizeInBits() ;
if (ScalarSize == 8) {
// PackVT is
PackVT = MVT::v2i8;
}
else {
PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
}
EltVT = PackVT.getVectorElementType();
int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();

// vector_shuffle <0,1,6,7> lhs, rhs
Expand Down Expand Up @@ -5924,6 +5981,8 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
SDLoc SL(Op);
EVT VT = Op.getValueType();

errs() << "in lowerBuild_Vector with VT: " << VT.getEVTString() << "\n";

if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v8i16 || VT == MVT::v8f16) {
EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
Expand Down Expand Up @@ -5969,32 +6028,57 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
}

assert(VT == MVT::v2f16 || VT == MVT::v2i16);
assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
if (VT != MVT::v2i8) {
assert(VT == MVT::v2f16 || VT == MVT::v2i16);
assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
}


EVT SrcVT = Op.getOperand(1).getValueType(); // i8, i16
EVT BCVT = (SrcVT) == MVT::f16 ? MVT::i16 : SrcVT;

unsigned VecSize = VT.getSizeInBits(); // 16, 32
EVT EltVT = SrcVT.isVector() ? SrcVT.getVectorElementType() : SrcVT.getScalarType();

unsigned EltSize = EltVT.getSizeInBits();

SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
// Convert vector index to bit-index (* EltSize)
SDValue ScaledShift = DAG.getNode(ISD::SHL, SL, MVT::i32, DAG.getConstant(1, SL, MVT::i32), ScaleFactor);

MVT IntVT = MVT::getIntegerVT(VecSize); // i16, i32
MVT FloatVT = MVT::getFloatingPointVT(VecSize); // f32
MVT RestIntVT = MVT::getIntegerVT(VT.getSizeInBits());




SDValue Lo = Op.getOperand(0);
SDValue Hi = Op.getOperand(1);

// Avoid adding defined bits with the zero_extend.
if (Hi.isUndef()) {
Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
Lo = DAG.getNode(ISD::BITCAST, SL, BCVT, Lo);
SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, IntVT, Lo);
return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
}

Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
Hi = DAG.getNode(ISD::BITCAST, SL, BCVT, Hi);
Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Hi);

SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
DAG.getConstant(16, SL, MVT::i32));
SDValue ShlHi = DAG.getNode(ISD::SHL, SL, IntVT, Hi, ScaledShift);
if (Lo.isUndef())
return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);

Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
Lo = DAG.getNode(ISD::BITCAST, SL, BCVT, Lo);
Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Lo);

SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
SDValue Or = DAG.getNode(ISD::OR, SL, IntVT, Lo, ShlHi);
return DAG.getNode(ISD::BITCAST, SL, VT, Or);
//errs() << "Build Final node : \n";
//temp->dump();
//errs() << "\n";
//return Or;
}

bool
Expand Down
75 changes: 75 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1308,11 +1308,15 @@ foreach Index = 0-31 in {
// FIXME: Why do only some of these type combinations for SReg and
// VReg?
// 16-bit bitcast

def : BitConvert <i16, f16, VGPR_32>;
def : BitConvert <f16, i16, VGPR_32>;
def : BitConvert <i16, f16, SReg_32>;
def : BitConvert <f16, i16, SReg_32>;

def : BitConvert <v2i8, i16, SReg_32>;
def : BitConvert <i16, v2i8, SReg_32>;

// 32-bit bitcast
def : BitConvert <i32, f32, VGPR_32>;
def : BitConvert <f32, i32, VGPR_32>;
Expand All @@ -1329,6 +1333,9 @@ def : BitConvert <f32, v2f16, SReg_32>;
def : BitConvert <v2i16, f32, SReg_32>;
def : BitConvert <f32, v2i16, SReg_32>;

def : BitConvert <v4i8, i32, SReg_32>;
def : BitConvert <v4i8, v2i16, SReg_32>;

// 64-bit bitcast
def : BitConvert <i64, f64, VReg_64>;
def : BitConvert <f64, i64, VReg_64>;
Expand Down Expand Up @@ -2782,6 +2789,74 @@ def : GCNPat <
(S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
>;

def : GCNPat <
(v2i8 (DivergentBinFrag<build_vector> (i8 0), (i8 SReg_32:$src1))),
(v2i8 (V_LSHLREV_B32_e64 (i8 8), SReg_32:$src1))
>;

def : GCNPat <
(v4i8 (build_vector (i8 SReg_32:$src0), (i8 SReg_32:$src1), (i8 SReg_32:$src2), (i8 SReg_32:$src3))),


(v4i8

(V_OR_B32_e64

(S_LSHL_B32
SReg_32:$src3,
(i32 24)
)
,

(V_OR_B32_e64

(S_LSHL_B32
SReg_32:$src2,
(i32 16)
)
,

(V_OR_B32_e64

(S_LSHL_B32
SReg_32:$src1,
(i32 8)
)
,
SReg_32:$src0
)

)

)


)
>;

/*
def : GCNPat <
(v4i8 (build_vector (i8 SReg_32:$src0), (i8 SReg_32:$src1), (i8 SReg_32:$src2), (i8 SReg_32:$src3))),
(v4i8 (i32 (V_OR_B32_e64 (i32 (S_LSHL_B32 SReg_32:$src3, (i32 24))), (i32 (V_OR_B32_e64 (i32 (S_LSHL_B32 SReg_32:$src2, (i32 16))), (i32 (V_OR_B32_e64 (i32 (S_LSHL_B32 SReg_32:$src1, (i32 8))), SReg_32:$src0)))))))
>;
*/
/*
def : GCNPat <
(v2i8 (build_vector (i8:$src0), (i8:$src1))),
(v2i8 (i16 (V_OR_B32_e64 (i16 (S_LSHL_B32 SReg_32:$src1, (i32 8))), SReg_32:$src0)))
>;


def : GCNPat <
(v2i8 (build_vector i8:$src0, (i8 undef))),
(COPY $src0)
>;

def : GCNPat <
(v2i8 (DivergentBinFrag<build_vector> (i8 undef), (i8 SReg_32:$src1))),
(V_LSHLREV_B32_e64 (i32 16), SReg_32:$src1)
>;
*/


foreach Ty = [i16, f16] in {
Expand Down
30 changes: 15 additions & 15 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
}

// SGPR 32-bit registers
def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8, i8], 32,
(add (sequence "SGPR%u", 0, 105))> {
// Give all SGPR classes higher priority than VGPR classes, because
// we want to spill SGPRs to VGPRs.
Expand Down Expand Up @@ -406,7 +406,7 @@ def SGPR_512Regs : SIRegisterTuples<getSubRegs<16>.ret, SGPR_32, 105, 4, 16, "s"
def SGPR_1024Regs : SIRegisterTuples<getSubRegs<32>.ret, SGPR_32, 105, 4, 32, "s">;

// Trap handler TMP 32-bit registers
def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16, v4i8, v2i8, i8], 32,
(add (sequence "TTMP%u", 0, 15))> {
let isAllocatable = 0;
let HasSGPR = 1;
Expand Down Expand Up @@ -527,8 +527,8 @@ class RegisterTypes<list<ValueType> reg_types> {
list<ValueType> types = reg_types;
}

def Reg16Types : RegisterTypes<[i16, f16]>;
def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>;
def Reg16Types : RegisterTypes<[i16, f16, v2i8, i8]>;
def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v4i8, p2, p3, p5, p6]>;

let HasVGPR = 1 in {
def VGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
Expand Down Expand Up @@ -600,7 +600,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
}

// AccVGPR 32-bit registers
def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8, i8], 32,
(add (sequence "AGPR%u", 0, 255))> {
let AllocationPriority = 0;
let Size = 32;
Expand Down Expand Up @@ -639,7 +639,7 @@ def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">;
// Register classes used as source and destination
//===----------------------------------------------------------------------===//

def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8, i8], 32,
(add FP_REG, SP_REG)> {
let isAllocatable = 0;
let CopyCost = -1;
Expand All @@ -662,7 +662,7 @@ def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32,
let GeneratePressureSet = 0, HasSGPR = 1 in {
// Subset of SReg_32 without M0 for SMRD instructions and alike.
// See comments in SIInstructions.td for more info.
def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8, i8, i1], 32,
(add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE,
SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID,
Expand All @@ -680,7 +680,7 @@ def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16,
let AllocationPriority = 0;
}

def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8, i8, i1], 32,
(add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> {
let AllocationPriority = 0;
}
Expand All @@ -691,7 +691,7 @@ def SReg_LO16_XEXEC_HI : SIRegisterClass<"AMDGPU", [i16, f16], 16,
let AllocationPriority = 0;
}

def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8, i8, i1], 32,
(add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
let AllocationPriority = 0;
}
Expand All @@ -710,20 +710,20 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
} // End GeneratePressureSet = 0

// Register class for all scalar registers (SGPRs + Special Registers)
def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8, i8, i1], 32,
(add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> {
let AllocationPriority = 0;
let HasSGPR = 1;
}

let GeneratePressureSet = 0 in {
def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8, i8], 32,
(add SReg_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
let HasSGPR = 1;
}

def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32,
def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16, v4i8, v2i8, i8], 32,
(add SGPR_64Regs)> {
let CopyCost = 1;
let AllocationPriority = 1;
Expand Down Expand Up @@ -807,7 +807,7 @@ defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Re
defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
}

def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8, i8], 32,
(add VGPR_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
let HasVGPR = 1;
Expand Down Expand Up @@ -887,14 +887,14 @@ def VReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, (add)> {
let HasVGPR = 1;
}

def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8, i8], 32,
(add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
let HasVGPR = 1;
let HasSGPR = 1;
}

def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8, i8], 32,
(add VGPR_32_Lo128, SReg_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
let HasVGPR = 1;
Expand Down