Skip to content

Commit

Permalink
[AMDGPU] Allow 8,16 bit sources in calculateSrcByte
Browse files Browse the repository at this point in the history
This is required for many trees produced in practice for i8 CodeGen.

Differential Revision: https://reviews.llvm.org/D155864

Change-Id: Iac01d183d9998b15138bdc7a5051e3bed338e7d9
  • Loading branch information
jrbyrnes committed Jul 28, 2023
1 parent 27f39ad commit 391249d
Show file tree
Hide file tree
Showing 6 changed files with 114 additions and 65 deletions.
85 changes: 55 additions & 30 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10428,10 +10428,12 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
if (Depth >= 6)
return std::nullopt;

auto ValueSize = Op.getValueSizeInBits();
if (ValueSize != 8 && ValueSize != 16 && ValueSize != 32)
return std::nullopt;

switch (Op->getOpcode()) {
case ISD::TRUNCATE: {
if (Op->getOperand(0).getScalarValueSizeInBits() != 32)
return std::nullopt;
return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
}

Expand All @@ -10451,9 +10453,6 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
}

default: {
if (Op.getScalarValueSizeInBits() != 32)
return std::nullopt;

return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
}
}
Expand Down Expand Up @@ -10595,6 +10594,17 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
return std::nullopt;
}

case ISD::CopyFromReg: {
auto BitWidth = Op.getScalarValueSizeInBits();
if (BitWidth % 8)
llvm_unreachable("Invalid type in CopyFromReg");

if (BitWidth / 8 > Index)
return calculateSrcByte(Op, StartingIndex, Index);

return std::nullopt;
}

case ISD::LOAD: {
auto L = cast<LoadSDNode>(Op.getNode());
unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
Expand Down Expand Up @@ -10631,7 +10641,8 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
}

// Returns true if the Operand is a scalar and is 16 bits
static bool is16BitScalarOp(SDValue &Operand) {
static bool isExtendedFrom16Bits(SDValue &Operand) {

switch (Operand.getOpcode()) {
case ISD::ANY_EXTEND:
case ISD::SIGN_EXTEND:
Expand All @@ -10647,7 +10658,7 @@ static bool is16BitScalarOp(SDValue &Operand) {
auto MemVT = L->getMemoryVT();
return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
}
return false;
return L->getMemoryVT().getSizeInBits() == 16;
}
default:
return false;
Expand Down Expand Up @@ -10675,29 +10686,29 @@ static bool addresses16Bits(int Mask) {
// Do not lower into v_perm if the operands are actually 16 bit
// and the selected bits (based on PermMask) correspond with two
// easily addressable 16 bit operands.
static bool hasEightBitAccesses(uint64_t PermMask, SDValue &Op,
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
SDValue &OtherOp) {
int Low16 = PermMask & 0xffff;
int Hi16 = (PermMask & 0xffff0000) >> 16;

// ByteProvider only accepts 32 bit operands
assert(Op.getValueType().getSizeInBits() == 32);
assert(OtherOp.getValueType().getSizeInBits() == 32);
assert(Op.getValueType().isByteSized());
assert(OtherOp.getValueType().isByteSized());

auto OpIs16Bit = is16BitScalarOp(Op);
auto OtherOpIs16Bit = is16BitScalarOp(Op);
auto TempOp = peekThroughBitcasts(Op);
auto TempOtherOp = peekThroughBitcasts(OtherOp);

// If there is a size mismatch, then we must use masking on at least one
// operand
if (OpIs16Bit != OtherOpIs16Bit)
auto OpIs16Bit =
TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
if (!OpIs16Bit)
return true;

// If both operands are 16 bit, return whether or not we cleanly address both
if (is16BitScalarOp(Op) && is16BitScalarOp(OtherOp))
return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
isExtendedFrom16Bits(TempOtherOp);
if (!OtherOpIs16Bit)
return true;

// Both are 32 bit operands
return true;
// Do we cleanly address both
return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
}

SDValue SITargetLowering::performOrCombine(SDNode *N,
Expand Down Expand Up @@ -10822,8 +10833,9 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
std::optional<ByteProvider<SDValue>> P =
calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
// TODO support constantZero
if (!P || P->isConstantZero())
if (!P || P->isConstantZero()) {
return SDValue();
}

PermNodes.push_back(*P);
}
Expand All @@ -10832,7 +10844,7 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,

int FirstSrc = 0;
std::optional<int> SecondSrc;
uint64_t permMask = 0x00000000;
uint64_t PermMask = 0x00000000;
for (size_t i = 0; i < PermNodes.size(); i++) {
auto PermOp = PermNodes[i];
// Since the mask is applied to Src1:Src2, Src1 bytes must be offset
Expand All @@ -10843,15 +10855,15 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
if (SecondSrc.has_value())
if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
return SDValue();

// Set the index of the second distinct Src node
SecondSrc = i;
assert(PermNodes[*SecondSrc].Src->getValueType().getSizeInBits() ==
32);
assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8));
SrcByteAdjust = 0;
}
assert(PermOp.SrcOffset + SrcByteAdjust < 8);
assert(!DAG.getDataLayout().isBigEndian());
permMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
}

SDValue Op = *PermNodes[FirstSrc].Src;
Expand All @@ -10860,8 +10872,8 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,

// Check that we are not just extracting the bytes in order from an op
if (Op == OtherOp) {
int Low16 = permMask & 0xffff;
int Hi16 = (permMask & 0xffff0000) >> 16;
int Low16 = PermMask & 0xffff;
int Hi16 = (PermMask & 0xffff0000) >> 16;

bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
Expand All @@ -10871,10 +10883,23 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
return Op;
}

if (hasEightBitAccesses(permMask, Op, OtherOp)) {
if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
SDLoc DL(N);
assert(Op.getValueType().isByteSized() &&
OtherOp.getValueType().isByteSized());
if (Op.getValueSizeInBits() < 32)
// If the ultimate src is less than 32 bits, then we will only be
// using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
// CalculateByteProvider would not have returned Op as source if we
// used a byte that is outside its ValueType. Thus, we are free to
// ANY_EXTEND as the extended bits are dont-cares.
Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op);

if (OtherOp.getValueSizeInBits() < 32)
OtherOp = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);

return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
DAG.getConstant(permMask, DL, MVT::i32));
DAG.getConstant(PermMask, DL, MVT::i32));
}
}
}
Expand Down
12 changes: 1 addition & 11 deletions llvm/test/CodeGen/AMDGPU/bf16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1263,18 +1263,13 @@ define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) {
; GFX9-LABEL: test_ret_v3bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX9-NEXT: s_mov_b32 s4, 0xffff
; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_ret_v3bf16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
entry:
ret <3 x bfloat> %in
Expand Down Expand Up @@ -1802,9 +1797,6 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX9-NEXT: s_mov_b32 s4, 0xffff
; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v4
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
Expand Down Expand Up @@ -1841,11 +1833,9 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_writelane_b32 v3, s30, 0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v4
; GFX10-NEXT: v_writelane_b32 v3, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
Expand Down
36 changes: 18 additions & 18 deletions llvm/test/CodeGen/AMDGPU/load-hi16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -266,9 +266,9 @@ define <2 x i16> @load_local_hi_v2i16_reglo(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo:
Expand Down Expand Up @@ -311,9 +311,9 @@ define void @load_local_hi_v2i16_reglo_vreg(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -696,9 +696,9 @@ define void @load_global_hi_v2i16_reglo_vreg(ptr addrspace(1) %in, i16 %reg) #0
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -1006,9 +1006,9 @@ define void @load_flat_hi_v2i16_reglo_vreg(ptr %in, i16 %reg) #0 {
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -1300,9 +1300,9 @@ define void @load_private_hi_v2i16_reglo_vreg(ptr addrspace(5) byval(i16) %in, i
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -1399,8 +1399,8 @@ define void @load_private_hi_v2i16_reglo_vreg_nooff(ptr addrspace(5) byval(i16)
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:4094 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -1851,9 +1851,9 @@ define void @load_constant_hi_v2i16_reglo_vreg(ptr addrspace(4) %in, i16 %reg) #
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -2069,9 +2069,9 @@ define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, ptr addrspace(
; GFX803-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4058
; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -2678,10 +2678,10 @@ define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, ptr addrspace(3)
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v2, v1
; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: ds_write_b16 v1, v0
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX803-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: v_perm_b32 v2, v0, v2, s4
; GFX803-NEXT: v_mov_b32_e32 v0, v2
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/load-lo16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -621,10 +621,10 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(ptr addrspace(3) %in, <
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: v_mov_b32_e32 v2, 0
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX803-NEXT: s_mov_b32 s4, 0x3020504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: ds_write_b16 v2, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -734,12 +734,12 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(ptr addrspace(3) noal
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: s_mov_b32 s4, 0x3020504
; GFX803-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: ds_write_b16 v2, v0
; GFX803-NEXT: ds_write_b16 v3, v4
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
Expand Down
34 changes: 34 additions & 0 deletions llvm/test/CodeGen/AMDGPU/permute_i8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2717,3 +2717,37 @@ define hidden void @zext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
ret void
}

define void @Source16Bit(i16 %in, <2 x i16> %reg) {
; GFX10-LABEL: Source16Bit:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x3050204
; GFX10-NEXT: global_store_dword v[0:1], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: Source16Bit:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x3050204
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
entry:
%elt0 = extractelement <2 x i16> %reg, i32 1
%e0b0 = and i16 %elt0, 255
%e0b1 = and i16 %elt0, -256
%e1b0 = and i16 %in, 255
%e1b1 = and i16 %in, -256
%tmp0 = shl i16 %e0b0, 8
%byte0 = or i16 %tmp0, %e1b0
%tmp2 = lshr i16 %e1b1, 8
%byte1 = or i16 %e0b1, %tmp2
%ext0 = zext i16 %byte0 to i32
%ext1 = zext i16 %byte1 to i32
%shifted = shl i32 %ext1, 16
%result = or i32 %shifted, %ext0
store i32 %result, ptr addrspace(1) undef
ret void
}
Loading

0 comments on commit 391249d

Please sign in to comment.