From 391249d1afe47d1671486a267eaf821a694987ea Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Thu, 29 Jun 2023 15:20:29 -0700 Subject: [PATCH] [AMDGPU] Allow 8,16 bit sources in calculateSrcByte This is required for many trees produced in practice for i8 CodeGen. Differential Revision: https://reviews.llvm.org/D155864 Change-Id: Iac01d183d9998b15138bdc7a5051e3bed338e7d9 --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 85 +++++++++++++++-------- llvm/test/CodeGen/AMDGPU/bf16.ll | 12 +--- llvm/test/CodeGen/AMDGPU/load-hi16.ll | 36 +++++----- llvm/test/CodeGen/AMDGPU/load-lo16.ll | 8 +-- llvm/test/CodeGen/AMDGPU/permute_i8.ll | 34 +++++++++ llvm/test/CodeGen/AMDGPU/trunc-combine.ll | 4 +- 6 files changed, 114 insertions(+), 65 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index d7bf5561a7c38..5509b408eb495 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10428,10 +10428,12 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0, if (Depth >= 6) return std::nullopt; + auto ValueSize = Op.getValueSizeInBits(); + if (ValueSize != 8 && ValueSize != 16 && ValueSize != 32) + return std::nullopt; + switch (Op->getOpcode()) { case ISD::TRUNCATE: { - if (Op->getOperand(0).getScalarValueSizeInBits() != 32) - return std::nullopt; return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); } @@ -10451,9 +10453,6 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0, } default: { - if (Op.getScalarValueSizeInBits() != 32) - return std::nullopt; - return ByteProvider::getSrc(Op, DestByte, SrcIndex); } } @@ -10595,6 +10594,17 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, return std::nullopt; } + case ISD::CopyFromReg: { + auto BitWidth = Op.getScalarValueSizeInBits(); + if (BitWidth % 8) + llvm_unreachable("Invalid type in CopyFromReg"); + + if (BitWidth / 8 > Index) + return calculateSrcByte(Op, StartingIndex, Index); + + return std::nullopt; + } + case ISD::LOAD: { auto L = cast(Op.getNode()); unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); @@ -10631,7 +10641,8 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, } // Returns true if the Operand is a scalar and is 16 bits -static bool is16BitScalarOp(SDValue &Operand) { +static bool isExtendedFrom16Bits(SDValue &Operand) { + switch (Operand.getOpcode()) { case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: @@ -10647,7 +10658,7 @@ static bool is16BitScalarOp(SDValue &Operand) { auto MemVT = L->getMemoryVT(); return !MemVT.isVector() && MemVT.getSizeInBits() == 16; } - return false; + return L->getMemoryVT().getSizeInBits() == 16; } default: return false; @@ -10675,29 +10686,29 @@ static bool addresses16Bits(int Mask) { // Do not lower into v_perm if the operands are actually 16 bit // and the selected bits (based on PermMask) correspond with two // easily addressable 16 bit operands. -static bool hasEightBitAccesses(uint64_t PermMask, SDValue &Op, +static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp) { int Low16 = PermMask & 0xffff; int Hi16 = (PermMask & 0xffff0000) >> 16; - // ByteProvider only accepts 32 bit operands - assert(Op.getValueType().getSizeInBits() == 32); - assert(OtherOp.getValueType().getSizeInBits() == 32); + assert(Op.getValueType().isByteSized()); + assert(OtherOp.getValueType().isByteSized()); - auto OpIs16Bit = is16BitScalarOp(Op); - auto OtherOpIs16Bit = is16BitScalarOp(Op); + auto TempOp = peekThroughBitcasts(Op); + auto TempOtherOp = peekThroughBitcasts(OtherOp); - // If there is a size mismatch, then we must use masking on at least one - // operand - if (OpIs16Bit != OtherOpIs16Bit) + auto OpIs16Bit = + TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp); + if (!OpIs16Bit) return true; - // If both operands are 16 bit, return whether or not we cleanly address both - if (is16BitScalarOp(Op) && is16BitScalarOp(OtherOp)) - return !addresses16Bits(Low16) || !addresses16Bits(Hi16); + auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 || + isExtendedFrom16Bits(TempOtherOp); + if (!OtherOpIs16Bit) + return true; - // Both are 32 bit operands - return true; + // Do we cleanly address both + return !addresses16Bits(Low16) || !addresses16Bits(Hi16); } SDValue SITargetLowering::performOrCombine(SDNode *N, @@ -10822,8 +10833,9 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, std::optional> P = calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i); // TODO support constantZero - if (!P || P->isConstantZero()) + if (!P || P->isConstantZero()) { return SDValue(); + } PermNodes.push_back(*P); } @@ -10832,7 +10844,7 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, int FirstSrc = 0; std::optional SecondSrc; - uint64_t permMask = 0x00000000; + uint64_t PermMask = 0x00000000; for (size_t i = 0; i < PermNodes.size(); i++) { auto PermOp = PermNodes[i]; // Since the mask is applied to Src1:Src2, Src1 bytes must be offset @@ -10843,15 +10855,15 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, if (SecondSrc.has_value()) if (!PermOp.hasSameSrc(PermNodes[*SecondSrc])) return SDValue(); + // Set the index of the second distinct Src node SecondSrc = i; - assert(PermNodes[*SecondSrc].Src->getValueType().getSizeInBits() == - 32); + assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8)); SrcByteAdjust = 0; } assert(PermOp.SrcOffset + SrcByteAdjust < 8); assert(!DAG.getDataLayout().isBigEndian()); - permMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8); + PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8); } SDValue Op = *PermNodes[FirstSrc].Src; @@ -10860,8 +10872,8 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, // Check that we are not just extracting the bytes in order from an op if (Op == OtherOp) { - int Low16 = permMask & 0xffff; - int Hi16 = (permMask & 0xffff0000) >> 16; + int Low16 = PermMask & 0xffff; + int Hi16 = (PermMask & 0xffff0000) >> 16; bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100); bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302); @@ -10871,10 +10883,23 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, return Op; } - if (hasEightBitAccesses(permMask, Op, OtherOp)) { + if (hasNon16BitAccesses(PermMask, Op, OtherOp)) { SDLoc DL(N); + assert(Op.getValueType().isByteSized() && + OtherOp.getValueType().isByteSized()); + if (Op.getValueSizeInBits() < 32) + // If the ultimate src is less than 32 bits, then we will only be + // using bytes 0: Op.getValueSizeInBytes() - 1 in the or. + // CalculateByteProvider would not have returned Op as source if we + // used a byte that is outside its ValueType. Thus, we are free to + // ANY_EXTEND as the extended bits are dont-cares. + Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op); + + if (OtherOp.getValueSizeInBits() < 32) + OtherOp = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op); + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp, - DAG.getConstant(permMask, DL, MVT::i32)); + DAG.getConstant(PermMask, DL, MVT::i32)); } } } diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 700325859151e..c354f783f5766 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -1263,18 +1263,13 @@ define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) { ; GFX9-LABEL: test_ret_v3bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_ret_v3bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: ret <3 x bfloat> %in @@ -1802,9 +1797,6 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v4 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 @@ -1841,11 +1833,9 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v3, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v4 ; GFX10-NEXT: v_writelane_b32 v3, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll index 52f6ca52d6b23..aa034cc185b47 100644 --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -266,9 +266,9 @@ define <2 x i16> @load_local_hi_v2i16_reglo(ptr addrspace(3) %in, i16 %reg) #0 { ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 +; GFX803-NEXT: s_mov_b32 s4, 0x1000504 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo: @@ -311,9 +311,9 @@ define void @load_local_hi_v2i16_reglo_vreg(ptr addrspace(3) %in, i16 %reg) #0 { ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 +; GFX803-NEXT: s_mov_b32 s4, 0x1000504 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -696,9 +696,9 @@ define void @load_global_hi_v2i16_reglo_vreg(ptr addrspace(1) %in, i16 %reg) #0 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] +; GFX803-NEXT: s_mov_b32 s4, 0x1000504 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1006,9 +1006,9 @@ define void @load_flat_hi_v2i16_reglo_vreg(ptr %in, i16 %reg) #0 { ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: flat_load_ushort v0, v[0:1] +; GFX803-NEXT: s_mov_b32 s4, 0x1000504 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1300,9 +1300,9 @@ define void @load_private_hi_v2i16_reglo_vreg(ptr addrspace(5) byval(i16) %in, i ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 +; GFX803-NEXT: s_mov_b32 s4, 0x1000504 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1399,8 +1399,8 @@ define void @load_private_hi_v2i16_reglo_vreg_nooff(ptr addrspace(5) byval(i16) ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: s_mov_b32 s4, 0x1000504 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1851,9 +1851,9 @@ define void @load_constant_hi_v2i16_reglo_vreg(ptr addrspace(4) %in, i16 %reg) # ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] +; GFX803-NEXT: s_mov_b32 s4, 0x1000504 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -2069,9 +2069,9 @@ define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, ptr addrspace( ; GFX803-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4058 +; GFX803-NEXT: s_mov_b32 s4, 0x1000504 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -2678,10 +2678,10 @@ define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, ptr addrspace(3) ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v2, v1 +; GFX803-NEXT: s_mov_b32 s4, 0x1000504 ; GFX803-NEXT: ds_write_b16 v1, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(1) -; GFX803-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX803-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: v_perm_b32 v2, v0, v2, s4 ; GFX803-NEXT: v_mov_b32_e32 v0, v2 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll index 3e8e3dec7f44c..0c8209baf09cd 100644 --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -621,10 +621,10 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(ptr addrspace(3) %in, < ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 ; GFX803-NEXT: v_mov_b32_e32 v2, 0 -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: ds_write_b16 v2, v0 -; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -734,12 +734,12 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(ptr addrspace(3) noal ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: ds_write_b16 v2, v0 ; GFX803-NEXT: ds_write_b16 v3, v4 -; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index 378907d20738f..234161a36ee03 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -2717,3 +2717,37 @@ define hidden void @zext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 ret void } + +define void @Source16Bit(i16 %in, <2 x i16> %reg) { +; GFX10-LABEL: Source16Bit: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x3050204 +; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: Source16Bit: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x3050204 +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +entry: + %elt0 = extractelement <2 x i16> %reg, i32 1 + %e0b0 = and i16 %elt0, 255 + %e0b1 = and i16 %elt0, -256 + %e1b0 = and i16 %in, 255 + %e1b1 = and i16 %in, -256 + %tmp0 = shl i16 %e0b0, 8 + %byte0 = or i16 %tmp0, %e1b0 + %tmp2 = lshr i16 %e1b1, 8 + %byte1 = or i16 %e0b1, %tmp2 + %ext0 = zext i16 %byte0 to i32 + %ext1 = zext i16 %byte1 to i32 + %shifted = shl i32 %ext1, 16 + %result = or i32 %shifted, %ext0 + store i32 %result, ptr addrspace(1) undef + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll index afec8f3512650..d200b25c17d33 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -150,8 +150,8 @@ define <2 x i16> @trunc_v2i64_arg_to_v2i16(<2 x i64> %arg0) #0 { ; VI-LABEL: trunc_v2i64_arg_to_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_mov_b32 s4, 0x1000504 +; VI-NEXT: v_perm_b32 v0, v0, v2, s4 ; VI-NEXT: s_setpc_b64 s[30:31] %trunc = trunc <2 x i64> %arg0 to <2 x i16> ret <2 x i16> %trunc