-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU]: Accept constant zero bytes in v_perm OrCombine #66533
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-amdgpu ChangesThis provides capability for combine to produce perms with masks containing 0x0cPatch is 47.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/66533.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1c85ec3f9f5212f..18cd159780e9d1c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10700,6 +10700,23 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
}
+ case ISD::EXTRACT_VECTOR_ELT: {
+ auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+ if (!IdxOp)
+ return std::nullopt;
+ auto VecIdx = IdxOp->getZExtValue();
+ auto ScalarSize = Op.getScalarValueSizeInBits();
+ if (ScalarSize != 32) {
+ if ((VecIdx + 1) * ScalarSize > 32)
+ return std::nullopt;
+ SrcIndex = ScalarSize == 8 ? VecIdx : VecIdx * 2 + SrcIndex;
+ return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
+ }
+
+ // The scalar is 32 bits, so just use the scalar
+ return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
+ }
+
default: {
return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
}
@@ -10958,6 +10975,9 @@ static bool addresses16Bits(int Mask) {
int Low8 = Mask & 0xff;
int Hi8 = (Mask & 0xff00) >> 8;
+ if (Low8 == 0x0c || Hi8 == 0x0c)
+ return false;
+
assert(Low8 < 8 && Hi8 < 8);
// Are the bytes contiguous in the order of increasing addresses.
bool IsConsecutive = (Hi8 - Low8 == 1);
@@ -11052,12 +11072,33 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
// If all the uses of an or need to extract the individual elements, do not
// attempt to lower into v_perm
auto usesCombinedOperand = [](SDNode *OrUse) {
+ // The combined bytes seem to be getting extracted
+ if (OrUse->getOpcode() == ISD::SRL || OrUse->getOpcode() == ISD::TRUNCATE)
+ return false;
+
+ if (OrUse->getOpcode() == ISD::AND) {
+ auto SelectMask = dyn_cast<ConstantSDNode>(OrUse->getOperand(1));
+ if (SelectMask && (SelectMask->getZExtValue() == 0xFF))
+ return false;
+ }
+
+ if (OrUse->getOpcode() == AMDGPUISD::CVT_F32_UBYTE0 ||
+ OrUse->getOpcode() == AMDGPUISD::CVT_F32_UBYTE1 ||
+ OrUse->getOpcode() == AMDGPUISD::CVT_F32_UBYTE2 ||
+ OrUse->getOpcode() == AMDGPUISD::CVT_F32_UBYTE3) {
+ return false;
+ }
+
+ if (auto StoreUse = dyn_cast<StoreSDNode>(OrUse))
+ if (StoreUse->isTruncatingStore() &&
+ StoreUse->getMemoryVT().getSizeInBits() == 8)
+ return false;
+
// If we have any non-vectorized use, then it is a candidate for v_perm
- if (OrUse->getOpcode() != ISD::BITCAST ||
- !OrUse->getValueType(0).isVector())
+ if (!(OrUse->getValueType(0).isVector() &&
+ OrUse->getOpcode() != ISD::BUILD_VECTOR))
return true;
- // If we have any non-vectorized use, then it is a candidate for v_perm
for (auto VUse : OrUse->uses()) {
if (!VUse->getValueType(0).isVector())
return true;
@@ -11119,8 +11160,7 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
// Find the ByteProvider that provides the ith byte of the result of OR
std::optional<ByteProvider<SDValue>> P =
calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
- // TODO support constantZero
- if (!P || P->isConstantZero())
+ if (!P)
return SDValue();
PermNodes.push_back(*P);
@@ -11128,11 +11168,17 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
if (PermNodes.size() != 4)
return SDValue();
- int FirstSrc = 0;
+ size_t FirstSrc = 0;
std::optional<int> SecondSrc;
uint64_t PermMask = 0x00000000;
for (size_t i = 0; i < PermNodes.size(); i++) {
auto PermOp = PermNodes[i];
+ if (PermOp.isConstantZero()) {
+ if (FirstSrc == i)
+ ++FirstSrc;
+ PermMask |= 0x0c << (i * 8);
+ continue;
+ }
// Since the mask is applied to Src1:Src2, Src1 bytes must be offset
// by sizeof(Src2) = 4
int SrcByteAdjust = 4;
@@ -11152,6 +11198,10 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
}
+ SDLoc DL(N);
+ if (PermMask == 0x0c0c0c0c)
+ return DAG.getConstant(0, DL, MVT::i32);
+
SDValue Op = *PermNodes[FirstSrc].Src;
SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
: *PermNodes[FirstSrc].Src;
@@ -11170,7 +11220,6 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
}
if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
- SDLoc DL(N);
assert(Op.getValueType().isByteSized() &&
OtherOp.getValueType().isByteSized());
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 901cbd4a5272059..a02d11533a988f1 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1428,7 +1428,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s8, 0x4000405
+; VI-NEXT: s_mov_b32 s8, 0xc0c0004
+; VI-NEXT: s_mov_b32 s9, 0x4000405
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v0
@@ -1438,35 +1439,31 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ubyte v6, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v4
+; VI-NEXT: flat_load_ubyte v7, v[2:3]
+; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v4
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v4
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v3, v[4:5]
-; VI-NEXT: flat_load_ubyte v4, v[0:1]
+; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v1, v[2:3]
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s4, s2
; VI-NEXT: s_mov_b32 s5, s3
; VI-NEXT: s_mov_b32 s2, s6
; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v6
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3
+; VI-NEXT: v_perm_b32 v3, v7, v6, s8
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
-; VI-NEXT: v_or_b32_e32 v4, v5, v4
-; VI-NEXT: v_or_b32_e32 v5, v7, v3
+; VI-NEXT: v_perm_b32 v0, v1, v0, s8
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3
+; VI-NEXT: v_perm_b32 v4, v3, v0, s9
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v3
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_perm_b32 v4, v4, v5, s8
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0
; VI-NEXT: s_endpgm
@@ -1475,24 +1472,24 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT: v_mov_b32_e32 v7, 0
+; GFX10-NEXT: v_mov_b32_e32 v6, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x3
; GFX10-NEXT: global_load_ubyte v1, v0, s[4:5] offset:2
-; GFX10-NEXT: global_load_ubyte v3, v0, s[4:5] offset:3
-; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:3
+; GFX10-NEXT: global_load_ubyte v2, v0, s[4:5] offset:3
+; GFX10-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3
; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_lshl_or_b32 v5, v3, 8, v1
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
+; GFX10-NEXT: v_perm_b32 v5, v1, v2, 0xc0c0004
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshl_or_b32 v6, v2, 8, v4
+; GFX10-NEXT: v_perm_b32 v4, v4, v3, 0xc0c0004
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
+; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v5
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v3
+; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x4000405
; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: v_perm_b32 v4, v5, v6, 0x4000405
-; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
-; GFX10-NEXT: global_store_dword v7, v4, s[2:3]
+; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX10-NEXT: global_store_dword v6, v4, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX9-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
@@ -1505,16 +1502,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] offset:3
; GFX9-NEXT: global_load_ubyte v3, v0, s[4:5] offset:3
; GFX9-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2
-; GFX9-NEXT: s_mov_b32 s4, 0x4000405
+; GFX9-NEXT: s_mov_b32 s4, 0xc0c0004
+; GFX9-NEXT: s_mov_b32 s5, 0x4000405
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshl_or_b32 v6, v3, 8, v1
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
+; GFX9-NEXT: v_perm_b32 v0, v1, v3, s4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshl_or_b32 v7, v2, 8, v4
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v3
+; GFX9-NEXT: v_perm_b32 v1, v4, v2, s4
+; GFX9-NEXT: v_perm_b32 v4, v0, v1, s5
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v1
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
+; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_perm_b32 v4, v6, v7, s4
; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX9-NEXT: global_store_dword v5, v4, s[2:3]
; GFX9-NEXT: s_endpgm
@@ -1527,19 +1525,20 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_load_u8 v1, v0, s[4:5] offset:2
-; GFX11-NEXT: global_load_u8 v3, v0, s[4:5] offset:3
-; GFX11-NEXT: global_load_u8 v2, v0, s[6:7] offset:3
+; GFX11-NEXT: global_load_u8 v2, v0, s[4:5] offset:3
+; GFX11-NEXT: global_load_u8 v3, v0, s[6:7] offset:3
; GFX11-NEXT: global_load_u8 v0, v0, s[6:7] offset:2
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: v_lshl_or_b32 v4, v3, 8, v1
-; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
+; GFX11-NEXT: v_perm_b32 v4, v1, v2, 0xc0c0004
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshl_or_b32 v5, v2, 8, v0
-; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
-; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v5, v0, v3, 0xc0c0004
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
+; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v4
+; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v5
; GFX11-NEXT: v_perm_b32 v4, v4, v5, 0x4000405
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_mov_b32_e32 v3, v1
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1]
; GFX11-NEXT: global_store_b32 v6, v4, s[2:3]
@@ -1794,43 +1793,46 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT: s_mov_b32 s4, 0xc0c0004
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 5, v0
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v10, v[2:3]
-; VI-NEXT: v_add_u32_e32 v2, vcc, 6, v0
+; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v0
+; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v0
-; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ubyte v8, v[0:1]
+; VI-NEXT: flat_load_ubyte v9, v[2:3]
+; VI-NEXT: flat_load_ubyte v10, v[4:5]
; VI-NEXT: flat_load_ubyte v6, v[6:7]
-; VI-NEXT: flat_load_ubyte v7, v[8:9]
-; VI-NEXT: flat_load_ubyte v8, v[2:3]
-; VI-NEXT: flat_load_ubyte v2, v[0:1]
-; VI-NEXT: flat_load_ubyte v4, v[4:5]
+; VI-NEXT: v_add_u32_e32 v2, vcc, 5, v0
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v9, v[0:1]
+; VI-NEXT: flat_load_ubyte v2, v[2:3]
+; VI-NEXT: flat_load_ubyte v3, v[4:5]
+; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_waitcnt vmcnt(6)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v10
-; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v7
+; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: v_perm_b32 v7, v8, v9, s4
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_perm_b32 v1, v6, v10, s4
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v6
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v2
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v8
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v9
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v0
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v3, v1
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v1
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[0:3], 0 offset:16
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
@@ -1839,90 +1841,86 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: v_mov_b32_e32 v8, 0
+; GFX10-NEXT: v_mov_b32_e32 v7, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x5
-; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6
-; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3
-; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2
+; GFX10-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:4
+; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:6
+; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3
+; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2
; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1
-; GFX10-NEXT: global_load_short_d16 v7, v0, s[2:3] offset:4
-; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
-; GFX10-NEXT: s_waitcnt vmcnt(5)
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
-; GFX10-NEXT: s_waitcnt vmcnt(4)
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
-; GFX10-NEXT: s_waitcnt vmcnt(3)
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
+; GFX10-NEXT: global_load_ubyte v6, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v7
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
+; GFX10-NEXT: v_perm_b32 v0, v4, v3, 0xc0c0004
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
-; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX10-NEXT: v_perm_b32 v8, v6, v5, 0xc0c0004
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v2
+; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v1
+; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v3, v0
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
+; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v8
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v8
+; GFX10-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] offset:16
+; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX9-LABEL: load_v7i8_to_v7f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:6
-; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:4
-; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-NEXT: global_load_ubyte v7, v0, s[2:3] offset:2
-; GFX9-NEXT: global_load_ubyte v8, v0, s[2:3] offset:1
-; GFX9-NEXT: global_load_ubyte v9, v0, s[2:3]
-; GFX9-NEXT: s_waitcnt vmcnt(5)
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v1
-; GFX9-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v2
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v2
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v3
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v7
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] offset:4
+; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] offset:6
+; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] offset:2
+; GFX9-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1
+; GFX9-NEXT: global_load_ubyte v5, v0, s[2:3]
+; GFX9-NEXT: global_load_ubyte v6, v0, s[2:3] offset:3
+; GFX9-NEXT: s_mov_b32 s2, 0xc0c0004
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v8
+; GFX9-NEXT: v_perm_b32 v0, v5, v4, s2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v9
-; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
-; GFX9-NEXT: global_store_dwordx3 v10, v[4:6], s[0:1] offset:16
+; GFX9-NEXT: v_perm_b32 v8, v3, v6, s2
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v2
+; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v1
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v1
+; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v8
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v8
+; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX9-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] offset:16
+; GFX9-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: load_v7i8_to_v7f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: v_mov_b32_e32 v8, 0
+; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_lshlrev_b32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x5
-; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:6
-; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3
-; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2
+; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:4
+; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:6
+; GFX11-NEXT: global_load_u8 v3, v0, s[2:3] offset:3
+; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:2
; GFX11-NEXT: global_load_u8 v5, v0, s[2:3] offset:1
-; GFX11-NEXT: global_load_d16_b16 v7, v0, s[2:3] offset:4
; GFX11-NEXT: ...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should compute instead of assuming element size
0a523a2
to
fc904cc
Compare
Passes psdb |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Needs update to main
fc904cc
to
c803764
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
c803764
to
2b083d4
Compare
2b083d4
to
a7b5122
Compare
Just rebase for 3115ad8 , still need to address comments. |
a7b5122
to
d083871
Compare
Change-Id: I454ccee1e33867359ae8053464a2ca57a669d73f
d083871
to
0b201d6
Compare
Latest iteration adds a heuristic which disables perm combine if the two operands are i8s (either actual i8s or post legalization). This type of perm can always be represented as lshl_or. I have also updated the heuristic such that we bypass it if the subtarget generation doesn't support lshl_or. This bypass has introduced a set of lit changes (usually in the form of VI checks) that are independent of the constant zero work. These changes occur since we are allowing the perm combine to occur more often. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of rebasing, you should push new commits and merge commits. I have no idea what was in the previous version now
Change-Id: I32516bfc8ff6bc88048b7300c8a82ba8cef68c0d
45aad68
to
4c7d899
Compare
I see how that could make things harder.. noted. Technically, there is a roundabout way to get this info:
Clicking the "Compare" button on GUI Or if you prefer to use CLI and have a local branch up-to-date with this:
But yes, I do understand this is not ideal and is more work on the reviewer. FYI, the new / unreviewed changes are inclusion of: |
; GFX803-NEXT: s_mov_b32 s4, 0x3020c04 | ||
; GFX803-NEXT: s_waitcnt vmcnt(0) | ||
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is another size regression
This provides capability for combine to produce perms with masks containing 0x0c